{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 7730, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00517464424320828, "grad_norm": 4.884782314300537, "learning_rate": 3.0172413793103453e-07, "loss": 0.4996, "step": 8 }, { "epoch": 0.01034928848641656, "grad_norm": 10.670414924621582, "learning_rate": 6.465517241379311e-07, "loss": 0.4662, "step": 16 }, { "epoch": 0.015523932729624839, "grad_norm": 2.3693668842315674, "learning_rate": 9.913793103448276e-07, "loss": 0.4448, "step": 24 }, { "epoch": 0.02069857697283312, "grad_norm": 4.083646297454834, "learning_rate": 1.336206896551724e-06, "loss": 0.4917, "step": 32 }, { "epoch": 0.0258732212160414, "grad_norm": 8.517114639282227, "learning_rate": 1.681034482758621e-06, "loss": 0.5036, "step": 40 }, { "epoch": 0.031047865459249677, "grad_norm": 6.137472629547119, "learning_rate": 2.025862068965517e-06, "loss": 0.4726, "step": 48 }, { "epoch": 0.03622250970245795, "grad_norm": 2.339662551879883, "learning_rate": 2.370689655172414e-06, "loss": 0.4691, "step": 56 }, { "epoch": 0.04139715394566624, "grad_norm": 1.5891815423965454, "learning_rate": 2.7155172413793105e-06, "loss": 0.4667, "step": 64 }, { "epoch": 0.04657179818887452, "grad_norm": 8.638023376464844, "learning_rate": 3.0603448275862068e-06, "loss": 0.4776, "step": 72 }, { "epoch": 0.0517464424320828, "grad_norm": 1.5453442335128784, "learning_rate": 3.4051724137931034e-06, "loss": 0.4724, "step": 80 }, { "epoch": 0.056921086675291076, "grad_norm": 3.350733757019043, "learning_rate": 3.7500000000000005e-06, "loss": 0.4675, "step": 88 }, { "epoch": 0.062095730918499355, "grad_norm": 4.810270309448242, "learning_rate": 4.094827586206897e-06, "loss": 0.4592, "step": 96 }, { "epoch": 0.06727037516170763, "grad_norm": 1.514150619506836, "learning_rate": 4.439655172413794e-06, "loss": 0.4825, "step": 104 }, { "epoch": 0.0724450194049159, "grad_norm": 8.344229698181152, "learning_rate": 4.78448275862069e-06, "loss": 0.4604, "step": 112 }, { "epoch": 0.07761966364812418, "grad_norm": 6.710835933685303, "learning_rate": 5.129310344827587e-06, "loss": 0.4659, "step": 120 }, { "epoch": 0.08279430789133248, "grad_norm": 3.7732086181640625, "learning_rate": 5.474137931034483e-06, "loss": 0.4575, "step": 128 }, { "epoch": 0.08796895213454076, "grad_norm": 1.9706354141235352, "learning_rate": 5.81896551724138e-06, "loss": 0.4473, "step": 136 }, { "epoch": 0.09314359637774904, "grad_norm": 1.358317494392395, "learning_rate": 6.163793103448276e-06, "loss": 0.4724, "step": 144 }, { "epoch": 0.09831824062095731, "grad_norm": 2.7171244621276855, "learning_rate": 6.508620689655173e-06, "loss": 0.4687, "step": 152 }, { "epoch": 0.1034928848641656, "grad_norm": 1.794846773147583, "learning_rate": 6.853448275862069e-06, "loss": 0.466, "step": 160 }, { "epoch": 0.10866752910737387, "grad_norm": 2.4740686416625977, "learning_rate": 7.198275862068966e-06, "loss": 0.4562, "step": 168 }, { "epoch": 0.11384217335058215, "grad_norm": 12.792396545410156, "learning_rate": 7.543103448275862e-06, "loss": 0.4553, "step": 176 }, { "epoch": 0.11901681759379043, "grad_norm": 4.893735408782959, "learning_rate": 7.88793103448276e-06, "loss": 0.4499, "step": 184 }, { "epoch": 0.12419146183699871, "grad_norm": 2.4978456497192383, "learning_rate": 8.232758620689656e-06, "loss": 0.4405, "step": 192 }, { "epoch": 0.129366106080207, "grad_norm": 1.71527099609375, "learning_rate": 8.577586206896551e-06, "loss": 0.4694, "step": 200 }, { "epoch": 0.13454075032341525, "grad_norm": 1.814359188079834, "learning_rate": 8.922413793103449e-06, "loss": 0.4213, "step": 208 }, { "epoch": 0.13971539456662355, "grad_norm": 1.0052739381790161, "learning_rate": 9.267241379310346e-06, "loss": 0.4165, "step": 216 }, { "epoch": 0.1448900388098318, "grad_norm": 3.3144054412841797, "learning_rate": 9.612068965517242e-06, "loss": 0.4324, "step": 224 }, { "epoch": 0.1500646830530401, "grad_norm": 14.789776802062988, "learning_rate": 9.95689655172414e-06, "loss": 0.4362, "step": 232 }, { "epoch": 0.15523932729624837, "grad_norm": 2.4650723934173584, "learning_rate": 9.999978494742326e-06, "loss": 0.4394, "step": 240 }, { "epoch": 0.16041397153945666, "grad_norm": 1.6728233098983765, "learning_rate": 9.999901251622079e-06, "loss": 0.4686, "step": 248 }, { "epoch": 0.16558861578266496, "grad_norm": 1.9105558395385742, "learning_rate": 9.999767832624e-06, "loss": 0.4479, "step": 256 }, { "epoch": 0.17076326002587322, "grad_norm": 2.219156503677368, "learning_rate": 9.999578239247104e-06, "loss": 0.4562, "step": 264 }, { "epoch": 0.1759379042690815, "grad_norm": 2.4550633430480957, "learning_rate": 9.999332473621544e-06, "loss": 0.4546, "step": 272 }, { "epoch": 0.18111254851228978, "grad_norm": 2.5587055683135986, "learning_rate": 9.999030538508598e-06, "loss": 0.4527, "step": 280 }, { "epoch": 0.18628719275549807, "grad_norm": 3.2998480796813965, "learning_rate": 9.99867243730063e-06, "loss": 0.4528, "step": 288 }, { "epoch": 0.19146183699870634, "grad_norm": 6.697995662689209, "learning_rate": 9.998258174021043e-06, "loss": 0.4488, "step": 296 }, { "epoch": 0.19663648124191463, "grad_norm": 2.801957607269287, "learning_rate": 9.997787753324253e-06, "loss": 0.4644, "step": 304 }, { "epoch": 0.2018111254851229, "grad_norm": 7.488362789154053, "learning_rate": 9.997261180495623e-06, "loss": 0.4567, "step": 312 }, { "epoch": 0.2069857697283312, "grad_norm": 11.816390991210938, "learning_rate": 9.996678461451408e-06, "loss": 0.4542, "step": 320 }, { "epoch": 0.21216041397153945, "grad_norm": 4.340507984161377, "learning_rate": 9.996039602738688e-06, "loss": 0.4299, "step": 328 }, { "epoch": 0.21733505821474774, "grad_norm": 2.5847530364990234, "learning_rate": 9.995344611535295e-06, "loss": 0.4199, "step": 336 }, { "epoch": 0.222509702457956, "grad_norm": 7.996049404144287, "learning_rate": 9.994593495649733e-06, "loss": 0.4006, "step": 344 }, { "epoch": 0.2276843467011643, "grad_norm": 8.294257164001465, "learning_rate": 9.993786263521083e-06, "loss": 0.3779, "step": 352 }, { "epoch": 0.23285899094437257, "grad_norm": 8.907233238220215, "learning_rate": 9.992922924218924e-06, "loss": 0.3424, "step": 360 }, { "epoch": 0.23803363518758086, "grad_norm": 20.33425521850586, "learning_rate": 9.99200348744321e-06, "loss": 0.3199, "step": 368 }, { "epoch": 0.24320827943078913, "grad_norm": 13.30887508392334, "learning_rate": 9.991027963524188e-06, "loss": 0.2968, "step": 376 }, { "epoch": 0.24838292367399742, "grad_norm": 24.578723907470703, "learning_rate": 9.989996363422246e-06, "loss": 0.2971, "step": 384 }, { "epoch": 0.2535575679172057, "grad_norm": 7.940666198730469, "learning_rate": 9.988908698727828e-06, "loss": 0.2772, "step": 392 }, { "epoch": 0.258732212160414, "grad_norm": 48.541175842285156, "learning_rate": 9.987764981661278e-06, "loss": 0.2937, "step": 400 }, { "epoch": 0.26390685640362227, "grad_norm": 8.2228364944458, "learning_rate": 9.986565225072713e-06, "loss": 0.2775, "step": 408 }, { "epoch": 0.2690815006468305, "grad_norm": 37.225929260253906, "learning_rate": 9.98530944244187e-06, "loss": 0.2628, "step": 416 }, { "epoch": 0.2742561448900388, "grad_norm": 28.484724044799805, "learning_rate": 9.983997647877973e-06, "loss": 0.2671, "step": 424 }, { "epoch": 0.2794307891332471, "grad_norm": 7.030237197875977, "learning_rate": 9.98262985611955e-06, "loss": 0.2598, "step": 432 }, { "epoch": 0.2846054333764554, "grad_norm": 6.647619247436523, "learning_rate": 9.981206082534287e-06, "loss": 0.2519, "step": 440 }, { "epoch": 0.2897800776196636, "grad_norm": 17.09227180480957, "learning_rate": 9.979726343118847e-06, "loss": 0.2567, "step": 448 }, { "epoch": 0.2949547218628719, "grad_norm": 24.190093994140625, "learning_rate": 9.978190654498687e-06, "loss": 0.2476, "step": 456 }, { "epoch": 0.3001293661060802, "grad_norm": 160.548095703125, "learning_rate": 9.976599033927884e-06, "loss": 0.2546, "step": 464 }, { "epoch": 0.3053040103492885, "grad_norm": 5.440623760223389, "learning_rate": 9.974951499288925e-06, "loss": 0.2497, "step": 472 }, { "epoch": 0.31047865459249674, "grad_norm": 21.864320755004883, "learning_rate": 9.973248069092516e-06, "loss": 0.2766, "step": 480 }, { "epoch": 0.31565329883570503, "grad_norm": 1.826672911643982, "learning_rate": 9.971488762477373e-06, "loss": 0.2578, "step": 488 }, { "epoch": 0.3208279430789133, "grad_norm": 11.376463890075684, "learning_rate": 9.969673599210006e-06, "loss": 0.2619, "step": 496 }, { "epoch": 0.3260025873221216, "grad_norm": 4.004533290863037, "learning_rate": 9.967802599684494e-06, "loss": 0.2586, "step": 504 }, { "epoch": 0.3311772315653299, "grad_norm": 14.940202713012695, "learning_rate": 9.965875784922261e-06, "loss": 0.2624, "step": 512 }, { "epoch": 0.33635187580853815, "grad_norm": 5.1946940422058105, "learning_rate": 9.963893176571836e-06, "loss": 0.2569, "step": 520 }, { "epoch": 0.34152652005174644, "grad_norm": 6.96012544631958, "learning_rate": 9.961854796908615e-06, "loss": 0.2501, "step": 528 }, { "epoch": 0.34670116429495473, "grad_norm": 20.033016204833984, "learning_rate": 9.959760668834601e-06, "loss": 0.2466, "step": 536 }, { "epoch": 0.351875808538163, "grad_norm": 5.902473449707031, "learning_rate": 9.957610815878156e-06, "loss": 0.2424, "step": 544 }, { "epoch": 0.35705045278137126, "grad_norm": 2.925157070159912, "learning_rate": 9.955405262193731e-06, "loss": 0.2429, "step": 552 }, { "epoch": 0.36222509702457956, "grad_norm": 3.106954336166382, "learning_rate": 9.9531440325616e-06, "loss": 0.2367, "step": 560 }, { "epoch": 0.36739974126778785, "grad_norm": 8.999578475952148, "learning_rate": 9.950827152387575e-06, "loss": 0.2428, "step": 568 }, { "epoch": 0.37257438551099614, "grad_norm": 8.012633323669434, "learning_rate": 9.948454647702727e-06, "loss": 0.2447, "step": 576 }, { "epoch": 0.3777490297542044, "grad_norm": 17.479957580566406, "learning_rate": 9.94602654516309e-06, "loss": 0.2539, "step": 584 }, { "epoch": 0.3829236739974127, "grad_norm": 111.3963851928711, "learning_rate": 9.94354287204936e-06, "loss": 0.2536, "step": 592 }, { "epoch": 0.38809831824062097, "grad_norm": 67.85525512695312, "learning_rate": 9.941003656266589e-06, "loss": 0.263, "step": 600 }, { "epoch": 0.39327296248382926, "grad_norm": 5.47231912612915, "learning_rate": 9.93840892634388e-06, "loss": 0.2395, "step": 608 }, { "epoch": 0.3984476067270375, "grad_norm": 20.308883666992188, "learning_rate": 9.935758711434052e-06, "loss": 0.2433, "step": 616 }, { "epoch": 0.4036222509702458, "grad_norm": 5.908266544342041, "learning_rate": 9.933053041313325e-06, "loss": 0.2531, "step": 624 }, { "epoch": 0.4087968952134541, "grad_norm": 4.62359094619751, "learning_rate": 9.930291946380977e-06, "loss": 0.2572, "step": 632 }, { "epoch": 0.4139715394566624, "grad_norm": 33.82321548461914, "learning_rate": 9.927475457659007e-06, "loss": 0.252, "step": 640 }, { "epoch": 0.4191461836998706, "grad_norm": 2.1320619583129883, "learning_rate": 9.924603606791786e-06, "loss": 0.2484, "step": 648 }, { "epoch": 0.4243208279430789, "grad_norm": 5.690158367156982, "learning_rate": 9.921676426045698e-06, "loss": 0.242, "step": 656 }, { "epoch": 0.4294954721862872, "grad_norm": 4.768186092376709, "learning_rate": 9.918693948308783e-06, "loss": 0.2258, "step": 664 }, { "epoch": 0.4346701164294955, "grad_norm": 7.269800662994385, "learning_rate": 9.915656207090367e-06, "loss": 0.2397, "step": 672 }, { "epoch": 0.4398447606727037, "grad_norm": 30.480161666870117, "learning_rate": 9.912563236520675e-06, "loss": 0.2296, "step": 680 }, { "epoch": 0.445019404915912, "grad_norm": 22.762638092041016, "learning_rate": 9.909415071350464e-06, "loss": 0.233, "step": 688 }, { "epoch": 0.4501940491591203, "grad_norm": 13.787392616271973, "learning_rate": 9.90621174695062e-06, "loss": 0.2333, "step": 696 }, { "epoch": 0.4553686934023286, "grad_norm": 3.74238657951355, "learning_rate": 9.902953299311763e-06, "loss": 0.236, "step": 704 }, { "epoch": 0.46054333764553684, "grad_norm": 71.07587432861328, "learning_rate": 9.899639765043854e-06, "loss": 0.2549, "step": 712 }, { "epoch": 0.46571798188874514, "grad_norm": 3.8564956188201904, "learning_rate": 9.89627118137576e-06, "loss": 0.2488, "step": 720 }, { "epoch": 0.47089262613195343, "grad_norm": 3.945932626724243, "learning_rate": 9.892847586154863e-06, "loss": 0.2435, "step": 728 }, { "epoch": 0.4760672703751617, "grad_norm": 8.651078224182129, "learning_rate": 9.889369017846616e-06, "loss": 0.2301, "step": 736 }, { "epoch": 0.48124191461837, "grad_norm": 23.654037475585938, "learning_rate": 9.88583551553411e-06, "loss": 0.2408, "step": 744 }, { "epoch": 0.48641655886157825, "grad_norm": 9.601846694946289, "learning_rate": 9.882247118917656e-06, "loss": 0.2578, "step": 752 }, { "epoch": 0.49159120310478654, "grad_norm": 7.171313762664795, "learning_rate": 9.87860386831431e-06, "loss": 0.2412, "step": 760 }, { "epoch": 0.49676584734799484, "grad_norm": 13.73882007598877, "learning_rate": 9.874905804657445e-06, "loss": 0.235, "step": 768 }, { "epoch": 0.5019404915912031, "grad_norm": 8.866873741149902, "learning_rate": 9.871152969496274e-06, "loss": 0.2259, "step": 776 }, { "epoch": 0.5071151358344114, "grad_norm": 2.9023540019989014, "learning_rate": 9.867345404995393e-06, "loss": 0.2382, "step": 784 }, { "epoch": 0.5122897800776197, "grad_norm": 8.159908294677734, "learning_rate": 9.8634831539343e-06, "loss": 0.2362, "step": 792 }, { "epoch": 0.517464424320828, "grad_norm": 5.698594093322754, "learning_rate": 9.85956625970692e-06, "loss": 0.2312, "step": 800 }, { "epoch": 0.5226390685640362, "grad_norm": 1.7953461408615112, "learning_rate": 9.855594766321122e-06, "loss": 0.2341, "step": 808 }, { "epoch": 0.5278137128072445, "grad_norm": 6.394684791564941, "learning_rate": 9.85156871839821e-06, "loss": 0.2467, "step": 816 }, { "epoch": 0.5329883570504528, "grad_norm": 3.978597402572632, "learning_rate": 9.847488161172429e-06, "loss": 0.2411, "step": 824 }, { "epoch": 0.538163001293661, "grad_norm": 49.182125091552734, "learning_rate": 9.843353140490466e-06, "loss": 0.2394, "step": 832 }, { "epoch": 0.5433376455368694, "grad_norm": 95.73147583007812, "learning_rate": 9.839163702810922e-06, "loss": 0.2247, "step": 840 }, { "epoch": 0.5485122897800776, "grad_norm": 14.831591606140137, "learning_rate": 9.834919895203789e-06, "loss": 0.2471, "step": 848 }, { "epoch": 0.553686934023286, "grad_norm": 5.494821548461914, "learning_rate": 9.83062176534994e-06, "loss": 0.244, "step": 856 }, { "epoch": 0.5588615782664942, "grad_norm": 2.4115333557128906, "learning_rate": 9.826269361540565e-06, "loss": 0.2532, "step": 864 }, { "epoch": 0.5640362225097024, "grad_norm": 13.977892875671387, "learning_rate": 9.821862732676655e-06, "loss": 0.2507, "step": 872 }, { "epoch": 0.5692108667529108, "grad_norm": 9.79362964630127, "learning_rate": 9.817401928268435e-06, "loss": 0.2237, "step": 880 }, { "epoch": 0.574385510996119, "grad_norm": 1.0879400968551636, "learning_rate": 9.812886998434817e-06, "loss": 0.2281, "step": 888 }, { "epoch": 0.5795601552393272, "grad_norm": 7.494878768920898, "learning_rate": 9.80831799390283e-06, "loss": 0.2375, "step": 896 }, { "epoch": 0.5847347994825356, "grad_norm": 6.011063098907471, "learning_rate": 9.803694966007059e-06, "loss": 0.2363, "step": 904 }, { "epoch": 0.5899094437257438, "grad_norm": 3.7052977085113525, "learning_rate": 9.799017966689057e-06, "loss": 0.2217, "step": 912 }, { "epoch": 0.5950840879689522, "grad_norm": 1.6096714735031128, "learning_rate": 9.794287048496771e-06, "loss": 0.2399, "step": 920 }, { "epoch": 0.6002587322121604, "grad_norm": 4.586109638214111, "learning_rate": 9.789502264583949e-06, "loss": 0.2511, "step": 928 }, { "epoch": 0.6054333764553687, "grad_norm": 6.05584192276001, "learning_rate": 9.784663668709537e-06, "loss": 0.2411, "step": 936 }, { "epoch": 0.610608020698577, "grad_norm": 10.09545612335205, "learning_rate": 9.779771315237086e-06, "loss": 0.2565, "step": 944 }, { "epoch": 0.6157826649417852, "grad_norm": 21.93640899658203, "learning_rate": 9.77482525913413e-06, "loss": 0.2294, "step": 952 }, { "epoch": 0.6209573091849935, "grad_norm": 12.018024444580078, "learning_rate": 9.769825555971575e-06, "loss": 0.2268, "step": 960 }, { "epoch": 0.6261319534282018, "grad_norm": 17.39588165283203, "learning_rate": 9.764772261923074e-06, "loss": 0.2349, "step": 968 }, { "epoch": 0.6313065976714101, "grad_norm": 15.817480087280273, "learning_rate": 9.759665433764393e-06, "loss": 0.2238, "step": 976 }, { "epoch": 0.6364812419146184, "grad_norm": 6.889886379241943, "learning_rate": 9.754505128872778e-06, "loss": 0.2409, "step": 984 }, { "epoch": 0.6416558861578266, "grad_norm": 5.029130935668945, "learning_rate": 9.749291405226304e-06, "loss": 0.2388, "step": 992 }, { "epoch": 0.6468305304010349, "grad_norm": 7.987931728363037, "learning_rate": 9.744024321403229e-06, "loss": 0.2306, "step": 1000 }, { "epoch": 0.6520051746442432, "grad_norm": 3.22019100189209, "learning_rate": 9.738703936581333e-06, "loss": 0.2444, "step": 1008 }, { "epoch": 0.6571798188874515, "grad_norm": 13.65085220336914, "learning_rate": 9.733330310537255e-06, "loss": 0.248, "step": 1016 }, { "epoch": 0.6623544631306598, "grad_norm": 12.927434921264648, "learning_rate": 9.727903503645818e-06, "loss": 0.2226, "step": 1024 }, { "epoch": 0.6675291073738681, "grad_norm": 1.27034592628479, "learning_rate": 9.722423576879354e-06, "loss": 0.237, "step": 1032 }, { "epoch": 0.6727037516170763, "grad_norm": 266.6712951660156, "learning_rate": 9.71689059180702e-06, "loss": 0.2384, "step": 1040 }, { "epoch": 0.6778783958602846, "grad_norm": 8.363792419433594, "learning_rate": 9.711304610594104e-06, "loss": 0.2278, "step": 1048 }, { "epoch": 0.6830530401034929, "grad_norm": 2.284507989883423, "learning_rate": 9.70566569600132e-06, "loss": 0.2452, "step": 1056 }, { "epoch": 0.6882276843467011, "grad_norm": 3.4681687355041504, "learning_rate": 9.699973911384119e-06, "loss": 0.2338, "step": 1064 }, { "epoch": 0.6934023285899095, "grad_norm": 21.047893524169922, "learning_rate": 9.694229320691961e-06, "loss": 0.2189, "step": 1072 }, { "epoch": 0.6985769728331177, "grad_norm": 7.996535778045654, "learning_rate": 9.688431988467609e-06, "loss": 0.2475, "step": 1080 }, { "epoch": 0.703751617076326, "grad_norm": 3.221071481704712, "learning_rate": 9.682581979846388e-06, "loss": 0.2501, "step": 1088 }, { "epoch": 0.7089262613195343, "grad_norm": 14.775407791137695, "learning_rate": 9.676679360555479e-06, "loss": 0.2245, "step": 1096 }, { "epoch": 0.7141009055627425, "grad_norm": 3.179734945297241, "learning_rate": 9.670724196913149e-06, "loss": 0.2314, "step": 1104 }, { "epoch": 0.7192755498059509, "grad_norm": 36.58845901489258, "learning_rate": 9.66471655582803e-06, "loss": 0.2414, "step": 1112 }, { "epoch": 0.7244501940491591, "grad_norm": 2.0989410877227783, "learning_rate": 9.658656504798361e-06, "loss": 0.2373, "step": 1120 }, { "epoch": 0.7296248382923674, "grad_norm": 67.86420440673828, "learning_rate": 9.652544111911218e-06, "loss": 0.2414, "step": 1128 }, { "epoch": 0.7347994825355757, "grad_norm": 64.04298400878906, "learning_rate": 9.646379445841769e-06, "loss": 0.2419, "step": 1136 }, { "epoch": 0.7399741267787839, "grad_norm": 5.908459663391113, "learning_rate": 9.640162575852487e-06, "loss": 0.2328, "step": 1144 }, { "epoch": 0.7451487710219923, "grad_norm": 5.114171981811523, "learning_rate": 9.633893571792375e-06, "loss": 0.2117, "step": 1152 }, { "epoch": 0.7503234152652005, "grad_norm": 8.035240173339844, "learning_rate": 9.627572504096188e-06, "loss": 0.2344, "step": 1160 }, { "epoch": 0.7554980595084088, "grad_norm": 3.9648971557617188, "learning_rate": 9.621199443783633e-06, "loss": 0.2248, "step": 1168 }, { "epoch": 0.7606727037516171, "grad_norm": 17.44873046875, "learning_rate": 9.614774462458573e-06, "loss": 0.2377, "step": 1176 }, { "epoch": 0.7658473479948253, "grad_norm": 10.615303039550781, "learning_rate": 9.608297632308233e-06, "loss": 0.2278, "step": 1184 }, { "epoch": 0.7710219922380336, "grad_norm": 29.23194122314453, "learning_rate": 9.601769026102368e-06, "loss": 0.2213, "step": 1192 }, { "epoch": 0.7761966364812419, "grad_norm": 21.02916717529297, "learning_rate": 9.595188717192466e-06, "loss": 0.2382, "step": 1200 }, { "epoch": 0.7813712807244502, "grad_norm": 9.737796783447266, "learning_rate": 9.58855677951091e-06, "loss": 0.2347, "step": 1208 }, { "epoch": 0.7865459249676585, "grad_norm": 59.887420654296875, "learning_rate": 9.581873287570164e-06, "loss": 0.2379, "step": 1216 }, { "epoch": 0.7917205692108668, "grad_norm": 25.03373146057129, "learning_rate": 9.575138316461909e-06, "loss": 0.2358, "step": 1224 }, { "epoch": 0.796895213454075, "grad_norm": 3.6164708137512207, "learning_rate": 9.568351941856223e-06, "loss": 0.2511, "step": 1232 }, { "epoch": 0.8020698576972833, "grad_norm": 86.80042266845703, "learning_rate": 9.561514240000724e-06, "loss": 0.2262, "step": 1240 }, { "epoch": 0.8072445019404916, "grad_norm": 6.826155185699463, "learning_rate": 9.554625287719711e-06, "loss": 0.231, "step": 1248 }, { "epoch": 0.8124191461836999, "grad_norm": 1.1142396926879883, "learning_rate": 9.547685162413298e-06, "loss": 0.2187, "step": 1256 }, { "epoch": 0.8175937904269082, "grad_norm": 1.7491034269332886, "learning_rate": 9.540693942056553e-06, "loss": 0.237, "step": 1264 }, { "epoch": 0.8227684346701164, "grad_norm": 43.96466064453125, "learning_rate": 9.533651705198616e-06, "loss": 0.2347, "step": 1272 }, { "epoch": 0.8279430789133247, "grad_norm": 2.50284481048584, "learning_rate": 9.526558530961817e-06, "loss": 0.2199, "step": 1280 }, { "epoch": 0.833117723156533, "grad_norm": 3.6089391708374023, "learning_rate": 9.519414499040785e-06, "loss": 0.2341, "step": 1288 }, { "epoch": 0.8382923673997412, "grad_norm": 2.025322914123535, "learning_rate": 9.51221968970156e-06, "loss": 0.2317, "step": 1296 }, { "epoch": 0.8434670116429496, "grad_norm": 3.1698544025421143, "learning_rate": 9.504974183780686e-06, "loss": 0.2133, "step": 1304 }, { "epoch": 0.8486416558861578, "grad_norm": 2.9666683673858643, "learning_rate": 9.497678062684301e-06, "loss": 0.2224, "step": 1312 }, { "epoch": 0.8538163001293662, "grad_norm": 6.248025894165039, "learning_rate": 9.490331408387225e-06, "loss": 0.2145, "step": 1320 }, { "epoch": 0.8589909443725744, "grad_norm": 5.850505828857422, "learning_rate": 9.482934303432038e-06, "loss": 0.2277, "step": 1328 }, { "epoch": 0.8641655886157826, "grad_norm": 8.283476829528809, "learning_rate": 9.475486830928155e-06, "loss": 0.2219, "step": 1336 }, { "epoch": 0.869340232858991, "grad_norm": 1.7238540649414062, "learning_rate": 9.467989074550891e-06, "loss": 0.2384, "step": 1344 }, { "epoch": 0.8745148771021992, "grad_norm": 2.008312225341797, "learning_rate": 9.46044111854052e-06, "loss": 0.2006, "step": 1352 }, { "epoch": 0.8796895213454075, "grad_norm": 5.544405937194824, "learning_rate": 9.452843047701324e-06, "loss": 0.2313, "step": 1360 }, { "epoch": 0.8848641655886158, "grad_norm": 2.758505344390869, "learning_rate": 9.44519494740065e-06, "loss": 0.2333, "step": 1368 }, { "epoch": 0.890038809831824, "grad_norm": 1.708716630935669, "learning_rate": 9.437496903567946e-06, "loss": 0.2276, "step": 1376 }, { "epoch": 0.8952134540750324, "grad_norm": 14.639440536499023, "learning_rate": 9.429749002693793e-06, "loss": 0.233, "step": 1384 }, { "epoch": 0.9003880983182406, "grad_norm": 18.10475730895996, "learning_rate": 9.421951331828938e-06, "loss": 0.2351, "step": 1392 }, { "epoch": 0.9055627425614489, "grad_norm": 15.641255378723145, "learning_rate": 9.414103978583312e-06, "loss": 0.2297, "step": 1400 }, { "epoch": 0.9107373868046572, "grad_norm": 13.53865909576416, "learning_rate": 9.406207031125048e-06, "loss": 0.2171, "step": 1408 }, { "epoch": 0.9159120310478654, "grad_norm": 7.33931827545166, "learning_rate": 9.398260578179487e-06, "loss": 0.2258, "step": 1416 }, { "epoch": 0.9210866752910737, "grad_norm": 1.1460613012313843, "learning_rate": 9.390264709028189e-06, "loss": 0.2223, "step": 1424 }, { "epoch": 0.926261319534282, "grad_norm": 3.664703607559204, "learning_rate": 9.382219513507922e-06, "loss": 0.2291, "step": 1432 }, { "epoch": 0.9314359637774903, "grad_norm": 2.028292655944824, "learning_rate": 9.374125082009654e-06, "loss": 0.2197, "step": 1440 }, { "epoch": 0.9366106080206986, "grad_norm": 1.853790044784546, "learning_rate": 9.365981505477541e-06, "loss": 0.2086, "step": 1448 }, { "epoch": 0.9417852522639069, "grad_norm": 49.15569305419922, "learning_rate": 9.3577888754079e-06, "loss": 0.2245, "step": 1456 }, { "epoch": 0.9469598965071151, "grad_norm": 3.0362915992736816, "learning_rate": 9.34954728384819e-06, "loss": 0.2304, "step": 1464 }, { "epoch": 0.9521345407503234, "grad_norm": 4.143098831176758, "learning_rate": 9.341256823395965e-06, "loss": 0.237, "step": 1472 }, { "epoch": 0.9573091849935317, "grad_norm": 5.784548759460449, "learning_rate": 9.332917587197844e-06, "loss": 0.2176, "step": 1480 }, { "epoch": 0.96248382923674, "grad_norm": 13.278392791748047, "learning_rate": 9.324529668948459e-06, "loss": 0.2303, "step": 1488 }, { "epoch": 0.9676584734799483, "grad_norm": 46.16282272338867, "learning_rate": 9.316093162889407e-06, "loss": 0.2226, "step": 1496 }, { "epoch": 0.9728331177231565, "grad_norm": 2.1848196983337402, "learning_rate": 9.307608163808189e-06, "loss": 0.2481, "step": 1504 }, { "epoch": 0.9780077619663649, "grad_norm": 3.2306673526763916, "learning_rate": 9.299074767037137e-06, "loss": 0.2144, "step": 1512 }, { "epoch": 0.9831824062095731, "grad_norm": 7.1894755363464355, "learning_rate": 9.290493068452357e-06, "loss": 0.2319, "step": 1520 }, { "epoch": 0.9883570504527813, "grad_norm": 13.211376190185547, "learning_rate": 9.281863164472647e-06, "loss": 0.2368, "step": 1528 }, { "epoch": 0.9935316946959897, "grad_norm": 3.837167739868164, "learning_rate": 9.273185152058406e-06, "loss": 0.237, "step": 1536 }, { "epoch": 0.9987063389391979, "grad_norm": 26.63664436340332, "learning_rate": 9.26445912871055e-06, "loss": 0.2236, "step": 1544 }, { "epoch": 1.0038809831824063, "grad_norm": 5.332308292388916, "learning_rate": 9.255685192469424e-06, "loss": 0.2325, "step": 1552 }, { "epoch": 1.0090556274256144, "grad_norm": 1.7717430591583252, "learning_rate": 9.246863441913685e-06, "loss": 0.2244, "step": 1560 }, { "epoch": 1.0142302716688227, "grad_norm": 36.790260314941406, "learning_rate": 9.237993976159211e-06, "loss": 0.2414, "step": 1568 }, { "epoch": 1.019404915912031, "grad_norm": 7.347696781158447, "learning_rate": 9.229076894857973e-06, "loss": 0.2339, "step": 1576 }, { "epoch": 1.0245795601552394, "grad_norm": 7.630255699157715, "learning_rate": 9.220112298196922e-06, "loss": 0.2127, "step": 1584 }, { "epoch": 1.0297542043984476, "grad_norm": 4.760798454284668, "learning_rate": 9.211100286896865e-06, "loss": 0.231, "step": 1592 }, { "epoch": 1.034928848641656, "grad_norm": 17.15314483642578, "learning_rate": 9.202040962211334e-06, "loss": 0.233, "step": 1600 }, { "epoch": 1.0401034928848643, "grad_norm": 7.275695323944092, "learning_rate": 9.19293442592544e-06, "loss": 0.2205, "step": 1608 }, { "epoch": 1.0452781371280724, "grad_norm": 10.593427658081055, "learning_rate": 9.183780780354736e-06, "loss": 0.2137, "step": 1616 }, { "epoch": 1.0504527813712807, "grad_norm": 84.94709014892578, "learning_rate": 9.174580128344073e-06, "loss": 0.2119, "step": 1624 }, { "epoch": 1.055627425614489, "grad_norm": 6.472126007080078, "learning_rate": 9.16533257326643e-06, "loss": 0.235, "step": 1632 }, { "epoch": 1.0608020698576972, "grad_norm": 2.1055634021759033, "learning_rate": 9.156038219021764e-06, "loss": 0.2329, "step": 1640 }, { "epoch": 1.0659767141009056, "grad_norm": 4.520327091217041, "learning_rate": 9.146697170035839e-06, "loss": 0.2199, "step": 1648 }, { "epoch": 1.071151358344114, "grad_norm": 2.18851375579834, "learning_rate": 9.137309531259054e-06, "loss": 0.2305, "step": 1656 }, { "epoch": 1.076326002587322, "grad_norm": 7.414587497711182, "learning_rate": 9.127875408165261e-06, "loss": 0.2172, "step": 1664 }, { "epoch": 1.0815006468305304, "grad_norm": 40.298065185546875, "learning_rate": 9.118394906750585e-06, "loss": 0.2222, "step": 1672 }, { "epoch": 1.0866752910737387, "grad_norm": 2.0129568576812744, "learning_rate": 9.108868133532224e-06, "loss": 0.2385, "step": 1680 }, { "epoch": 1.0918499353169469, "grad_norm": 10.248085975646973, "learning_rate": 9.099295195547264e-06, "loss": 0.2252, "step": 1688 }, { "epoch": 1.0970245795601552, "grad_norm": 3.0196053981781006, "learning_rate": 9.089676200351467e-06, "loss": 0.2266, "step": 1696 }, { "epoch": 1.1021992238033635, "grad_norm": 2.057884931564331, "learning_rate": 9.08001125601807e-06, "loss": 0.2353, "step": 1704 }, { "epoch": 1.107373868046572, "grad_norm": 3.0884134769439697, "learning_rate": 9.07030047113656e-06, "loss": 0.2137, "step": 1712 }, { "epoch": 1.11254851228978, "grad_norm": 2.399594783782959, "learning_rate": 9.060543954811464e-06, "loss": 0.2122, "step": 1720 }, { "epoch": 1.1177231565329884, "grad_norm": 20.144880294799805, "learning_rate": 9.050741816661128e-06, "loss": 0.222, "step": 1728 }, { "epoch": 1.1228978007761967, "grad_norm": 3.0582709312438965, "learning_rate": 9.040894166816461e-06, "loss": 0.2162, "step": 1736 }, { "epoch": 1.1280724450194048, "grad_norm": 3.313769578933716, "learning_rate": 9.031001115919732e-06, "loss": 0.23, "step": 1744 }, { "epoch": 1.1332470892626132, "grad_norm": 2.804013967514038, "learning_rate": 9.02106277512329e-06, "loss": 0.2367, "step": 1752 }, { "epoch": 1.1384217335058215, "grad_norm": 9.238598823547363, "learning_rate": 9.011079256088355e-06, "loss": 0.2371, "step": 1760 }, { "epoch": 1.1435963777490297, "grad_norm": 10.921507835388184, "learning_rate": 9.001050670983721e-06, "loss": 0.2327, "step": 1768 }, { "epoch": 1.148771021992238, "grad_norm": 1.8150590658187866, "learning_rate": 8.990977132484535e-06, "loss": 0.233, "step": 1776 }, { "epoch": 1.1539456662354464, "grad_norm": 1.9657435417175293, "learning_rate": 8.980858753771002e-06, "loss": 0.2172, "step": 1784 }, { "epoch": 1.1591203104786545, "grad_norm": 3.5203697681427, "learning_rate": 8.970695648527132e-06, "loss": 0.2129, "step": 1792 }, { "epoch": 1.1642949547218628, "grad_norm": 8.29598617553711, "learning_rate": 8.96048793093945e-06, "loss": 0.2325, "step": 1800 }, { "epoch": 1.1694695989650712, "grad_norm": 1.7340894937515259, "learning_rate": 8.950235715695717e-06, "loss": 0.2177, "step": 1808 }, { "epoch": 1.1746442432082795, "grad_norm": 8.192721366882324, "learning_rate": 8.93993911798365e-06, "loss": 0.2385, "step": 1816 }, { "epoch": 1.1798188874514877, "grad_norm": 15.727509498596191, "learning_rate": 8.929598253489617e-06, "loss": 0.2367, "step": 1824 }, { "epoch": 1.184993531694696, "grad_norm": 15.4706449508667, "learning_rate": 8.91921323839734e-06, "loss": 0.2373, "step": 1832 }, { "epoch": 1.1901681759379044, "grad_norm": 7.3651227951049805, "learning_rate": 8.908784189386589e-06, "loss": 0.2352, "step": 1840 }, { "epoch": 1.1953428201811125, "grad_norm": 15.465596199035645, "learning_rate": 8.898311223631876e-06, "loss": 0.223, "step": 1848 }, { "epoch": 1.2005174644243208, "grad_norm": 7.5635528564453125, "learning_rate": 8.887794458801137e-06, "loss": 0.2179, "step": 1856 }, { "epoch": 1.2056921086675292, "grad_norm": 9.759207725524902, "learning_rate": 8.8772340130544e-06, "loss": 0.2189, "step": 1864 }, { "epoch": 1.2108667529107373, "grad_norm": 2.7430686950683594, "learning_rate": 8.866630005042476e-06, "loss": 0.2354, "step": 1872 }, { "epoch": 1.2160413971539457, "grad_norm": 6.708351135253906, "learning_rate": 8.855982553905604e-06, "loss": 0.2191, "step": 1880 }, { "epoch": 1.221216041397154, "grad_norm": 155.43638610839844, "learning_rate": 8.845291779272131e-06, "loss": 0.226, "step": 1888 }, { "epoch": 1.2263906856403621, "grad_norm": 2.132230281829834, "learning_rate": 8.834557801257162e-06, "loss": 0.2087, "step": 1896 }, { "epoch": 1.2315653298835705, "grad_norm": 4.70977783203125, "learning_rate": 8.823780740461204e-06, "loss": 0.2122, "step": 1904 }, { "epoch": 1.2367399741267788, "grad_norm": 1.3468469381332397, "learning_rate": 8.81296071796882e-06, "loss": 0.2225, "step": 1912 }, { "epoch": 1.2419146183699872, "grad_norm": 3.733586549758911, "learning_rate": 8.80209785534726e-06, "loss": 0.2395, "step": 1920 }, { "epoch": 1.2470892626131953, "grad_norm": 4.492749214172363, "learning_rate": 8.791192274645107e-06, "loss": 0.2138, "step": 1928 }, { "epoch": 1.2522639068564037, "grad_norm": 19.35169219970703, "learning_rate": 8.780244098390891e-06, "loss": 0.2287, "step": 1936 }, { "epoch": 1.2574385510996118, "grad_norm": 1.480233073234558, "learning_rate": 8.769253449591728e-06, "loss": 0.2347, "step": 1944 }, { "epoch": 1.2626131953428201, "grad_norm": 8.458354949951172, "learning_rate": 8.758220451731922e-06, "loss": 0.2327, "step": 1952 }, { "epoch": 1.2677878395860285, "grad_norm": 10.992264747619629, "learning_rate": 8.74714522877159e-06, "loss": 0.221, "step": 1960 }, { "epoch": 1.2729624838292368, "grad_norm": 15.178800582885742, "learning_rate": 8.736027905145265e-06, "loss": 0.2282, "step": 1968 }, { "epoch": 1.278137128072445, "grad_norm": 22.52798843383789, "learning_rate": 8.724868605760497e-06, "loss": 0.2238, "step": 1976 }, { "epoch": 1.2833117723156533, "grad_norm": 2.1530232429504395, "learning_rate": 8.713667455996449e-06, "loss": 0.2304, "step": 1984 }, { "epoch": 1.2884864165588616, "grad_norm": 3.1006431579589844, "learning_rate": 8.70242458170249e-06, "loss": 0.2453, "step": 1992 }, { "epoch": 1.2936610608020698, "grad_norm": 35.956756591796875, "learning_rate": 8.691140109196782e-06, "loss": 0.2018, "step": 2000 }, { "epoch": 1.2988357050452781, "grad_norm": 32.85151672363281, "learning_rate": 8.67981416526486e-06, "loss": 0.2131, "step": 2008 }, { "epoch": 1.3040103492884865, "grad_norm": 3.432091236114502, "learning_rate": 8.668446877158205e-06, "loss": 0.2433, "step": 2016 }, { "epoch": 1.3091849935316948, "grad_norm": 14.427757263183594, "learning_rate": 8.657038372592815e-06, "loss": 0.2315, "step": 2024 }, { "epoch": 1.314359637774903, "grad_norm": 2.7362518310546875, "learning_rate": 8.645588779747775e-06, "loss": 0.2295, "step": 2032 }, { "epoch": 1.3195342820181113, "grad_norm": 3.2137720584869385, "learning_rate": 8.634098227263809e-06, "loss": 0.221, "step": 2040 }, { "epoch": 1.3247089262613194, "grad_norm": 31.10691261291504, "learning_rate": 8.622566844241846e-06, "loss": 0.2174, "step": 2048 }, { "epoch": 1.3298835705045278, "grad_norm": 4.460792541503906, "learning_rate": 8.610994760241555e-06, "loss": 0.2277, "step": 2056 }, { "epoch": 1.3350582147477361, "grad_norm": 1.419631838798523, "learning_rate": 8.599382105279899e-06, "loss": 0.2259, "step": 2064 }, { "epoch": 1.3402328589909445, "grad_norm": 6.29133939743042, "learning_rate": 8.58772900982967e-06, "loss": 0.2272, "step": 2072 }, { "epoch": 1.3454075032341526, "grad_norm": 4.922664642333984, "learning_rate": 8.576035604818031e-06, "loss": 0.216, "step": 2080 }, { "epoch": 1.350582147477361, "grad_norm": 4.346408843994141, "learning_rate": 8.564302021625033e-06, "loss": 0.212, "step": 2088 }, { "epoch": 1.3557567917205693, "grad_norm": 27.521772384643555, "learning_rate": 8.552528392082147e-06, "loss": 0.2423, "step": 2096 }, { "epoch": 1.3609314359637774, "grad_norm": 4.437674522399902, "learning_rate": 8.54071484847078e-06, "loss": 0.2133, "step": 2104 }, { "epoch": 1.3661060802069858, "grad_norm": 18.69135284423828, "learning_rate": 8.528861523520792e-06, "loss": 0.2248, "step": 2112 }, { "epoch": 1.371280724450194, "grad_norm": 1.3831626176834106, "learning_rate": 8.516968550408998e-06, "loss": 0.2158, "step": 2120 }, { "epoch": 1.3764553686934025, "grad_norm": 2.99196720123291, "learning_rate": 8.505036062757677e-06, "loss": 0.2301, "step": 2128 }, { "epoch": 1.3816300129366106, "grad_norm": 25.202573776245117, "learning_rate": 8.493064194633072e-06, "loss": 0.213, "step": 2136 }, { "epoch": 1.386804657179819, "grad_norm": 40.16120529174805, "learning_rate": 8.481053080543879e-06, "loss": 0.2394, "step": 2144 }, { "epoch": 1.391979301423027, "grad_norm": 32.01914596557617, "learning_rate": 8.469002855439741e-06, "loss": 0.2155, "step": 2152 }, { "epoch": 1.3971539456662354, "grad_norm": 11.138020515441895, "learning_rate": 8.456913654709725e-06, "loss": 0.2337, "step": 2160 }, { "epoch": 1.4023285899094438, "grad_norm": 4.366079807281494, "learning_rate": 8.444785614180807e-06, "loss": 0.2186, "step": 2168 }, { "epoch": 1.407503234152652, "grad_norm": 19.91827392578125, "learning_rate": 8.432618870116339e-06, "loss": 0.2493, "step": 2176 }, { "epoch": 1.4126778783958602, "grad_norm": 9.727320671081543, "learning_rate": 8.42041355921453e-06, "loss": 0.2207, "step": 2184 }, { "epoch": 1.4178525226390686, "grad_norm": 15.481773376464844, "learning_rate": 8.4081698186069e-06, "loss": 0.2179, "step": 2192 }, { "epoch": 1.4230271668822767, "grad_norm": 4.066092491149902, "learning_rate": 8.39588778585674e-06, "loss": 0.2304, "step": 2200 }, { "epoch": 1.428201811125485, "grad_norm": 35.265018463134766, "learning_rate": 8.383567598957567e-06, "loss": 0.2237, "step": 2208 }, { "epoch": 1.4333764553686934, "grad_norm": 10.66234302520752, "learning_rate": 8.37120939633158e-06, "loss": 0.2202, "step": 2216 }, { "epoch": 1.4385510996119018, "grad_norm": 1.3266409635543823, "learning_rate": 8.358813316828097e-06, "loss": 0.2194, "step": 2224 }, { "epoch": 1.44372574385511, "grad_norm": 4.940774440765381, "learning_rate": 8.346379499722e-06, "loss": 0.205, "step": 2232 }, { "epoch": 1.4489003880983182, "grad_norm": 4.44492769241333, "learning_rate": 8.333908084712163e-06, "loss": 0.2241, "step": 2240 }, { "epoch": 1.4540750323415266, "grad_norm": 2.2839596271514893, "learning_rate": 8.321399211919893e-06, "loss": 0.2245, "step": 2248 }, { "epoch": 1.4592496765847347, "grad_norm": 22.686996459960938, "learning_rate": 8.308853021887346e-06, "loss": 0.2472, "step": 2256 }, { "epoch": 1.464424320827943, "grad_norm": 3.752237319946289, "learning_rate": 8.296269655575956e-06, "loss": 0.2201, "step": 2264 }, { "epoch": 1.4695989650711514, "grad_norm": 16.266639709472656, "learning_rate": 8.283649254364843e-06, "loss": 0.2298, "step": 2272 }, { "epoch": 1.4747736093143597, "grad_norm": 1.5946576595306396, "learning_rate": 8.270991960049231e-06, "loss": 0.2144, "step": 2280 }, { "epoch": 1.4799482535575679, "grad_norm": 38.729488372802734, "learning_rate": 8.25829791483885e-06, "loss": 0.2181, "step": 2288 }, { "epoch": 1.4851228978007762, "grad_norm": 8.792683601379395, "learning_rate": 8.245567261356347e-06, "loss": 0.2204, "step": 2296 }, { "epoch": 1.4902975420439843, "grad_norm": 3.90529203414917, "learning_rate": 8.232800142635675e-06, "loss": 0.2179, "step": 2304 }, { "epoch": 1.4954721862871927, "grad_norm": 4.205716609954834, "learning_rate": 8.219996702120482e-06, "loss": 0.2391, "step": 2312 }, { "epoch": 1.500646830530401, "grad_norm": 12.205682754516602, "learning_rate": 8.207157083662516e-06, "loss": 0.2365, "step": 2320 }, { "epoch": 1.5058214747736094, "grad_norm": 3.6196138858795166, "learning_rate": 8.19428143151999e-06, "loss": 0.2246, "step": 2328 }, { "epoch": 1.5109961190168177, "grad_norm": 3.0989346504211426, "learning_rate": 8.181369890355975e-06, "loss": 0.2266, "step": 2336 }, { "epoch": 1.5161707632600259, "grad_norm": 3.95747447013855, "learning_rate": 8.16842260523677e-06, "loss": 0.2292, "step": 2344 }, { "epoch": 1.521345407503234, "grad_norm": 4.447324752807617, "learning_rate": 8.155439721630265e-06, "loss": 0.2128, "step": 2352 }, { "epoch": 1.5265200517464423, "grad_norm": 4.018327713012695, "learning_rate": 8.14242138540432e-06, "loss": 0.2098, "step": 2360 }, { "epoch": 1.5316946959896507, "grad_norm": 6.639673709869385, "learning_rate": 8.129367742825117e-06, "loss": 0.2232, "step": 2368 }, { "epoch": 1.536869340232859, "grad_norm": 17.369224548339844, "learning_rate": 8.116278940555517e-06, "loss": 0.2291, "step": 2376 }, { "epoch": 1.5420439844760674, "grad_norm": 6.68892765045166, "learning_rate": 8.103155125653419e-06, "loss": 0.2425, "step": 2384 }, { "epoch": 1.5472186287192755, "grad_norm": 34.08677673339844, "learning_rate": 8.089996445570097e-06, "loss": 0.2296, "step": 2392 }, { "epoch": 1.5523932729624839, "grad_norm": 25.89212417602539, "learning_rate": 8.076803048148553e-06, "loss": 0.2526, "step": 2400 }, { "epoch": 1.557567917205692, "grad_norm": 12.489855766296387, "learning_rate": 8.06357508162185e-06, "loss": 0.2218, "step": 2408 }, { "epoch": 1.5627425614489003, "grad_norm": 1.4443012475967407, "learning_rate": 8.050312694611451e-06, "loss": 0.2239, "step": 2416 }, { "epoch": 1.5679172056921087, "grad_norm": 9.047863960266113, "learning_rate": 8.037016036125542e-06, "loss": 0.2096, "step": 2424 }, { "epoch": 1.573091849935317, "grad_norm": 8.109607696533203, "learning_rate": 8.023685255557368e-06, "loss": 0.2118, "step": 2432 }, { "epoch": 1.5782664941785254, "grad_norm": 36.13426971435547, "learning_rate": 8.010320502683549e-06, "loss": 0.2083, "step": 2440 }, { "epoch": 1.5834411384217335, "grad_norm": 17.517616271972656, "learning_rate": 7.996921927662395e-06, "loss": 0.2078, "step": 2448 }, { "epoch": 1.5886157826649416, "grad_norm": 4.454436779022217, "learning_rate": 7.983489681032219e-06, "loss": 0.2428, "step": 2456 }, { "epoch": 1.59379042690815, "grad_norm": 26.744815826416016, "learning_rate": 7.970023913709652e-06, "loss": 0.2263, "step": 2464 }, { "epoch": 1.5989650711513583, "grad_norm": 2.163850784301758, "learning_rate": 7.956524776987945e-06, "loss": 0.2253, "step": 2472 }, { "epoch": 1.6041397153945667, "grad_norm": 2.051708936691284, "learning_rate": 7.94299242253526e-06, "loss": 0.2352, "step": 2480 }, { "epoch": 1.609314359637775, "grad_norm": 9.460783004760742, "learning_rate": 7.929427002392981e-06, "loss": 0.2407, "step": 2488 }, { "epoch": 1.6144890038809832, "grad_norm": 13.439695358276367, "learning_rate": 7.915828668973992e-06, "loss": 0.2189, "step": 2496 }, { "epoch": 1.6196636481241915, "grad_norm": 14.777132987976074, "learning_rate": 7.902197575060978e-06, "loss": 0.2232, "step": 2504 }, { "epoch": 1.6248382923673996, "grad_norm": 36.123809814453125, "learning_rate": 7.888533873804693e-06, "loss": 0.2258, "step": 2512 }, { "epoch": 1.630012936610608, "grad_norm": 1.7682701349258423, "learning_rate": 7.874837718722254e-06, "loss": 0.2339, "step": 2520 }, { "epoch": 1.6351875808538163, "grad_norm": 13.471175193786621, "learning_rate": 7.861109263695405e-06, "loss": 0.2441, "step": 2528 }, { "epoch": 1.6403622250970247, "grad_norm": 2.4435620307922363, "learning_rate": 7.847348662968796e-06, "loss": 0.2245, "step": 2536 }, { "epoch": 1.645536869340233, "grad_norm": 2.9155242443084717, "learning_rate": 7.833556071148245e-06, "loss": 0.2229, "step": 2544 }, { "epoch": 1.6507115135834411, "grad_norm": 1.8082088232040405, "learning_rate": 7.819731643199006e-06, "loss": 0.2273, "step": 2552 }, { "epoch": 1.6558861578266493, "grad_norm": 1.26021409034729, "learning_rate": 7.805875534444016e-06, "loss": 0.2234, "step": 2560 }, { "epoch": 1.6610608020698576, "grad_norm": 1.4038159847259521, "learning_rate": 7.79198790056217e-06, "loss": 0.2318, "step": 2568 }, { "epoch": 1.666235446313066, "grad_norm": 217.4139862060547, "learning_rate": 7.77806889758655e-06, "loss": 0.2318, "step": 2576 }, { "epoch": 1.6714100905562743, "grad_norm": 3.6848537921905518, "learning_rate": 7.764118681902688e-06, "loss": 0.2276, "step": 2584 }, { "epoch": 1.6765847347994827, "grad_norm": 5.489861965179443, "learning_rate": 7.750137410246803e-06, "loss": 0.2255, "step": 2592 }, { "epoch": 1.6817593790426908, "grad_norm": 3.472351312637329, "learning_rate": 7.73612523970404e-06, "loss": 0.2106, "step": 2600 }, { "epoch": 1.6869340232858991, "grad_norm": 4.789738178253174, "learning_rate": 7.722082327706701e-06, "loss": 0.2432, "step": 2608 }, { "epoch": 1.6921086675291073, "grad_norm": 1.9528974294662476, "learning_rate": 7.708008832032485e-06, "loss": 0.2263, "step": 2616 }, { "epoch": 1.6972833117723156, "grad_norm": 21.182388305664062, "learning_rate": 7.693904910802712e-06, "loss": 0.2346, "step": 2624 }, { "epoch": 1.702457956015524, "grad_norm": 12.879313468933105, "learning_rate": 7.679770722480539e-06, "loss": 0.2041, "step": 2632 }, { "epoch": 1.7076326002587323, "grad_norm": 8.2993745803833, "learning_rate": 7.665606425869194e-06, "loss": 0.2193, "step": 2640 }, { "epoch": 1.7128072445019404, "grad_norm": 8.905396461486816, "learning_rate": 7.651412180110176e-06, "loss": 0.2067, "step": 2648 }, { "epoch": 1.7179818887451488, "grad_norm": 2.264564037322998, "learning_rate": 7.637188144681478e-06, "loss": 0.2225, "step": 2656 }, { "epoch": 1.723156532988357, "grad_norm": 6.169471263885498, "learning_rate": 7.622934479395792e-06, "loss": 0.2128, "step": 2664 }, { "epoch": 1.7283311772315653, "grad_norm": 13.35564136505127, "learning_rate": 7.608651344398713e-06, "loss": 0.2185, "step": 2672 }, { "epoch": 1.7335058214747736, "grad_norm": 3.8094420433044434, "learning_rate": 7.5943389001669395e-06, "loss": 0.2038, "step": 2680 }, { "epoch": 1.738680465717982, "grad_norm": 7.062588214874268, "learning_rate": 7.579997307506472e-06, "loss": 0.2247, "step": 2688 }, { "epoch": 1.7438551099611903, "grad_norm": 4.3430962562561035, "learning_rate": 7.565626727550804e-06, "loss": 0.213, "step": 2696 }, { "epoch": 1.7490297542043984, "grad_norm": 0.8228983283042908, "learning_rate": 7.551227321759111e-06, "loss": 0.2116, "step": 2704 }, { "epoch": 1.7542043984476066, "grad_norm": 2.99155592918396, "learning_rate": 7.536799251914442e-06, "loss": 0.2295, "step": 2712 }, { "epoch": 1.759379042690815, "grad_norm": 1.8452483415603638, "learning_rate": 7.522342680121897e-06, "loss": 0.2174, "step": 2720 }, { "epoch": 1.7645536869340233, "grad_norm": 9.412191390991211, "learning_rate": 7.507857768806803e-06, "loss": 0.2125, "step": 2728 }, { "epoch": 1.7697283311772316, "grad_norm": 1.4425644874572754, "learning_rate": 7.4933446807129e-06, "loss": 0.2283, "step": 2736 }, { "epoch": 1.77490297542044, "grad_norm": 2.3930792808532715, "learning_rate": 7.4788035789005e-06, "loss": 0.2288, "step": 2744 }, { "epoch": 1.780077619663648, "grad_norm": 8.610286712646484, "learning_rate": 7.464234626744659e-06, "loss": 0.2197, "step": 2752 }, { "epoch": 1.7852522639068564, "grad_norm": 2.6517043113708496, "learning_rate": 7.449637987933347e-06, "loss": 0.2278, "step": 2760 }, { "epoch": 1.7904269081500646, "grad_norm": 1.6406168937683105, "learning_rate": 7.435013826465601e-06, "loss": 0.2227, "step": 2768 }, { "epoch": 1.795601552393273, "grad_norm": 11.188384056091309, "learning_rate": 7.420362306649691e-06, "loss": 0.2139, "step": 2776 }, { "epoch": 1.8007761966364813, "grad_norm": 4.76533317565918, "learning_rate": 7.405683593101263e-06, "loss": 0.2279, "step": 2784 }, { "epoch": 1.8059508408796896, "grad_norm": 3.7414357662200928, "learning_rate": 7.390977850741498e-06, "loss": 0.2098, "step": 2792 }, { "epoch": 1.811125485122898, "grad_norm": 37.51686096191406, "learning_rate": 7.376245244795255e-06, "loss": 0.2204, "step": 2800 }, { "epoch": 1.816300129366106, "grad_norm": 29.107175827026367, "learning_rate": 7.361485940789221e-06, "loss": 0.2254, "step": 2808 }, { "epoch": 1.8214747736093142, "grad_norm": 51.822723388671875, "learning_rate": 7.346700104550042e-06, "loss": 0.2304, "step": 2816 }, { "epoch": 1.8266494178525226, "grad_norm": 20.180925369262695, "learning_rate": 7.331887902202463e-06, "loss": 0.2262, "step": 2824 }, { "epoch": 1.831824062095731, "grad_norm": 8.97240161895752, "learning_rate": 7.317049500167466e-06, "loss": 0.253, "step": 2832 }, { "epoch": 1.8369987063389392, "grad_norm": 24.334049224853516, "learning_rate": 7.3021850651603955e-06, "loss": 0.219, "step": 2840 }, { "epoch": 1.8421733505821476, "grad_norm": 14.938766479492188, "learning_rate": 7.2872947641890854e-06, "loss": 0.232, "step": 2848 }, { "epoch": 1.8473479948253557, "grad_norm": 0.8988103270530701, "learning_rate": 7.272378764551988e-06, "loss": 0.213, "step": 2856 }, { "epoch": 1.852522639068564, "grad_norm": 25.765361785888672, "learning_rate": 7.257437233836285e-06, "loss": 0.2185, "step": 2864 }, { "epoch": 1.8576972833117722, "grad_norm": 21.85658836364746, "learning_rate": 7.242470339916014e-06, "loss": 0.2175, "step": 2872 }, { "epoch": 1.8628719275549805, "grad_norm": 4.687010288238525, "learning_rate": 7.227478250950178e-06, "loss": 0.2221, "step": 2880 }, { "epoch": 1.868046571798189, "grad_norm": 1.0599603652954102, "learning_rate": 7.212461135380855e-06, "loss": 0.214, "step": 2888 }, { "epoch": 1.8732212160413972, "grad_norm": 8.412078857421875, "learning_rate": 7.197419161931305e-06, "loss": 0.2103, "step": 2896 }, { "epoch": 1.8783958602846056, "grad_norm": 3.0134875774383545, "learning_rate": 7.182352499604081e-06, "loss": 0.2114, "step": 2904 }, { "epoch": 1.8835705045278137, "grad_norm": 0.7685543298721313, "learning_rate": 7.167261317679121e-06, "loss": 0.2036, "step": 2912 }, { "epoch": 1.8887451487710218, "grad_norm": 2.3020284175872803, "learning_rate": 7.1521457857118525e-06, "loss": 0.2265, "step": 2920 }, { "epoch": 1.8939197930142302, "grad_norm": 18.331098556518555, "learning_rate": 7.137006073531285e-06, "loss": 0.2318, "step": 2928 }, { "epoch": 1.8990944372574385, "grad_norm": 6.871685981750488, "learning_rate": 7.121842351238102e-06, "loss": 0.1977, "step": 2936 }, { "epoch": 1.9042690815006469, "grad_norm": 8.529058456420898, "learning_rate": 7.106654789202751e-06, "loss": 0.1992, "step": 2944 }, { "epoch": 1.9094437257438552, "grad_norm": 11.424173355102539, "learning_rate": 7.0914435580635286e-06, "loss": 0.2185, "step": 2952 }, { "epoch": 1.9146183699870634, "grad_norm": 5.662269592285156, "learning_rate": 7.076208828724661e-06, "loss": 0.217, "step": 2960 }, { "epoch": 1.9197930142302717, "grad_norm": 17.576711654663086, "learning_rate": 7.060950772354389e-06, "loss": 0.2251, "step": 2968 }, { "epoch": 1.9249676584734798, "grad_norm": 65.82316589355469, "learning_rate": 7.045669560383039e-06, "loss": 0.2131, "step": 2976 }, { "epoch": 1.9301423027166882, "grad_norm": 5.220096588134766, "learning_rate": 7.030365364501104e-06, "loss": 0.2263, "step": 2984 }, { "epoch": 1.9353169469598965, "grad_norm": 13.813321113586426, "learning_rate": 7.015038356657303e-06, "loss": 0.2229, "step": 2992 }, { "epoch": 1.9404915912031049, "grad_norm": 3.5180106163024902, "learning_rate": 6.9996887090566645e-06, "loss": 0.2055, "step": 3000 }, { "epoch": 1.9456662354463132, "grad_norm": 13.823914527893066, "learning_rate": 6.98431659415858e-06, "loss": 0.2298, "step": 3008 }, { "epoch": 1.9508408796895214, "grad_norm": 9.340493202209473, "learning_rate": 6.968922184674868e-06, "loss": 0.21, "step": 3016 }, { "epoch": 1.9560155239327295, "grad_norm": 8.113641738891602, "learning_rate": 6.95350565356784e-06, "loss": 0.2215, "step": 3024 }, { "epoch": 1.9611901681759378, "grad_norm": 4.113650321960449, "learning_rate": 6.93806717404835e-06, "loss": 0.2176, "step": 3032 }, { "epoch": 1.9663648124191462, "grad_norm": 44.47676086425781, "learning_rate": 6.922606919573851e-06, "loss": 0.2188, "step": 3040 }, { "epoch": 1.9715394566623545, "grad_norm": 4.712839126586914, "learning_rate": 6.907125063846447e-06, "loss": 0.2101, "step": 3048 }, { "epoch": 1.9767141009055629, "grad_norm": 13.299867630004883, "learning_rate": 6.891621780810941e-06, "loss": 0.2207, "step": 3056 }, { "epoch": 1.981888745148771, "grad_norm": 2.58492112159729, "learning_rate": 6.876097244652879e-06, "loss": 0.2233, "step": 3064 }, { "epoch": 1.9870633893919794, "grad_norm": 4.740981101989746, "learning_rate": 6.860551629796597e-06, "loss": 0.2386, "step": 3072 }, { "epoch": 1.9922380336351875, "grad_norm": 3.3356964588165283, "learning_rate": 6.844985110903255e-06, "loss": 0.208, "step": 3080 }, { "epoch": 1.9974126778783958, "grad_norm": 11.826971054077148, "learning_rate": 6.829397862868878e-06, "loss": 0.2142, "step": 3088 }, { "epoch": 2.002587322121604, "grad_norm": 5.242908477783203, "learning_rate": 6.8137900608223985e-06, "loss": 0.2276, "step": 3096 }, { "epoch": 2.0077619663648125, "grad_norm": 7.4030327796936035, "learning_rate": 6.798161880123671e-06, "loss": 0.2199, "step": 3104 }, { "epoch": 2.012936610608021, "grad_norm": 1.0129386186599731, "learning_rate": 6.78251349636152e-06, "loss": 0.2151, "step": 3112 }, { "epoch": 2.0181112548512288, "grad_norm": 12.936131477355957, "learning_rate": 6.766845085351755e-06, "loss": 0.2103, "step": 3120 }, { "epoch": 2.023285899094437, "grad_norm": 3.0860114097595215, "learning_rate": 6.751156823135203e-06, "loss": 0.2312, "step": 3128 }, { "epoch": 2.0284605433376455, "grad_norm": 10.499250411987305, "learning_rate": 6.735448885975724e-06, "loss": 0.2236, "step": 3136 }, { "epoch": 2.033635187580854, "grad_norm": 1.9619132280349731, "learning_rate": 6.7197214503582355e-06, "loss": 0.2222, "step": 3144 }, { "epoch": 2.038809831824062, "grad_norm": 5.836687088012695, "learning_rate": 6.703974692986729e-06, "loss": 0.2057, "step": 3152 }, { "epoch": 2.0439844760672705, "grad_norm": 1.3874237537384033, "learning_rate": 6.68820879078228e-06, "loss": 0.2332, "step": 3160 }, { "epoch": 2.049159120310479, "grad_norm": 1.146099328994751, "learning_rate": 6.672423920881068e-06, "loss": 0.2266, "step": 3168 }, { "epoch": 2.0543337645536868, "grad_norm": 3.993424654006958, "learning_rate": 6.6566202606323806e-06, "loss": 0.2172, "step": 3176 }, { "epoch": 2.059508408796895, "grad_norm": 57.19075393676758, "learning_rate": 6.640797987596621e-06, "loss": 0.2056, "step": 3184 }, { "epoch": 2.0646830530401035, "grad_norm": 2.535454511642456, "learning_rate": 6.6249572795433155e-06, "loss": 0.2082, "step": 3192 }, { "epoch": 2.069857697283312, "grad_norm": 2.062868595123291, "learning_rate": 6.609098314449116e-06, "loss": 0.2182, "step": 3200 }, { "epoch": 2.07503234152652, "grad_norm": 0.7799398303031921, "learning_rate": 6.593221270495797e-06, "loss": 0.2168, "step": 3208 }, { "epoch": 2.0802069857697285, "grad_norm": 57.6228141784668, "learning_rate": 6.5773263260682595e-06, "loss": 0.2344, "step": 3216 }, { "epoch": 2.0853816300129364, "grad_norm": 1.8435810804367065, "learning_rate": 6.561413659752521e-06, "loss": 0.2159, "step": 3224 }, { "epoch": 2.0905562742561448, "grad_norm": 4.932186126708984, "learning_rate": 6.545483450333712e-06, "loss": 0.214, "step": 3232 }, { "epoch": 2.095730918499353, "grad_norm": 2.2944324016571045, "learning_rate": 6.529535876794069e-06, "loss": 0.2212, "step": 3240 }, { "epoch": 2.1009055627425615, "grad_norm": 2.7697372436523438, "learning_rate": 6.5135711183109156e-06, "loss": 0.2193, "step": 3248 }, { "epoch": 2.10608020698577, "grad_norm": 2.9319217205047607, "learning_rate": 6.497589354254662e-06, "loss": 0.2292, "step": 3256 }, { "epoch": 2.111254851228978, "grad_norm": 4.4201154708862305, "learning_rate": 6.481590764186778e-06, "loss": 0.2141, "step": 3264 }, { "epoch": 2.116429495472186, "grad_norm": 14.810309410095215, "learning_rate": 6.465575527857781e-06, "loss": 0.1982, "step": 3272 }, { "epoch": 2.1216041397153944, "grad_norm": 3.787808656692505, "learning_rate": 6.44954382520522e-06, "loss": 0.2116, "step": 3280 }, { "epoch": 2.1267787839586028, "grad_norm": 7.30560827255249, "learning_rate": 6.433495836351643e-06, "loss": 0.2088, "step": 3288 }, { "epoch": 2.131953428201811, "grad_norm": 1.6610970497131348, "learning_rate": 6.417431741602585e-06, "loss": 0.2189, "step": 3296 }, { "epoch": 2.1371280724450195, "grad_norm": 3.0157978534698486, "learning_rate": 6.401351721444533e-06, "loss": 0.2197, "step": 3304 }, { "epoch": 2.142302716688228, "grad_norm": 29.640609741210938, "learning_rate": 6.385255956542907e-06, "loss": 0.2209, "step": 3312 }, { "epoch": 2.147477360931436, "grad_norm": 1.1481863260269165, "learning_rate": 6.369144627740023e-06, "loss": 0.2099, "step": 3320 }, { "epoch": 2.152652005174644, "grad_norm": 4.599576473236084, "learning_rate": 6.353017916053063e-06, "loss": 0.2159, "step": 3328 }, { "epoch": 2.1578266494178524, "grad_norm": 4.746689319610596, "learning_rate": 6.336876002672042e-06, "loss": 0.2289, "step": 3336 }, { "epoch": 2.1630012936610608, "grad_norm": 14.463395118713379, "learning_rate": 6.3207190689577745e-06, "loss": 0.219, "step": 3344 }, { "epoch": 2.168175937904269, "grad_norm": 7.709287166595459, "learning_rate": 6.304547296439831e-06, "loss": 0.2419, "step": 3352 }, { "epoch": 2.1733505821474774, "grad_norm": 4.06973934173584, "learning_rate": 6.288360866814504e-06, "loss": 0.2434, "step": 3360 }, { "epoch": 2.178525226390686, "grad_norm": 2.9332635402679443, "learning_rate": 6.272159961942764e-06, "loss": 0.2202, "step": 3368 }, { "epoch": 2.1836998706338937, "grad_norm": 7.125377655029297, "learning_rate": 6.255944763848215e-06, "loss": 0.2049, "step": 3376 }, { "epoch": 2.188874514877102, "grad_norm": 2.0361697673797607, "learning_rate": 6.239715454715054e-06, "loss": 0.2237, "step": 3384 }, { "epoch": 2.1940491591203104, "grad_norm": 25.10670280456543, "learning_rate": 6.223472216886021e-06, "loss": 0.2088, "step": 3392 }, { "epoch": 2.1992238033635187, "grad_norm": 16.464994430541992, "learning_rate": 6.2072152328603464e-06, "loss": 0.2224, "step": 3400 }, { "epoch": 2.204398447606727, "grad_norm": 3.6457226276397705, "learning_rate": 6.190944685291708e-06, "loss": 0.2081, "step": 3408 }, { "epoch": 2.2095730918499354, "grad_norm": 1.3341186046600342, "learning_rate": 6.174660756986175e-06, "loss": 0.211, "step": 3416 }, { "epoch": 2.214747736093144, "grad_norm": 14.349130630493164, "learning_rate": 6.158363630900155e-06, "loss": 0.225, "step": 3424 }, { "epoch": 2.2199223803363517, "grad_norm": 13.548636436462402, "learning_rate": 6.142053490138335e-06, "loss": 0.2251, "step": 3432 }, { "epoch": 2.22509702457956, "grad_norm": 8.892589569091797, "learning_rate": 6.1257305179516315e-06, "loss": 0.2518, "step": 3440 }, { "epoch": 2.2302716688227684, "grad_norm": 3.8122220039367676, "learning_rate": 6.109394897735121e-06, "loss": 0.2342, "step": 3448 }, { "epoch": 2.2354463130659767, "grad_norm": 6.099566459655762, "learning_rate": 6.093046813025995e-06, "loss": 0.2175, "step": 3456 }, { "epoch": 2.240620957309185, "grad_norm": 1.4928990602493286, "learning_rate": 6.0766864475014785e-06, "loss": 0.2383, "step": 3464 }, { "epoch": 2.2457956015523934, "grad_norm": 11.29719352722168, "learning_rate": 6.060313984976783e-06, "loss": 0.2389, "step": 3472 }, { "epoch": 2.2509702457956013, "grad_norm": 2.987386465072632, "learning_rate": 6.043929609403032e-06, "loss": 0.2113, "step": 3480 }, { "epoch": 2.2561448900388097, "grad_norm": 2.351633310317993, "learning_rate": 6.027533504865196e-06, "loss": 0.2235, "step": 3488 }, { "epoch": 2.261319534282018, "grad_norm": 5.436527729034424, "learning_rate": 6.011125855580026e-06, "loss": 0.2204, "step": 3496 }, { "epoch": 2.2664941785252264, "grad_norm": 1.6124496459960938, "learning_rate": 5.994706845893986e-06, "loss": 0.2247, "step": 3504 }, { "epoch": 2.2716688227684347, "grad_norm": 8.689626693725586, "learning_rate": 5.978276660281174e-06, "loss": 0.2345, "step": 3512 }, { "epoch": 2.276843467011643, "grad_norm": 29.871366500854492, "learning_rate": 5.961835483341255e-06, "loss": 0.2154, "step": 3520 }, { "epoch": 2.2820181112548514, "grad_norm": 11.956281661987305, "learning_rate": 5.945383499797388e-06, "loss": 0.2351, "step": 3528 }, { "epoch": 2.2871927554980593, "grad_norm": 1.748079538345337, "learning_rate": 5.928920894494147e-06, "loss": 0.2083, "step": 3536 }, { "epoch": 2.2923673997412677, "grad_norm": 22.06855010986328, "learning_rate": 5.912447852395444e-06, "loss": 0.2149, "step": 3544 }, { "epoch": 2.297542043984476, "grad_norm": 7.1412529945373535, "learning_rate": 5.8959645585824575e-06, "loss": 0.2176, "step": 3552 }, { "epoch": 2.3027166882276844, "grad_norm": 11.964238166809082, "learning_rate": 5.879471198251544e-06, "loss": 0.2235, "step": 3560 }, { "epoch": 2.3078913324708927, "grad_norm": 4.59617280960083, "learning_rate": 5.86296795671216e-06, "loss": 0.2066, "step": 3568 }, { "epoch": 2.313065976714101, "grad_norm": 27.847808837890625, "learning_rate": 5.846455019384787e-06, "loss": 0.2031, "step": 3576 }, { "epoch": 2.318240620957309, "grad_norm": 1.6919810771942139, "learning_rate": 5.8299325717988355e-06, "loss": 0.2163, "step": 3584 }, { "epoch": 2.3234152652005173, "grad_norm": 2.3351593017578125, "learning_rate": 5.813400799590573e-06, "loss": 0.2211, "step": 3592 }, { "epoch": 2.3285899094437257, "grad_norm": 23.962520599365234, "learning_rate": 5.7968598885010315e-06, "loss": 0.2116, "step": 3600 }, { "epoch": 2.333764553686934, "grad_norm": 19.117528915405273, "learning_rate": 5.780310024373923e-06, "loss": 0.2227, "step": 3608 }, { "epoch": 2.3389391979301424, "grad_norm": 48.07025146484375, "learning_rate": 5.763751393153545e-06, "loss": 0.2183, "step": 3616 }, { "epoch": 2.3441138421733507, "grad_norm": 1.309008240699768, "learning_rate": 5.747184180882704e-06, "loss": 0.2098, "step": 3624 }, { "epoch": 2.349288486416559, "grad_norm": 2.3568334579467773, "learning_rate": 5.730608573700613e-06, "loss": 0.2062, "step": 3632 }, { "epoch": 2.354463130659767, "grad_norm": 1.478174090385437, "learning_rate": 5.714024757840806e-06, "loss": 0.2175, "step": 3640 }, { "epoch": 2.3596377749029753, "grad_norm": 4.405735492706299, "learning_rate": 5.697432919629048e-06, "loss": 0.2204, "step": 3648 }, { "epoch": 2.3648124191461837, "grad_norm": 1.8726969957351685, "learning_rate": 5.680833245481234e-06, "loss": 0.2205, "step": 3656 }, { "epoch": 2.369987063389392, "grad_norm": 2.596064805984497, "learning_rate": 5.664225921901302e-06, "loss": 0.2197, "step": 3664 }, { "epoch": 2.3751617076326004, "grad_norm": 3.998558282852173, "learning_rate": 5.647611135479133e-06, "loss": 0.2132, "step": 3672 }, { "epoch": 2.3803363518758087, "grad_norm": 23.586936950683594, "learning_rate": 5.6309890728884555e-06, "loss": 0.2174, "step": 3680 }, { "epoch": 2.3855109961190166, "grad_norm": 41.73709487915039, "learning_rate": 5.614359920884751e-06, "loss": 0.2214, "step": 3688 }, { "epoch": 2.390685640362225, "grad_norm": 25.809192657470703, "learning_rate": 5.5977238663031495e-06, "loss": 0.2193, "step": 3696 }, { "epoch": 2.3958602846054333, "grad_norm": 2.3301281929016113, "learning_rate": 5.581081096056337e-06, "loss": 0.2192, "step": 3704 }, { "epoch": 2.4010349288486417, "grad_norm": 2.47526478767395, "learning_rate": 5.564431797132454e-06, "loss": 0.2042, "step": 3712 }, { "epoch": 2.40620957309185, "grad_norm": 24.38184928894043, "learning_rate": 5.547776156592989e-06, "loss": 0.2235, "step": 3720 }, { "epoch": 2.4113842173350584, "grad_norm": 1.7953455448150635, "learning_rate": 5.531114361570684e-06, "loss": 0.231, "step": 3728 }, { "epoch": 2.4165588615782667, "grad_norm": 2.250443935394287, "learning_rate": 5.514446599267429e-06, "loss": 0.2206, "step": 3736 }, { "epoch": 2.4217335058214746, "grad_norm": 3.726274251937866, "learning_rate": 5.497773056952159e-06, "loss": 0.2133, "step": 3744 }, { "epoch": 2.426908150064683, "grad_norm": 1.7468153238296509, "learning_rate": 5.481093921958749e-06, "loss": 0.2299, "step": 3752 }, { "epoch": 2.4320827943078913, "grad_norm": 15.300987243652344, "learning_rate": 5.4644093816839086e-06, "loss": 0.2238, "step": 3760 }, { "epoch": 2.4372574385510997, "grad_norm": 1.6782792806625366, "learning_rate": 5.44771962358508e-06, "loss": 0.2315, "step": 3768 }, { "epoch": 2.442432082794308, "grad_norm": 18.028644561767578, "learning_rate": 5.4310248351783264e-06, "loss": 0.2366, "step": 3776 }, { "epoch": 2.4476067270375164, "grad_norm": 4.083730220794678, "learning_rate": 5.414325204036237e-06, "loss": 0.207, "step": 3784 }, { "epoch": 2.4527813712807243, "grad_norm": 1.102040410041809, "learning_rate": 5.397620917785799e-06, "loss": 0.2198, "step": 3792 }, { "epoch": 2.4579560155239326, "grad_norm": 13.378700256347656, "learning_rate": 5.380912164106312e-06, "loss": 0.2193, "step": 3800 }, { "epoch": 2.463130659767141, "grad_norm": 1.2787953615188599, "learning_rate": 5.364199130727262e-06, "loss": 0.2146, "step": 3808 }, { "epoch": 2.4683053040103493, "grad_norm": 5.001540184020996, "learning_rate": 5.347482005426224e-06, "loss": 0.2128, "step": 3816 }, { "epoch": 2.4734799482535577, "grad_norm": 32.12523651123047, "learning_rate": 5.330760976026744e-06, "loss": 0.2146, "step": 3824 }, { "epoch": 2.478654592496766, "grad_norm": 2.327051877975464, "learning_rate": 5.314036230396233e-06, "loss": 0.2224, "step": 3832 }, { "epoch": 2.4838292367399744, "grad_norm": 26.68153953552246, "learning_rate": 5.297307956443856e-06, "loss": 0.2238, "step": 3840 }, { "epoch": 2.4890038809831823, "grad_norm": 2.472919225692749, "learning_rate": 5.28057634211842e-06, "loss": 0.2116, "step": 3848 }, { "epoch": 2.4941785252263906, "grad_norm": 90.6447982788086, "learning_rate": 5.2638415754062625e-06, "loss": 0.2207, "step": 3856 }, { "epoch": 2.499353169469599, "grad_norm": 1.0954121351242065, "learning_rate": 5.247103844329137e-06, "loss": 0.2277, "step": 3864 }, { "epoch": 2.5045278137128073, "grad_norm": 1.563537836074829, "learning_rate": 5.230363336942105e-06, "loss": 0.2093, "step": 3872 }, { "epoch": 2.5097024579560157, "grad_norm": 42.30322265625, "learning_rate": 5.213620241331424e-06, "loss": 0.2162, "step": 3880 }, { "epoch": 2.5148771021992236, "grad_norm": 1.3426226377487183, "learning_rate": 5.196874745612425e-06, "loss": 0.2232, "step": 3888 }, { "epoch": 2.520051746442432, "grad_norm": 9.39691162109375, "learning_rate": 5.180127037927408e-06, "loss": 0.2242, "step": 3896 }, { "epoch": 2.5252263906856403, "grad_norm": 4.963808536529541, "learning_rate": 5.163377306443527e-06, "loss": 0.2156, "step": 3904 }, { "epoch": 2.5304010349288486, "grad_norm": 5.381854057312012, "learning_rate": 5.146625739350671e-06, "loss": 0.2267, "step": 3912 }, { "epoch": 2.535575679172057, "grad_norm": 11.522015571594238, "learning_rate": 5.129872524859356e-06, "loss": 0.2337, "step": 3920 }, { "epoch": 2.5407503234152653, "grad_norm": 1.7935131788253784, "learning_rate": 5.1131178511986045e-06, "loss": 0.2213, "step": 3928 }, { "epoch": 2.5459249676584736, "grad_norm": 3.484163284301758, "learning_rate": 5.096361906613836e-06, "loss": 0.2215, "step": 3936 }, { "epoch": 2.551099611901682, "grad_norm": 1.6095975637435913, "learning_rate": 5.079604879364746e-06, "loss": 0.2164, "step": 3944 }, { "epoch": 2.55627425614489, "grad_norm": 1.0419116020202637, "learning_rate": 5.062846957723194e-06, "loss": 0.2071, "step": 3952 }, { "epoch": 2.5614489003880982, "grad_norm": 2.196648359298706, "learning_rate": 5.046088329971095e-06, "loss": 0.2071, "step": 3960 }, { "epoch": 2.5666235446313066, "grad_norm": 11.791790962219238, "learning_rate": 5.0293291843982896e-06, "loss": 0.2325, "step": 3968 }, { "epoch": 2.571798188874515, "grad_norm": 3.8279497623443604, "learning_rate": 5.012569709300441e-06, "loss": 0.2219, "step": 3976 }, { "epoch": 2.5769728331177233, "grad_norm": 6.66563081741333, "learning_rate": 4.995810092976912e-06, "loss": 0.2211, "step": 3984 }, { "epoch": 2.582147477360931, "grad_norm": 2.60664701461792, "learning_rate": 4.979050523728654e-06, "loss": 0.2128, "step": 3992 }, { "epoch": 2.5873221216041395, "grad_norm": 7.721459865570068, "learning_rate": 4.962291189856089e-06, "loss": 0.2089, "step": 4000 }, { "epoch": 2.592496765847348, "grad_norm": 4.121895790100098, "learning_rate": 4.945532279656993e-06, "loss": 0.2183, "step": 4008 }, { "epoch": 2.5976714100905562, "grad_norm": 4.680197238922119, "learning_rate": 4.9287739814243835e-06, "loss": 0.2173, "step": 4016 }, { "epoch": 2.6028460543337646, "grad_norm": 3.2336695194244385, "learning_rate": 4.912016483444403e-06, "loss": 0.2069, "step": 4024 }, { "epoch": 2.608020698576973, "grad_norm": 5.2549309730529785, "learning_rate": 4.8952599739942015e-06, "loss": 0.2342, "step": 4032 }, { "epoch": 2.6131953428201813, "grad_norm": 1.6960891485214233, "learning_rate": 4.878504641339822e-06, "loss": 0.2158, "step": 4040 }, { "epoch": 2.6183699870633896, "grad_norm": 11.856524467468262, "learning_rate": 4.861750673734085e-06, "loss": 0.2135, "step": 4048 }, { "epoch": 2.6235446313065975, "grad_norm": 25.278499603271484, "learning_rate": 4.8449982594144786e-06, "loss": 0.2054, "step": 4056 }, { "epoch": 2.628719275549806, "grad_norm": 9.263405799865723, "learning_rate": 4.828247586601035e-06, "loss": 0.2099, "step": 4064 }, { "epoch": 2.6338939197930142, "grad_norm": 5.896005153656006, "learning_rate": 4.811498843494222e-06, "loss": 0.207, "step": 4072 }, { "epoch": 2.6390685640362226, "grad_norm": 1.7929736375808716, "learning_rate": 4.794752218272824e-06, "loss": 0.2267, "step": 4080 }, { "epoch": 2.644243208279431, "grad_norm": 1.0446430444717407, "learning_rate": 4.7780078990918326e-06, "loss": 0.2206, "step": 4088 }, { "epoch": 2.649417852522639, "grad_norm": 5.339548110961914, "learning_rate": 4.761266074080326e-06, "loss": 0.2187, "step": 4096 }, { "epoch": 2.654592496765847, "grad_norm": 14.899389266967773, "learning_rate": 4.744526931339367e-06, "loss": 0.207, "step": 4104 }, { "epoch": 2.6597671410090555, "grad_norm": 1.1871739625930786, "learning_rate": 4.727790658939875e-06, "loss": 0.2211, "step": 4112 }, { "epoch": 2.664941785252264, "grad_norm": 1.4944238662719727, "learning_rate": 4.711057444920522e-06, "loss": 0.2206, "step": 4120 }, { "epoch": 2.6701164294954722, "grad_norm": 62.428558349609375, "learning_rate": 4.694327477285619e-06, "loss": 0.2163, "step": 4128 }, { "epoch": 2.6752910737386806, "grad_norm": 9.660483360290527, "learning_rate": 4.6776009440030035e-06, "loss": 0.2123, "step": 4136 }, { "epoch": 2.680465717981889, "grad_norm": 7.151432991027832, "learning_rate": 4.660878033001922e-06, "loss": 0.2163, "step": 4144 }, { "epoch": 2.6856403622250973, "grad_norm": 8.276389122009277, "learning_rate": 4.644158932170929e-06, "loss": 0.2239, "step": 4152 }, { "epoch": 2.690815006468305, "grad_norm": 9.777862548828125, "learning_rate": 4.627443829355765e-06, "loss": 0.22, "step": 4160 }, { "epoch": 2.6959896507115135, "grad_norm": 3.226142168045044, "learning_rate": 4.610732912357256e-06, "loss": 0.2278, "step": 4168 }, { "epoch": 2.701164294954722, "grad_norm": 4.0428385734558105, "learning_rate": 4.5940263689291955e-06, "loss": 0.2135, "step": 4176 }, { "epoch": 2.7063389391979302, "grad_norm": 2.2790136337280273, "learning_rate": 4.57732438677624e-06, "loss": 0.2022, "step": 4184 }, { "epoch": 2.7115135834411386, "grad_norm": 8.921182632446289, "learning_rate": 4.560627153551795e-06, "loss": 0.2195, "step": 4192 }, { "epoch": 2.7166882276843465, "grad_norm": 1.772870421409607, "learning_rate": 4.543934856855913e-06, "loss": 0.2088, "step": 4200 }, { "epoch": 2.721862871927555, "grad_norm": 3.5123379230499268, "learning_rate": 4.527247684233185e-06, "loss": 0.2105, "step": 4208 }, { "epoch": 2.727037516170763, "grad_norm": 2.6325693130493164, "learning_rate": 4.510565823170625e-06, "loss": 0.2158, "step": 4216 }, { "epoch": 2.7322121604139715, "grad_norm": 79.38518524169922, "learning_rate": 4.493889461095574e-06, "loss": 0.2012, "step": 4224 }, { "epoch": 2.73738680465718, "grad_norm": 12.619338035583496, "learning_rate": 4.477218785373587e-06, "loss": 0.2151, "step": 4232 }, { "epoch": 2.742561448900388, "grad_norm": 1.3559527397155762, "learning_rate": 4.460553983306332e-06, "loss": 0.2048, "step": 4240 }, { "epoch": 2.7477360931435966, "grad_norm": 9.754837036132812, "learning_rate": 4.443895242129484e-06, "loss": 0.2134, "step": 4248 }, { "epoch": 2.752910737386805, "grad_norm": 4.612194538116455, "learning_rate": 4.4272427490106215e-06, "loss": 0.2063, "step": 4256 }, { "epoch": 2.758085381630013, "grad_norm": 5.114107608795166, "learning_rate": 4.410596691047123e-06, "loss": 0.2185, "step": 4264 }, { "epoch": 2.763260025873221, "grad_norm": 9.316654205322266, "learning_rate": 4.3939572552640645e-06, "loss": 0.2153, "step": 4272 }, { "epoch": 2.7684346701164295, "grad_norm": 6.500330448150635, "learning_rate": 4.377324628612123e-06, "loss": 0.2101, "step": 4280 }, { "epoch": 2.773609314359638, "grad_norm": 1.7955437898635864, "learning_rate": 4.36069899796547e-06, "loss": 0.2072, "step": 4288 }, { "epoch": 2.778783958602846, "grad_norm": 113.924072265625, "learning_rate": 4.344080550119672e-06, "loss": 0.2066, "step": 4296 }, { "epoch": 2.783958602846054, "grad_norm": 2.6472039222717285, "learning_rate": 4.327469471789597e-06, "loss": 0.2122, "step": 4304 }, { "epoch": 2.7891332470892625, "grad_norm": 7.95417594909668, "learning_rate": 4.310865949607311e-06, "loss": 0.1984, "step": 4312 }, { "epoch": 2.794307891332471, "grad_norm": 2.1271450519561768, "learning_rate": 4.294270170119987e-06, "loss": 0.2263, "step": 4320 }, { "epoch": 2.799482535575679, "grad_norm": 1.0342472791671753, "learning_rate": 4.277682319787802e-06, "loss": 0.2248, "step": 4328 }, { "epoch": 2.8046571798188875, "grad_norm": 143.9209747314453, "learning_rate": 4.261102584981848e-06, "loss": 0.2026, "step": 4336 }, { "epoch": 2.809831824062096, "grad_norm": 7.666977405548096, "learning_rate": 4.244531151982034e-06, "loss": 0.2195, "step": 4344 }, { "epoch": 2.815006468305304, "grad_norm": 123.94723510742188, "learning_rate": 4.227968206974999e-06, "loss": 0.2207, "step": 4352 }, { "epoch": 2.8201811125485126, "grad_norm": 52.61326599121094, "learning_rate": 4.211413936052013e-06, "loss": 0.2026, "step": 4360 }, { "epoch": 2.8253557567917205, "grad_norm": 6.78623628616333, "learning_rate": 4.194868525206887e-06, "loss": 0.2131, "step": 4368 }, { "epoch": 2.830530401034929, "grad_norm": 1.9670113325119019, "learning_rate": 4.178332160333891e-06, "loss": 0.2268, "step": 4376 }, { "epoch": 2.835705045278137, "grad_norm": 1.38176691532135, "learning_rate": 4.161805027225655e-06, "loss": 0.2192, "step": 4384 }, { "epoch": 2.8408796895213455, "grad_norm": 3.1774935722351074, "learning_rate": 4.145287311571089e-06, "loss": 0.2164, "step": 4392 }, { "epoch": 2.8460543337645534, "grad_norm": 4.968908786773682, "learning_rate": 4.1287791989532935e-06, "loss": 0.2089, "step": 4400 }, { "epoch": 2.8512289780077618, "grad_norm": 16.97532081604004, "learning_rate": 4.1122808748474745e-06, "loss": 0.2143, "step": 4408 }, { "epoch": 2.85640362225097, "grad_norm": 0.9508041739463806, "learning_rate": 4.095792524618861e-06, "loss": 0.2205, "step": 4416 }, { "epoch": 2.8615782664941785, "grad_norm": 15.078865051269531, "learning_rate": 4.079314333520623e-06, "loss": 0.2224, "step": 4424 }, { "epoch": 2.866752910737387, "grad_norm": 5.82922887802124, "learning_rate": 4.062846486691784e-06, "loss": 0.1991, "step": 4432 }, { "epoch": 2.871927554980595, "grad_norm": 1.030604362487793, "learning_rate": 4.04638916915515e-06, "loss": 0.2134, "step": 4440 }, { "epoch": 2.8771021992238035, "grad_norm": 1.6634279489517212, "learning_rate": 4.0299425658152255e-06, "loss": 0.2113, "step": 4448 }, { "epoch": 2.882276843467012, "grad_norm": 28.07861328125, "learning_rate": 4.013506861456136e-06, "loss": 0.2113, "step": 4456 }, { "epoch": 2.88745148771022, "grad_norm": 6.467888355255127, "learning_rate": 3.997082240739551e-06, "loss": 0.2299, "step": 4464 }, { "epoch": 2.892626131953428, "grad_norm": 2.268150568008423, "learning_rate": 3.9806688882026125e-06, "loss": 0.2134, "step": 4472 }, { "epoch": 2.8978007761966365, "grad_norm": 19.535581588745117, "learning_rate": 3.964266988255861e-06, "loss": 0.2224, "step": 4480 }, { "epoch": 2.902975420439845, "grad_norm": 3.764432907104492, "learning_rate": 3.94787672518116e-06, "loss": 0.2122, "step": 4488 }, { "epoch": 2.908150064683053, "grad_norm": 27.98623275756836, "learning_rate": 3.931498283129631e-06, "loss": 0.2009, "step": 4496 }, { "epoch": 2.913324708926261, "grad_norm": 1.0380629301071167, "learning_rate": 3.915131846119581e-06, "loss": 0.2076, "step": 4504 }, { "epoch": 2.9184993531694694, "grad_norm": 3.117368698120117, "learning_rate": 3.898777598034434e-06, "loss": 0.2179, "step": 4512 }, { "epoch": 2.9236739974126777, "grad_norm": 13.296104431152344, "learning_rate": 3.882435722620667e-06, "loss": 0.2045, "step": 4520 }, { "epoch": 2.928848641655886, "grad_norm": 2.6790828704833984, "learning_rate": 3.866106403485745e-06, "loss": 0.2138, "step": 4528 }, { "epoch": 2.9340232858990944, "grad_norm": 0.8345991373062134, "learning_rate": 3.849789824096061e-06, "loss": 0.1957, "step": 4536 }, { "epoch": 2.939197930142303, "grad_norm": 8.058691024780273, "learning_rate": 3.833486167774867e-06, "loss": 0.2193, "step": 4544 }, { "epoch": 2.944372574385511, "grad_norm": 2.5544962882995605, "learning_rate": 3.817195617700224e-06, "loss": 0.2215, "step": 4552 }, { "epoch": 2.9495472186287195, "grad_norm": 2.118175983428955, "learning_rate": 3.800918356902936e-06, "loss": 0.2082, "step": 4560 }, { "epoch": 2.9547218628719274, "grad_norm": 1.6129382848739624, "learning_rate": 3.784654568264497e-06, "loss": 0.2148, "step": 4568 }, { "epoch": 2.9598965071151357, "grad_norm": 14.738019943237305, "learning_rate": 3.768404434515038e-06, "loss": 0.216, "step": 4576 }, { "epoch": 2.965071151358344, "grad_norm": 10.588520050048828, "learning_rate": 3.7521681382312693e-06, "loss": 0.2179, "step": 4584 }, { "epoch": 2.9702457956015524, "grad_norm": 26.88380241394043, "learning_rate": 3.735945861834434e-06, "loss": 0.2132, "step": 4592 }, { "epoch": 2.975420439844761, "grad_norm": 8.701356887817383, "learning_rate": 3.7197377875882547e-06, "loss": 0.2174, "step": 4600 }, { "epoch": 2.9805950840879687, "grad_norm": 9.462160110473633, "learning_rate": 3.703544097596887e-06, "loss": 0.2296, "step": 4608 }, { "epoch": 2.985769728331177, "grad_norm": 7.985735893249512, "learning_rate": 3.6873649738028737e-06, "loss": 0.2121, "step": 4616 }, { "epoch": 2.9909443725743854, "grad_norm": 24.12420654296875, "learning_rate": 3.671200597985104e-06, "loss": 0.206, "step": 4624 }, { "epoch": 2.9961190168175937, "grad_norm": 1.4237028360366821, "learning_rate": 3.655051151756762e-06, "loss": 0.2072, "step": 4632 }, { "epoch": 3.001293661060802, "grad_norm": 4.103756427764893, "learning_rate": 3.638916816563298e-06, "loss": 0.1977, "step": 4640 }, { "epoch": 3.0064683053040104, "grad_norm": 2.717452049255371, "learning_rate": 3.622797773680379e-06, "loss": 0.2233, "step": 4648 }, { "epoch": 3.011642949547219, "grad_norm": 3.143430233001709, "learning_rate": 3.6066942042118568e-06, "loss": 0.2246, "step": 4656 }, { "epoch": 3.0168175937904267, "grad_norm": 0.8686035871505737, "learning_rate": 3.5906062890877368e-06, "loss": 0.2112, "step": 4664 }, { "epoch": 3.021992238033635, "grad_norm": 7.7689056396484375, "learning_rate": 3.5745342090621406e-06, "loss": 0.2288, "step": 4672 }, { "epoch": 3.0271668822768434, "grad_norm": 7.347503185272217, "learning_rate": 3.5584781447112737e-06, "loss": 0.1989, "step": 4680 }, { "epoch": 3.0323415265200517, "grad_norm": 29.066707611083984, "learning_rate": 3.542438276431401e-06, "loss": 0.1981, "step": 4688 }, { "epoch": 3.03751617076326, "grad_norm": 16.907032012939453, "learning_rate": 3.526414784436819e-06, "loss": 0.2241, "step": 4696 }, { "epoch": 3.0426908150064684, "grad_norm": 50.19180679321289, "learning_rate": 3.510407848757828e-06, "loss": 0.2103, "step": 4704 }, { "epoch": 3.047865459249677, "grad_norm": 3.969433069229126, "learning_rate": 3.494417649238713e-06, "loss": 0.2084, "step": 4712 }, { "epoch": 3.0530401034928847, "grad_norm": 1.613051176071167, "learning_rate": 3.47844436553572e-06, "loss": 0.207, "step": 4720 }, { "epoch": 3.058214747736093, "grad_norm": 15.627549171447754, "learning_rate": 3.462488177115041e-06, "loss": 0.2232, "step": 4728 }, { "epoch": 3.0633893919793014, "grad_norm": 4.300905704498291, "learning_rate": 3.4465492632507946e-06, "loss": 0.2122, "step": 4736 }, { "epoch": 3.0685640362225097, "grad_norm": 7.382449150085449, "learning_rate": 3.4306278030230143e-06, "loss": 0.2146, "step": 4744 }, { "epoch": 3.073738680465718, "grad_norm": 2.4655721187591553, "learning_rate": 3.4147239753156324e-06, "loss": 0.2172, "step": 4752 }, { "epoch": 3.0789133247089264, "grad_norm": 3.6668355464935303, "learning_rate": 3.398837958814475e-06, "loss": 0.2068, "step": 4760 }, { "epoch": 3.0840879689521343, "grad_norm": 2.1171956062316895, "learning_rate": 3.382969932005252e-06, "loss": 0.2049, "step": 4768 }, { "epoch": 3.0892626131953427, "grad_norm": 2.6610488891601562, "learning_rate": 3.367120073171548e-06, "loss": 0.2132, "step": 4776 }, { "epoch": 3.094437257438551, "grad_norm": 3.1115005016326904, "learning_rate": 3.351288560392833e-06, "loss": 0.2113, "step": 4784 }, { "epoch": 3.0996119016817594, "grad_norm": 39.48991775512695, "learning_rate": 3.335475571542442e-06, "loss": 0.1985, "step": 4792 }, { "epoch": 3.1047865459249677, "grad_norm": 4.187602996826172, "learning_rate": 3.3196812842855895e-06, "loss": 0.2209, "step": 4800 }, { "epoch": 3.109961190168176, "grad_norm": 2.5106544494628906, "learning_rate": 3.303905876077372e-06, "loss": 0.2136, "step": 4808 }, { "epoch": 3.1151358344113844, "grad_norm": 5.031343460083008, "learning_rate": 3.28814952416077e-06, "loss": 0.2079, "step": 4816 }, { "epoch": 3.1203104786545923, "grad_norm": 4.405430316925049, "learning_rate": 3.272412405564659e-06, "loss": 0.2209, "step": 4824 }, { "epoch": 3.1254851228978007, "grad_norm": 4.106354713439941, "learning_rate": 3.2566946971018225e-06, "loss": 0.2219, "step": 4832 }, { "epoch": 3.130659767141009, "grad_norm": 74.13800811767578, "learning_rate": 3.240996575366961e-06, "loss": 0.2264, "step": 4840 }, { "epoch": 3.1358344113842174, "grad_norm": 2.211841344833374, "learning_rate": 3.225318216734713e-06, "loss": 0.2095, "step": 4848 }, { "epoch": 3.1410090556274257, "grad_norm": 5.970486164093018, "learning_rate": 3.209659797357669e-06, "loss": 0.2156, "step": 4856 }, { "epoch": 3.146183699870634, "grad_norm": 2.485638380050659, "learning_rate": 3.1940214931643945e-06, "loss": 0.2137, "step": 4864 }, { "epoch": 3.151358344113842, "grad_norm": 1.4314054250717163, "learning_rate": 3.1784034798574514e-06, "loss": 0.2071, "step": 4872 }, { "epoch": 3.1565329883570503, "grad_norm": 2.9645638465881348, "learning_rate": 3.1628059329114286e-06, "loss": 0.2172, "step": 4880 }, { "epoch": 3.1617076326002587, "grad_norm": 2.3624343872070312, "learning_rate": 3.1472290275709642e-06, "loss": 0.2201, "step": 4888 }, { "epoch": 3.166882276843467, "grad_norm": 8.607010841369629, "learning_rate": 3.1316729388487815e-06, "loss": 0.2092, "step": 4896 }, { "epoch": 3.1720569210866754, "grad_norm": 1.539337396621704, "learning_rate": 3.1161378415237197e-06, "loss": 0.2105, "step": 4904 }, { "epoch": 3.1772315653298837, "grad_norm": 2.8210718631744385, "learning_rate": 3.1006239101387725e-06, "loss": 0.2279, "step": 4912 }, { "epoch": 3.1824062095730916, "grad_norm": 2.121821641921997, "learning_rate": 3.0851313189991226e-06, "loss": 0.2033, "step": 4920 }, { "epoch": 3.1875808538163, "grad_norm": 1.296933650970459, "learning_rate": 3.0696602421701943e-06, "loss": 0.2021, "step": 4928 }, { "epoch": 3.1927554980595083, "grad_norm": 6.50001335144043, "learning_rate": 3.054210853475682e-06, "loss": 0.209, "step": 4936 }, { "epoch": 3.1979301423027167, "grad_norm": 4.615538120269775, "learning_rate": 3.0387833264956078e-06, "loss": 0.2133, "step": 4944 }, { "epoch": 3.203104786545925, "grad_norm": 2.2612783908843994, "learning_rate": 3.02337783456437e-06, "loss": 0.2207, "step": 4952 }, { "epoch": 3.2082794307891334, "grad_norm": 5.742753028869629, "learning_rate": 3.007994550768793e-06, "loss": 0.2244, "step": 4960 }, { "epoch": 3.2134540750323417, "grad_norm": 2.2950289249420166, "learning_rate": 2.9926336479461846e-06, "loss": 0.2055, "step": 4968 }, { "epoch": 3.2186287192755496, "grad_norm": 1.1664949655532837, "learning_rate": 2.9772952986823943e-06, "loss": 0.2003, "step": 4976 }, { "epoch": 3.223803363518758, "grad_norm": 16.022438049316406, "learning_rate": 2.9619796753098716e-06, "loss": 0.2171, "step": 4984 }, { "epoch": 3.2289780077619663, "grad_norm": 1.7189433574676514, "learning_rate": 2.946686949905733e-06, "loss": 0.2308, "step": 4992 }, { "epoch": 3.2341526520051747, "grad_norm": 9.547940254211426, "learning_rate": 2.9314172942898257e-06, "loss": 0.2124, "step": 5000 }, { "epoch": 3.239327296248383, "grad_norm": 2.51373028755188, "learning_rate": 2.9161708800228e-06, "loss": 0.196, "step": 5008 }, { "epoch": 3.2445019404915914, "grad_norm": 2.1994516849517822, "learning_rate": 2.900947878404181e-06, "loss": 0.2283, "step": 5016 }, { "epoch": 3.2496765847347993, "grad_norm": 3.118130922317505, "learning_rate": 2.8857484604704415e-06, "loss": 0.2067, "step": 5024 }, { "epoch": 3.2548512289780076, "grad_norm": 11.828572273254395, "learning_rate": 2.870572796993084e-06, "loss": 0.1918, "step": 5032 }, { "epoch": 3.260025873221216, "grad_norm": 9.986909866333008, "learning_rate": 2.8554210584767188e-06, "loss": 0.2205, "step": 5040 }, { "epoch": 3.2652005174644243, "grad_norm": 10.845985412597656, "learning_rate": 2.8402934151571505e-06, "loss": 0.2055, "step": 5048 }, { "epoch": 3.2703751617076326, "grad_norm": 6.319619655609131, "learning_rate": 2.8251900369994645e-06, "loss": 0.2106, "step": 5056 }, { "epoch": 3.275549805950841, "grad_norm": 6.275879859924316, "learning_rate": 2.8101110936961153e-06, "loss": 0.2055, "step": 5064 }, { "epoch": 3.2807244501940493, "grad_norm": 56.02274703979492, "learning_rate": 2.795056754665028e-06, "loss": 0.2066, "step": 5072 }, { "epoch": 3.2858990944372573, "grad_norm": 4.814873218536377, "learning_rate": 2.7800271890476836e-06, "loss": 0.2145, "step": 5080 }, { "epoch": 3.2910737386804656, "grad_norm": 2.2435498237609863, "learning_rate": 2.765022565707226e-06, "loss": 0.2214, "step": 5088 }, { "epoch": 3.296248382923674, "grad_norm": 13.148270606994629, "learning_rate": 2.750043053226561e-06, "loss": 0.2017, "step": 5096 }, { "epoch": 3.3014230271668823, "grad_norm": 8.37467098236084, "learning_rate": 2.735088819906465e-06, "loss": 0.202, "step": 5104 }, { "epoch": 3.3065976714100906, "grad_norm": 17.98732566833496, "learning_rate": 2.7201600337636946e-06, "loss": 0.2121, "step": 5112 }, { "epoch": 3.311772315653299, "grad_norm": 6.076496601104736, "learning_rate": 2.7052568625290955e-06, "loss": 0.2187, "step": 5120 }, { "epoch": 3.316946959896507, "grad_norm": 2.1653237342834473, "learning_rate": 2.690379473645718e-06, "loss": 0.2119, "step": 5128 }, { "epoch": 3.3221216041397152, "grad_norm": 10.182047843933105, "learning_rate": 2.675528034266941e-06, "loss": 0.2204, "step": 5136 }, { "epoch": 3.3272962483829236, "grad_norm": 29.412364959716797, "learning_rate": 2.6607027112545893e-06, "loss": 0.2093, "step": 5144 }, { "epoch": 3.332470892626132, "grad_norm": 4.263775825500488, "learning_rate": 2.645903671177058e-06, "loss": 0.2191, "step": 5152 }, { "epoch": 3.3376455368693403, "grad_norm": 30.59326934814453, "learning_rate": 2.631131080307445e-06, "loss": 0.2026, "step": 5160 }, { "epoch": 3.3428201811125486, "grad_norm": 5.779555320739746, "learning_rate": 2.6163851046216813e-06, "loss": 0.2137, "step": 5168 }, { "epoch": 3.347994825355757, "grad_norm": 3.681560754776001, "learning_rate": 2.6016659097966636e-06, "loss": 0.2146, "step": 5176 }, { "epoch": 3.353169469598965, "grad_norm": 1.4380924701690674, "learning_rate": 2.5869736612083955e-06, "loss": 0.2087, "step": 5184 }, { "epoch": 3.3583441138421732, "grad_norm": 12.789270401000977, "learning_rate": 2.572308523930131e-06, "loss": 0.216, "step": 5192 }, { "epoch": 3.3635187580853816, "grad_norm": 17.20673370361328, "learning_rate": 2.557670662730515e-06, "loss": 0.2145, "step": 5200 }, { "epoch": 3.36869340232859, "grad_norm": 1.4245859384536743, "learning_rate": 2.5430602420717355e-06, "loss": 0.2107, "step": 5208 }, { "epoch": 3.3738680465717983, "grad_norm": 13.390450477600098, "learning_rate": 2.528477426107678e-06, "loss": 0.204, "step": 5216 }, { "epoch": 3.3790426908150066, "grad_norm": 1.8627618551254272, "learning_rate": 2.513922378682075e-06, "loss": 0.2112, "step": 5224 }, { "epoch": 3.3842173350582145, "grad_norm": 1.2581387758255005, "learning_rate": 2.499395263326669e-06, "loss": 0.2056, "step": 5232 }, { "epoch": 3.389391979301423, "grad_norm": 1.6016255617141724, "learning_rate": 2.484896243259375e-06, "loss": 0.2077, "step": 5240 }, { "epoch": 3.3945666235446312, "grad_norm": 6.12626314163208, "learning_rate": 2.470425481382447e-06, "loss": 0.2113, "step": 5248 }, { "epoch": 3.3997412677878396, "grad_norm": 2.2390005588531494, "learning_rate": 2.4559831402806454e-06, "loss": 0.2097, "step": 5256 }, { "epoch": 3.404915912031048, "grad_norm": 5.566039085388184, "learning_rate": 2.441569382219413e-06, "loss": 0.2065, "step": 5264 }, { "epoch": 3.4100905562742563, "grad_norm": 1.4189672470092773, "learning_rate": 2.427184369143051e-06, "loss": 0.2182, "step": 5272 }, { "epoch": 3.4152652005174646, "grad_norm": 22.4144287109375, "learning_rate": 2.4128282626728985e-06, "loss": 0.2052, "step": 5280 }, { "epoch": 3.4204398447606725, "grad_norm": 2.110011339187622, "learning_rate": 2.398501224105517e-06, "loss": 0.2091, "step": 5288 }, { "epoch": 3.425614489003881, "grad_norm": 2.668170928955078, "learning_rate": 2.384203414410878e-06, "loss": 0.2092, "step": 5296 }, { "epoch": 3.4307891332470892, "grad_norm": 3.0023293495178223, "learning_rate": 2.3699349942305603e-06, "loss": 0.2116, "step": 5304 }, { "epoch": 3.4359637774902976, "grad_norm": 4.757721900939941, "learning_rate": 2.355696123875934e-06, "loss": 0.2025, "step": 5312 }, { "epoch": 3.441138421733506, "grad_norm": 19.3017635345459, "learning_rate": 2.341486963326366e-06, "loss": 0.2227, "step": 5320 }, { "epoch": 3.4463130659767143, "grad_norm": 1.613916039466858, "learning_rate": 2.3273076722274233e-06, "loss": 0.1964, "step": 5328 }, { "epoch": 3.451487710219922, "grad_norm": 2.9506986141204834, "learning_rate": 2.3131584098890775e-06, "loss": 0.2258, "step": 5336 }, { "epoch": 3.4566623544631305, "grad_norm": 6.207396984100342, "learning_rate": 2.299039335283914e-06, "loss": 0.2156, "step": 5344 }, { "epoch": 3.461836998706339, "grad_norm": 1.0315911769866943, "learning_rate": 2.2849506070453466e-06, "loss": 0.1993, "step": 5352 }, { "epoch": 3.4670116429495472, "grad_norm": 39.45634078979492, "learning_rate": 2.27089238346584e-06, "loss": 0.201, "step": 5360 }, { "epoch": 3.4721862871927556, "grad_norm": 13.167795181274414, "learning_rate": 2.2568648224951217e-06, "loss": 0.2168, "step": 5368 }, { "epoch": 3.477360931435964, "grad_norm": 19.175676345825195, "learning_rate": 2.2428680817384153e-06, "loss": 0.1958, "step": 5376 }, { "epoch": 3.4825355756791723, "grad_norm": 6.49905252456665, "learning_rate": 2.228902318454666e-06, "loss": 0.2009, "step": 5384 }, { "epoch": 3.48771021992238, "grad_norm": 2.5502731800079346, "learning_rate": 2.214967689554775e-06, "loss": 0.2018, "step": 5392 }, { "epoch": 3.4928848641655885, "grad_norm": 17.03938102722168, "learning_rate": 2.201064351599837e-06, "loss": 0.2102, "step": 5400 }, { "epoch": 3.498059508408797, "grad_norm": 3.042534112930298, "learning_rate": 2.18719246079938e-06, "loss": 0.212, "step": 5408 }, { "epoch": 3.503234152652005, "grad_norm": 1.2638347148895264, "learning_rate": 2.17335217300961e-06, "loss": 0.2273, "step": 5416 }, { "epoch": 3.5084087968952136, "grad_norm": 13.239524841308594, "learning_rate": 2.1595436437316614e-06, "loss": 0.2107, "step": 5424 }, { "epoch": 3.5135834411384215, "grad_norm": 1.3805886507034302, "learning_rate": 2.1457670281098493e-06, "loss": 0.2167, "step": 5432 }, { "epoch": 3.51875808538163, "grad_norm": 4.028883934020996, "learning_rate": 2.132022480929926e-06, "loss": 0.2158, "step": 5440 }, { "epoch": 3.523932729624838, "grad_norm": 5.846325397491455, "learning_rate": 2.118310156617342e-06, "loss": 0.2237, "step": 5448 }, { "epoch": 3.5291073738680465, "grad_norm": 1.7205686569213867, "learning_rate": 2.1046302092355107e-06, "loss": 0.2206, "step": 5456 }, { "epoch": 3.534282018111255, "grad_norm": 16.556434631347656, "learning_rate": 2.0909827924840787e-06, "loss": 0.208, "step": 5464 }, { "epoch": 3.539456662354463, "grad_norm": 1.7649803161621094, "learning_rate": 2.0773680596971976e-06, "loss": 0.2087, "step": 5472 }, { "epoch": 3.5446313065976716, "grad_norm": 33.007301330566406, "learning_rate": 2.0637861638418003e-06, "loss": 0.2162, "step": 5480 }, { "epoch": 3.54980595084088, "grad_norm": 55.374752044677734, "learning_rate": 2.0502372575158865e-06, "loss": 0.2078, "step": 5488 }, { "epoch": 3.554980595084088, "grad_norm": 9.658388137817383, "learning_rate": 2.0367214929468036e-06, "loss": 0.2036, "step": 5496 }, { "epoch": 3.560155239327296, "grad_norm": 15.524160385131836, "learning_rate": 2.0232390219895364e-06, "loss": 0.2035, "step": 5504 }, { "epoch": 3.5653298835705045, "grad_norm": 3.2139875888824463, "learning_rate": 2.009789996125009e-06, "loss": 0.2099, "step": 5512 }, { "epoch": 3.570504527813713, "grad_norm": 1.951788067817688, "learning_rate": 1.99637456645837e-06, "loss": 0.2027, "step": 5520 }, { "epoch": 3.575679172056921, "grad_norm": 1.508257508277893, "learning_rate": 1.982992883717304e-06, "loss": 0.2064, "step": 5528 }, { "epoch": 3.580853816300129, "grad_norm": 1.6039284467697144, "learning_rate": 1.9696450982503356e-06, "loss": 0.2065, "step": 5536 }, { "epoch": 3.5860284605433375, "grad_norm": 7.59080696105957, "learning_rate": 1.95633136002514e-06, "loss": 0.2112, "step": 5544 }, { "epoch": 3.591203104786546, "grad_norm": 25.365097045898438, "learning_rate": 1.943051818626857e-06, "loss": 0.2115, "step": 5552 }, { "epoch": 3.596377749029754, "grad_norm": 1.8311065435409546, "learning_rate": 1.9298066232564135e-06, "loss": 0.203, "step": 5560 }, { "epoch": 3.6015523932729625, "grad_norm": 12.468267440795898, "learning_rate": 1.916595922728843e-06, "loss": 0.2106, "step": 5568 }, { "epoch": 3.606727037516171, "grad_norm": 3.0780019760131836, "learning_rate": 1.9034198654716163e-06, "loss": 0.2152, "step": 5576 }, { "epoch": 3.611901681759379, "grad_norm": 1.8586463928222656, "learning_rate": 1.890278599522975e-06, "loss": 0.203, "step": 5584 }, { "epoch": 3.6170763260025875, "grad_norm": 1.7706098556518555, "learning_rate": 1.8771722725302644e-06, "loss": 0.2188, "step": 5592 }, { "epoch": 3.6222509702457955, "grad_norm": 2.8252525329589844, "learning_rate": 1.864101031748277e-06, "loss": 0.2101, "step": 5600 }, { "epoch": 3.627425614489004, "grad_norm": 2.265062093734741, "learning_rate": 1.8510650240376e-06, "loss": 0.2018, "step": 5608 }, { "epoch": 3.632600258732212, "grad_norm": 2.893099546432495, "learning_rate": 1.8380643958629596e-06, "loss": 0.2047, "step": 5616 }, { "epoch": 3.6377749029754205, "grad_norm": 1.9583306312561035, "learning_rate": 1.8250992932915811e-06, "loss": 0.2101, "step": 5624 }, { "epoch": 3.642949547218629, "grad_norm": 1.5815550088882446, "learning_rate": 1.8121698619915457e-06, "loss": 0.2153, "step": 5632 }, { "epoch": 3.6481241914618368, "grad_norm": 20.85481834411621, "learning_rate": 1.7992762472301511e-06, "loss": 0.2095, "step": 5640 }, { "epoch": 3.653298835705045, "grad_norm": 3.830641269683838, "learning_rate": 1.7864185938722868e-06, "loss": 0.2056, "step": 5648 }, { "epoch": 3.6584734799482534, "grad_norm": 15.252459526062012, "learning_rate": 1.7735970463787967e-06, "loss": 0.2233, "step": 5656 }, { "epoch": 3.663648124191462, "grad_norm": 2.826512336730957, "learning_rate": 1.7608117488048636e-06, "loss": 0.2275, "step": 5664 }, { "epoch": 3.66882276843467, "grad_norm": 2.0030300617218018, "learning_rate": 1.7480628447983878e-06, "loss": 0.2101, "step": 5672 }, { "epoch": 3.6739974126778785, "grad_norm": 2.019261598587036, "learning_rate": 1.735350477598372e-06, "loss": 0.2121, "step": 5680 }, { "epoch": 3.679172056921087, "grad_norm": 2.8684234619140625, "learning_rate": 1.7226747900333135e-06, "loss": 0.2239, "step": 5688 }, { "epoch": 3.684346701164295, "grad_norm": 18.148910522460938, "learning_rate": 1.7100359245196035e-06, "loss": 0.2087, "step": 5696 }, { "epoch": 3.689521345407503, "grad_norm": 2.289294719696045, "learning_rate": 1.6974340230599173e-06, "loss": 0.1977, "step": 5704 }, { "epoch": 3.6946959896507114, "grad_norm": 7.737388610839844, "learning_rate": 1.6848692272416268e-06, "loss": 0.2152, "step": 5712 }, { "epoch": 3.69987063389392, "grad_norm": 17.832653045654297, "learning_rate": 1.6723416782352076e-06, "loss": 0.2132, "step": 5720 }, { "epoch": 3.705045278137128, "grad_norm": 2.064863443374634, "learning_rate": 1.659851516792651e-06, "loss": 0.2106, "step": 5728 }, { "epoch": 3.7102199223803365, "grad_norm": 4.075965404510498, "learning_rate": 1.647398883245886e-06, "loss": 0.2105, "step": 5736 }, { "epoch": 3.7153945666235444, "grad_norm": 18.446760177612305, "learning_rate": 1.6349839175051995e-06, "loss": 0.213, "step": 5744 }, { "epoch": 3.7205692108667527, "grad_norm": 22.746885299682617, "learning_rate": 1.622606759057666e-06, "loss": 0.2037, "step": 5752 }, { "epoch": 3.725743855109961, "grad_norm": 1.7176026105880737, "learning_rate": 1.610267546965581e-06, "loss": 0.2129, "step": 5760 }, { "epoch": 3.7309184993531694, "grad_norm": 1.911559820175171, "learning_rate": 1.5979664198648959e-06, "loss": 0.227, "step": 5768 }, { "epoch": 3.736093143596378, "grad_norm": 21.561500549316406, "learning_rate": 1.5857035159636625e-06, "loss": 0.2178, "step": 5776 }, { "epoch": 3.741267787839586, "grad_norm": 14.1412353515625, "learning_rate": 1.5734789730404815e-06, "loss": 0.2048, "step": 5784 }, { "epoch": 3.7464424320827945, "grad_norm": 8.0577392578125, "learning_rate": 1.5612929284429484e-06, "loss": 0.2079, "step": 5792 }, { "epoch": 3.751617076326003, "grad_norm": 10.920722007751465, "learning_rate": 1.549145519086122e-06, "loss": 0.1922, "step": 5800 }, { "epoch": 3.7567917205692107, "grad_norm": 1.764875888824463, "learning_rate": 1.5370368814509727e-06, "loss": 0.1979, "step": 5808 }, { "epoch": 3.761966364812419, "grad_norm": 4.495997428894043, "learning_rate": 1.5249671515828569e-06, "loss": 0.2098, "step": 5816 }, { "epoch": 3.7671410090556274, "grad_norm": 4.6347503662109375, "learning_rate": 1.5129364650899869e-06, "loss": 0.2254, "step": 5824 }, { "epoch": 3.772315653298836, "grad_norm": 7.4554009437561035, "learning_rate": 1.5009449571419077e-06, "loss": 0.2071, "step": 5832 }, { "epoch": 3.777490297542044, "grad_norm": 1.338654637336731, "learning_rate": 1.4889927624679762e-06, "loss": 0.2029, "step": 5840 }, { "epoch": 3.782664941785252, "grad_norm": 3.0357022285461426, "learning_rate": 1.4770800153558513e-06, "loss": 0.2136, "step": 5848 }, { "epoch": 3.7878395860284604, "grad_norm": 11.845126152038574, "learning_rate": 1.4652068496499804e-06, "loss": 0.2241, "step": 5856 }, { "epoch": 3.7930142302716687, "grad_norm": 1.81815505027771, "learning_rate": 1.4533733987501004e-06, "loss": 0.2151, "step": 5864 }, { "epoch": 3.798188874514877, "grad_norm": 1.015817403793335, "learning_rate": 1.4415797956097356e-06, "loss": 0.2179, "step": 5872 }, { "epoch": 3.8033635187580854, "grad_norm": 5.7766218185424805, "learning_rate": 1.4298261727347034e-06, "loss": 0.2151, "step": 5880 }, { "epoch": 3.8085381630012938, "grad_norm": 2.376643180847168, "learning_rate": 1.41811266218163e-06, "loss": 0.1969, "step": 5888 }, { "epoch": 3.813712807244502, "grad_norm": 3.778541088104248, "learning_rate": 1.4064393955564615e-06, "loss": 0.211, "step": 5896 }, { "epoch": 3.8188874514877105, "grad_norm": 8.010899543762207, "learning_rate": 1.3948065040129882e-06, "loss": 0.2075, "step": 5904 }, { "epoch": 3.8240620957309184, "grad_norm": 11.207403182983398, "learning_rate": 1.3832141182513699e-06, "loss": 0.2022, "step": 5912 }, { "epoch": 3.8292367399741267, "grad_norm": 1.3528246879577637, "learning_rate": 1.3716623685166685e-06, "loss": 0.2143, "step": 5920 }, { "epoch": 3.834411384217335, "grad_norm": 26.18692970275879, "learning_rate": 1.3601513845973835e-06, "loss": 0.2224, "step": 5928 }, { "epoch": 3.8395860284605434, "grad_norm": 1.8334400653839111, "learning_rate": 1.3486812958239931e-06, "loss": 0.2178, "step": 5936 }, { "epoch": 3.8447606727037518, "grad_norm": 5.360263824462891, "learning_rate": 1.3372522310675063e-06, "loss": 0.2175, "step": 5944 }, { "epoch": 3.8499353169469597, "grad_norm": 1.6526539325714111, "learning_rate": 1.3258643187380071e-06, "loss": 0.2074, "step": 5952 }, { "epoch": 3.855109961190168, "grad_norm": 10.19829273223877, "learning_rate": 1.3145176867832165e-06, "loss": 0.2067, "step": 5960 }, { "epoch": 3.8602846054333764, "grad_norm": 3.2970707416534424, "learning_rate": 1.3032124626870546e-06, "loss": 0.2229, "step": 5968 }, { "epoch": 3.8654592496765847, "grad_norm": 2.3017404079437256, "learning_rate": 1.2919487734682073e-06, "loss": 0.2071, "step": 5976 }, { "epoch": 3.870633893919793, "grad_norm": 11.9258394241333, "learning_rate": 1.2807267456787004e-06, "loss": 0.204, "step": 5984 }, { "epoch": 3.8758085381630014, "grad_norm": 1.4012444019317627, "learning_rate": 1.2695465054024752e-06, "loss": 0.2191, "step": 5992 }, { "epoch": 3.8809831824062098, "grad_norm": 1.8280630111694336, "learning_rate": 1.2584081782539764e-06, "loss": 0.2163, "step": 6000 }, { "epoch": 3.886157826649418, "grad_norm": 5.412600994110107, "learning_rate": 1.247311889376736e-06, "loss": 0.2066, "step": 6008 }, { "epoch": 3.891332470892626, "grad_norm": 18.54897117614746, "learning_rate": 1.2362577634419692e-06, "loss": 0.2104, "step": 6016 }, { "epoch": 3.8965071151358344, "grad_norm": 21.60243034362793, "learning_rate": 1.2252459246471754e-06, "loss": 0.2074, "step": 6024 }, { "epoch": 3.9016817593790427, "grad_norm": 24.753875732421875, "learning_rate": 1.2142764967147385e-06, "loss": 0.2005, "step": 6032 }, { "epoch": 3.906856403622251, "grad_norm": 12.543983459472656, "learning_rate": 1.2033496028905445e-06, "loss": 0.204, "step": 6040 }, { "epoch": 3.9120310478654594, "grad_norm": 8.506756782531738, "learning_rate": 1.1924653659425862e-06, "loss": 0.2109, "step": 6048 }, { "epoch": 3.9172056921086673, "grad_norm": 6.222147464752197, "learning_rate": 1.1816239081595926e-06, "loss": 0.203, "step": 6056 }, { "epoch": 3.9223803363518757, "grad_norm": 1.7484989166259766, "learning_rate": 1.1708253513496504e-06, "loss": 0.2183, "step": 6064 }, { "epoch": 3.927554980595084, "grad_norm": 1.252619981765747, "learning_rate": 1.160069816838838e-06, "loss": 0.2018, "step": 6072 }, { "epoch": 3.9327296248382924, "grad_norm": 8.604789733886719, "learning_rate": 1.1493574254698598e-06, "loss": 0.1997, "step": 6080 }, { "epoch": 3.9379042690815007, "grad_norm": 21.417587280273438, "learning_rate": 1.1386882976006897e-06, "loss": 0.1985, "step": 6088 }, { "epoch": 3.943078913324709, "grad_norm": 1.4360429048538208, "learning_rate": 1.128062553103223e-06, "loss": 0.214, "step": 6096 }, { "epoch": 3.9482535575679174, "grad_norm": 28.317441940307617, "learning_rate": 1.1174803113619204e-06, "loss": 0.2086, "step": 6104 }, { "epoch": 3.9534282018111258, "grad_norm": 4.1924052238464355, "learning_rate": 1.106941691272474e-06, "loss": 0.214, "step": 6112 }, { "epoch": 3.9586028460543337, "grad_norm": 11.092174530029297, "learning_rate": 1.0964468112404691e-06, "loss": 0.2052, "step": 6120 }, { "epoch": 3.963777490297542, "grad_norm": 0.8266966938972473, "learning_rate": 1.0859957891800548e-06, "loss": 0.2056, "step": 6128 }, { "epoch": 3.9689521345407504, "grad_norm": 1.4189459085464478, "learning_rate": 1.075588742512617e-06, "loss": 0.2043, "step": 6136 }, { "epoch": 3.9741267787839587, "grad_norm": 10.665148735046387, "learning_rate": 1.0652257881654625e-06, "loss": 0.2146, "step": 6144 }, { "epoch": 3.9793014230271666, "grad_norm": 11.610654830932617, "learning_rate": 1.0549070425705017e-06, "loss": 0.2126, "step": 6152 }, { "epoch": 3.984476067270375, "grad_norm": 1.4883841276168823, "learning_rate": 1.0446326216629422e-06, "loss": 0.2093, "step": 6160 }, { "epoch": 3.9896507115135833, "grad_norm": 3.509707450866699, "learning_rate": 1.0344026408799868e-06, "loss": 0.2055, "step": 6168 }, { "epoch": 3.9948253557567917, "grad_norm": 7.780543804168701, "learning_rate": 1.0242172151595365e-06, "loss": 0.2123, "step": 6176 }, { "epoch": 4.0, "grad_norm": 12.673335075378418, "learning_rate": 1.0140764589388963e-06, "loss": 0.2044, "step": 6184 }, { "epoch": 4.005174644243208, "grad_norm": 3.754645824432373, "learning_rate": 1.003980486153494e-06, "loss": 0.2155, "step": 6192 }, { "epoch": 4.010349288486417, "grad_norm": 0.9349867105484009, "learning_rate": 9.939294102355957e-07, "loss": 0.211, "step": 6200 }, { "epoch": 4.015523932729625, "grad_norm": 9.735404014587402, "learning_rate": 9.839233441130353e-07, "loss": 0.2043, "step": 6208 }, { "epoch": 4.020698576972833, "grad_norm": 4.292990207672119, "learning_rate": 9.739624002079412e-07, "loss": 0.2239, "step": 6216 }, { "epoch": 4.025873221216042, "grad_norm": 12.860719680786133, "learning_rate": 9.640466904354778e-07, "loss": 0.2163, "step": 6224 }, { "epoch": 4.03104786545925, "grad_norm": 1.3650522232055664, "learning_rate": 9.541763262025866e-07, "loss": 0.2082, "step": 6232 }, { "epoch": 4.0362225097024576, "grad_norm": 17.642620086669922, "learning_rate": 9.443514184067326e-07, "loss": 0.197, "step": 6240 }, { "epoch": 4.041397153945666, "grad_norm": 5.8212714195251465, "learning_rate": 9.345720774346589e-07, "loss": 0.2059, "step": 6248 }, { "epoch": 4.046571798188874, "grad_norm": 14.918822288513184, "learning_rate": 9.248384131611493e-07, "loss": 0.2074, "step": 6256 }, { "epoch": 4.051746442432083, "grad_norm": 3.4913625717163086, "learning_rate": 9.151505349477901e-07, "loss": 0.2251, "step": 6264 }, { "epoch": 4.056921086675291, "grad_norm": 17.606687545776367, "learning_rate": 9.055085516417439e-07, "loss": 0.2141, "step": 6272 }, { "epoch": 4.062095730918499, "grad_norm": 0.8561938405036926, "learning_rate": 8.959125715745248e-07, "loss": 0.2123, "step": 6280 }, { "epoch": 4.067270375161708, "grad_norm": 6.487976551055908, "learning_rate": 8.863627025607835e-07, "loss": 0.2218, "step": 6288 }, { "epoch": 4.072445019404916, "grad_norm": 24.814111709594727, "learning_rate": 8.768590518970938e-07, "loss": 0.1991, "step": 6296 }, { "epoch": 4.077619663648124, "grad_norm": 0.9031611084938049, "learning_rate": 8.674017263607488e-07, "loss": 0.2011, "step": 6304 }, { "epoch": 4.082794307891333, "grad_norm": 32.17177200317383, "learning_rate": 8.57990832208559e-07, "loss": 0.2109, "step": 6312 }, { "epoch": 4.087968952134541, "grad_norm": 4.342803478240967, "learning_rate": 8.486264751756607e-07, "loss": 0.1977, "step": 6320 }, { "epoch": 4.093143596377749, "grad_norm": 4.845144748687744, "learning_rate": 8.393087604743283e-07, "loss": 0.2082, "step": 6328 }, { "epoch": 4.098318240620958, "grad_norm": 7.410824298858643, "learning_rate": 8.300377927927888e-07, "loss": 0.2096, "step": 6336 }, { "epoch": 4.103492884864165, "grad_norm": 21.402700424194336, "learning_rate": 8.208136762940489e-07, "loss": 0.2133, "step": 6344 }, { "epoch": 4.1086675291073735, "grad_norm": 10.131089210510254, "learning_rate": 8.116365146147243e-07, "loss": 0.2217, "step": 6352 }, { "epoch": 4.113842173350582, "grad_norm": 2.5814476013183594, "learning_rate": 8.025064108638742e-07, "loss": 0.1901, "step": 6360 }, { "epoch": 4.11901681759379, "grad_norm": 3.8243138790130615, "learning_rate": 7.934234676218411e-07, "loss": 0.2239, "step": 6368 }, { "epoch": 4.124191461836999, "grad_norm": 1.469176173210144, "learning_rate": 7.843877869391053e-07, "loss": 0.2088, "step": 6376 }, { "epoch": 4.129366106080207, "grad_norm": 8.301488876342773, "learning_rate": 7.753994703351298e-07, "loss": 0.2082, "step": 6384 }, { "epoch": 4.134540750323415, "grad_norm": 5.9099249839782715, "learning_rate": 7.664586187972234e-07, "loss": 0.1966, "step": 6392 }, { "epoch": 4.139715394566624, "grad_norm": 10.461531639099121, "learning_rate": 7.575653327794075e-07, "loss": 0.2058, "step": 6400 }, { "epoch": 4.144890038809832, "grad_norm": 6.918221950531006, "learning_rate": 7.48719712201284e-07, "loss": 0.212, "step": 6408 }, { "epoch": 4.15006468305304, "grad_norm": 2.304258346557617, "learning_rate": 7.399218564469174e-07, "loss": 0.2005, "step": 6416 }, { "epoch": 4.155239327296249, "grad_norm": 6.370954990386963, "learning_rate": 7.311718643637134e-07, "loss": 0.1985, "step": 6424 }, { "epoch": 4.160413971539457, "grad_norm": 11.649602890014648, "learning_rate": 7.224698342613096e-07, "loss": 0.1978, "step": 6432 }, { "epoch": 4.165588615782665, "grad_norm": 6.221530437469482, "learning_rate": 7.138158639104748e-07, "loss": 0.2098, "step": 6440 }, { "epoch": 4.170763260025873, "grad_norm": 10.76377010345459, "learning_rate": 7.052100505420051e-07, "loss": 0.2189, "step": 6448 }, { "epoch": 4.175937904269081, "grad_norm": 12.636096954345703, "learning_rate": 6.96652490845634e-07, "loss": 0.2253, "step": 6456 }, { "epoch": 4.1811125485122895, "grad_norm": 1.3980625867843628, "learning_rate": 6.881432809689459e-07, "loss": 0.2044, "step": 6464 }, { "epoch": 4.186287192755498, "grad_norm": 40.41677474975586, "learning_rate": 6.796825165162951e-07, "loss": 0.2063, "step": 6472 }, { "epoch": 4.191461836998706, "grad_norm": 6.918355941772461, "learning_rate": 6.712702925477343e-07, "loss": 0.2095, "step": 6480 }, { "epoch": 4.196636481241915, "grad_norm": 5.543708801269531, "learning_rate": 6.62906703577943e-07, "loss": 0.203, "step": 6488 }, { "epoch": 4.201811125485123, "grad_norm": 0.7908322811126709, "learning_rate": 6.545918435751669e-07, "loss": 0.2164, "step": 6496 }, { "epoch": 4.206985769728331, "grad_norm": 23.913345336914062, "learning_rate": 6.463258059601635e-07, "loss": 0.1994, "step": 6504 }, { "epoch": 4.21216041397154, "grad_norm": 20.359169006347656, "learning_rate": 6.381086836051498e-07, "loss": 0.2175, "step": 6512 }, { "epoch": 4.217335058214748, "grad_norm": 2.3722023963928223, "learning_rate": 6.299405688327631e-07, "loss": 0.2055, "step": 6520 }, { "epoch": 4.222509702457956, "grad_norm": 96.0938491821289, "learning_rate": 6.218215534150185e-07, "loss": 0.1927, "step": 6528 }, { "epoch": 4.227684346701165, "grad_norm": 7.64237642288208, "learning_rate": 6.137517285722816e-07, "loss": 0.2043, "step": 6536 }, { "epoch": 4.232858990944372, "grad_norm": 4.935359477996826, "learning_rate": 6.057311849722419e-07, "loss": 0.2184, "step": 6544 }, { "epoch": 4.2380336351875805, "grad_norm": 3.5845251083374023, "learning_rate": 5.977600127288941e-07, "loss": 0.2137, "step": 6552 }, { "epoch": 4.243208279430789, "grad_norm": 16.71499252319336, "learning_rate": 5.898383014015275e-07, "loss": 0.2096, "step": 6560 }, { "epoch": 4.248382923673997, "grad_norm": 17.370206832885742, "learning_rate": 5.81966139993716e-07, "loss": 0.2067, "step": 6568 }, { "epoch": 4.2535575679172055, "grad_norm": 22.58631134033203, "learning_rate": 5.741436169523234e-07, "loss": 0.2232, "step": 6576 }, { "epoch": 4.258732212160414, "grad_norm": 3.885636806488037, "learning_rate": 5.663708201665041e-07, "loss": 0.2096, "step": 6584 }, { "epoch": 4.263906856403622, "grad_norm": 1.6453157663345337, "learning_rate": 5.586478369667203e-07, "loss": 0.2082, "step": 6592 }, { "epoch": 4.269081500646831, "grad_norm": 6.002994060516357, "learning_rate": 5.50974754123757e-07, "loss": 0.201, "step": 6600 }, { "epoch": 4.274256144890039, "grad_norm": 1.0541515350341797, "learning_rate": 5.433516578477504e-07, "loss": 0.2105, "step": 6608 }, { "epoch": 4.279430789133247, "grad_norm": 1.4043939113616943, "learning_rate": 5.357786337872168e-07, "loss": 0.2143, "step": 6616 }, { "epoch": 4.284605433376456, "grad_norm": 2.2832283973693848, "learning_rate": 5.282557670280914e-07, "loss": 0.2075, "step": 6624 }, { "epoch": 4.289780077619664, "grad_norm": 1.8749516010284424, "learning_rate": 5.207831420927722e-07, "loss": 0.1923, "step": 6632 }, { "epoch": 4.294954721862872, "grad_norm": 9.822781562805176, "learning_rate": 5.133608429391706e-07, "loss": 0.2093, "step": 6640 }, { "epoch": 4.300129366106081, "grad_norm": 8.075210571289062, "learning_rate": 5.059889529597678e-07, "loss": 0.1995, "step": 6648 }, { "epoch": 4.305304010349288, "grad_norm": 13.020132064819336, "learning_rate": 4.986675549806769e-07, "loss": 0.208, "step": 6656 }, { "epoch": 4.3104786545924965, "grad_norm": 1.0930315256118774, "learning_rate": 4.913967312607154e-07, "loss": 0.1978, "step": 6664 }, { "epoch": 4.315653298835705, "grad_norm": 1.784521222114563, "learning_rate": 4.841765634904777e-07, "loss": 0.1921, "step": 6672 }, { "epoch": 4.320827943078913, "grad_norm": 10.063488960266113, "learning_rate": 4.770071327914177e-07, "loss": 0.2094, "step": 6680 }, { "epoch": 4.3260025873221215, "grad_norm": 1.535333514213562, "learning_rate": 4.6988851971493886e-07, "loss": 0.2041, "step": 6688 }, { "epoch": 4.33117723156533, "grad_norm": 3.97757625579834, "learning_rate": 4.628208042414889e-07, "loss": 0.2225, "step": 6696 }, { "epoch": 4.336351875808538, "grad_norm": 8.291097640991211, "learning_rate": 4.558040657796603e-07, "loss": 0.2119, "step": 6704 }, { "epoch": 4.3415265200517466, "grad_norm": 5.780373573303223, "learning_rate": 4.4883838316529816e-07, "loss": 0.2099, "step": 6712 }, { "epoch": 4.346701164294955, "grad_norm": 23.213045120239258, "learning_rate": 4.4192383466061583e-07, "loss": 0.1992, "step": 6720 }, { "epoch": 4.351875808538163, "grad_norm": 6.386680603027344, "learning_rate": 4.350604979533135e-07, "loss": 0.2085, "step": 6728 }, { "epoch": 4.357050452781372, "grad_norm": 1.2865214347839355, "learning_rate": 4.2824845015570713e-07, "loss": 0.2168, "step": 6736 }, { "epoch": 4.36222509702458, "grad_norm": 3.7142112255096436, "learning_rate": 4.214877678038609e-07, "loss": 0.2087, "step": 6744 }, { "epoch": 4.367399741267787, "grad_norm": 15.213637351989746, "learning_rate": 4.1477852685672895e-07, "loss": 0.2107, "step": 6752 }, { "epoch": 4.372574385510996, "grad_norm": 2.587907075881958, "learning_rate": 4.0812080269529983e-07, "loss": 0.2178, "step": 6760 }, { "epoch": 4.377749029754204, "grad_norm": 3.547725200653076, "learning_rate": 4.015146701217493e-07, "loss": 0.2255, "step": 6768 }, { "epoch": 4.3829236739974125, "grad_norm": 2.6407039165496826, "learning_rate": 3.949602033586047e-07, "loss": 0.2035, "step": 6776 }, { "epoch": 4.388098318240621, "grad_norm": 7.901521682739258, "learning_rate": 3.884574760479037e-07, "loss": 0.2069, "step": 6784 }, { "epoch": 4.393272962483829, "grad_norm": 2.049207925796509, "learning_rate": 3.820065612503732e-07, "loss": 0.2042, "step": 6792 }, { "epoch": 4.3984476067270375, "grad_norm": 1.1140714883804321, "learning_rate": 3.756075314446045e-07, "loss": 0.2081, "step": 6800 }, { "epoch": 4.403622250970246, "grad_norm": 1.4943286180496216, "learning_rate": 3.6926045852624106e-07, "loss": 0.2066, "step": 6808 }, { "epoch": 4.408796895213454, "grad_norm": 2.857074499130249, "learning_rate": 3.629654138071692e-07, "loss": 0.2095, "step": 6816 }, { "epoch": 4.4139715394566625, "grad_norm": 8.338090896606445, "learning_rate": 3.56722468014718e-07, "loss": 0.2238, "step": 6824 }, { "epoch": 4.419146183699871, "grad_norm": 5.449411869049072, "learning_rate": 3.505316912908668e-07, "loss": 0.1984, "step": 6832 }, { "epoch": 4.424320827943079, "grad_norm": 26.998971939086914, "learning_rate": 3.443931531914507e-07, "loss": 0.199, "step": 6840 }, { "epoch": 4.429495472186288, "grad_norm": 2.477626085281372, "learning_rate": 3.3830692268538637e-07, "loss": 0.205, "step": 6848 }, { "epoch": 4.434670116429496, "grad_norm": 5.2165632247924805, "learning_rate": 3.3227306815389213e-07, "loss": 0.2037, "step": 6856 }, { "epoch": 4.439844760672703, "grad_norm": 1.6941031217575073, "learning_rate": 3.262916573897218e-07, "loss": 0.2006, "step": 6864 }, { "epoch": 4.445019404915912, "grad_norm": 1.2534488439559937, "learning_rate": 3.2036275759640245e-07, "loss": 0.1979, "step": 6872 }, { "epoch": 4.45019404915912, "grad_norm": 16.04631233215332, "learning_rate": 3.1448643538748045e-07, "loss": 0.2027, "step": 6880 }, { "epoch": 4.455368693402328, "grad_norm": 2.139188289642334, "learning_rate": 3.086627567857703e-07, "loss": 0.2088, "step": 6888 }, { "epoch": 4.460543337645537, "grad_norm": 1.968658685684204, "learning_rate": 3.0289178722261726e-07, "loss": 0.213, "step": 6896 }, { "epoch": 4.465717981888745, "grad_norm": 5.241852760314941, "learning_rate": 2.9717359153715707e-07, "loss": 0.2227, "step": 6904 }, { "epoch": 4.4708926261319535, "grad_norm": 2.9422459602355957, "learning_rate": 2.9150823397559094e-07, "loss": 0.2046, "step": 6912 }, { "epoch": 4.476067270375162, "grad_norm": 8.094049453735352, "learning_rate": 2.8589577819046364e-07, "loss": 0.198, "step": 6920 }, { "epoch": 4.48124191461837, "grad_norm": 1.4471999406814575, "learning_rate": 2.8033628723994623e-07, "loss": 0.2106, "step": 6928 }, { "epoch": 4.4864165588615785, "grad_norm": 2.6254570484161377, "learning_rate": 2.7482982358712885e-07, "loss": 0.211, "step": 6936 }, { "epoch": 4.491591203104787, "grad_norm": 11.787996292114258, "learning_rate": 2.6937644909931893e-07, "loss": 0.2103, "step": 6944 }, { "epoch": 4.496765847347995, "grad_norm": 4.06654691696167, "learning_rate": 2.639762250473482e-07, "loss": 0.2116, "step": 6952 }, { "epoch": 4.501940491591203, "grad_norm": 1.4215199947357178, "learning_rate": 2.5862921210487833e-07, "loss": 0.2039, "step": 6960 }, { "epoch": 4.507115135834411, "grad_norm": 4.604936122894287, "learning_rate": 2.5333547034772645e-07, "loss": 0.2126, "step": 6968 }, { "epoch": 4.512289780077619, "grad_norm": 29.087053298950195, "learning_rate": 2.480950592531844e-07, "loss": 0.195, "step": 6976 }, { "epoch": 4.517464424320828, "grad_norm": 6.165823459625244, "learning_rate": 2.429080376993537e-07, "loss": 0.2141, "step": 6984 }, { "epoch": 4.522639068564036, "grad_norm": 7.566137313842773, "learning_rate": 2.37774463964483e-07, "loss": 0.2013, "step": 6992 }, { "epoch": 4.527813712807244, "grad_norm": 17.188518524169922, "learning_rate": 2.3269439572631448e-07, "loss": 0.213, "step": 7000 }, { "epoch": 4.532988357050453, "grad_norm": 2.3657376766204834, "learning_rate": 2.2766789006143265e-07, "loss": 0.2087, "step": 7008 }, { "epoch": 4.538163001293661, "grad_norm": 131.63343811035156, "learning_rate": 2.226950034446279e-07, "loss": 0.2219, "step": 7016 }, { "epoch": 4.5433376455368695, "grad_norm": 9.159808158874512, "learning_rate": 2.1777579174825703e-07, "loss": 0.2194, "step": 7024 }, { "epoch": 4.548512289780078, "grad_norm": 1.9207276105880737, "learning_rate": 2.1291031024161856e-07, "loss": 0.2093, "step": 7032 }, { "epoch": 4.553686934023286, "grad_norm": 12.356361389160156, "learning_rate": 2.0809861359033124e-07, "loss": 0.214, "step": 7040 }, { "epoch": 4.5588615782664945, "grad_norm": 2.2436163425445557, "learning_rate": 2.0334075585571988e-07, "loss": 0.2149, "step": 7048 }, { "epoch": 4.564036222509703, "grad_norm": 3.7744526863098145, "learning_rate": 1.986367904942066e-07, "loss": 0.1967, "step": 7056 }, { "epoch": 4.569210866752911, "grad_norm": 1.290109395980835, "learning_rate": 1.9398677035671222e-07, "loss": 0.2186, "step": 7064 }, { "epoch": 4.574385510996119, "grad_norm": 1.6141724586486816, "learning_rate": 1.8939074768806076e-07, "loss": 0.2067, "step": 7072 }, { "epoch": 4.579560155239327, "grad_norm": 2.7260425090789795, "learning_rate": 1.8484877412639435e-07, "loss": 0.1964, "step": 7080 }, { "epoch": 4.584734799482535, "grad_norm": 1.980734944343567, "learning_rate": 1.8036090070259026e-07, "loss": 0.1991, "step": 7088 }, { "epoch": 4.589909443725744, "grad_norm": 2.165452241897583, "learning_rate": 1.7592717783969094e-07, "loss": 0.2146, "step": 7096 }, { "epoch": 4.595084087968952, "grad_norm": 1.303862452507019, "learning_rate": 1.7154765535233486e-07, "loss": 0.2152, "step": 7104 }, { "epoch": 4.60025873221216, "grad_norm": 7.211986064910889, "learning_rate": 1.6722238244619827e-07, "loss": 0.2248, "step": 7112 }, { "epoch": 4.605433376455369, "grad_norm": 7.8271870613098145, "learning_rate": 1.6295140771744044e-07, "loss": 0.209, "step": 7120 }, { "epoch": 4.610608020698577, "grad_norm": 1.4870136976242065, "learning_rate": 1.587347791521604e-07, "loss": 0.2148, "step": 7128 }, { "epoch": 4.6157826649417855, "grad_norm": 1.905441403388977, "learning_rate": 1.5457254412585666e-07, "loss": 0.2107, "step": 7136 }, { "epoch": 4.620957309184994, "grad_norm": 0.9246317148208618, "learning_rate": 1.5046474940289268e-07, "loss": 0.2177, "step": 7144 }, { "epoch": 4.626131953428202, "grad_norm": 12.103673934936523, "learning_rate": 1.4641144113597628e-07, "loss": 0.2049, "step": 7152 }, { "epoch": 4.63130659767141, "grad_norm": 1.047852635383606, "learning_rate": 1.4241266486563654e-07, "loss": 0.2062, "step": 7160 }, { "epoch": 4.636481241914618, "grad_norm": 0.9444906115531921, "learning_rate": 1.3846846551971272e-07, "loss": 0.2019, "step": 7168 }, { "epoch": 4.641655886157826, "grad_norm": 1.3876017332077026, "learning_rate": 1.3457888741285452e-07, "loss": 0.1979, "step": 7176 }, { "epoch": 4.646830530401035, "grad_norm": 1.2842018604278564, "learning_rate": 1.307439742460165e-07, "loss": 0.207, "step": 7184 }, { "epoch": 4.652005174644243, "grad_norm": 15.236653327941895, "learning_rate": 1.2696376910597275e-07, "loss": 0.2146, "step": 7192 }, { "epoch": 4.657179818887451, "grad_norm": 41.783870697021484, "learning_rate": 1.2323831446483025e-07, "loss": 0.207, "step": 7200 }, { "epoch": 4.66235446313066, "grad_norm": 1.3897194862365723, "learning_rate": 1.1956765217955302e-07, "loss": 0.1963, "step": 7208 }, { "epoch": 4.667529107373868, "grad_norm": 10.643123626708984, "learning_rate": 1.1595182349149026e-07, "loss": 0.2189, "step": 7216 }, { "epoch": 4.672703751617076, "grad_norm": 1.3099812269210815, "learning_rate": 1.1239086902591512e-07, "loss": 0.2271, "step": 7224 }, { "epoch": 4.677878395860285, "grad_norm": 9.640113830566406, "learning_rate": 1.0888482879156503e-07, "loss": 0.2085, "step": 7232 }, { "epoch": 4.683053040103493, "grad_norm": 8.829296112060547, "learning_rate": 1.0543374218019708e-07, "loss": 0.2029, "step": 7240 }, { "epoch": 4.6882276843467015, "grad_norm": 15.328158378601074, "learning_rate": 1.0203764796614057e-07, "loss": 0.2266, "step": 7248 }, { "epoch": 4.69340232858991, "grad_norm": 3.7764499187469482, "learning_rate": 9.869658430586349e-08, "loss": 0.216, "step": 7256 }, { "epoch": 4.698576972833118, "grad_norm": 8.380655288696289, "learning_rate": 9.541058873754394e-08, "loss": 0.213, "step": 7264 }, { "epoch": 4.7037516170763265, "grad_norm": 5.44411563873291, "learning_rate": 9.217969818064832e-08, "loss": 0.1983, "step": 7272 }, { "epoch": 4.708926261319534, "grad_norm": 20.862810134887695, "learning_rate": 8.900394893551655e-08, "loss": 0.2082, "step": 7280 }, { "epoch": 4.714100905562742, "grad_norm": 8.134232521057129, "learning_rate": 8.588337668295366e-08, "loss": 0.1995, "step": 7288 }, { "epoch": 4.719275549805951, "grad_norm": 5.843049049377441, "learning_rate": 8.28180164838288e-08, "loss": 0.1962, "step": 7296 }, { "epoch": 4.724450194049159, "grad_norm": 14.079636573791504, "learning_rate": 7.980790277868189e-08, "loss": 0.2213, "step": 7304 }, { "epoch": 4.729624838292367, "grad_norm": 1.857102870941162, "learning_rate": 7.685306938733761e-08, "loss": 0.2115, "step": 7312 }, { "epoch": 4.734799482535576, "grad_norm": 4.96279239654541, "learning_rate": 7.395354950852307e-08, "loss": 0.2191, "step": 7320 }, { "epoch": 4.739974126778784, "grad_norm": 3.5607805252075195, "learning_rate": 7.110937571949639e-08, "loss": 0.2076, "step": 7328 }, { "epoch": 4.745148771021992, "grad_norm": 3.995842218399048, "learning_rate": 6.832057997568087e-08, "loss": 0.1983, "step": 7336 }, { "epoch": 4.750323415265201, "grad_norm": 19.480527877807617, "learning_rate": 6.55871936103053e-08, "loss": 0.2037, "step": 7344 }, { "epoch": 4.755498059508409, "grad_norm": 2.151970624923706, "learning_rate": 6.290924733405201e-08, "loss": 0.2137, "step": 7352 }, { "epoch": 4.760672703751617, "grad_norm": 6.302921772003174, "learning_rate": 6.028677123471105e-08, "loss": 0.2095, "step": 7360 }, { "epoch": 4.765847347994825, "grad_norm": 1.0118603706359863, "learning_rate": 5.771979477684375e-08, "loss": 0.221, "step": 7368 }, { "epoch": 4.771021992238033, "grad_norm": 20.808563232421875, "learning_rate": 5.5208346801451376e-08, "loss": 0.2034, "step": 7376 }, { "epoch": 4.776196636481242, "grad_norm": 4.907052040100098, "learning_rate": 5.2752455525650334e-08, "loss": 0.2076, "step": 7384 }, { "epoch": 4.78137128072445, "grad_norm": 29.325668334960938, "learning_rate": 5.035214854235526e-08, "loss": 0.1882, "step": 7392 }, { "epoch": 4.786545924967658, "grad_norm": 6.801876544952393, "learning_rate": 4.8007452819968107e-08, "loss": 0.2004, "step": 7400 }, { "epoch": 4.791720569210867, "grad_norm": 42.309303283691406, "learning_rate": 4.571839470207839e-08, "loss": 0.2132, "step": 7408 }, { "epoch": 4.796895213454075, "grad_norm": 5.460014820098877, "learning_rate": 4.3484999907163484e-08, "loss": 0.1956, "step": 7416 }, { "epoch": 4.802069857697283, "grad_norm": 4.588939666748047, "learning_rate": 4.130729352830154e-08, "loss": 0.1942, "step": 7424 }, { "epoch": 4.807244501940492, "grad_norm": 12.402462005615234, "learning_rate": 3.9185300032889005e-08, "loss": 0.2013, "step": 7432 }, { "epoch": 4.8124191461837, "grad_norm": 2.2621071338653564, "learning_rate": 3.711904326236693e-08, "loss": 0.1916, "step": 7440 }, { "epoch": 4.817593790426908, "grad_norm": 0.7548274993896484, "learning_rate": 3.510854643195061e-08, "loss": 0.2083, "step": 7448 }, { "epoch": 4.822768434670117, "grad_norm": 6.128635883331299, "learning_rate": 3.3153832130371486e-08, "loss": 0.2125, "step": 7456 }, { "epoch": 4.827943078913325, "grad_norm": 4.110105514526367, "learning_rate": 3.1254922319621794e-08, "loss": 0.2127, "step": 7464 }, { "epoch": 4.833117723156533, "grad_norm": 13.073273658752441, "learning_rate": 2.941183833470751e-08, "loss": 0.2125, "step": 7472 }, { "epoch": 4.838292367399741, "grad_norm": 4.8851542472839355, "learning_rate": 2.7624600883410235e-08, "loss": 0.2054, "step": 7480 }, { "epoch": 4.843467011642949, "grad_norm": 12.184813499450684, "learning_rate": 2.589323004605293e-08, "loss": 0.2078, "step": 7488 }, { "epoch": 4.848641655886158, "grad_norm": 22.534168243408203, "learning_rate": 2.4217745275275094e-08, "loss": 0.1981, "step": 7496 }, { "epoch": 4.853816300129366, "grad_norm": 15.434218406677246, "learning_rate": 2.2598165395813498e-08, "loss": 0.2012, "step": 7504 }, { "epoch": 4.858990944372574, "grad_norm": 1.728147029876709, "learning_rate": 2.1034508604292904e-08, "loss": 0.2149, "step": 7512 }, { "epoch": 4.864165588615783, "grad_norm": 49.4590950012207, "learning_rate": 1.9526792469017896e-08, "loss": 0.2191, "step": 7520 }, { "epoch": 4.869340232858991, "grad_norm": 0.7744470834732056, "learning_rate": 1.807503392977916e-08, "loss": 0.2149, "step": 7528 }, { "epoch": 4.874514877102199, "grad_norm": 4.530129909515381, "learning_rate": 1.6679249297660847e-08, "loss": 0.2137, "step": 7536 }, { "epoch": 4.879689521345408, "grad_norm": 6.442441463470459, "learning_rate": 1.533945425485739e-08, "loss": 0.208, "step": 7544 }, { "epoch": 4.884864165588616, "grad_norm": 29.557950973510742, "learning_rate": 1.405566385449919e-08, "loss": 0.2039, "step": 7552 }, { "epoch": 4.890038809831824, "grad_norm": 1.7751787900924683, "learning_rate": 1.2827892520481667e-08, "loss": 0.2158, "step": 7560 }, { "epoch": 4.895213454075033, "grad_norm": 1.7882133722305298, "learning_rate": 1.1656154047303691e-08, "loss": 0.196, "step": 7568 }, { "epoch": 4.90038809831824, "grad_norm": 1.204350233078003, "learning_rate": 1.0540461599913287e-08, "loss": 0.1944, "step": 7576 }, { "epoch": 4.9055627425614485, "grad_norm": 3.3513760566711426, "learning_rate": 9.480827713557183e-09, "loss": 0.1995, "step": 7584 }, { "epoch": 4.910737386804657, "grad_norm": 4.574145317077637, "learning_rate": 8.47726429364426e-09, "loss": 0.1927, "step": 7592 }, { "epoch": 4.915912031047865, "grad_norm": 2.1629719734191895, "learning_rate": 7.529782615608439e-09, "loss": 0.1989, "step": 7600 }, { "epoch": 4.921086675291074, "grad_norm": 1.5303400754928589, "learning_rate": 6.638393324782111e-09, "loss": 0.2084, "step": 7608 }, { "epoch": 4.926261319534282, "grad_norm": 1.0018268823623657, "learning_rate": 5.803106436279571e-09, "loss": 0.2008, "step": 7616 }, { "epoch": 4.93143596377749, "grad_norm": 1.660049319267273, "learning_rate": 5.023931334879883e-09, "loss": 0.201, "step": 7624 }, { "epoch": 4.936610608020699, "grad_norm": 16.29326820373535, "learning_rate": 4.3008767749253e-09, "loss": 0.2237, "step": 7632 }, { "epoch": 4.941785252263907, "grad_norm": 1.205179214477539, "learning_rate": 3.6339508802213374e-09, "loss": 0.2053, "step": 7640 }, { "epoch": 4.946959896507115, "grad_norm": 1.7497557401657104, "learning_rate": 3.0231611439457407e-09, "loss": 0.2056, "step": 7648 }, { "epoch": 4.952134540750324, "grad_norm": 2.757598400115967, "learning_rate": 2.468514428563551e-09, "loss": 0.2057, "step": 7656 }, { "epoch": 4.957309184993532, "grad_norm": 0.9228636026382446, "learning_rate": 1.9700169657510537e-09, "loss": 0.2067, "step": 7664 }, { "epoch": 4.96248382923674, "grad_norm": 1.4917523860931396, "learning_rate": 1.5276743563258367e-09, "loss": 0.2222, "step": 7672 }, { "epoch": 4.967658473479949, "grad_norm": 4.442543029785156, "learning_rate": 1.141491570182396e-09, "loss": 0.2168, "step": 7680 }, { "epoch": 4.972833117723156, "grad_norm": 1.5017764568328857, "learning_rate": 8.114729462377346e-10, "loss": 0.2137, "step": 7688 }, { "epoch": 4.9780077619663645, "grad_norm": 3.795858144760132, "learning_rate": 5.376221923830694e-10, "loss": 0.2188, "step": 7696 }, { "epoch": 4.983182406209573, "grad_norm": 2.518415927886963, "learning_rate": 3.1994238543997526e-10, "loss": 0.2288, "step": 7704 }, { "epoch": 4.988357050452781, "grad_norm": 1.4229096174240112, "learning_rate": 1.5843597112707997e-10, "loss": 0.2053, "step": 7712 }, { "epoch": 4.99353169469599, "grad_norm": 8.819074630737305, "learning_rate": 5.3104764033973245e-11, "loss": 0.2206, "step": 7720 }, { "epoch": 4.998706338939198, "grad_norm": 4.728495121002197, "learning_rate": 3.949947598447246e-12, "loss": 0.2096, "step": 7728 }, { "epoch": 5.0, "step": 7730, "total_flos": 2.7880125934075904e+16, "train_loss": 0.22995005332097354, "train_runtime": 14201.6757, "train_samples_per_second": 69.632, "train_steps_per_second": 0.544 } ], "logging_steps": 8, "max_steps": 7730, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 387, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.7880125934075904e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }