{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3765, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007971303308090873, "grad_norm": 2.7439849590205014, "learning_rate": 2.387267904509284e-07, "loss": 0.4952, "step": 10 }, { "epoch": 0.015942606616181746, "grad_norm": 1.8269416958822857, "learning_rate": 5.039787798408489e-07, "loss": 0.4898, "step": 20 }, { "epoch": 0.023913909924272617, "grad_norm": 1.373955440239158, "learning_rate": 7.692307692307694e-07, "loss": 0.4661, "step": 30 }, { "epoch": 0.03188521323236349, "grad_norm": 0.7072675856563305, "learning_rate": 1.0344827586206898e-06, "loss": 0.4337, "step": 40 }, { "epoch": 0.03985651654045436, "grad_norm": 0.7236913112540133, "learning_rate": 1.29973474801061e-06, "loss": 0.4091, "step": 50 }, { "epoch": 0.047827819848545235, "grad_norm": 0.42830095771838445, "learning_rate": 1.5649867374005307e-06, "loss": 0.391, "step": 60 }, { "epoch": 0.05579912315663611, "grad_norm": 0.24985255941378587, "learning_rate": 1.830238726790451e-06, "loss": 0.3755, "step": 70 }, { "epoch": 0.06377042646472698, "grad_norm": 0.22053904249111797, "learning_rate": 2.0954907161803713e-06, "loss": 0.3646, "step": 80 }, { "epoch": 0.07174172977281786, "grad_norm": 0.1871454926752817, "learning_rate": 2.360742705570292e-06, "loss": 0.3585, "step": 90 }, { "epoch": 0.07971303308090873, "grad_norm": 0.16957059006933034, "learning_rate": 2.625994694960212e-06, "loss": 0.3537, "step": 100 }, { "epoch": 0.0876843363889996, "grad_norm": 0.15807877454450825, "learning_rate": 2.891246684350133e-06, "loss": 0.3481, "step": 110 }, { "epoch": 0.09565563969709047, "grad_norm": 0.1696204667980688, "learning_rate": 3.1564986737400535e-06, "loss": 0.3455, "step": 120 }, { "epoch": 0.10362694300518134, "grad_norm": 0.18296589316110073, "learning_rate": 3.4217506631299737e-06, "loss": 0.3374, "step": 130 }, { "epoch": 0.11159824631327223, "grad_norm": 0.1589874660304547, "learning_rate": 3.6870026525198943e-06, "loss": 0.3353, "step": 140 }, { "epoch": 0.1195695496213631, "grad_norm": 0.16713671478912018, "learning_rate": 3.9522546419098145e-06, "loss": 0.3311, "step": 150 }, { "epoch": 0.12754085292945397, "grad_norm": 0.19094654720327028, "learning_rate": 4.217506631299735e-06, "loss": 0.3298, "step": 160 }, { "epoch": 0.13551215623754484, "grad_norm": 0.17173127985031086, "learning_rate": 4.482758620689656e-06, "loss": 0.3297, "step": 170 }, { "epoch": 0.1434834595456357, "grad_norm": 0.16743742436159792, "learning_rate": 4.748010610079576e-06, "loss": 0.3255, "step": 180 }, { "epoch": 0.15145476285372658, "grad_norm": 0.20163674324120764, "learning_rate": 5.013262599469496e-06, "loss": 0.325, "step": 190 }, { "epoch": 0.15942606616181745, "grad_norm": 0.16037098448631043, "learning_rate": 5.278514588859417e-06, "loss": 0.3197, "step": 200 }, { "epoch": 0.16739736946990832, "grad_norm": 0.21353712697354443, "learning_rate": 5.5437665782493376e-06, "loss": 0.319, "step": 210 }, { "epoch": 0.1753686727779992, "grad_norm": 0.19869628289479196, "learning_rate": 5.809018567639257e-06, "loss": 0.3156, "step": 220 }, { "epoch": 0.18333997608609007, "grad_norm": 0.1975130616223235, "learning_rate": 6.074270557029178e-06, "loss": 0.3172, "step": 230 }, { "epoch": 0.19131127939418094, "grad_norm": 0.1902414828411451, "learning_rate": 6.339522546419099e-06, "loss": 0.3122, "step": 240 }, { "epoch": 0.1992825827022718, "grad_norm": 0.22225170253654492, "learning_rate": 6.6047745358090184e-06, "loss": 0.3073, "step": 250 }, { "epoch": 0.20725388601036268, "grad_norm": 0.19343451278128465, "learning_rate": 6.87002652519894e-06, "loss": 0.309, "step": 260 }, { "epoch": 0.21522518931845358, "grad_norm": 0.21183101802781412, "learning_rate": 7.1352785145888606e-06, "loss": 0.3124, "step": 270 }, { "epoch": 0.22319649262654445, "grad_norm": 0.26603427067291613, "learning_rate": 7.40053050397878e-06, "loss": 0.3059, "step": 280 }, { "epoch": 0.23116779593463532, "grad_norm": 0.22261379017016011, "learning_rate": 7.6657824933687e-06, "loss": 0.3094, "step": 290 }, { "epoch": 0.2391390992427262, "grad_norm": 0.2024824115097555, "learning_rate": 7.93103448275862e-06, "loss": 0.3084, "step": 300 }, { "epoch": 0.24711040255081707, "grad_norm": 0.2042942185636207, "learning_rate": 8.196286472148541e-06, "loss": 0.3057, "step": 310 }, { "epoch": 0.25508170585890794, "grad_norm": 0.18736233349181933, "learning_rate": 8.461538461538462e-06, "loss": 0.304, "step": 320 }, { "epoch": 0.2630530091669988, "grad_norm": 0.21102640951635082, "learning_rate": 8.726790450928383e-06, "loss": 0.3033, "step": 330 }, { "epoch": 0.2710243124750897, "grad_norm": 0.21618009137316338, "learning_rate": 8.992042440318303e-06, "loss": 0.3021, "step": 340 }, { "epoch": 0.27899561578318055, "grad_norm": 0.19515018580578244, "learning_rate": 9.257294429708224e-06, "loss": 0.2978, "step": 350 }, { "epoch": 0.2869669190912714, "grad_norm": 0.24140786468138617, "learning_rate": 9.522546419098145e-06, "loss": 0.3012, "step": 360 }, { "epoch": 0.2949382223993623, "grad_norm": 0.27497175793322665, "learning_rate": 9.787798408488064e-06, "loss": 0.3013, "step": 370 }, { "epoch": 0.30290952570745316, "grad_norm": 0.23551643286267343, "learning_rate": 9.99999140169557e-06, "loss": 0.3002, "step": 380 }, { "epoch": 0.31088082901554404, "grad_norm": 0.24946640275348875, "learning_rate": 9.99969046414561e-06, "loss": 0.2955, "step": 390 }, { "epoch": 0.3188521323236349, "grad_norm": 0.2424205054036822, "learning_rate": 9.998959640946033e-06, "loss": 0.2976, "step": 400 }, { "epoch": 0.3268234356317258, "grad_norm": 0.2844168535727217, "learning_rate": 9.997798994934812e-06, "loss": 0.2977, "step": 410 }, { "epoch": 0.33479473893981665, "grad_norm": 0.25483658348181026, "learning_rate": 9.99620862590714e-06, "loss": 0.2951, "step": 420 }, { "epoch": 0.3427660422479075, "grad_norm": 0.23711613447438198, "learning_rate": 9.994188670606845e-06, "loss": 0.2952, "step": 430 }, { "epoch": 0.3507373455559984, "grad_norm": 0.2028196656708933, "learning_rate": 9.99173930271464e-06, "loss": 0.2972, "step": 440 }, { "epoch": 0.35870864886408926, "grad_norm": 0.23753065276887678, "learning_rate": 9.988860732833183e-06, "loss": 0.294, "step": 450 }, { "epoch": 0.36667995217218013, "grad_norm": 0.21519789088821045, "learning_rate": 9.98555320846897e-06, "loss": 0.2949, "step": 460 }, { "epoch": 0.374651255480271, "grad_norm": 0.24780874257317498, "learning_rate": 9.981817014011066e-06, "loss": 0.2911, "step": 470 }, { "epoch": 0.3826225587883619, "grad_norm": 0.2419346606109402, "learning_rate": 9.977652470706629e-06, "loss": 0.2923, "step": 480 }, { "epoch": 0.39059386209645275, "grad_norm": 0.22080537805937434, "learning_rate": 9.973059936633308e-06, "loss": 0.2908, "step": 490 }, { "epoch": 0.3985651654045436, "grad_norm": 0.22915945812817434, "learning_rate": 9.968039806668448e-06, "loss": 0.2934, "step": 500 }, { "epoch": 0.4065364687126345, "grad_norm": 0.25832882794920076, "learning_rate": 9.96259251245514e-06, "loss": 0.2892, "step": 510 }, { "epoch": 0.41450777202072536, "grad_norm": 0.20890334095429705, "learning_rate": 9.956718522365098e-06, "loss": 0.29, "step": 520 }, { "epoch": 0.42247907532881623, "grad_norm": 0.19222031712823726, "learning_rate": 9.950418341458398e-06, "loss": 0.2936, "step": 530 }, { "epoch": 0.43045037863690716, "grad_norm": 0.229251945332095, "learning_rate": 9.943692511440051e-06, "loss": 0.2903, "step": 540 }, { "epoch": 0.43842168194499803, "grad_norm": 0.19910329695969214, "learning_rate": 9.936541610613417e-06, "loss": 0.2882, "step": 550 }, { "epoch": 0.4463929852530889, "grad_norm": 0.1980158753040197, "learning_rate": 9.928966253830492e-06, "loss": 0.288, "step": 560 }, { "epoch": 0.4543642885611798, "grad_norm": 0.21294837161238345, "learning_rate": 9.920967092439028e-06, "loss": 0.2901, "step": 570 }, { "epoch": 0.46233559186927065, "grad_norm": 0.2345212892254059, "learning_rate": 9.912544814226547e-06, "loss": 0.2889, "step": 580 }, { "epoch": 0.4703068951773615, "grad_norm": 0.2653674551152453, "learning_rate": 9.903700143361185e-06, "loss": 0.2884, "step": 590 }, { "epoch": 0.4782781984854524, "grad_norm": 0.22315589547019074, "learning_rate": 9.894433840329442e-06, "loss": 0.288, "step": 600 }, { "epoch": 0.48624950179354326, "grad_norm": 0.27243118100516245, "learning_rate": 9.884746701870778e-06, "loss": 0.2876, "step": 610 }, { "epoch": 0.49422080510163413, "grad_norm": 0.2642805191794303, "learning_rate": 9.874639560909118e-06, "loss": 0.2855, "step": 620 }, { "epoch": 0.502192108409725, "grad_norm": 0.21570422679762521, "learning_rate": 9.864113286481237e-06, "loss": 0.2848, "step": 630 }, { "epoch": 0.5101634117178159, "grad_norm": 0.20704843013033902, "learning_rate": 9.853168783662028e-06, "loss": 0.2873, "step": 640 }, { "epoch": 0.5181347150259067, "grad_norm": 0.18056431665581077, "learning_rate": 9.841806993486686e-06, "loss": 0.2839, "step": 650 }, { "epoch": 0.5261060183339976, "grad_norm": 0.2105787281096281, "learning_rate": 9.830028892869804e-06, "loss": 0.2813, "step": 660 }, { "epoch": 0.5340773216420884, "grad_norm": 0.21257758348049322, "learning_rate": 9.81783549452136e-06, "loss": 0.2846, "step": 670 }, { "epoch": 0.5420486249501794, "grad_norm": 0.21961213318400993, "learning_rate": 9.805227846859652e-06, "loss": 0.2829, "step": 680 }, { "epoch": 0.5500199282582702, "grad_norm": 0.25199803254623604, "learning_rate": 9.792207033921152e-06, "loss": 0.2883, "step": 690 }, { "epoch": 0.5579912315663611, "grad_norm": 0.20221809778648672, "learning_rate": 9.778774175267294e-06, "loss": 0.2842, "step": 700 }, { "epoch": 0.565962534874452, "grad_norm": 0.1898841298780325, "learning_rate": 9.764930425888216e-06, "loss": 0.282, "step": 710 }, { "epoch": 0.5739338381825428, "grad_norm": 0.20117780815968267, "learning_rate": 9.750676976103444e-06, "loss": 0.2839, "step": 720 }, { "epoch": 0.5819051414906338, "grad_norm": 0.21755712645273997, "learning_rate": 9.736015051459551e-06, "loss": 0.2819, "step": 730 }, { "epoch": 0.5898764447987246, "grad_norm": 0.20291021122674352, "learning_rate": 9.720945912624783e-06, "loss": 0.2836, "step": 740 }, { "epoch": 0.5978477481068155, "grad_norm": 0.20194444409505907, "learning_rate": 9.705470855280661e-06, "loss": 0.2833, "step": 750 }, { "epoch": 0.6058190514149063, "grad_norm": 0.20304767059054613, "learning_rate": 9.689591210010572e-06, "loss": 0.2825, "step": 760 }, { "epoch": 0.6137903547229973, "grad_norm": 0.2648940083523449, "learning_rate": 9.673308342185366e-06, "loss": 0.282, "step": 770 }, { "epoch": 0.6217616580310881, "grad_norm": 0.21769630697418604, "learning_rate": 9.65662365184596e-06, "loss": 0.2774, "step": 780 }, { "epoch": 0.629732961339179, "grad_norm": 0.392188226969322, "learning_rate": 9.639538573582952e-06, "loss": 0.2819, "step": 790 }, { "epoch": 0.6377042646472698, "grad_norm": 0.20739255958270508, "learning_rate": 9.62205457641328e-06, "loss": 0.2814, "step": 800 }, { "epoch": 0.6456755679553607, "grad_norm": 0.19797948805471874, "learning_rate": 9.604173163653906e-06, "loss": 0.2807, "step": 810 }, { "epoch": 0.6536468712634516, "grad_norm": 0.22378074781887625, "learning_rate": 9.58589587279256e-06, "loss": 0.2779, "step": 820 }, { "epoch": 0.6616181745715425, "grad_norm": 0.19730401928407432, "learning_rate": 9.567224275355538e-06, "loss": 0.2807, "step": 830 }, { "epoch": 0.6695894778796333, "grad_norm": 0.19232291589840167, "learning_rate": 9.548159976772593e-06, "loss": 0.2803, "step": 840 }, { "epoch": 0.6775607811877242, "grad_norm": 0.23171459344457662, "learning_rate": 9.528704616238875e-06, "loss": 0.2794, "step": 850 }, { "epoch": 0.685532084495815, "grad_norm": 0.2316345630782031, "learning_rate": 9.508859866574003e-06, "loss": 0.2802, "step": 860 }, { "epoch": 0.693503387803906, "grad_norm": 0.20634122518883097, "learning_rate": 9.488627434078232e-06, "loss": 0.2814, "step": 870 }, { "epoch": 0.7014746911119968, "grad_norm": 0.28423540381597584, "learning_rate": 9.468009058385735e-06, "loss": 0.277, "step": 880 }, { "epoch": 0.7094459944200877, "grad_norm": 0.19838823753708112, "learning_rate": 9.447006512315025e-06, "loss": 0.2775, "step": 890 }, { "epoch": 0.7174172977281785, "grad_norm": 0.1988655247541702, "learning_rate": 9.425621601716531e-06, "loss": 0.278, "step": 900 }, { "epoch": 0.7253886010362695, "grad_norm": 0.18011113329739104, "learning_rate": 9.403856165317322e-06, "loss": 0.2786, "step": 910 }, { "epoch": 0.7333599043443603, "grad_norm": 0.1844142249739523, "learning_rate": 9.381712074563006e-06, "loss": 0.2785, "step": 920 }, { "epoch": 0.7413312076524512, "grad_norm": 0.2148813967605786, "learning_rate": 9.359191233456821e-06, "loss": 0.2785, "step": 930 }, { "epoch": 0.749302510960542, "grad_norm": 0.1883568911372003, "learning_rate": 9.336295578395927e-06, "loss": 0.2789, "step": 940 }, { "epoch": 0.7572738142686329, "grad_norm": 0.2340210607240215, "learning_rate": 9.313027078004903e-06, "loss": 0.2789, "step": 950 }, { "epoch": 0.7652451175767238, "grad_norm": 0.19429300696808027, "learning_rate": 9.289387732966492e-06, "loss": 0.2788, "step": 960 }, { "epoch": 0.7732164208848147, "grad_norm": 0.1933521060098035, "learning_rate": 9.265379575849561e-06, "loss": 0.2743, "step": 970 }, { "epoch": 0.7811877241929055, "grad_norm": 0.2008930847687583, "learning_rate": 9.241004670934348e-06, "loss": 0.2746, "step": 980 }, { "epoch": 0.7891590275009964, "grad_norm": 0.20419942608243757, "learning_rate": 9.216265114034964e-06, "loss": 0.2761, "step": 990 }, { "epoch": 0.7971303308090872, "grad_norm": 0.20885521941941515, "learning_rate": 9.191163032319198e-06, "loss": 0.2799, "step": 1000 }, { "epoch": 0.8051016341171782, "grad_norm": 0.20033581634243633, "learning_rate": 9.1657005841256e-06, "loss": 0.2773, "step": 1010 }, { "epoch": 0.813072937425269, "grad_norm": 0.22147195992757276, "learning_rate": 9.139879958777931e-06, "loss": 0.275, "step": 1020 }, { "epoch": 0.8210442407333599, "grad_norm": 0.20166804562350057, "learning_rate": 9.113703376396885e-06, "loss": 0.2755, "step": 1030 }, { "epoch": 0.8290155440414507, "grad_norm": 0.19260698449209812, "learning_rate": 9.087173087709226e-06, "loss": 0.2742, "step": 1040 }, { "epoch": 0.8369868473495417, "grad_norm": 0.2116198417344192, "learning_rate": 9.060291373854252e-06, "loss": 0.2749, "step": 1050 }, { "epoch": 0.8449581506576325, "grad_norm": 0.27126785391794345, "learning_rate": 9.033060546187651e-06, "loss": 0.2774, "step": 1060 }, { "epoch": 0.8529294539657234, "grad_norm": 0.2361830324015449, "learning_rate": 9.005482946082784e-06, "loss": 0.2724, "step": 1070 }, { "epoch": 0.8609007572738143, "grad_norm": 0.23551093980233517, "learning_rate": 8.97756094472935e-06, "loss": 0.2762, "step": 1080 }, { "epoch": 0.8688720605819051, "grad_norm": 0.21298565207438697, "learning_rate": 8.949296942929515e-06, "loss": 0.2753, "step": 1090 }, { "epoch": 0.8768433638899961, "grad_norm": 0.20749916709788005, "learning_rate": 8.92069337089148e-06, "loss": 0.278, "step": 1100 }, { "epoch": 0.8848146671980869, "grad_norm": 0.17956706004286632, "learning_rate": 8.891752688020532e-06, "loss": 0.2775, "step": 1110 }, { "epoch": 0.8927859705061778, "grad_norm": 0.1953798521876003, "learning_rate": 8.862477382707569e-06, "loss": 0.2741, "step": 1120 }, { "epoch": 0.9007572738142686, "grad_norm": 0.2294774184802095, "learning_rate": 8.832869972115148e-06, "loss": 0.2736, "step": 1130 }, { "epoch": 0.9087285771223595, "grad_norm": 0.18067067247270632, "learning_rate": 8.802933001961058e-06, "loss": 0.2737, "step": 1140 }, { "epoch": 0.9166998804304504, "grad_norm": 0.2146301016081478, "learning_rate": 8.77266904629942e-06, "loss": 0.2733, "step": 1150 }, { "epoch": 0.9246711837385413, "grad_norm": 0.18563813225207978, "learning_rate": 8.742080707299381e-06, "loss": 0.2734, "step": 1160 }, { "epoch": 0.9326424870466321, "grad_norm": 0.19921223017975606, "learning_rate": 8.71117061502135e-06, "loss": 0.2762, "step": 1170 }, { "epoch": 0.940613790354723, "grad_norm": 0.19733160963975976, "learning_rate": 8.679941427190884e-06, "loss": 0.275, "step": 1180 }, { "epoch": 0.9485850936628138, "grad_norm": 0.20922845272298418, "learning_rate": 8.64839582897015e-06, "loss": 0.2712, "step": 1190 }, { "epoch": 0.9565563969709048, "grad_norm": 0.17679207911764114, "learning_rate": 8.616536532727062e-06, "loss": 0.2716, "step": 1200 }, { "epoch": 0.9645277002789956, "grad_norm": 0.1871171741554302, "learning_rate": 8.584366277802057e-06, "loss": 0.271, "step": 1210 }, { "epoch": 0.9724990035870865, "grad_norm": 0.20200784391473173, "learning_rate": 8.55188783027256e-06, "loss": 0.2722, "step": 1220 }, { "epoch": 0.9804703068951773, "grad_norm": 0.2060568121760763, "learning_rate": 8.519103982715158e-06, "loss": 0.2695, "step": 1230 }, { "epoch": 0.9884416102032683, "grad_norm": 0.19358577088132986, "learning_rate": 8.486017553965475e-06, "loss": 0.2701, "step": 1240 }, { "epoch": 0.9964129135113591, "grad_norm": 0.2434707357861983, "learning_rate": 8.452631388875814e-06, "loss": 0.2731, "step": 1250 }, { "epoch": 1.0039856516540455, "grad_norm": 0.2616385036392448, "learning_rate": 8.418948358070535e-06, "loss": 0.2665, "step": 1260 }, { "epoch": 1.0119569549621363, "grad_norm": 0.2115757131765118, "learning_rate": 8.384971357699255e-06, "loss": 0.2627, "step": 1270 }, { "epoch": 1.0199282582702272, "grad_norm": 0.20239812309837896, "learning_rate": 8.3507033091878e-06, "loss": 0.2662, "step": 1280 }, { "epoch": 1.027899561578318, "grad_norm": 0.20274705613036875, "learning_rate": 8.316147158987036e-06, "loss": 0.2637, "step": 1290 }, { "epoch": 1.035870864886409, "grad_norm": 0.1854664643809721, "learning_rate": 8.281305878319519e-06, "loss": 0.2627, "step": 1300 }, { "epoch": 1.0438421681944998, "grad_norm": 0.20204039521748993, "learning_rate": 8.246182462924022e-06, "loss": 0.2625, "step": 1310 }, { "epoch": 1.0518134715025906, "grad_norm": 0.21618960118522135, "learning_rate": 8.210779932797954e-06, "loss": 0.2693, "step": 1320 }, { "epoch": 1.0597847748106815, "grad_norm": 0.178361480629425, "learning_rate": 8.175101331937692e-06, "loss": 0.261, "step": 1330 }, { "epoch": 1.0677560781187725, "grad_norm": 0.25878248216098365, "learning_rate": 8.139149728076852e-06, "loss": 0.2634, "step": 1340 }, { "epoch": 1.0757273814268633, "grad_norm": 0.2033185860677756, "learning_rate": 8.102928212422519e-06, "loss": 0.2646, "step": 1350 }, { "epoch": 1.0836986847349541, "grad_norm": 0.2143942468103035, "learning_rate": 8.066439899389451e-06, "loss": 0.264, "step": 1360 }, { "epoch": 1.091669988043045, "grad_norm": 0.18331709491785228, "learning_rate": 8.02968792633231e-06, "loss": 0.2646, "step": 1370 }, { "epoch": 1.099641291351136, "grad_norm": 0.19827283155969086, "learning_rate": 7.99267545327588e-06, "loss": 0.2648, "step": 1380 }, { "epoch": 1.1076125946592268, "grad_norm": 0.24364988811035662, "learning_rate": 7.955405662643384e-06, "loss": 0.2601, "step": 1390 }, { "epoch": 1.1155838979673176, "grad_norm": 0.19923690656335546, "learning_rate": 7.917881758982838e-06, "loss": 0.2638, "step": 1400 }, { "epoch": 1.1235552012754084, "grad_norm": 0.19790386629961096, "learning_rate": 7.880106968691516e-06, "loss": 0.2647, "step": 1410 }, { "epoch": 1.1315265045834995, "grad_norm": 0.209982262643357, "learning_rate": 7.842084539738547e-06, "loss": 0.2629, "step": 1420 }, { "epoch": 1.1394978078915903, "grad_norm": 0.1865353794893146, "learning_rate": 7.803817741385636e-06, "loss": 0.2622, "step": 1430 }, { "epoch": 1.147469111199681, "grad_norm": 0.18591402004381719, "learning_rate": 7.765309863905965e-06, "loss": 0.2638, "step": 1440 }, { "epoch": 1.1554404145077721, "grad_norm": 0.22287948038133532, "learning_rate": 7.726564218301298e-06, "loss": 0.2658, "step": 1450 }, { "epoch": 1.163411717815863, "grad_norm": 0.18704477851949775, "learning_rate": 7.68758413601728e-06, "loss": 0.2632, "step": 1460 }, { "epoch": 1.1713830211239538, "grad_norm": 0.18630909945867757, "learning_rate": 7.648372968656995e-06, "loss": 0.2629, "step": 1470 }, { "epoch": 1.1793543244320446, "grad_norm": 0.5386080569158398, "learning_rate": 7.608934087692794e-06, "loss": 0.2612, "step": 1480 }, { "epoch": 1.1873256277401354, "grad_norm": 0.1746043720739328, "learning_rate": 7.569270884176401e-06, "loss": 0.2609, "step": 1490 }, { "epoch": 1.1952969310482264, "grad_norm": 0.19541388116589287, "learning_rate": 7.529386768447342e-06, "loss": 0.2642, "step": 1500 }, { "epoch": 1.2032682343563172, "grad_norm": 0.2015022076748336, "learning_rate": 7.4892851698397174e-06, "loss": 0.2638, "step": 1510 }, { "epoch": 1.211239537664408, "grad_norm": 0.18974402819968988, "learning_rate": 7.448969536387339e-06, "loss": 0.2617, "step": 1520 }, { "epoch": 1.219210840972499, "grad_norm": 0.19142664444724963, "learning_rate": 7.408443334527257e-06, "loss": 0.2644, "step": 1530 }, { "epoch": 1.22718214428059, "grad_norm": 0.18678354662021696, "learning_rate": 7.367710048801715e-06, "loss": 0.26, "step": 1540 }, { "epoch": 1.2351534475886807, "grad_norm": 0.20449677931821042, "learning_rate": 7.326773181558532e-06, "loss": 0.2593, "step": 1550 }, { "epoch": 1.2431247508967715, "grad_norm": 0.17656320872206302, "learning_rate": 7.285636252649965e-06, "loss": 0.2629, "step": 1560 }, { "epoch": 1.2510960542048624, "grad_norm": 0.18132917187985256, "learning_rate": 7.244302799130064e-06, "loss": 0.2632, "step": 1570 }, { "epoch": 1.2590673575129534, "grad_norm": 0.19148004299294089, "learning_rate": 7.202776374950549e-06, "loss": 0.262, "step": 1580 }, { "epoch": 1.2670386608210442, "grad_norm": 0.17407075104632905, "learning_rate": 7.161060550655227e-06, "loss": 0.2614, "step": 1590 }, { "epoch": 1.275009964129135, "grad_norm": 0.1805208863871377, "learning_rate": 7.119158913072996e-06, "loss": 0.2602, "step": 1600 }, { "epoch": 1.282981267437226, "grad_norm": 0.1700912662455128, "learning_rate": 7.0770750650094335e-06, "loss": 0.2601, "step": 1610 }, { "epoch": 1.2909525707453169, "grad_norm": 0.20872692606059245, "learning_rate": 7.03481262493702e-06, "loss": 0.2623, "step": 1620 }, { "epoch": 1.2989238740534077, "grad_norm": 0.18231553621037966, "learning_rate": 6.992375226684016e-06, "loss": 0.2612, "step": 1630 }, { "epoch": 1.3068951773614987, "grad_norm": 0.3260893903118034, "learning_rate": 6.949766519122021e-06, "loss": 0.2593, "step": 1640 }, { "epoch": 1.3148664806695896, "grad_norm": 0.1837719103909305, "learning_rate": 6.906990165852218e-06, "loss": 0.2631, "step": 1650 }, { "epoch": 1.3228377839776804, "grad_norm": 0.17385746180137812, "learning_rate": 6.864049844890389e-06, "loss": 0.2601, "step": 1660 }, { "epoch": 1.3308090872857712, "grad_norm": 0.18767118916567374, "learning_rate": 6.820949248350653e-06, "loss": 0.2599, "step": 1670 }, { "epoch": 1.338780390593862, "grad_norm": 0.18523703670706165, "learning_rate": 6.777692082128024e-06, "loss": 0.2611, "step": 1680 }, { "epoch": 1.346751693901953, "grad_norm": 0.17761446472146364, "learning_rate": 6.734282065579757e-06, "loss": 0.2596, "step": 1690 }, { "epoch": 1.3547229972100439, "grad_norm": 0.20263313570108268, "learning_rate": 6.690722931205551e-06, "loss": 0.2579, "step": 1700 }, { "epoch": 1.3626943005181347, "grad_norm": 0.19596135949066837, "learning_rate": 6.6470184243266235e-06, "loss": 0.2594, "step": 1710 }, { "epoch": 1.3706656038262257, "grad_norm": 0.17389328177855742, "learning_rate": 6.6031723027636775e-06, "loss": 0.2601, "step": 1720 }, { "epoch": 1.3786369071343165, "grad_norm": 0.17594519271532788, "learning_rate": 6.559188336513794e-06, "loss": 0.2609, "step": 1730 }, { "epoch": 1.3866082104424073, "grad_norm": 0.17131298098330294, "learning_rate": 6.515070307426279e-06, "loss": 0.2639, "step": 1740 }, { "epoch": 1.3945795137504982, "grad_norm": 0.18366655818427982, "learning_rate": 6.470822008877482e-06, "loss": 0.2643, "step": 1750 }, { "epoch": 1.402550817058589, "grad_norm": 0.16290637917636525, "learning_rate": 6.4264472454446535e-06, "loss": 0.2589, "step": 1760 }, { "epoch": 1.41052212036668, "grad_norm": 0.1923343830812872, "learning_rate": 6.381949832578796e-06, "loss": 0.2597, "step": 1770 }, { "epoch": 1.4184934236747708, "grad_norm": 0.16772495714207045, "learning_rate": 6.337333596276613e-06, "loss": 0.2605, "step": 1780 }, { "epoch": 1.4264647269828616, "grad_norm": 0.18790213961542337, "learning_rate": 6.292602372751536e-06, "loss": 0.2604, "step": 1790 }, { "epoch": 1.4344360302909527, "grad_norm": 0.18406526632785894, "learning_rate": 6.247760008103889e-06, "loss": 0.2607, "step": 1800 }, { "epoch": 1.4424073335990435, "grad_norm": 0.18222410408713163, "learning_rate": 6.2028103579901725e-06, "loss": 0.2615, "step": 1810 }, { "epoch": 1.4503786369071343, "grad_norm": 0.18828145882782565, "learning_rate": 6.157757287291557e-06, "loss": 0.2614, "step": 1820 }, { "epoch": 1.4583499402152251, "grad_norm": 0.16998195784557607, "learning_rate": 6.112604669781572e-06, "loss": 0.2581, "step": 1830 }, { "epoch": 1.466321243523316, "grad_norm": 0.16771798799179155, "learning_rate": 6.0673563877930244e-06, "loss": 0.259, "step": 1840 }, { "epoch": 1.474292546831407, "grad_norm": 0.17650338257347983, "learning_rate": 6.022016331884185e-06, "loss": 0.2611, "step": 1850 }, { "epoch": 1.4822638501394978, "grad_norm": 0.1883582677936213, "learning_rate": 5.9765884005042725e-06, "loss": 0.2577, "step": 1860 }, { "epoch": 1.4902351534475886, "grad_norm": 0.1903059355062031, "learning_rate": 5.931076499658258e-06, "loss": 0.2561, "step": 1870 }, { "epoch": 1.4982064567556796, "grad_norm": 0.170261118553718, "learning_rate": 5.8854845425710085e-06, "loss": 0.2574, "step": 1880 }, { "epoch": 1.5061777600637705, "grad_norm": 0.1693099595303142, "learning_rate": 5.839816449350824e-06, "loss": 0.2603, "step": 1890 }, { "epoch": 1.5141490633718613, "grad_norm": 0.1818861367650445, "learning_rate": 5.7940761466523795e-06, "loss": 0.2648, "step": 1900 }, { "epoch": 1.5221203666799523, "grad_norm": 0.21748921810474134, "learning_rate": 5.748267567339093e-06, "loss": 0.2555, "step": 1910 }, { "epoch": 1.530091669988043, "grad_norm": 0.1903700684723453, "learning_rate": 5.702394650144975e-06, "loss": 0.2602, "step": 1920 }, { "epoch": 1.538062973296134, "grad_norm": 0.17836978417633012, "learning_rate": 5.656461339335968e-06, "loss": 0.2577, "step": 1930 }, { "epoch": 1.5460342766042248, "grad_norm": 0.1627820959396175, "learning_rate": 5.6104715843708e-06, "loss": 0.2611, "step": 1940 }, { "epoch": 1.5540055799123156, "grad_norm": 0.15537918892349203, "learning_rate": 5.564429339561411e-06, "loss": 0.2592, "step": 1950 }, { "epoch": 1.5619768832204066, "grad_norm": 0.1692865509471413, "learning_rate": 5.518338563732945e-06, "loss": 0.2557, "step": 1960 }, { "epoch": 1.5699481865284974, "grad_norm": 0.18872166066900145, "learning_rate": 5.4722032198833595e-06, "loss": 0.2597, "step": 1970 }, { "epoch": 1.5779194898365883, "grad_norm": 0.16793201436135893, "learning_rate": 5.426027274842683e-06, "loss": 0.2612, "step": 1980 }, { "epoch": 1.5858907931446793, "grad_norm": 0.1614692177614517, "learning_rate": 5.379814698931935e-06, "loss": 0.257, "step": 1990 }, { "epoch": 1.5938620964527699, "grad_norm": 0.17158275465453685, "learning_rate": 5.3335694656217405e-06, "loss": 0.2604, "step": 2000 }, { "epoch": 1.601833399760861, "grad_norm": 0.16178053730940672, "learning_rate": 5.2872955511906974e-06, "loss": 0.258, "step": 2010 }, { "epoch": 1.6098047030689517, "grad_norm": 0.17144071756850604, "learning_rate": 5.2409969343834675e-06, "loss": 0.2596, "step": 2020 }, { "epoch": 1.6177760063770426, "grad_norm": 0.16798857484611837, "learning_rate": 5.194677596068689e-06, "loss": 0.2598, "step": 2030 }, { "epoch": 1.6257473096851336, "grad_norm": 0.16245877571502684, "learning_rate": 5.1483415188966855e-06, "loss": 0.2621, "step": 2040 }, { "epoch": 1.6337186129932244, "grad_norm": 0.22338180692114515, "learning_rate": 5.101992686957028e-06, "loss": 0.2579, "step": 2050 }, { "epoch": 1.6416899163013152, "grad_norm": 0.1777225738766675, "learning_rate": 5.055635085435972e-06, "loss": 0.2559, "step": 2060 }, { "epoch": 1.6496612196094063, "grad_norm": 0.17571885043922136, "learning_rate": 5.009272700273804e-06, "loss": 0.2598, "step": 2070 }, { "epoch": 1.6576325229174969, "grad_norm": 0.16140144516975566, "learning_rate": 4.962909517822125e-06, "loss": 0.2555, "step": 2080 }, { "epoch": 1.665603826225588, "grad_norm": 0.18160320235099942, "learning_rate": 4.91654952450108e-06, "loss": 0.2559, "step": 2090 }, { "epoch": 1.6735751295336787, "grad_norm": 0.18372880830782032, "learning_rate": 4.870196706456609e-06, "loss": 0.262, "step": 2100 }, { "epoch": 1.6815464328417695, "grad_norm": 0.16897256178696948, "learning_rate": 4.8238550492177065e-06, "loss": 0.2566, "step": 2110 }, { "epoch": 1.6895177361498606, "grad_norm": 0.16383359612759563, "learning_rate": 4.777528537353729e-06, "loss": 0.258, "step": 2120 }, { "epoch": 1.6974890394579514, "grad_norm": 0.16504846683572935, "learning_rate": 4.7312211541318e-06, "loss": 0.258, "step": 2130 }, { "epoch": 1.7054603427660422, "grad_norm": 0.18461915132331697, "learning_rate": 4.684936881174314e-06, "loss": 0.259, "step": 2140 }, { "epoch": 1.7134316460741332, "grad_norm": 0.18262043780620987, "learning_rate": 4.638679698116588e-06, "loss": 0.2597, "step": 2150 }, { "epoch": 1.721402949382224, "grad_norm": 0.17541591405265763, "learning_rate": 4.592453582264684e-06, "loss": 0.2554, "step": 2160 }, { "epoch": 1.7293742526903149, "grad_norm": 0.1653040886084589, "learning_rate": 4.546262508253429e-06, "loss": 0.2584, "step": 2170 }, { "epoch": 1.737345555998406, "grad_norm": 0.1585102987285458, "learning_rate": 4.500110447704666e-06, "loss": 0.2593, "step": 2180 }, { "epoch": 1.7453168593064965, "grad_norm": 0.16189117020291785, "learning_rate": 4.454001368885764e-06, "loss": 0.2568, "step": 2190 }, { "epoch": 1.7532881626145875, "grad_norm": 0.1754043030670264, "learning_rate": 4.40793923636842e-06, "loss": 0.2557, "step": 2200 }, { "epoch": 1.7612594659226783, "grad_norm": 0.1798179829068267, "learning_rate": 4.3619280106877716e-06, "loss": 0.2572, "step": 2210 }, { "epoch": 1.7692307692307692, "grad_norm": 0.16313092179418315, "learning_rate": 4.315971648001861e-06, "loss": 0.2556, "step": 2220 }, { "epoch": 1.7772020725388602, "grad_norm": 0.177395532354502, "learning_rate": 4.270074099751478e-06, "loss": 0.2542, "step": 2230 }, { "epoch": 1.785173375846951, "grad_norm": 0.16407314616761093, "learning_rate": 4.224239312320399e-06, "loss": 0.257, "step": 2240 }, { "epoch": 1.7931446791550418, "grad_norm": 0.1611929306688441, "learning_rate": 4.178471226696073e-06, "loss": 0.2572, "step": 2250 }, { "epoch": 1.8011159824631329, "grad_norm": 0.15344135774159312, "learning_rate": 4.132773778130766e-06, "loss": 0.2551, "step": 2260 }, { "epoch": 1.8090872857712235, "grad_norm": 0.18018747885039685, "learning_rate": 4.087150895803192e-06, "loss": 0.2562, "step": 2270 }, { "epoch": 1.8170585890793145, "grad_norm": 0.16109719534481134, "learning_rate": 4.041606502480684e-06, "loss": 0.2544, "step": 2280 }, { "epoch": 1.8250298923874053, "grad_norm": 0.15229386523298538, "learning_rate": 3.996144514181891e-06, "loss": 0.254, "step": 2290 }, { "epoch": 1.8330011956954961, "grad_norm": 0.1732832205396586, "learning_rate": 3.950768839840079e-06, "loss": 0.2568, "step": 2300 }, { "epoch": 1.8409724990035872, "grad_norm": 0.17030412533612874, "learning_rate": 3.905483380967027e-06, "loss": 0.2559, "step": 2310 }, { "epoch": 1.848943802311678, "grad_norm": 0.1646050569595746, "learning_rate": 3.8602920313175684e-06, "loss": 0.2584, "step": 2320 }, { "epoch": 1.8569151056197688, "grad_norm": 0.18112487950883321, "learning_rate": 3.815198676554794e-06, "loss": 0.2577, "step": 2330 }, { "epoch": 1.8648864089278598, "grad_norm": 0.15970698627462004, "learning_rate": 3.7702071939159535e-06, "loss": 0.2574, "step": 2340 }, { "epoch": 1.8728577122359504, "grad_norm": 0.16592497935881761, "learning_rate": 3.7253214518790814e-06, "loss": 0.2528, "step": 2350 }, { "epoch": 1.8808290155440415, "grad_norm": 0.1907236587733275, "learning_rate": 3.6805453098303757e-06, "loss": 0.2592, "step": 2360 }, { "epoch": 1.8888003188521323, "grad_norm": 0.1636603572774249, "learning_rate": 3.63588261773236e-06, "loss": 0.2602, "step": 2370 }, { "epoch": 1.896771622160223, "grad_norm": 0.15180772024085912, "learning_rate": 3.5913372157928515e-06, "loss": 0.2563, "step": 2380 }, { "epoch": 1.9047429254683141, "grad_norm": 0.1597945224171933, "learning_rate": 3.546912934134773e-06, "loss": 0.2549, "step": 2390 }, { "epoch": 1.912714228776405, "grad_norm": 0.1548083962863071, "learning_rate": 3.502613592466826e-06, "loss": 0.2572, "step": 2400 }, { "epoch": 1.9206855320844958, "grad_norm": 0.1481441665132999, "learning_rate": 3.4584429997550685e-06, "loss": 0.2575, "step": 2410 }, { "epoch": 1.9286568353925868, "grad_norm": 0.15555262051404722, "learning_rate": 3.414404953895406e-06, "loss": 0.2552, "step": 2420 }, { "epoch": 1.9366281387006774, "grad_norm": 0.15133232327459284, "learning_rate": 3.3705032413870402e-06, "loss": 0.2539, "step": 2430 }, { "epoch": 1.9445994420087684, "grad_norm": 0.15739551231138196, "learning_rate": 3.326741637006896e-06, "loss": 0.2546, "step": 2440 }, { "epoch": 1.9525707453168593, "grad_norm": 0.17129713309093755, "learning_rate": 3.2831239034850593e-06, "loss": 0.254, "step": 2450 }, { "epoch": 1.96054204862495, "grad_norm": 0.17872233586244216, "learning_rate": 3.2396537911812454e-06, "loss": 0.2535, "step": 2460 }, { "epoch": 1.9685133519330411, "grad_norm": 0.15620956109745782, "learning_rate": 3.196335037762337e-06, "loss": 0.2566, "step": 2470 }, { "epoch": 1.976484655241132, "grad_norm": 0.1442172315199126, "learning_rate": 3.1531713678810076e-06, "loss": 0.2529, "step": 2480 }, { "epoch": 1.9844559585492227, "grad_norm": 0.15796499558853128, "learning_rate": 3.110166492855468e-06, "loss": 0.2551, "step": 2490 }, { "epoch": 1.9924272618573138, "grad_norm": 0.15453646599732415, "learning_rate": 3.0673241103503572e-06, "loss": 0.2515, "step": 2500 }, { "epoch": 2.0, "grad_norm": 0.1972849604645947, "learning_rate": 3.0246479040588077e-06, "loss": 0.2551, "step": 2510 }, { "epoch": 2.007971303308091, "grad_norm": 0.1644857365812511, "learning_rate": 2.9821415433857174e-06, "loss": 0.2503, "step": 2520 }, { "epoch": 2.0159426066161816, "grad_norm": 0.15167361754961786, "learning_rate": 2.939808683132238e-06, "loss": 0.248, "step": 2530 }, { "epoch": 2.0239139099242727, "grad_norm": 0.15143886743143137, "learning_rate": 2.897652963181529e-06, "loss": 0.2475, "step": 2540 }, { "epoch": 2.0318852132323637, "grad_norm": 0.15557561304873513, "learning_rate": 2.8556780081857966e-06, "loss": 0.2502, "step": 2550 }, { "epoch": 2.0398565165404543, "grad_norm": 0.14833684638790712, "learning_rate": 2.813887427254626e-06, "loss": 0.247, "step": 2560 }, { "epoch": 2.0478278198485453, "grad_norm": 0.15160958104161015, "learning_rate": 2.772284813644675e-06, "loss": 0.2485, "step": 2570 }, { "epoch": 2.055799123156636, "grad_norm": 0.1463790587181485, "learning_rate": 2.7308737444507037e-06, "loss": 0.2464, "step": 2580 }, { "epoch": 2.063770426464727, "grad_norm": 0.14689650797158363, "learning_rate": 2.689657780298019e-06, "loss": 0.2491, "step": 2590 }, { "epoch": 2.071741729772818, "grad_norm": 0.15966093074503412, "learning_rate": 2.648640465036316e-06, "loss": 0.2485, "step": 2600 }, { "epoch": 2.0797130330809086, "grad_norm": 0.2092933023915233, "learning_rate": 2.6078253254349706e-06, "loss": 0.2468, "step": 2610 }, { "epoch": 2.0876843363889996, "grad_norm": 0.15161101989978912, "learning_rate": 2.5672158708797953e-06, "loss": 0.2474, "step": 2620 }, { "epoch": 2.0956556396970907, "grad_norm": 0.13872835429015348, "learning_rate": 2.526815593071306e-06, "loss": 0.2479, "step": 2630 }, { "epoch": 2.1036269430051813, "grad_norm": 0.15338466332562303, "learning_rate": 2.486627965724482e-06, "loss": 0.2498, "step": 2640 }, { "epoch": 2.1115982463132723, "grad_norm": 0.14140026219879168, "learning_rate": 2.4466564442700974e-06, "loss": 0.2498, "step": 2650 }, { "epoch": 2.119569549621363, "grad_norm": 0.15970341507801283, "learning_rate": 2.406904465557614e-06, "loss": 0.2466, "step": 2660 }, { "epoch": 2.127540852929454, "grad_norm": 0.1519435398581199, "learning_rate": 2.3673754475596634e-06, "loss": 0.2472, "step": 2670 }, { "epoch": 2.135512156237545, "grad_norm": 0.14371921030590182, "learning_rate": 2.3280727890781753e-06, "loss": 0.2471, "step": 2680 }, { "epoch": 2.1434834595456356, "grad_norm": 0.1480883729791422, "learning_rate": 2.2889998694521257e-06, "loss": 0.2486, "step": 2690 }, { "epoch": 2.1514547628537266, "grad_norm": 0.1635322392602264, "learning_rate": 2.2501600482669865e-06, "loss": 0.2503, "step": 2700 }, { "epoch": 2.1594260661618176, "grad_norm": 0.17232127721119667, "learning_rate": 2.211556665065854e-06, "loss": 0.2484, "step": 2710 }, { "epoch": 2.1673973694699082, "grad_norm": 0.1480271607060395, "learning_rate": 2.173193039062299e-06, "loss": 0.2507, "step": 2720 }, { "epoch": 2.1753686727779993, "grad_norm": 0.1602014965172441, "learning_rate": 2.1350724688549906e-06, "loss": 0.2514, "step": 2730 }, { "epoch": 2.18333997608609, "grad_norm": 0.1414641476617348, "learning_rate": 2.0971982321440553e-06, "loss": 0.248, "step": 2740 }, { "epoch": 2.191311279394181, "grad_norm": 0.15546919413747778, "learning_rate": 2.0595735854492675e-06, "loss": 0.2487, "step": 2750 }, { "epoch": 2.199282582702272, "grad_norm": 0.14859695582253002, "learning_rate": 2.0222017638300394e-06, "loss": 0.2469, "step": 2760 }, { "epoch": 2.2072538860103625, "grad_norm": 0.15297411724937537, "learning_rate": 1.9850859806072576e-06, "loss": 0.2449, "step": 2770 }, { "epoch": 2.2152251893184536, "grad_norm": 0.3819615108389116, "learning_rate": 1.9482294270870055e-06, "loss": 0.2469, "step": 2780 }, { "epoch": 2.2231964926265446, "grad_norm": 0.15939599378059569, "learning_rate": 1.9116352722861596e-06, "loss": 0.2472, "step": 2790 }, { "epoch": 2.231167795934635, "grad_norm": 0.1555518437281259, "learning_rate": 1.8753066626599086e-06, "loss": 0.2508, "step": 2800 }, { "epoch": 2.2391390992427262, "grad_norm": 0.1420441078873591, "learning_rate": 1.839246721831215e-06, "loss": 0.2484, "step": 2810 }, { "epoch": 2.247110402550817, "grad_norm": 0.1436721984643807, "learning_rate": 1.8034585503222441e-06, "loss": 0.2469, "step": 2820 }, { "epoch": 2.255081705858908, "grad_norm": 0.14135560041109888, "learning_rate": 1.7679452252877622e-06, "loss": 0.2465, "step": 2830 }, { "epoch": 2.263053009166999, "grad_norm": 0.1504613140361942, "learning_rate": 1.7327098002505681e-06, "loss": 0.2444, "step": 2840 }, { "epoch": 2.2710243124750895, "grad_norm": 0.17027410047278219, "learning_rate": 1.6977553048389306e-06, "loss": 0.2461, "step": 2850 }, { "epoch": 2.2789956157831806, "grad_norm": 0.1715439427086149, "learning_rate": 1.663084744526105e-06, "loss": 0.2481, "step": 2860 }, { "epoch": 2.2869669190912716, "grad_norm": 0.147378563017796, "learning_rate": 1.6287011003719105e-06, "loss": 0.2452, "step": 2870 }, { "epoch": 2.294938222399362, "grad_norm": 0.13827676791752133, "learning_rate": 1.5946073287664065e-06, "loss": 0.2492, "step": 2880 }, { "epoch": 2.302909525707453, "grad_norm": 0.15079331024865258, "learning_rate": 1.5608063611757058e-06, "loss": 0.249, "step": 2890 }, { "epoch": 2.3108808290155443, "grad_norm": 0.14708063904393764, "learning_rate": 1.5273011038899066e-06, "loss": 0.2514, "step": 2900 }, { "epoch": 2.318852132323635, "grad_norm": 0.14020201643369284, "learning_rate": 1.4940944377732168e-06, "loss": 0.2477, "step": 2910 }, { "epoch": 2.326823435631726, "grad_norm": 0.1528037921936168, "learning_rate": 1.4611892180162407e-06, "loss": 0.2469, "step": 2920 }, { "epoch": 2.3347947389398165, "grad_norm": 0.13915360250390427, "learning_rate": 1.4285882738904822e-06, "loss": 0.2468, "step": 2930 }, { "epoch": 2.3427660422479075, "grad_norm": 0.14250210180453843, "learning_rate": 1.3962944085050833e-06, "loss": 0.248, "step": 2940 }, { "epoch": 2.3507373455559986, "grad_norm": 0.15103609439825436, "learning_rate": 1.3643103985658047e-06, "loss": 0.2471, "step": 2950 }, { "epoch": 2.358708648864089, "grad_norm": 0.1343269921202141, "learning_rate": 1.332638994136269e-06, "loss": 0.2446, "step": 2960 }, { "epoch": 2.36667995217218, "grad_norm": 0.15449698277173599, "learning_rate": 1.301282918401518e-06, "loss": 0.2444, "step": 2970 }, { "epoch": 2.374651255480271, "grad_norm": 0.1445514536177799, "learning_rate": 1.270244867433853e-06, "loss": 0.2512, "step": 2980 }, { "epoch": 2.382622558788362, "grad_norm": 0.15118149213707782, "learning_rate": 1.2395275099610272e-06, "loss": 0.2495, "step": 2990 }, { "epoch": 2.390593862096453, "grad_norm": 0.13886439394677316, "learning_rate": 1.2091334871367838e-06, "loss": 0.246, "step": 3000 }, { "epoch": 2.3985651654045435, "grad_norm": 0.13698163877896719, "learning_rate": 1.1790654123137552e-06, "loss": 0.2487, "step": 3010 }, { "epoch": 2.4065364687126345, "grad_norm": 0.13611193490421775, "learning_rate": 1.1493258708187677e-06, "loss": 0.2462, "step": 3020 }, { "epoch": 2.4145077720207255, "grad_norm": 0.13752762254360268, "learning_rate": 1.1199174197305473e-06, "loss": 0.2464, "step": 3030 }, { "epoch": 2.422479075328816, "grad_norm": 0.13534708608338716, "learning_rate": 1.0908425876598512e-06, "loss": 0.2471, "step": 3040 }, { "epoch": 2.430450378636907, "grad_norm": 0.14072307465294032, "learning_rate": 1.0621038745320579e-06, "loss": 0.2507, "step": 3050 }, { "epoch": 2.438421681944998, "grad_norm": 0.18149670536468343, "learning_rate": 1.0337037513722154e-06, "loss": 0.2468, "step": 3060 }, { "epoch": 2.446392985253089, "grad_norm": 0.13777290158405223, "learning_rate": 1.0056446600925718e-06, "loss": 0.2467, "step": 3070 }, { "epoch": 2.45436428856118, "grad_norm": 0.13677789276728541, "learning_rate": 9.779290132826224e-07, "loss": 0.2481, "step": 3080 }, { "epoch": 2.4623355918692704, "grad_norm": 0.14535155203152114, "learning_rate": 9.505591940016601e-07, "loss": 0.246, "step": 3090 }, { "epoch": 2.4703068951773615, "grad_norm": 0.13976318624658268, "learning_rate": 9.235375555738824e-07, "loss": 0.2463, "step": 3100 }, { "epoch": 2.4782781984854525, "grad_norm": 0.13568166041286991, "learning_rate": 8.968664213860417e-07, "loss": 0.2472, "step": 3110 }, { "epoch": 2.486249501793543, "grad_norm": 0.13510001777472666, "learning_rate": 8.705480846876746e-07, "loss": 0.2453, "step": 3120 }, { "epoch": 2.494220805101634, "grad_norm": 0.14225142264009744, "learning_rate": 8.445848083939267e-07, "loss": 0.2466, "step": 3130 }, { "epoch": 2.5021921084097247, "grad_norm": 0.13719258420894617, "learning_rate": 8.189788248909763e-07, "loss": 0.246, "step": 3140 }, { "epoch": 2.5101634117178158, "grad_norm": 0.13873842823619842, "learning_rate": 7.937323358440935e-07, "loss": 0.2466, "step": 3150 }, { "epoch": 2.518134715025907, "grad_norm": 0.15089712483314988, "learning_rate": 7.688475120083349e-07, "loss": 0.2477, "step": 3160 }, { "epoch": 2.526106018333998, "grad_norm": 0.13447979411026342, "learning_rate": 7.443264930418886e-07, "loss": 0.2463, "step": 3170 }, { "epoch": 2.5340773216420884, "grad_norm": 0.1380057513415994, "learning_rate": 7.201713873221134e-07, "loss": 0.2495, "step": 3180 }, { "epoch": 2.5420486249501795, "grad_norm": 0.14450360624810707, "learning_rate": 6.963842717642488e-07, "loss": 0.2499, "step": 3190 }, { "epoch": 2.55001992825827, "grad_norm": 0.2060655057691808, "learning_rate": 6.72967191642836e-07, "loss": 0.247, "step": 3200 }, { "epoch": 2.557991231566361, "grad_norm": 0.12851695195695914, "learning_rate": 6.499221604158623e-07, "loss": 0.246, "step": 3210 }, { "epoch": 2.565962534874452, "grad_norm": 0.16068250540689558, "learning_rate": 6.2725115955164e-07, "loss": 0.2457, "step": 3220 }, { "epoch": 2.5739338381825427, "grad_norm": 0.1362948462489877, "learning_rate": 6.049561383584301e-07, "loss": 0.2485, "step": 3230 }, { "epoch": 2.5819051414906338, "grad_norm": 0.14318195107122192, "learning_rate": 5.830390138168435e-07, "loss": 0.2457, "step": 3240 }, { "epoch": 2.5898764447987244, "grad_norm": 0.15160361757805285, "learning_rate": 5.615016704150056e-07, "loss": 0.2437, "step": 3250 }, { "epoch": 2.5978477481068154, "grad_norm": 0.14208542489599402, "learning_rate": 5.403459599865307e-07, "loss": 0.2456, "step": 3260 }, { "epoch": 2.6058190514149064, "grad_norm": 0.1342083762390786, "learning_rate": 5.195737015512947e-07, "loss": 0.2449, "step": 3270 }, { "epoch": 2.6137903547229975, "grad_norm": 0.12996044889937128, "learning_rate": 4.991866811590268e-07, "loss": 0.2482, "step": 3280 }, { "epoch": 2.621761658031088, "grad_norm": 0.13276419525758598, "learning_rate": 4.791866517357491e-07, "loss": 0.2478, "step": 3290 }, { "epoch": 2.629732961339179, "grad_norm": 0.13410510712928858, "learning_rate": 4.5957533293304655e-07, "loss": 0.2465, "step": 3300 }, { "epoch": 2.6377042646472697, "grad_norm": 0.13836647678174321, "learning_rate": 4.403544109802144e-07, "loss": 0.2446, "step": 3310 }, { "epoch": 2.6456755679553607, "grad_norm": 0.14173844811223874, "learning_rate": 4.2152553853926914e-07, "loss": 0.2467, "step": 3320 }, { "epoch": 2.653646871263452, "grad_norm": 0.12963202838388832, "learning_rate": 4.0309033456284565e-07, "loss": 0.2461, "step": 3330 }, { "epoch": 2.6616181745715424, "grad_norm": 0.12985066342865778, "learning_rate": 3.850503841550024e-07, "loss": 0.2441, "step": 3340 }, { "epoch": 2.6695894778796334, "grad_norm": 0.13686704000194505, "learning_rate": 3.674072384349242e-07, "loss": 0.2494, "step": 3350 }, { "epoch": 2.677560781187724, "grad_norm": 0.13198096936683557, "learning_rate": 3.501624144035559e-07, "loss": 0.2466, "step": 3360 }, { "epoch": 2.685532084495815, "grad_norm": 0.13285862104009824, "learning_rate": 3.333173948131663e-07, "loss": 0.249, "step": 3370 }, { "epoch": 2.693503387803906, "grad_norm": 0.13123763580997755, "learning_rate": 3.1687362803985987e-07, "loss": 0.2485, "step": 3380 }, { "epoch": 2.7014746911119967, "grad_norm": 0.130635888923708, "learning_rate": 3.008325279590357e-07, "loss": 0.2464, "step": 3390 }, { "epoch": 2.7094459944200877, "grad_norm": 0.12903560375175332, "learning_rate": 2.851954738238277e-07, "loss": 0.2464, "step": 3400 }, { "epoch": 2.7174172977281783, "grad_norm": 0.13068172271012624, "learning_rate": 2.6996381014650353e-07, "loss": 0.2477, "step": 3410 }, { "epoch": 2.7253886010362693, "grad_norm": 0.12948155434817346, "learning_rate": 2.5513884658286745e-07, "loss": 0.2454, "step": 3420 }, { "epoch": 2.7333599043443604, "grad_norm": 0.13478087793899063, "learning_rate": 2.407218578196524e-07, "loss": 0.2446, "step": 3430 }, { "epoch": 2.7413312076524514, "grad_norm": 0.14200513684004365, "learning_rate": 2.267140834649123e-07, "loss": 0.2475, "step": 3440 }, { "epoch": 2.749302510960542, "grad_norm": 0.1388036591016315, "learning_rate": 2.13116727941447e-07, "loss": 0.2458, "step": 3450 }, { "epoch": 2.757273814268633, "grad_norm": 0.13274417766749047, "learning_rate": 1.9993096038323556e-07, "loss": 0.2478, "step": 3460 }, { "epoch": 2.7652451175767236, "grad_norm": 0.1309898112310216, "learning_rate": 1.8715791453491562e-07, "loss": 0.2473, "step": 3470 }, { "epoch": 2.7732164208848147, "grad_norm": 0.12855050002466045, "learning_rate": 1.7479868865430072e-07, "loss": 0.246, "step": 3480 }, { "epoch": 2.7811877241929057, "grad_norm": 0.1297939051422684, "learning_rate": 1.6285434541794598e-07, "loss": 0.2454, "step": 3490 }, { "epoch": 2.7891590275009963, "grad_norm": 0.1360452914842712, "learning_rate": 1.5132591182978107e-07, "loss": 0.2478, "step": 3500 }, { "epoch": 2.7971303308090874, "grad_norm": 0.1303778236512888, "learning_rate": 1.4021437913280366e-07, "loss": 0.2473, "step": 3510 }, { "epoch": 2.805101634117178, "grad_norm": 0.12494111165590116, "learning_rate": 1.2952070272384986e-07, "loss": 0.2472, "step": 3520 }, { "epoch": 2.813072937425269, "grad_norm": 0.13077601667810976, "learning_rate": 1.192458020714482e-07, "loss": 0.2447, "step": 3530 }, { "epoch": 2.82104424073336, "grad_norm": 0.13202872119004214, "learning_rate": 1.0939056063675846e-07, "loss": 0.2467, "step": 3540 }, { "epoch": 2.8290155440414506, "grad_norm": 0.1282605177841322, "learning_rate": 9.995582579761243e-08, "loss": 0.2479, "step": 3550 }, { "epoch": 2.8369868473495417, "grad_norm": 0.12641719729667697, "learning_rate": 9.094240877565441e-08, "loss": 0.2473, "step": 3560 }, { "epoch": 2.8449581506576322, "grad_norm": 0.133124543261481, "learning_rate": 8.235108456658814e-08, "loss": 0.2456, "step": 3570 }, { "epoch": 2.8529294539657233, "grad_norm": 0.126398551708617, "learning_rate": 7.418259187354227e-08, "loss": 0.248, "step": 3580 }, { "epoch": 2.8609007572738143, "grad_norm": 0.12941811506345907, "learning_rate": 6.643763304355566e-08, "loss": 0.2465, "step": 3590 }, { "epoch": 2.8688720605819054, "grad_norm": 0.12633613236354346, "learning_rate": 5.911687400718458e-08, "loss": 0.2484, "step": 3600 }, { "epoch": 2.876843363889996, "grad_norm": 0.13027882462021592, "learning_rate": 5.222094422124846e-08, "loss": 0.2478, "step": 3610 }, { "epoch": 2.884814667198087, "grad_norm": 0.130436804547978, "learning_rate": 4.57504366147038e-08, "loss": 0.2502, "step": 3620 }, { "epoch": 2.8927859705061776, "grad_norm": 0.13485396495421867, "learning_rate": 3.970590753766712e-08, "loss": 0.2475, "step": 3630 }, { "epoch": 2.9007572738142686, "grad_norm": 0.1357506612910079, "learning_rate": 3.408787671357494e-08, "loss": 0.2485, "step": 3640 }, { "epoch": 2.9087285771223597, "grad_norm": 0.1324337960159484, "learning_rate": 2.8896827194496713e-08, "loss": 0.2507, "step": 3650 }, { "epoch": 2.9166998804304503, "grad_norm": 0.1384258646412873, "learning_rate": 2.4133205319603614e-08, "loss": 0.2471, "step": 3660 }, { "epoch": 2.9246711837385413, "grad_norm": 0.12538463682333836, "learning_rate": 1.9797420676788692e-08, "loss": 0.2465, "step": 3670 }, { "epoch": 2.932642487046632, "grad_norm": 0.13488887387214157, "learning_rate": 1.5889846067450586e-08, "loss": 0.2473, "step": 3680 }, { "epoch": 2.940613790354723, "grad_norm": 0.1414038209388306, "learning_rate": 1.241081747443862e-08, "loss": 0.2481, "step": 3690 }, { "epoch": 2.948585093662814, "grad_norm": 0.13040100044435768, "learning_rate": 9.36063403316534e-09, "loss": 0.2463, "step": 3700 }, { "epoch": 2.956556396970905, "grad_norm": 0.12638246739069472, "learning_rate": 6.739558005884883e-09, "loss": 0.2465, "step": 3710 }, { "epoch": 2.9645277002789956, "grad_norm": 0.13324839473704572, "learning_rate": 4.547814759142122e-09, "loss": 0.2505, "step": 3720 }, { "epoch": 2.9724990035870866, "grad_norm": 0.1305883337577235, "learning_rate": 2.785592744398713e-09, "loss": 0.2478, "step": 3730 }, { "epoch": 2.9804703068951772, "grad_norm": 0.13393956028092843, "learning_rate": 1.453043481824401e-09, "loss": 0.2459, "step": 3740 }, { "epoch": 2.9884416102032683, "grad_norm": 0.13005754003113282, "learning_rate": 5.50281547275211e-10, "loss": 0.2488, "step": 3750 }, { "epoch": 2.9964129135113593, "grad_norm": 0.13834661200938075, "learning_rate": 7.738456243466808e-11, "loss": 0.2472, "step": 3760 } ], "logging_steps": 10, "max_steps": 3765, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.647275000450253e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }