{ "best_metric": null, "best_model_checkpoint": null, "epoch": 11.19194180190263, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005595970900951315, "grad_norm": 7.419506072998047, "learning_rate": 1.0000000000000002e-06, "loss": 0.9689, "step": 10 }, { "epoch": 0.01119194180190263, "grad_norm": 8.035171508789062, "learning_rate": 2.0000000000000003e-06, "loss": 0.8977, "step": 20 }, { "epoch": 0.016787912702853944, "grad_norm": 7.580524444580078, "learning_rate": 3e-06, "loss": 0.9942, "step": 30 }, { "epoch": 0.02238388360380526, "grad_norm": 5.7520976066589355, "learning_rate": 4.000000000000001e-06, "loss": 0.8421, "step": 40 }, { "epoch": 0.027979854504756575, "grad_norm": 4.714428901672363, "learning_rate": 5e-06, "loss": 0.6063, "step": 50 }, { "epoch": 0.03357582540570789, "grad_norm": 4.136861801147461, "learning_rate": 6e-06, "loss": 0.4259, "step": 60 }, { "epoch": 0.03917179630665921, "grad_norm": 2.1667540073394775, "learning_rate": 7.000000000000001e-06, "loss": 0.3447, "step": 70 }, { "epoch": 0.04476776720761052, "grad_norm": 2.3095765113830566, "learning_rate": 8.000000000000001e-06, "loss": 0.284, "step": 80 }, { "epoch": 0.05036373810856184, "grad_norm": 1.2860591411590576, "learning_rate": 9e-06, "loss": 0.2067, "step": 90 }, { "epoch": 0.05595970900951315, "grad_norm": 2.0302886962890625, "learning_rate": 1e-05, "loss": 0.1943, "step": 100 }, { "epoch": 0.06155567991046446, "grad_norm": 1.2757196426391602, "learning_rate": 1.1000000000000001e-05, "loss": 0.1442, "step": 110 }, { "epoch": 0.06715165081141578, "grad_norm": 1.5842756032943726, "learning_rate": 1.2e-05, "loss": 0.132, "step": 120 }, { "epoch": 0.0727476217123671, "grad_norm": 1.0327903032302856, "learning_rate": 1.3000000000000001e-05, "loss": 0.097, "step": 130 }, { "epoch": 0.07834359261331841, "grad_norm": 0.733019232749939, "learning_rate": 1.4000000000000001e-05, "loss": 0.0807, "step": 140 }, { "epoch": 0.08393956351426973, "grad_norm": 0.9548436999320984, "learning_rate": 1.5e-05, "loss": 0.0922, "step": 150 }, { "epoch": 0.08953553441522104, "grad_norm": 0.44906941056251526, "learning_rate": 1.6000000000000003e-05, "loss": 0.0841, "step": 160 }, { "epoch": 0.09513150531617236, "grad_norm": 0.9586009979248047, "learning_rate": 1.7000000000000003e-05, "loss": 0.0726, "step": 170 }, { "epoch": 0.10072747621712368, "grad_norm": 0.6236313581466675, "learning_rate": 1.8e-05, "loss": 0.0631, "step": 180 }, { "epoch": 0.10632344711807498, "grad_norm": 1.1688262224197388, "learning_rate": 1.9e-05, "loss": 0.0717, "step": 190 }, { "epoch": 0.1119194180190263, "grad_norm": 1.5576119422912598, "learning_rate": 2e-05, "loss": 0.0718, "step": 200 }, { "epoch": 0.11751538891997762, "grad_norm": 1.0707802772521973, "learning_rate": 2.1e-05, "loss": 0.0591, "step": 210 }, { "epoch": 0.12311135982092893, "grad_norm": 0.8612272143363953, "learning_rate": 2.2000000000000003e-05, "loss": 0.0623, "step": 220 }, { "epoch": 0.12870733072188026, "grad_norm": 0.796205997467041, "learning_rate": 2.3000000000000003e-05, "loss": 0.0563, "step": 230 }, { "epoch": 0.13430330162283155, "grad_norm": 1.127061367034912, "learning_rate": 2.4e-05, "loss": 0.0545, "step": 240 }, { "epoch": 0.13989927252378287, "grad_norm": 0.9559623003005981, "learning_rate": 2.5e-05, "loss": 0.0543, "step": 250 }, { "epoch": 0.1454952434247342, "grad_norm": 0.7295358777046204, "learning_rate": 2.6000000000000002e-05, "loss": 0.0554, "step": 260 }, { "epoch": 0.1510912143256855, "grad_norm": 0.8386074900627136, "learning_rate": 2.7000000000000002e-05, "loss": 0.0488, "step": 270 }, { "epoch": 0.15668718522663683, "grad_norm": 0.9443495869636536, "learning_rate": 2.8000000000000003e-05, "loss": 0.0639, "step": 280 }, { "epoch": 0.16228315612758815, "grad_norm": 0.8754186630249023, "learning_rate": 2.9e-05, "loss": 0.0477, "step": 290 }, { "epoch": 0.16787912702853947, "grad_norm": 0.5491052269935608, "learning_rate": 3e-05, "loss": 0.0509, "step": 300 }, { "epoch": 0.17347509792949076, "grad_norm": 0.7870469093322754, "learning_rate": 3.1e-05, "loss": 0.0478, "step": 310 }, { "epoch": 0.17907106883044208, "grad_norm": 0.9322296380996704, "learning_rate": 3.2000000000000005e-05, "loss": 0.0514, "step": 320 }, { "epoch": 0.1846670397313934, "grad_norm": 1.236414909362793, "learning_rate": 3.3e-05, "loss": 0.0504, "step": 330 }, { "epoch": 0.19026301063234471, "grad_norm": 1.2571903467178345, "learning_rate": 3.4000000000000007e-05, "loss": 0.0374, "step": 340 }, { "epoch": 0.19585898153329603, "grad_norm": 1.1705288887023926, "learning_rate": 3.5e-05, "loss": 0.0514, "step": 350 }, { "epoch": 0.20145495243424735, "grad_norm": 1.0005333423614502, "learning_rate": 3.6e-05, "loss": 0.0459, "step": 360 }, { "epoch": 0.20705092333519864, "grad_norm": 0.5335679054260254, "learning_rate": 3.7e-05, "loss": 0.0444, "step": 370 }, { "epoch": 0.21264689423614996, "grad_norm": 1.052669882774353, "learning_rate": 3.8e-05, "loss": 0.0409, "step": 380 }, { "epoch": 0.21824286513710128, "grad_norm": 0.44473376870155334, "learning_rate": 3.9000000000000006e-05, "loss": 0.0505, "step": 390 }, { "epoch": 0.2238388360380526, "grad_norm": 0.6711838841438293, "learning_rate": 4e-05, "loss": 0.0388, "step": 400 }, { "epoch": 0.22943480693900392, "grad_norm": 0.55412358045578, "learning_rate": 4.1e-05, "loss": 0.0416, "step": 410 }, { "epoch": 0.23503077783995524, "grad_norm": 1.0375343561172485, "learning_rate": 4.2e-05, "loss": 0.0501, "step": 420 }, { "epoch": 0.24062674874090656, "grad_norm": 0.7955525517463684, "learning_rate": 4.3e-05, "loss": 0.0461, "step": 430 }, { "epoch": 0.24622271964185785, "grad_norm": 0.8107234239578247, "learning_rate": 4.4000000000000006e-05, "loss": 0.0448, "step": 440 }, { "epoch": 0.2518186905428092, "grad_norm": 0.8368202447891235, "learning_rate": 4.5e-05, "loss": 0.0459, "step": 450 }, { "epoch": 0.2574146614437605, "grad_norm": 0.6938339471817017, "learning_rate": 4.600000000000001e-05, "loss": 0.034, "step": 460 }, { "epoch": 0.2630106323447118, "grad_norm": 0.8612020611763, "learning_rate": 4.7e-05, "loss": 0.0454, "step": 470 }, { "epoch": 0.2686066032456631, "grad_norm": 0.777197539806366, "learning_rate": 4.8e-05, "loss": 0.0381, "step": 480 }, { "epoch": 0.2742025741466144, "grad_norm": 0.6520339250564575, "learning_rate": 4.9e-05, "loss": 0.0381, "step": 490 }, { "epoch": 0.27979854504756574, "grad_norm": 0.5808746814727783, "learning_rate": 5e-05, "loss": 0.0285, "step": 500 }, { "epoch": 0.28539451594851706, "grad_norm": 0.9482337832450867, "learning_rate": 5.1000000000000006e-05, "loss": 0.0362, "step": 510 }, { "epoch": 0.2909904868494684, "grad_norm": 0.5615134239196777, "learning_rate": 5.2000000000000004e-05, "loss": 0.0322, "step": 520 }, { "epoch": 0.2965864577504197, "grad_norm": 1.2695409059524536, "learning_rate": 5.300000000000001e-05, "loss": 0.0411, "step": 530 }, { "epoch": 0.302182428651371, "grad_norm": 0.7221632599830627, "learning_rate": 5.4000000000000005e-05, "loss": 0.0422, "step": 540 }, { "epoch": 0.30777839955232233, "grad_norm": 1.1144938468933105, "learning_rate": 5.500000000000001e-05, "loss": 0.0334, "step": 550 }, { "epoch": 0.31337437045327365, "grad_norm": 0.6722885966300964, "learning_rate": 5.6000000000000006e-05, "loss": 0.0436, "step": 560 }, { "epoch": 0.318970341354225, "grad_norm": 1.0043433904647827, "learning_rate": 5.6999999999999996e-05, "loss": 0.0452, "step": 570 }, { "epoch": 0.3245663122551763, "grad_norm": 0.9483539462089539, "learning_rate": 5.8e-05, "loss": 0.0492, "step": 580 }, { "epoch": 0.3301622831561276, "grad_norm": 0.7825531363487244, "learning_rate": 5.9e-05, "loss": 0.0381, "step": 590 }, { "epoch": 0.33575825405707893, "grad_norm": 0.7982919216156006, "learning_rate": 6e-05, "loss": 0.0447, "step": 600 }, { "epoch": 0.3413542249580302, "grad_norm": 0.9162524342536926, "learning_rate": 6.1e-05, "loss": 0.0453, "step": 610 }, { "epoch": 0.3469501958589815, "grad_norm": 0.5597997903823853, "learning_rate": 6.2e-05, "loss": 0.0393, "step": 620 }, { "epoch": 0.35254616675993283, "grad_norm": 0.713256299495697, "learning_rate": 6.3e-05, "loss": 0.0394, "step": 630 }, { "epoch": 0.35814213766088415, "grad_norm": 0.7356066703796387, "learning_rate": 6.400000000000001e-05, "loss": 0.0339, "step": 640 }, { "epoch": 0.36373810856183547, "grad_norm": 0.5933259129524231, "learning_rate": 6.500000000000001e-05, "loss": 0.038, "step": 650 }, { "epoch": 0.3693340794627868, "grad_norm": 0.5277016162872314, "learning_rate": 6.6e-05, "loss": 0.0383, "step": 660 }, { "epoch": 0.3749300503637381, "grad_norm": 0.9106026887893677, "learning_rate": 6.7e-05, "loss": 0.0268, "step": 670 }, { "epoch": 0.38052602126468943, "grad_norm": 0.5941755771636963, "learning_rate": 6.800000000000001e-05, "loss": 0.0399, "step": 680 }, { "epoch": 0.38612199216564075, "grad_norm": 0.7207239270210266, "learning_rate": 6.9e-05, "loss": 0.0304, "step": 690 }, { "epoch": 0.39171796306659207, "grad_norm": 0.5808258652687073, "learning_rate": 7e-05, "loss": 0.0317, "step": 700 }, { "epoch": 0.3973139339675434, "grad_norm": 0.6304859519004822, "learning_rate": 7.1e-05, "loss": 0.0417, "step": 710 }, { "epoch": 0.4029099048684947, "grad_norm": 0.6625694036483765, "learning_rate": 7.2e-05, "loss": 0.0301, "step": 720 }, { "epoch": 0.408505875769446, "grad_norm": 0.6456591486930847, "learning_rate": 7.3e-05, "loss": 0.0416, "step": 730 }, { "epoch": 0.4141018466703973, "grad_norm": 0.8103715181350708, "learning_rate": 7.4e-05, "loss": 0.0398, "step": 740 }, { "epoch": 0.4196978175713486, "grad_norm": 0.592147707939148, "learning_rate": 7.500000000000001e-05, "loss": 0.0317, "step": 750 }, { "epoch": 0.4252937884722999, "grad_norm": 0.6823825836181641, "learning_rate": 7.6e-05, "loss": 0.031, "step": 760 }, { "epoch": 0.43088975937325125, "grad_norm": 0.3274383544921875, "learning_rate": 7.7e-05, "loss": 0.0305, "step": 770 }, { "epoch": 0.43648573027420257, "grad_norm": 0.3436225950717926, "learning_rate": 7.800000000000001e-05, "loss": 0.0338, "step": 780 }, { "epoch": 0.4420817011751539, "grad_norm": 0.8361327052116394, "learning_rate": 7.900000000000001e-05, "loss": 0.0264, "step": 790 }, { "epoch": 0.4476776720761052, "grad_norm": 0.5449605584144592, "learning_rate": 8e-05, "loss": 0.0321, "step": 800 }, { "epoch": 0.4532736429770565, "grad_norm": 0.31227922439575195, "learning_rate": 8.1e-05, "loss": 0.0272, "step": 810 }, { "epoch": 0.45886961387800784, "grad_norm": 0.6099038124084473, "learning_rate": 8.2e-05, "loss": 0.0504, "step": 820 }, { "epoch": 0.46446558477895916, "grad_norm": 0.6343345642089844, "learning_rate": 8.3e-05, "loss": 0.0343, "step": 830 }, { "epoch": 0.4700615556799105, "grad_norm": 0.7962288856506348, "learning_rate": 8.4e-05, "loss": 0.0292, "step": 840 }, { "epoch": 0.4756575265808618, "grad_norm": 0.3960738182067871, "learning_rate": 8.5e-05, "loss": 0.033, "step": 850 }, { "epoch": 0.4812534974818131, "grad_norm": 0.9380257725715637, "learning_rate": 8.6e-05, "loss": 0.0404, "step": 860 }, { "epoch": 0.4868494683827644, "grad_norm": 0.7713156342506409, "learning_rate": 8.7e-05, "loss": 0.0387, "step": 870 }, { "epoch": 0.4924454392837157, "grad_norm": 1.137207269668579, "learning_rate": 8.800000000000001e-05, "loss": 0.039, "step": 880 }, { "epoch": 0.498041410184667, "grad_norm": 0.7128203511238098, "learning_rate": 8.900000000000001e-05, "loss": 0.0354, "step": 890 }, { "epoch": 0.5036373810856184, "grad_norm": 0.6396750211715698, "learning_rate": 9e-05, "loss": 0.0367, "step": 900 }, { "epoch": 0.5092333519865697, "grad_norm": 0.6838144659996033, "learning_rate": 9.1e-05, "loss": 0.0369, "step": 910 }, { "epoch": 0.514829322887521, "grad_norm": 0.6156594157218933, "learning_rate": 9.200000000000001e-05, "loss": 0.0402, "step": 920 }, { "epoch": 0.5204252937884724, "grad_norm": 0.5517926812171936, "learning_rate": 9.300000000000001e-05, "loss": 0.0497, "step": 930 }, { "epoch": 0.5260212646894236, "grad_norm": 0.6177653670310974, "learning_rate": 9.4e-05, "loss": 0.0322, "step": 940 }, { "epoch": 0.5316172355903749, "grad_norm": 0.5705161094665527, "learning_rate": 9.5e-05, "loss": 0.0365, "step": 950 }, { "epoch": 0.5372132064913262, "grad_norm": 0.7966452836990356, "learning_rate": 9.6e-05, "loss": 0.0377, "step": 960 }, { "epoch": 0.5428091773922775, "grad_norm": 0.7984173893928528, "learning_rate": 9.7e-05, "loss": 0.0335, "step": 970 }, { "epoch": 0.5484051482932288, "grad_norm": 0.6380477547645569, "learning_rate": 9.8e-05, "loss": 0.0329, "step": 980 }, { "epoch": 0.5540011191941802, "grad_norm": 0.7180393934249878, "learning_rate": 9.900000000000001e-05, "loss": 0.0302, "step": 990 }, { "epoch": 0.5595970900951315, "grad_norm": 0.8885056972503662, "learning_rate": 0.0001, "loss": 0.0345, "step": 1000 }, { "epoch": 0.5651930609960828, "grad_norm": 0.41542354226112366, "learning_rate": 9.999993165095463e-05, "loss": 0.0445, "step": 1010 }, { "epoch": 0.5707890318970341, "grad_norm": 0.4343472421169281, "learning_rate": 9.999972660400536e-05, "loss": 0.0263, "step": 1020 }, { "epoch": 0.5763850027979854, "grad_norm": 0.7970145344734192, "learning_rate": 9.999938485971279e-05, "loss": 0.0322, "step": 1030 }, { "epoch": 0.5819809736989368, "grad_norm": 0.6129629015922546, "learning_rate": 9.999890641901125e-05, "loss": 0.0262, "step": 1040 }, { "epoch": 0.5875769445998881, "grad_norm": 0.5661425590515137, "learning_rate": 9.999829128320874e-05, "loss": 0.0317, "step": 1050 }, { "epoch": 0.5931729155008394, "grad_norm": 0.7532817721366882, "learning_rate": 9.999753945398704e-05, "loss": 0.0359, "step": 1060 }, { "epoch": 0.5987688864017907, "grad_norm": 0.42677804827690125, "learning_rate": 9.999665093340165e-05, "loss": 0.0273, "step": 1070 }, { "epoch": 0.604364857302742, "grad_norm": 0.6325145363807678, "learning_rate": 9.99956257238817e-05, "loss": 0.0377, "step": 1080 }, { "epoch": 0.6099608282036934, "grad_norm": 0.6003039479255676, "learning_rate": 9.999446382823013e-05, "loss": 0.0327, "step": 1090 }, { "epoch": 0.6155567991046447, "grad_norm": 0.36753129959106445, "learning_rate": 9.999316524962345e-05, "loss": 0.0285, "step": 1100 }, { "epoch": 0.621152770005596, "grad_norm": 0.43158769607543945, "learning_rate": 9.999172999161198e-05, "loss": 0.0275, "step": 1110 }, { "epoch": 0.6267487409065473, "grad_norm": 0.33566170930862427, "learning_rate": 9.999015805811965e-05, "loss": 0.0278, "step": 1120 }, { "epoch": 0.6323447118074986, "grad_norm": 0.671672523021698, "learning_rate": 9.998844945344405e-05, "loss": 0.0344, "step": 1130 }, { "epoch": 0.63794068270845, "grad_norm": 1.1190325021743774, "learning_rate": 9.998660418225645e-05, "loss": 0.0304, "step": 1140 }, { "epoch": 0.6435366536094013, "grad_norm": 0.6546229124069214, "learning_rate": 9.998462224960175e-05, "loss": 0.0343, "step": 1150 }, { "epoch": 0.6491326245103526, "grad_norm": 0.7560105323791504, "learning_rate": 9.998250366089848e-05, "loss": 0.0259, "step": 1160 }, { "epoch": 0.6547285954113039, "grad_norm": 0.6937676072120667, "learning_rate": 9.998024842193876e-05, "loss": 0.0308, "step": 1170 }, { "epoch": 0.6603245663122552, "grad_norm": 0.4479691684246063, "learning_rate": 9.997785653888835e-05, "loss": 0.0272, "step": 1180 }, { "epoch": 0.6659205372132065, "grad_norm": 0.38218632340431213, "learning_rate": 9.997532801828658e-05, "loss": 0.0313, "step": 1190 }, { "epoch": 0.6715165081141579, "grad_norm": 0.3345787525177002, "learning_rate": 9.997266286704631e-05, "loss": 0.0328, "step": 1200 }, { "epoch": 0.6771124790151091, "grad_norm": 0.3578011989593506, "learning_rate": 9.996986109245395e-05, "loss": 0.0373, "step": 1210 }, { "epoch": 0.6827084499160604, "grad_norm": 0.6602341532707214, "learning_rate": 9.996692270216947e-05, "loss": 0.0346, "step": 1220 }, { "epoch": 0.6883044208170117, "grad_norm": 0.4503819942474365, "learning_rate": 9.996384770422629e-05, "loss": 0.0243, "step": 1230 }, { "epoch": 0.693900391717963, "grad_norm": 0.753041684627533, "learning_rate": 9.996063610703137e-05, "loss": 0.0277, "step": 1240 }, { "epoch": 0.6994963626189143, "grad_norm": 0.3396258056163788, "learning_rate": 9.995728791936504e-05, "loss": 0.0219, "step": 1250 }, { "epoch": 0.7050923335198657, "grad_norm": 0.6529501676559448, "learning_rate": 9.995380315038119e-05, "loss": 0.0242, "step": 1260 }, { "epoch": 0.710688304420817, "grad_norm": 0.2462773472070694, "learning_rate": 9.9950181809607e-05, "loss": 0.021, "step": 1270 }, { "epoch": 0.7162842753217683, "grad_norm": 0.4511205554008484, "learning_rate": 9.994642390694308e-05, "loss": 0.0267, "step": 1280 }, { "epoch": 0.7218802462227196, "grad_norm": 0.5708833336830139, "learning_rate": 9.99425294526634e-05, "loss": 0.0288, "step": 1290 }, { "epoch": 0.7274762171236709, "grad_norm": 0.4378319978713989, "learning_rate": 9.993849845741524e-05, "loss": 0.0308, "step": 1300 }, { "epoch": 0.7330721880246223, "grad_norm": 0.44127964973449707, "learning_rate": 9.99343309322192e-05, "loss": 0.0282, "step": 1310 }, { "epoch": 0.7386681589255736, "grad_norm": 0.35624831914901733, "learning_rate": 9.993002688846913e-05, "loss": 0.0298, "step": 1320 }, { "epoch": 0.7442641298265249, "grad_norm": 0.45579585433006287, "learning_rate": 9.992558633793212e-05, "loss": 0.0325, "step": 1330 }, { "epoch": 0.7498601007274762, "grad_norm": 0.6297839283943176, "learning_rate": 9.992100929274846e-05, "loss": 0.0369, "step": 1340 }, { "epoch": 0.7554560716284275, "grad_norm": 0.29105043411254883, "learning_rate": 9.991629576543163e-05, "loss": 0.0253, "step": 1350 }, { "epoch": 0.7610520425293789, "grad_norm": 0.501181960105896, "learning_rate": 9.991144576886823e-05, "loss": 0.0355, "step": 1360 }, { "epoch": 0.7666480134303302, "grad_norm": 0.4630679488182068, "learning_rate": 9.990645931631796e-05, "loss": 0.0264, "step": 1370 }, { "epoch": 0.7722439843312815, "grad_norm": 0.6088075637817383, "learning_rate": 9.990133642141359e-05, "loss": 0.0282, "step": 1380 }, { "epoch": 0.7778399552322328, "grad_norm": 0.5682616233825684, "learning_rate": 9.989607709816091e-05, "loss": 0.0331, "step": 1390 }, { "epoch": 0.7834359261331841, "grad_norm": 0.4457339644432068, "learning_rate": 9.989068136093873e-05, "loss": 0.0309, "step": 1400 }, { "epoch": 0.7890318970341355, "grad_norm": 0.566882848739624, "learning_rate": 9.988514922449879e-05, "loss": 0.0436, "step": 1410 }, { "epoch": 0.7946278679350868, "grad_norm": 0.4208590090274811, "learning_rate": 9.987948070396571e-05, "loss": 0.0293, "step": 1420 }, { "epoch": 0.8002238388360381, "grad_norm": 0.5373462438583374, "learning_rate": 9.987367581483705e-05, "loss": 0.0333, "step": 1430 }, { "epoch": 0.8058198097369894, "grad_norm": 0.4833603799343109, "learning_rate": 9.986773457298311e-05, "loss": 0.0238, "step": 1440 }, { "epoch": 0.8114157806379407, "grad_norm": 0.3185485303401947, "learning_rate": 9.986165699464705e-05, "loss": 0.0279, "step": 1450 }, { "epoch": 0.817011751538892, "grad_norm": 0.32943880558013916, "learning_rate": 9.985544309644475e-05, "loss": 0.0259, "step": 1460 }, { "epoch": 0.8226077224398433, "grad_norm": 0.4028552174568176, "learning_rate": 9.984909289536473e-05, "loss": 0.0183, "step": 1470 }, { "epoch": 0.8282036933407946, "grad_norm": 0.3354315459728241, "learning_rate": 9.984260640876821e-05, "loss": 0.0279, "step": 1480 }, { "epoch": 0.8337996642417459, "grad_norm": 0.581444263458252, "learning_rate": 9.983598365438902e-05, "loss": 0.0231, "step": 1490 }, { "epoch": 0.8393956351426972, "grad_norm": 0.3263351321220398, "learning_rate": 9.98292246503335e-05, "loss": 0.0257, "step": 1500 }, { "epoch": 0.8449916060436485, "grad_norm": 0.4574286639690399, "learning_rate": 9.98223294150805e-05, "loss": 0.0172, "step": 1510 }, { "epoch": 0.8505875769445999, "grad_norm": 0.6482700705528259, "learning_rate": 9.981529796748134e-05, "loss": 0.0252, "step": 1520 }, { "epoch": 0.8561835478455512, "grad_norm": 0.22327029705047607, "learning_rate": 9.980813032675974e-05, "loss": 0.0296, "step": 1530 }, { "epoch": 0.8617795187465025, "grad_norm": 0.39261817932128906, "learning_rate": 9.980082651251175e-05, "loss": 0.0226, "step": 1540 }, { "epoch": 0.8673754896474538, "grad_norm": 0.3742023706436157, "learning_rate": 9.979338654470569e-05, "loss": 0.0283, "step": 1550 }, { "epoch": 0.8729714605484051, "grad_norm": 0.240834578871727, "learning_rate": 9.97858104436822e-05, "loss": 0.0176, "step": 1560 }, { "epoch": 0.8785674314493565, "grad_norm": 0.39040738344192505, "learning_rate": 9.977809823015401e-05, "loss": 0.0225, "step": 1570 }, { "epoch": 0.8841634023503078, "grad_norm": 0.3102349042892456, "learning_rate": 9.977024992520602e-05, "loss": 0.0229, "step": 1580 }, { "epoch": 0.8897593732512591, "grad_norm": 0.32893484830856323, "learning_rate": 9.976226555029522e-05, "loss": 0.0286, "step": 1590 }, { "epoch": 0.8953553441522104, "grad_norm": 0.3821198046207428, "learning_rate": 9.975414512725057e-05, "loss": 0.0278, "step": 1600 }, { "epoch": 0.9009513150531617, "grad_norm": 0.3672045171260834, "learning_rate": 9.974588867827301e-05, "loss": 0.0275, "step": 1610 }, { "epoch": 0.906547285954113, "grad_norm": 0.36223965883255005, "learning_rate": 9.973749622593534e-05, "loss": 0.028, "step": 1620 }, { "epoch": 0.9121432568550644, "grad_norm": 0.5474312901496887, "learning_rate": 9.972896779318219e-05, "loss": 0.0307, "step": 1630 }, { "epoch": 0.9177392277560157, "grad_norm": 0.7324241399765015, "learning_rate": 9.972030340333001e-05, "loss": 0.0246, "step": 1640 }, { "epoch": 0.923335198656967, "grad_norm": 0.44370922446250916, "learning_rate": 9.97115030800669e-05, "loss": 0.0229, "step": 1650 }, { "epoch": 0.9289311695579183, "grad_norm": 0.40400007367134094, "learning_rate": 9.970256684745258e-05, "loss": 0.0368, "step": 1660 }, { "epoch": 0.9345271404588696, "grad_norm": 0.4597970247268677, "learning_rate": 9.969349472991838e-05, "loss": 0.0215, "step": 1670 }, { "epoch": 0.940123111359821, "grad_norm": 0.41508862376213074, "learning_rate": 9.968428675226714e-05, "loss": 0.0251, "step": 1680 }, { "epoch": 0.9457190822607723, "grad_norm": 0.5726234316825867, "learning_rate": 9.967494293967312e-05, "loss": 0.0385, "step": 1690 }, { "epoch": 0.9513150531617236, "grad_norm": 0.47390761971473694, "learning_rate": 9.966546331768191e-05, "loss": 0.0269, "step": 1700 }, { "epoch": 0.9569110240626749, "grad_norm": 0.3252114951610565, "learning_rate": 9.965584791221048e-05, "loss": 0.023, "step": 1710 }, { "epoch": 0.9625069949636262, "grad_norm": 0.4773138761520386, "learning_rate": 9.964609674954696e-05, "loss": 0.0322, "step": 1720 }, { "epoch": 0.9681029658645776, "grad_norm": 0.45844170451164246, "learning_rate": 9.963620985635065e-05, "loss": 0.0233, "step": 1730 }, { "epoch": 0.9736989367655288, "grad_norm": 0.40978696942329407, "learning_rate": 9.962618725965196e-05, "loss": 0.0337, "step": 1740 }, { "epoch": 0.9792949076664801, "grad_norm": 0.43942537903785706, "learning_rate": 9.961602898685226e-05, "loss": 0.0225, "step": 1750 }, { "epoch": 0.9848908785674314, "grad_norm": 0.7744397521018982, "learning_rate": 9.96057350657239e-05, "loss": 0.0302, "step": 1760 }, { "epoch": 0.9904868494683827, "grad_norm": 0.3644595444202423, "learning_rate": 9.959530552441005e-05, "loss": 0.0252, "step": 1770 }, { "epoch": 0.996082820369334, "grad_norm": 0.29574769735336304, "learning_rate": 9.95847403914247e-05, "loss": 0.0222, "step": 1780 }, { "epoch": 1.0016787912702854, "grad_norm": 0.5153500437736511, "learning_rate": 9.95740396956525e-05, "loss": 0.0291, "step": 1790 }, { "epoch": 1.0072747621712368, "grad_norm": 0.5961137413978577, "learning_rate": 9.956320346634876e-05, "loss": 0.0266, "step": 1800 }, { "epoch": 1.012870733072188, "grad_norm": 0.48836737871170044, "learning_rate": 9.955223173313931e-05, "loss": 0.0213, "step": 1810 }, { "epoch": 1.0184667039731394, "grad_norm": 0.5610430240631104, "learning_rate": 9.954112452602045e-05, "loss": 0.0205, "step": 1820 }, { "epoch": 1.0240626748740906, "grad_norm": 0.4025803804397583, "learning_rate": 9.952988187535886e-05, "loss": 0.0224, "step": 1830 }, { "epoch": 1.029658645775042, "grad_norm": 0.605367124080658, "learning_rate": 9.95185038118915e-05, "loss": 0.0303, "step": 1840 }, { "epoch": 1.0352546166759933, "grad_norm": 0.3206970989704132, "learning_rate": 9.950699036672559e-05, "loss": 0.0231, "step": 1850 }, { "epoch": 1.0408505875769447, "grad_norm": 0.3495715260505676, "learning_rate": 9.949534157133844e-05, "loss": 0.024, "step": 1860 }, { "epoch": 1.046446558477896, "grad_norm": 0.3895197808742523, "learning_rate": 9.948355745757741e-05, "loss": 0.0203, "step": 1870 }, { "epoch": 1.0520425293788471, "grad_norm": 0.40038052201271057, "learning_rate": 9.94716380576598e-05, "loss": 0.0221, "step": 1880 }, { "epoch": 1.0576385002797986, "grad_norm": 0.479744553565979, "learning_rate": 9.945958340417283e-05, "loss": 0.028, "step": 1890 }, { "epoch": 1.0632344711807498, "grad_norm": 0.3020111322402954, "learning_rate": 9.944739353007344e-05, "loss": 0.0265, "step": 1900 }, { "epoch": 1.0688304420817012, "grad_norm": 0.3391585648059845, "learning_rate": 9.943506846868826e-05, "loss": 0.0233, "step": 1910 }, { "epoch": 1.0744264129826524, "grad_norm": 0.3941816985607147, "learning_rate": 9.942260825371358e-05, "loss": 0.0184, "step": 1920 }, { "epoch": 1.0800223838836038, "grad_norm": 0.31161707639694214, "learning_rate": 9.941001291921512e-05, "loss": 0.0229, "step": 1930 }, { "epoch": 1.085618354784555, "grad_norm": 0.33263275027275085, "learning_rate": 9.939728249962807e-05, "loss": 0.0227, "step": 1940 }, { "epoch": 1.0912143256855065, "grad_norm": 0.35178300738334656, "learning_rate": 9.938441702975689e-05, "loss": 0.0224, "step": 1950 }, { "epoch": 1.0968102965864577, "grad_norm": 0.374667227268219, "learning_rate": 9.937141654477528e-05, "loss": 0.0196, "step": 1960 }, { "epoch": 1.102406267487409, "grad_norm": 0.2080841362476349, "learning_rate": 9.93582810802261e-05, "loss": 0.0274, "step": 1970 }, { "epoch": 1.1080022383883603, "grad_norm": 0.29197070002555847, "learning_rate": 9.934501067202117e-05, "loss": 0.0242, "step": 1980 }, { "epoch": 1.1135982092893117, "grad_norm": 0.32980409264564514, "learning_rate": 9.93316053564413e-05, "loss": 0.0189, "step": 1990 }, { "epoch": 1.119194180190263, "grad_norm": 0.4776092767715454, "learning_rate": 9.931806517013612e-05, "loss": 0.022, "step": 2000 }, { "epoch": 1.1247901510912144, "grad_norm": 0.37389442324638367, "learning_rate": 9.930439015012396e-05, "loss": 0.0216, "step": 2010 }, { "epoch": 1.1303861219921656, "grad_norm": 0.22275716066360474, "learning_rate": 9.929058033379181e-05, "loss": 0.0192, "step": 2020 }, { "epoch": 1.135982092893117, "grad_norm": 0.5097452402114868, "learning_rate": 9.927663575889521e-05, "loss": 0.0198, "step": 2030 }, { "epoch": 1.1415780637940682, "grad_norm": 0.3198114037513733, "learning_rate": 9.926255646355804e-05, "loss": 0.0218, "step": 2040 }, { "epoch": 1.1471740346950197, "grad_norm": 0.1620880514383316, "learning_rate": 9.92483424862726e-05, "loss": 0.0227, "step": 2050 }, { "epoch": 1.1527700055959709, "grad_norm": 0.2927526831626892, "learning_rate": 9.923399386589933e-05, "loss": 0.0195, "step": 2060 }, { "epoch": 1.1583659764969223, "grad_norm": 0.2967079281806946, "learning_rate": 9.921951064166684e-05, "loss": 0.024, "step": 2070 }, { "epoch": 1.1639619473978735, "grad_norm": 0.19401852786540985, "learning_rate": 9.92048928531717e-05, "loss": 0.0223, "step": 2080 }, { "epoch": 1.169557918298825, "grad_norm": 0.28363627195358276, "learning_rate": 9.919014054037836e-05, "loss": 0.0188, "step": 2090 }, { "epoch": 1.1751538891997761, "grad_norm": 0.3623961806297302, "learning_rate": 9.917525374361912e-05, "loss": 0.0206, "step": 2100 }, { "epoch": 1.1807498601007276, "grad_norm": 0.503246545791626, "learning_rate": 9.91602325035939e-05, "loss": 0.0253, "step": 2110 }, { "epoch": 1.1863458310016788, "grad_norm": 0.7744673490524292, "learning_rate": 9.914507686137019e-05, "loss": 0.0337, "step": 2120 }, { "epoch": 1.19194180190263, "grad_norm": 0.48357081413269043, "learning_rate": 9.912978685838294e-05, "loss": 0.0309, "step": 2130 }, { "epoch": 1.1975377728035814, "grad_norm": 0.22658684849739075, "learning_rate": 9.911436253643445e-05, "loss": 0.0208, "step": 2140 }, { "epoch": 1.2031337437045329, "grad_norm": 0.40776172280311584, "learning_rate": 9.90988039376942e-05, "loss": 0.0232, "step": 2150 }, { "epoch": 1.208729714605484, "grad_norm": 0.48974546790122986, "learning_rate": 9.90831111046988e-05, "loss": 0.0278, "step": 2160 }, { "epoch": 1.2143256855064353, "grad_norm": 0.3066832423210144, "learning_rate": 9.90672840803519e-05, "loss": 0.018, "step": 2170 }, { "epoch": 1.2199216564073867, "grad_norm": 0.22434163093566895, "learning_rate": 9.905132290792394e-05, "loss": 0.0141, "step": 2180 }, { "epoch": 1.225517627308338, "grad_norm": 0.3365159034729004, "learning_rate": 9.903522763105218e-05, "loss": 0.0205, "step": 2190 }, { "epoch": 1.2311135982092893, "grad_norm": 0.3467719256877899, "learning_rate": 9.901899829374047e-05, "loss": 0.0206, "step": 2200 }, { "epoch": 1.2367095691102405, "grad_norm": 0.31818097829818726, "learning_rate": 9.900263494035921e-05, "loss": 0.0255, "step": 2210 }, { "epoch": 1.242305540011192, "grad_norm": 0.3118780851364136, "learning_rate": 9.89861376156452e-05, "loss": 0.0211, "step": 2220 }, { "epoch": 1.2479015109121432, "grad_norm": 0.2563456594944, "learning_rate": 9.896950636470147e-05, "loss": 0.0249, "step": 2230 }, { "epoch": 1.2534974818130946, "grad_norm": 0.4434971213340759, "learning_rate": 9.895274123299723e-05, "loss": 0.0214, "step": 2240 }, { "epoch": 1.2590934527140458, "grad_norm": 0.36243245005607605, "learning_rate": 9.893584226636772e-05, "loss": 0.0239, "step": 2250 }, { "epoch": 1.2646894236149973, "grad_norm": 0.4027983546257019, "learning_rate": 9.891880951101407e-05, "loss": 0.0328, "step": 2260 }, { "epoch": 1.2702853945159485, "grad_norm": 0.4992479383945465, "learning_rate": 9.890164301350318e-05, "loss": 0.0247, "step": 2270 }, { "epoch": 1.2758813654169, "grad_norm": 0.5188339948654175, "learning_rate": 9.888434282076758e-05, "loss": 0.0252, "step": 2280 }, { "epoch": 1.281477336317851, "grad_norm": 0.2691977620124817, "learning_rate": 9.886690898010535e-05, "loss": 0.0238, "step": 2290 }, { "epoch": 1.2870733072188025, "grad_norm": 0.42759424448013306, "learning_rate": 9.884934153917997e-05, "loss": 0.0252, "step": 2300 }, { "epoch": 1.2926692781197537, "grad_norm": 0.315560519695282, "learning_rate": 9.883164054602012e-05, "loss": 0.0184, "step": 2310 }, { "epoch": 1.2982652490207052, "grad_norm": 0.34518998861312866, "learning_rate": 9.881380604901964e-05, "loss": 0.026, "step": 2320 }, { "epoch": 1.3038612199216564, "grad_norm": 0.322465717792511, "learning_rate": 9.879583809693738e-05, "loss": 0.0217, "step": 2330 }, { "epoch": 1.3094571908226076, "grad_norm": 0.31809547543525696, "learning_rate": 9.877773673889701e-05, "loss": 0.0219, "step": 2340 }, { "epoch": 1.315053161723559, "grad_norm": 0.4411179721355438, "learning_rate": 9.8759502024387e-05, "loss": 0.0221, "step": 2350 }, { "epoch": 1.3206491326245104, "grad_norm": 0.44775789976119995, "learning_rate": 9.87411340032603e-05, "loss": 0.0234, "step": 2360 }, { "epoch": 1.3262451035254617, "grad_norm": 0.5176445245742798, "learning_rate": 9.872263272573443e-05, "loss": 0.0255, "step": 2370 }, { "epoch": 1.3318410744264129, "grad_norm": 0.36430883407592773, "learning_rate": 9.870399824239117e-05, "loss": 0.0205, "step": 2380 }, { "epoch": 1.3374370453273643, "grad_norm": 0.5294170379638672, "learning_rate": 9.868523060417646e-05, "loss": 0.0266, "step": 2390 }, { "epoch": 1.3430330162283157, "grad_norm": 0.3633783459663391, "learning_rate": 9.86663298624003e-05, "loss": 0.0208, "step": 2400 }, { "epoch": 1.348628987129267, "grad_norm": 0.5161033272743225, "learning_rate": 9.864729606873663e-05, "loss": 0.0201, "step": 2410 }, { "epoch": 1.3542249580302181, "grad_norm": 0.6746691465377808, "learning_rate": 9.862812927522309e-05, "loss": 0.0243, "step": 2420 }, { "epoch": 1.3598209289311696, "grad_norm": 0.2213054746389389, "learning_rate": 9.860882953426099e-05, "loss": 0.0209, "step": 2430 }, { "epoch": 1.365416899832121, "grad_norm": 0.6545590162277222, "learning_rate": 9.858939689861506e-05, "loss": 0.0225, "step": 2440 }, { "epoch": 1.3710128707330722, "grad_norm": 0.46804091334342957, "learning_rate": 9.856983142141339e-05, "loss": 0.0271, "step": 2450 }, { "epoch": 1.3766088416340234, "grad_norm": 0.38381436467170715, "learning_rate": 9.855013315614725e-05, "loss": 0.0233, "step": 2460 }, { "epoch": 1.3822048125349748, "grad_norm": 0.41659992933273315, "learning_rate": 9.853030215667093e-05, "loss": 0.0229, "step": 2470 }, { "epoch": 1.387800783435926, "grad_norm": 0.4473920464515686, "learning_rate": 9.851033847720166e-05, "loss": 0.0278, "step": 2480 }, { "epoch": 1.3933967543368775, "grad_norm": 0.3903592824935913, "learning_rate": 9.849024217231935e-05, "loss": 0.0222, "step": 2490 }, { "epoch": 1.3989927252378287, "grad_norm": 0.296999454498291, "learning_rate": 9.847001329696653e-05, "loss": 0.0287, "step": 2500 }, { "epoch": 1.4045886961387801, "grad_norm": 0.45139339566230774, "learning_rate": 9.844965190644817e-05, "loss": 0.0253, "step": 2510 }, { "epoch": 1.4101846670397313, "grad_norm": 0.29245492815971375, "learning_rate": 9.842915805643155e-05, "loss": 0.0149, "step": 2520 }, { "epoch": 1.4157806379406828, "grad_norm": 0.2889615595340729, "learning_rate": 9.840853180294608e-05, "loss": 0.0224, "step": 2530 }, { "epoch": 1.421376608841634, "grad_norm": 0.4102277457714081, "learning_rate": 9.838777320238312e-05, "loss": 0.0268, "step": 2540 }, { "epoch": 1.4269725797425854, "grad_norm": 0.5045889616012573, "learning_rate": 9.836688231149592e-05, "loss": 0.0195, "step": 2550 }, { "epoch": 1.4325685506435366, "grad_norm": 0.5412267446517944, "learning_rate": 9.834585918739936e-05, "loss": 0.0262, "step": 2560 }, { "epoch": 1.438164521544488, "grad_norm": 0.5022779703140259, "learning_rate": 9.832470388756987e-05, "loss": 0.0268, "step": 2570 }, { "epoch": 1.4437604924454392, "grad_norm": 0.5818321108818054, "learning_rate": 9.830341646984521e-05, "loss": 0.0262, "step": 2580 }, { "epoch": 1.4493564633463907, "grad_norm": 0.3627963066101074, "learning_rate": 9.82819969924244e-05, "loss": 0.0161, "step": 2590 }, { "epoch": 1.4549524342473419, "grad_norm": 0.35047340393066406, "learning_rate": 9.826044551386744e-05, "loss": 0.0245, "step": 2600 }, { "epoch": 1.4605484051482933, "grad_norm": 0.2970013916492462, "learning_rate": 9.823876209309527e-05, "loss": 0.0206, "step": 2610 }, { "epoch": 1.4661443760492445, "grad_norm": 0.39108118414878845, "learning_rate": 9.821694678938953e-05, "loss": 0.0229, "step": 2620 }, { "epoch": 1.4717403469501957, "grad_norm": 0.30723538994789124, "learning_rate": 9.819499966239243e-05, "loss": 0.0239, "step": 2630 }, { "epoch": 1.4773363178511472, "grad_norm": 0.316388338804245, "learning_rate": 9.817292077210659e-05, "loss": 0.0232, "step": 2640 }, { "epoch": 1.4829322887520986, "grad_norm": 0.2693226635456085, "learning_rate": 9.815071017889482e-05, "loss": 0.0201, "step": 2650 }, { "epoch": 1.4885282596530498, "grad_norm": 0.2165406197309494, "learning_rate": 9.812836794348004e-05, "loss": 0.0178, "step": 2660 }, { "epoch": 1.494124230554001, "grad_norm": 0.33953240513801575, "learning_rate": 9.81058941269451e-05, "loss": 0.0247, "step": 2670 }, { "epoch": 1.4997202014549524, "grad_norm": 0.37577569484710693, "learning_rate": 9.808328879073251e-05, "loss": 0.0188, "step": 2680 }, { "epoch": 1.5053161723559039, "grad_norm": 0.3397989273071289, "learning_rate": 9.806055199664446e-05, "loss": 0.0174, "step": 2690 }, { "epoch": 1.510912143256855, "grad_norm": 0.11495699733495712, "learning_rate": 9.803768380684242e-05, "loss": 0.0193, "step": 2700 }, { "epoch": 1.5165081141578063, "grad_norm": 0.3947618305683136, "learning_rate": 9.801468428384716e-05, "loss": 0.0195, "step": 2710 }, { "epoch": 1.5221040850587577, "grad_norm": 0.3024958670139313, "learning_rate": 9.799155349053851e-05, "loss": 0.021, "step": 2720 }, { "epoch": 1.5277000559597091, "grad_norm": 0.3651089072227478, "learning_rate": 9.796829149015517e-05, "loss": 0.0148, "step": 2730 }, { "epoch": 1.5332960268606604, "grad_norm": 0.6126254796981812, "learning_rate": 9.794489834629455e-05, "loss": 0.0187, "step": 2740 }, { "epoch": 1.5388919977616116, "grad_norm": 0.35577818751335144, "learning_rate": 9.792137412291265e-05, "loss": 0.0183, "step": 2750 }, { "epoch": 1.544487968662563, "grad_norm": 0.26784461736679077, "learning_rate": 9.789771888432375e-05, "loss": 0.0239, "step": 2760 }, { "epoch": 1.5500839395635144, "grad_norm": 0.3259308338165283, "learning_rate": 9.787393269520039e-05, "loss": 0.0174, "step": 2770 }, { "epoch": 1.5556799104644656, "grad_norm": 0.3289090394973755, "learning_rate": 9.785001562057309e-05, "loss": 0.0185, "step": 2780 }, { "epoch": 1.5612758813654168, "grad_norm": 0.41667595505714417, "learning_rate": 9.782596772583026e-05, "loss": 0.0264, "step": 2790 }, { "epoch": 1.5668718522663683, "grad_norm": 0.4217163324356079, "learning_rate": 9.780178907671789e-05, "loss": 0.0221, "step": 2800 }, { "epoch": 1.5724678231673195, "grad_norm": 0.3442951440811157, "learning_rate": 9.777747973933948e-05, "loss": 0.0195, "step": 2810 }, { "epoch": 1.578063794068271, "grad_norm": 0.38543257117271423, "learning_rate": 9.775303978015585e-05, "loss": 0.0189, "step": 2820 }, { "epoch": 1.5836597649692221, "grad_norm": 0.6017774939537048, "learning_rate": 9.772846926598491e-05, "loss": 0.0254, "step": 2830 }, { "epoch": 1.5892557358701733, "grad_norm": 0.5754305720329285, "learning_rate": 9.77037682640015e-05, "loss": 0.0224, "step": 2840 }, { "epoch": 1.5948517067711248, "grad_norm": 0.2952113747596741, "learning_rate": 9.767893684173721e-05, "loss": 0.0209, "step": 2850 }, { "epoch": 1.6004476776720762, "grad_norm": 0.3667709231376648, "learning_rate": 9.765397506708023e-05, "loss": 0.0221, "step": 2860 }, { "epoch": 1.6060436485730274, "grad_norm": 0.543677031993866, "learning_rate": 9.762888300827507e-05, "loss": 0.0216, "step": 2870 }, { "epoch": 1.6116396194739786, "grad_norm": 0.3521057069301605, "learning_rate": 9.760366073392246e-05, "loss": 0.02, "step": 2880 }, { "epoch": 1.61723559037493, "grad_norm": 0.35763946175575256, "learning_rate": 9.757830831297914e-05, "loss": 0.0244, "step": 2890 }, { "epoch": 1.6228315612758815, "grad_norm": 0.25549840927124023, "learning_rate": 9.755282581475769e-05, "loss": 0.0224, "step": 2900 }, { "epoch": 1.6284275321768327, "grad_norm": 0.22006206214427948, "learning_rate": 9.752721330892624e-05, "loss": 0.0178, "step": 2910 }, { "epoch": 1.6340235030777839, "grad_norm": 0.2791355550289154, "learning_rate": 9.750147086550844e-05, "loss": 0.0204, "step": 2920 }, { "epoch": 1.6396194739787353, "grad_norm": 0.34600383043289185, "learning_rate": 9.747559855488313e-05, "loss": 0.0206, "step": 2930 }, { "epoch": 1.6452154448796867, "grad_norm": 0.40189531445503235, "learning_rate": 9.744959644778422e-05, "loss": 0.0213, "step": 2940 }, { "epoch": 1.650811415780638, "grad_norm": 0.21385939419269562, "learning_rate": 9.742346461530048e-05, "loss": 0.0287, "step": 2950 }, { "epoch": 1.6564073866815892, "grad_norm": 0.4269281327724457, "learning_rate": 9.739720312887535e-05, "loss": 0.0226, "step": 2960 }, { "epoch": 1.6620033575825406, "grad_norm": 0.46277040243148804, "learning_rate": 9.73708120603067e-05, "loss": 0.0206, "step": 2970 }, { "epoch": 1.667599328483492, "grad_norm": 0.340044230222702, "learning_rate": 9.734429148174675e-05, "loss": 0.016, "step": 2980 }, { "epoch": 1.6731952993844432, "grad_norm": 0.33839765191078186, "learning_rate": 9.731764146570173e-05, "loss": 0.0208, "step": 2990 }, { "epoch": 1.6787912702853944, "grad_norm": 0.4214085042476654, "learning_rate": 9.729086208503174e-05, "loss": 0.0291, "step": 3000 }, { "epoch": 1.6843872411863459, "grad_norm": 0.29594293236732483, "learning_rate": 9.726395341295062e-05, "loss": 0.0194, "step": 3010 }, { "epoch": 1.6899832120872973, "grad_norm": 0.43080446124076843, "learning_rate": 9.723691552302562e-05, "loss": 0.0204, "step": 3020 }, { "epoch": 1.6955791829882485, "grad_norm": 0.3255208134651184, "learning_rate": 9.720974848917735e-05, "loss": 0.0219, "step": 3030 }, { "epoch": 1.7011751538891997, "grad_norm": 0.30094242095947266, "learning_rate": 9.718245238567939e-05, "loss": 0.0207, "step": 3040 }, { "epoch": 1.7067711247901511, "grad_norm": 0.27606436610221863, "learning_rate": 9.715502728715826e-05, "loss": 0.025, "step": 3050 }, { "epoch": 1.7123670956911026, "grad_norm": 0.21307139098644257, "learning_rate": 9.712747326859315e-05, "loss": 0.0202, "step": 3060 }, { "epoch": 1.7179630665920538, "grad_norm": 0.4076824188232422, "learning_rate": 9.709979040531569e-05, "loss": 0.0181, "step": 3070 }, { "epoch": 1.723559037493005, "grad_norm": 0.3973149359226227, "learning_rate": 9.707197877300974e-05, "loss": 0.0278, "step": 3080 }, { "epoch": 1.7291550083939562, "grad_norm": 0.3367111086845398, "learning_rate": 9.704403844771128e-05, "loss": 0.0284, "step": 3090 }, { "epoch": 1.7347509792949076, "grad_norm": 0.4137897193431854, "learning_rate": 9.701596950580806e-05, "loss": 0.0251, "step": 3100 }, { "epoch": 1.740346950195859, "grad_norm": 0.28888463973999023, "learning_rate": 9.698777202403953e-05, "loss": 0.0185, "step": 3110 }, { "epoch": 1.7459429210968103, "grad_norm": 0.2732876241207123, "learning_rate": 9.695944607949649e-05, "loss": 0.0206, "step": 3120 }, { "epoch": 1.7515388919977615, "grad_norm": 0.5475505590438843, "learning_rate": 9.693099174962103e-05, "loss": 0.0239, "step": 3130 }, { "epoch": 1.757134862898713, "grad_norm": 0.3212341070175171, "learning_rate": 9.690240911220618e-05, "loss": 0.0193, "step": 3140 }, { "epoch": 1.7627308337996643, "grad_norm": 0.38309773802757263, "learning_rate": 9.687369824539577e-05, "loss": 0.0228, "step": 3150 }, { "epoch": 1.7683268047006155, "grad_norm": 0.22085356712341309, "learning_rate": 9.684485922768422e-05, "loss": 0.0167, "step": 3160 }, { "epoch": 1.7739227756015667, "grad_norm": 0.32358717918395996, "learning_rate": 9.681589213791633e-05, "loss": 0.0216, "step": 3170 }, { "epoch": 1.7795187465025182, "grad_norm": 0.30354073643684387, "learning_rate": 9.6786797055287e-05, "loss": 0.0202, "step": 3180 }, { "epoch": 1.7851147174034696, "grad_norm": 0.3479655981063843, "learning_rate": 9.675757405934103e-05, "loss": 0.0167, "step": 3190 }, { "epoch": 1.7907106883044208, "grad_norm": 0.3674020767211914, "learning_rate": 9.672822322997305e-05, "loss": 0.0216, "step": 3200 }, { "epoch": 1.796306659205372, "grad_norm": 0.2632925808429718, "learning_rate": 9.669874464742705e-05, "loss": 0.0166, "step": 3210 }, { "epoch": 1.8019026301063235, "grad_norm": 0.22815559804439545, "learning_rate": 9.66691383922964e-05, "loss": 0.0182, "step": 3220 }, { "epoch": 1.8074986010072749, "grad_norm": 0.2246052771806717, "learning_rate": 9.663940454552342e-05, "loss": 0.0186, "step": 3230 }, { "epoch": 1.813094571908226, "grad_norm": 0.28712260723114014, "learning_rate": 9.660954318839933e-05, "loss": 0.0157, "step": 3240 }, { "epoch": 1.8186905428091773, "grad_norm": 0.2282487452030182, "learning_rate": 9.657955440256395e-05, "loss": 0.0201, "step": 3250 }, { "epoch": 1.8242865137101287, "grad_norm": 0.3279257118701935, "learning_rate": 9.654943827000548e-05, "loss": 0.0153, "step": 3260 }, { "epoch": 1.8298824846110802, "grad_norm": 0.3519797623157501, "learning_rate": 9.651919487306025e-05, "loss": 0.0217, "step": 3270 }, { "epoch": 1.8354784555120314, "grad_norm": 0.29638567566871643, "learning_rate": 9.648882429441257e-05, "loss": 0.0165, "step": 3280 }, { "epoch": 1.8410744264129826, "grad_norm": 0.3102523982524872, "learning_rate": 9.645832661709444e-05, "loss": 0.02, "step": 3290 }, { "epoch": 1.846670397313934, "grad_norm": 0.31784892082214355, "learning_rate": 9.642770192448536e-05, "loss": 0.0259, "step": 3300 }, { "epoch": 1.8522663682148854, "grad_norm": 0.31783589720726013, "learning_rate": 9.639695030031204e-05, "loss": 0.0154, "step": 3310 }, { "epoch": 1.8578623391158366, "grad_norm": 0.4002092778682709, "learning_rate": 9.636607182864827e-05, "loss": 0.0128, "step": 3320 }, { "epoch": 1.8634583100167879, "grad_norm": 0.3656691610813141, "learning_rate": 9.63350665939146e-05, "loss": 0.0167, "step": 3330 }, { "epoch": 1.869054280917739, "grad_norm": 0.34003934264183044, "learning_rate": 9.630393468087818e-05, "loss": 0.018, "step": 3340 }, { "epoch": 1.8746502518186905, "grad_norm": 0.3051067888736725, "learning_rate": 9.627267617465243e-05, "loss": 0.0192, "step": 3350 }, { "epoch": 1.880246222719642, "grad_norm": 0.32361093163490295, "learning_rate": 9.624129116069694e-05, "loss": 0.0262, "step": 3360 }, { "epoch": 1.8858421936205931, "grad_norm": 0.20856234431266785, "learning_rate": 9.620977972481716e-05, "loss": 0.0259, "step": 3370 }, { "epoch": 1.8914381645215443, "grad_norm": 0.3916553258895874, "learning_rate": 9.617814195316411e-05, "loss": 0.0184, "step": 3380 }, { "epoch": 1.8970341354224958, "grad_norm": 0.461211621761322, "learning_rate": 9.614637793223425e-05, "loss": 0.018, "step": 3390 }, { "epoch": 1.9026301063234472, "grad_norm": 0.4060401916503906, "learning_rate": 9.611448774886924e-05, "loss": 0.0196, "step": 3400 }, { "epoch": 1.9082260772243984, "grad_norm": 0.362894207239151, "learning_rate": 9.60824714902556e-05, "loss": 0.0149, "step": 3410 }, { "epoch": 1.9138220481253496, "grad_norm": 0.2224276214838028, "learning_rate": 9.605032924392457e-05, "loss": 0.0214, "step": 3420 }, { "epoch": 1.919418019026301, "grad_norm": 0.36570799350738525, "learning_rate": 9.601806109775179e-05, "loss": 0.019, "step": 3430 }, { "epoch": 1.9250139899272525, "grad_norm": 0.37845227122306824, "learning_rate": 9.598566713995718e-05, "loss": 0.0283, "step": 3440 }, { "epoch": 1.9306099608282037, "grad_norm": 0.2989262044429779, "learning_rate": 9.595314745910456e-05, "loss": 0.0195, "step": 3450 }, { "epoch": 1.936205931729155, "grad_norm": 0.4651845097541809, "learning_rate": 9.59205021441015e-05, "loss": 0.0221, "step": 3460 }, { "epoch": 1.9418019026301063, "grad_norm": 0.16341492533683777, "learning_rate": 9.588773128419906e-05, "loss": 0.0189, "step": 3470 }, { "epoch": 1.9473978735310578, "grad_norm": 0.3499149978160858, "learning_rate": 9.58548349689915e-05, "loss": 0.0163, "step": 3480 }, { "epoch": 1.952993844432009, "grad_norm": 0.5015300512313843, "learning_rate": 9.582181328841611e-05, "loss": 0.0287, "step": 3490 }, { "epoch": 1.9585898153329602, "grad_norm": 0.3239698112010956, "learning_rate": 9.578866633275288e-05, "loss": 0.0168, "step": 3500 }, { "epoch": 1.9641857862339116, "grad_norm": 0.29603099822998047, "learning_rate": 9.575539419262434e-05, "loss": 0.0204, "step": 3510 }, { "epoch": 1.969781757134863, "grad_norm": 0.4523886740207672, "learning_rate": 9.572199695899522e-05, "loss": 0.0247, "step": 3520 }, { "epoch": 1.9753777280358142, "grad_norm": 0.2664707899093628, "learning_rate": 9.568847472317232e-05, "loss": 0.0155, "step": 3530 }, { "epoch": 1.9809736989367654, "grad_norm": 0.3717735707759857, "learning_rate": 9.565482757680415e-05, "loss": 0.0279, "step": 3540 }, { "epoch": 1.9865696698377169, "grad_norm": 0.4721260070800781, "learning_rate": 9.562105561188069e-05, "loss": 0.017, "step": 3550 }, { "epoch": 1.9921656407386683, "grad_norm": 0.19504283368587494, "learning_rate": 9.558715892073323e-05, "loss": 0.0251, "step": 3560 }, { "epoch": 1.9977616116396195, "grad_norm": 0.3900291919708252, "learning_rate": 9.555313759603402e-05, "loss": 0.028, "step": 3570 }, { "epoch": 2.0033575825405707, "grad_norm": 0.3327538073062897, "learning_rate": 9.551899173079607e-05, "loss": 0.0214, "step": 3580 }, { "epoch": 2.008953553441522, "grad_norm": 0.5092990398406982, "learning_rate": 9.548472141837286e-05, "loss": 0.0204, "step": 3590 }, { "epoch": 2.0145495243424736, "grad_norm": 0.2563795745372772, "learning_rate": 9.545032675245813e-05, "loss": 0.0242, "step": 3600 }, { "epoch": 2.020145495243425, "grad_norm": 0.1788598746061325, "learning_rate": 9.541580782708557e-05, "loss": 0.0189, "step": 3610 }, { "epoch": 2.025741466144376, "grad_norm": 0.2857683598995209, "learning_rate": 9.538116473662861e-05, "loss": 0.0187, "step": 3620 }, { "epoch": 2.031337437045327, "grad_norm": 0.25776809453964233, "learning_rate": 9.534639757580013e-05, "loss": 0.0176, "step": 3630 }, { "epoch": 2.036933407946279, "grad_norm": 0.37827619910240173, "learning_rate": 9.531150643965223e-05, "loss": 0.0133, "step": 3640 }, { "epoch": 2.04252937884723, "grad_norm": 0.36484652757644653, "learning_rate": 9.527649142357596e-05, "loss": 0.021, "step": 3650 }, { "epoch": 2.0481253497481813, "grad_norm": 0.41479215025901794, "learning_rate": 9.524135262330098e-05, "loss": 0.0159, "step": 3660 }, { "epoch": 2.0537213206491325, "grad_norm": 0.261192262172699, "learning_rate": 9.520609013489547e-05, "loss": 0.0169, "step": 3670 }, { "epoch": 2.059317291550084, "grad_norm": 0.3758920431137085, "learning_rate": 9.517070405476575e-05, "loss": 0.02, "step": 3680 }, { "epoch": 2.0649132624510353, "grad_norm": 0.33406686782836914, "learning_rate": 9.513519447965595e-05, "loss": 0.0176, "step": 3690 }, { "epoch": 2.0705092333519866, "grad_norm": 0.18889296054840088, "learning_rate": 9.509956150664796e-05, "loss": 0.0167, "step": 3700 }, { "epoch": 2.0761052042529378, "grad_norm": 0.231406569480896, "learning_rate": 9.50638052331609e-05, "loss": 0.0232, "step": 3710 }, { "epoch": 2.0817011751538894, "grad_norm": 0.31842225790023804, "learning_rate": 9.502792575695112e-05, "loss": 0.0219, "step": 3720 }, { "epoch": 2.0872971460548406, "grad_norm": 0.2191598266363144, "learning_rate": 9.499192317611167e-05, "loss": 0.0207, "step": 3730 }, { "epoch": 2.092893116955792, "grad_norm": 0.3848901391029358, "learning_rate": 9.49557975890723e-05, "loss": 0.0205, "step": 3740 }, { "epoch": 2.098489087856743, "grad_norm": 0.3654007017612457, "learning_rate": 9.491954909459895e-05, "loss": 0.0202, "step": 3750 }, { "epoch": 2.1040850587576942, "grad_norm": 0.3708373010158539, "learning_rate": 9.488317779179361e-05, "loss": 0.0186, "step": 3760 }, { "epoch": 2.109681029658646, "grad_norm": 0.29888278245925903, "learning_rate": 9.484668378009408e-05, "loss": 0.0179, "step": 3770 }, { "epoch": 2.115277000559597, "grad_norm": 0.3273047208786011, "learning_rate": 9.481006715927351e-05, "loss": 0.0194, "step": 3780 }, { "epoch": 2.1208729714605483, "grad_norm": 0.30253902077674866, "learning_rate": 9.477332802944044e-05, "loss": 0.0172, "step": 3790 }, { "epoch": 2.1264689423614995, "grad_norm": 0.3017847537994385, "learning_rate": 9.473646649103818e-05, "loss": 0.0239, "step": 3800 }, { "epoch": 2.132064913262451, "grad_norm": 0.2024342566728592, "learning_rate": 9.46994826448448e-05, "loss": 0.0229, "step": 3810 }, { "epoch": 2.1376608841634024, "grad_norm": 0.25708290934562683, "learning_rate": 9.46623765919727e-05, "loss": 0.017, "step": 3820 }, { "epoch": 2.1432568550643536, "grad_norm": 0.38740572333335876, "learning_rate": 9.462514843386845e-05, "loss": 0.0186, "step": 3830 }, { "epoch": 2.148852825965305, "grad_norm": 0.41047894954681396, "learning_rate": 9.458779827231237e-05, "loss": 0.0197, "step": 3840 }, { "epoch": 2.1544487968662565, "grad_norm": 0.26995226740837097, "learning_rate": 9.45503262094184e-05, "loss": 0.0183, "step": 3850 }, { "epoch": 2.1600447677672077, "grad_norm": 0.3127893805503845, "learning_rate": 9.451273234763371e-05, "loss": 0.0206, "step": 3860 }, { "epoch": 2.165640738668159, "grad_norm": 0.33325016498565674, "learning_rate": 9.447501678973852e-05, "loss": 0.0208, "step": 3870 }, { "epoch": 2.17123670956911, "grad_norm": 0.2265041172504425, "learning_rate": 9.443717963884569e-05, "loss": 0.0177, "step": 3880 }, { "epoch": 2.1768326804700617, "grad_norm": 0.44378480315208435, "learning_rate": 9.439922099840054e-05, "loss": 0.0232, "step": 3890 }, { "epoch": 2.182428651371013, "grad_norm": 0.2953107953071594, "learning_rate": 9.43611409721806e-05, "loss": 0.0201, "step": 3900 }, { "epoch": 2.188024622271964, "grad_norm": 0.3876049518585205, "learning_rate": 9.432293966429514e-05, "loss": 0.0164, "step": 3910 }, { "epoch": 2.1936205931729154, "grad_norm": 0.2669300138950348, "learning_rate": 9.428461717918511e-05, "loss": 0.0153, "step": 3920 }, { "epoch": 2.199216564073867, "grad_norm": 0.6801855564117432, "learning_rate": 9.424617362162271e-05, "loss": 0.0185, "step": 3930 }, { "epoch": 2.204812534974818, "grad_norm": 0.3502347469329834, "learning_rate": 9.420760909671118e-05, "loss": 0.0253, "step": 3940 }, { "epoch": 2.2104085058757694, "grad_norm": 0.3213407099246979, "learning_rate": 9.416892370988444e-05, "loss": 0.0221, "step": 3950 }, { "epoch": 2.2160044767767206, "grad_norm": 0.45591723918914795, "learning_rate": 9.413011756690685e-05, "loss": 0.0303, "step": 3960 }, { "epoch": 2.2216004476776723, "grad_norm": 0.5190838575363159, "learning_rate": 9.409119077387294e-05, "loss": 0.0214, "step": 3970 }, { "epoch": 2.2271964185786235, "grad_norm": 0.24658669531345367, "learning_rate": 9.405214343720707e-05, "loss": 0.0169, "step": 3980 }, { "epoch": 2.2327923894795747, "grad_norm": 0.26745668053627014, "learning_rate": 9.401297566366318e-05, "loss": 0.0174, "step": 3990 }, { "epoch": 2.238388360380526, "grad_norm": 0.23573242127895355, "learning_rate": 9.397368756032445e-05, "loss": 0.0166, "step": 4000 }, { "epoch": 2.243984331281477, "grad_norm": 0.38697415590286255, "learning_rate": 9.393427923460308e-05, "loss": 0.0175, "step": 4010 }, { "epoch": 2.2495803021824288, "grad_norm": 0.26302671432495117, "learning_rate": 9.389475079423988e-05, "loss": 0.016, "step": 4020 }, { "epoch": 2.25517627308338, "grad_norm": 0.520627498626709, "learning_rate": 9.385510234730415e-05, "loss": 0.0196, "step": 4030 }, { "epoch": 2.260772243984331, "grad_norm": 0.3094232976436615, "learning_rate": 9.381533400219318e-05, "loss": 0.0197, "step": 4040 }, { "epoch": 2.266368214885283, "grad_norm": 0.3238268196582794, "learning_rate": 9.377544586763215e-05, "loss": 0.0242, "step": 4050 }, { "epoch": 2.271964185786234, "grad_norm": 0.37398698925971985, "learning_rate": 9.373543805267368e-05, "loss": 0.0225, "step": 4060 }, { "epoch": 2.2775601566871853, "grad_norm": 0.22411245107650757, "learning_rate": 9.369531066669758e-05, "loss": 0.0259, "step": 4070 }, { "epoch": 2.2831561275881365, "grad_norm": 0.2310367226600647, "learning_rate": 9.365506381941066e-05, "loss": 0.0198, "step": 4080 }, { "epoch": 2.2887520984890877, "grad_norm": 0.4910151958465576, "learning_rate": 9.36146976208462e-05, "loss": 0.0234, "step": 4090 }, { "epoch": 2.2943480693900393, "grad_norm": 0.2820461392402649, "learning_rate": 9.357421218136386e-05, "loss": 0.0176, "step": 4100 }, { "epoch": 2.2999440402909905, "grad_norm": 0.22990214824676514, "learning_rate": 9.353360761164931e-05, "loss": 0.0185, "step": 4110 }, { "epoch": 2.3055400111919417, "grad_norm": 0.33790138363838196, "learning_rate": 9.349288402271388e-05, "loss": 0.0178, "step": 4120 }, { "epoch": 2.311135982092893, "grad_norm": 0.3388676345348358, "learning_rate": 9.345204152589428e-05, "loss": 0.0147, "step": 4130 }, { "epoch": 2.3167319529938446, "grad_norm": 0.36007586121559143, "learning_rate": 9.341108023285238e-05, "loss": 0.0185, "step": 4140 }, { "epoch": 2.322327923894796, "grad_norm": 0.41096752882003784, "learning_rate": 9.337000025557476e-05, "loss": 0.0219, "step": 4150 }, { "epoch": 2.327923894795747, "grad_norm": 0.2878301441669464, "learning_rate": 9.332880170637252e-05, "loss": 0.0159, "step": 4160 }, { "epoch": 2.3335198656966982, "grad_norm": 0.32061803340911865, "learning_rate": 9.328748469788093e-05, "loss": 0.0216, "step": 4170 }, { "epoch": 2.33911583659765, "grad_norm": 0.29178762435913086, "learning_rate": 9.32460493430591e-05, "loss": 0.0178, "step": 4180 }, { "epoch": 2.344711807498601, "grad_norm": 0.32889455556869507, "learning_rate": 9.320449575518972e-05, "loss": 0.0194, "step": 4190 }, { "epoch": 2.3503077783995523, "grad_norm": 0.2980196475982666, "learning_rate": 9.316282404787871e-05, "loss": 0.015, "step": 4200 }, { "epoch": 2.3559037493005035, "grad_norm": 0.21256855130195618, "learning_rate": 9.31210343350549e-05, "loss": 0.0151, "step": 4210 }, { "epoch": 2.361499720201455, "grad_norm": 0.2378161996603012, "learning_rate": 9.30791267309698e-05, "loss": 0.0179, "step": 4220 }, { "epoch": 2.3670956911024064, "grad_norm": 0.211124449968338, "learning_rate": 9.30371013501972e-05, "loss": 0.0147, "step": 4230 }, { "epoch": 2.3726916620033576, "grad_norm": 0.3496321439743042, "learning_rate": 9.299495830763286e-05, "loss": 0.0144, "step": 4240 }, { "epoch": 2.378287632904309, "grad_norm": 0.2865016758441925, "learning_rate": 9.295269771849427e-05, "loss": 0.0209, "step": 4250 }, { "epoch": 2.38388360380526, "grad_norm": 0.22519885003566742, "learning_rate": 9.291031969832026e-05, "loss": 0.0177, "step": 4260 }, { "epoch": 2.3894795747062116, "grad_norm": 0.41060182452201843, "learning_rate": 9.286782436297073e-05, "loss": 0.0169, "step": 4270 }, { "epoch": 2.395075545607163, "grad_norm": 0.6265867352485657, "learning_rate": 9.282521182862629e-05, "loss": 0.0189, "step": 4280 }, { "epoch": 2.400671516508114, "grad_norm": 0.3811153173446655, "learning_rate": 9.278248221178798e-05, "loss": 0.0274, "step": 4290 }, { "epoch": 2.4062674874090657, "grad_norm": 0.2686716318130493, "learning_rate": 9.273963562927695e-05, "loss": 0.0198, "step": 4300 }, { "epoch": 2.411863458310017, "grad_norm": 0.31025633215904236, "learning_rate": 9.269667219823412e-05, "loss": 0.0159, "step": 4310 }, { "epoch": 2.417459429210968, "grad_norm": 0.23998180031776428, "learning_rate": 9.265359203611987e-05, "loss": 0.018, "step": 4320 }, { "epoch": 2.4230554001119193, "grad_norm": 0.45635882019996643, "learning_rate": 9.261039526071374e-05, "loss": 0.0199, "step": 4330 }, { "epoch": 2.4286513710128705, "grad_norm": 0.34626588225364685, "learning_rate": 9.256708199011401e-05, "loss": 0.0169, "step": 4340 }, { "epoch": 2.434247341913822, "grad_norm": 0.27278828620910645, "learning_rate": 9.252365234273755e-05, "loss": 0.0173, "step": 4350 }, { "epoch": 2.4398433128147734, "grad_norm": 0.5236303806304932, "learning_rate": 9.248010643731935e-05, "loss": 0.0226, "step": 4360 }, { "epoch": 2.4454392837157246, "grad_norm": 0.27782773971557617, "learning_rate": 9.243644439291223e-05, "loss": 0.0194, "step": 4370 }, { "epoch": 2.451035254616676, "grad_norm": 0.280048131942749, "learning_rate": 9.239266632888659e-05, "loss": 0.0174, "step": 4380 }, { "epoch": 2.4566312255176275, "grad_norm": 0.3045734763145447, "learning_rate": 9.234877236492997e-05, "loss": 0.0148, "step": 4390 }, { "epoch": 2.4622271964185787, "grad_norm": 0.1700965315103531, "learning_rate": 9.230476262104677e-05, "loss": 0.0155, "step": 4400 }, { "epoch": 2.46782316731953, "grad_norm": 0.3037347197532654, "learning_rate": 9.226063721755799e-05, "loss": 0.0132, "step": 4410 }, { "epoch": 2.473419138220481, "grad_norm": 0.29750266671180725, "learning_rate": 9.221639627510076e-05, "loss": 0.0149, "step": 4420 }, { "epoch": 2.4790151091214327, "grad_norm": 0.1919635832309723, "learning_rate": 9.217203991462815e-05, "loss": 0.015, "step": 4430 }, { "epoch": 2.484611080022384, "grad_norm": 0.2919257879257202, "learning_rate": 9.212756825740873e-05, "loss": 0.0177, "step": 4440 }, { "epoch": 2.490207050923335, "grad_norm": 0.17676684260368347, "learning_rate": 9.208298142502636e-05, "loss": 0.0175, "step": 4450 }, { "epoch": 2.4958030218242864, "grad_norm": 0.24397723376750946, "learning_rate": 9.20382795393797e-05, "loss": 0.0179, "step": 4460 }, { "epoch": 2.501398992725238, "grad_norm": 0.32645362615585327, "learning_rate": 9.199346272268199e-05, "loss": 0.0179, "step": 4470 }, { "epoch": 2.5069949636261892, "grad_norm": 0.35162001848220825, "learning_rate": 9.194853109746074e-05, "loss": 0.0174, "step": 4480 }, { "epoch": 2.5125909345271404, "grad_norm": 0.4019016623497009, "learning_rate": 9.190348478655724e-05, "loss": 0.015, "step": 4490 }, { "epoch": 2.5181869054280916, "grad_norm": 0.4017965495586395, "learning_rate": 9.185832391312644e-05, "loss": 0.0238, "step": 4500 }, { "epoch": 2.523782876329043, "grad_norm": 0.41645774245262146, "learning_rate": 9.18130486006364e-05, "loss": 0.0143, "step": 4510 }, { "epoch": 2.5293788472299945, "grad_norm": 0.28400033712387085, "learning_rate": 9.176765897286813e-05, "loss": 0.0196, "step": 4520 }, { "epoch": 2.5349748181309457, "grad_norm": 0.4045359492301941, "learning_rate": 9.17221551539151e-05, "loss": 0.0191, "step": 4530 }, { "epoch": 2.540570789031897, "grad_norm": 0.37660202383995056, "learning_rate": 9.167653726818305e-05, "loss": 0.0138, "step": 4540 }, { "epoch": 2.5461667599328486, "grad_norm": 0.35835906863212585, "learning_rate": 9.163080544038952e-05, "loss": 0.0213, "step": 4550 }, { "epoch": 2.5517627308338, "grad_norm": 0.3906223177909851, "learning_rate": 9.158495979556358e-05, "loss": 0.0204, "step": 4560 }, { "epoch": 2.557358701734751, "grad_norm": 0.23904386162757874, "learning_rate": 9.153900045904549e-05, "loss": 0.0193, "step": 4570 }, { "epoch": 2.562954672635702, "grad_norm": 0.3690219521522522, "learning_rate": 9.14929275564863e-05, "loss": 0.0218, "step": 4580 }, { "epoch": 2.5685506435366534, "grad_norm": 0.3098298907279968, "learning_rate": 9.144674121384757e-05, "loss": 0.0142, "step": 4590 }, { "epoch": 2.574146614437605, "grad_norm": 0.5726227164268494, "learning_rate": 9.140044155740101e-05, "loss": 0.0168, "step": 4600 }, { "epoch": 2.5797425853385563, "grad_norm": 0.32549935579299927, "learning_rate": 9.135402871372808e-05, "loss": 0.0228, "step": 4610 }, { "epoch": 2.5853385562395075, "grad_norm": 0.35607558488845825, "learning_rate": 9.130750280971978e-05, "loss": 0.0234, "step": 4620 }, { "epoch": 2.590934527140459, "grad_norm": 0.31833362579345703, "learning_rate": 9.126086397257612e-05, "loss": 0.0134, "step": 4630 }, { "epoch": 2.5965304980414103, "grad_norm": 0.5075991749763489, "learning_rate": 9.121411232980588e-05, "loss": 0.0181, "step": 4640 }, { "epoch": 2.6021264689423615, "grad_norm": 0.2868656814098358, "learning_rate": 9.116724800922629e-05, "loss": 0.0216, "step": 4650 }, { "epoch": 2.6077224398433128, "grad_norm": 0.38551998138427734, "learning_rate": 9.112027113896262e-05, "loss": 0.0218, "step": 4660 }, { "epoch": 2.613318410744264, "grad_norm": 0.3080727756023407, "learning_rate": 9.107318184744781e-05, "loss": 0.0263, "step": 4670 }, { "epoch": 2.618914381645215, "grad_norm": 0.2743169665336609, "learning_rate": 9.102598026342222e-05, "loss": 0.0143, "step": 4680 }, { "epoch": 2.624510352546167, "grad_norm": 0.286101758480072, "learning_rate": 9.097866651593317e-05, "loss": 0.0219, "step": 4690 }, { "epoch": 2.630106323447118, "grad_norm": 0.1881791204214096, "learning_rate": 9.093124073433463e-05, "loss": 0.015, "step": 4700 }, { "epoch": 2.6357022943480692, "grad_norm": 0.3556104004383087, "learning_rate": 9.088370304828685e-05, "loss": 0.0207, "step": 4710 }, { "epoch": 2.641298265249021, "grad_norm": 0.2784225344657898, "learning_rate": 9.083605358775612e-05, "loss": 0.0159, "step": 4720 }, { "epoch": 2.646894236149972, "grad_norm": 0.22262175381183624, "learning_rate": 9.078829248301417e-05, "loss": 0.0162, "step": 4730 }, { "epoch": 2.6524902070509233, "grad_norm": 0.16783557832241058, "learning_rate": 9.074041986463808e-05, "loss": 0.018, "step": 4740 }, { "epoch": 2.6580861779518745, "grad_norm": 0.31983381509780884, "learning_rate": 9.069243586350975e-05, "loss": 0.0168, "step": 4750 }, { "epoch": 2.6636821488528257, "grad_norm": 0.2954675555229187, "learning_rate": 9.064434061081562e-05, "loss": 0.0157, "step": 4760 }, { "epoch": 2.6692781197537774, "grad_norm": 0.37835440039634705, "learning_rate": 9.059613423804623e-05, "loss": 0.016, "step": 4770 }, { "epoch": 2.6748740906547286, "grad_norm": 0.30182933807373047, "learning_rate": 9.0547816876996e-05, "loss": 0.0223, "step": 4780 }, { "epoch": 2.68047006155568, "grad_norm": 0.3329738974571228, "learning_rate": 9.049938865976275e-05, "loss": 0.0232, "step": 4790 }, { "epoch": 2.6860660324566314, "grad_norm": 0.2866031527519226, "learning_rate": 9.045084971874738e-05, "loss": 0.0193, "step": 4800 }, { "epoch": 2.6916620033575827, "grad_norm": 0.3558676540851593, "learning_rate": 9.040220018665347e-05, "loss": 0.0181, "step": 4810 }, { "epoch": 2.697257974258534, "grad_norm": 0.22001361846923828, "learning_rate": 9.035344019648702e-05, "loss": 0.0124, "step": 4820 }, { "epoch": 2.702853945159485, "grad_norm": 0.28986766934394836, "learning_rate": 9.030456988155596e-05, "loss": 0.0179, "step": 4830 }, { "epoch": 2.7084499160604363, "grad_norm": 0.3889327347278595, "learning_rate": 9.025558937546988e-05, "loss": 0.0186, "step": 4840 }, { "epoch": 2.714045886961388, "grad_norm": 0.33833345770835876, "learning_rate": 9.020649881213958e-05, "loss": 0.0161, "step": 4850 }, { "epoch": 2.719641857862339, "grad_norm": 0.23896977305412292, "learning_rate": 9.015729832577681e-05, "loss": 0.0149, "step": 4860 }, { "epoch": 2.7252378287632903, "grad_norm": 0.44981443881988525, "learning_rate": 9.010798805089384e-05, "loss": 0.0221, "step": 4870 }, { "epoch": 2.730833799664242, "grad_norm": 0.4389462471008301, "learning_rate": 9.005856812230304e-05, "loss": 0.0175, "step": 4880 }, { "epoch": 2.736429770565193, "grad_norm": 0.2757073640823364, "learning_rate": 9.000903867511666e-05, "loss": 0.0176, "step": 4890 }, { "epoch": 2.7420257414661444, "grad_norm": 0.2381424754858017, "learning_rate": 8.995939984474624e-05, "loss": 0.0145, "step": 4900 }, { "epoch": 2.7476217123670956, "grad_norm": 0.25083616375923157, "learning_rate": 8.990965176690252e-05, "loss": 0.0184, "step": 4910 }, { "epoch": 2.753217683268047, "grad_norm": 0.3651309013366699, "learning_rate": 8.98597945775948e-05, "loss": 0.0201, "step": 4920 }, { "epoch": 2.7588136541689985, "grad_norm": 0.19562850892543793, "learning_rate": 8.980982841313074e-05, "loss": 0.0158, "step": 4930 }, { "epoch": 2.7644096250699497, "grad_norm": 0.646306037902832, "learning_rate": 8.975975341011596e-05, "loss": 0.0172, "step": 4940 }, { "epoch": 2.770005595970901, "grad_norm": 0.5771059393882751, "learning_rate": 8.970956970545355e-05, "loss": 0.0181, "step": 4950 }, { "epoch": 2.775601566871852, "grad_norm": 0.2918018400669098, "learning_rate": 8.965927743634391e-05, "loss": 0.0199, "step": 4960 }, { "epoch": 2.7811975377728038, "grad_norm": 0.5034765601158142, "learning_rate": 8.96088767402841e-05, "loss": 0.0172, "step": 4970 }, { "epoch": 2.786793508673755, "grad_norm": 0.29646632075309753, "learning_rate": 8.955836775506776e-05, "loss": 0.0147, "step": 4980 }, { "epoch": 2.792389479574706, "grad_norm": 0.2613969147205353, "learning_rate": 8.950775061878453e-05, "loss": 0.0164, "step": 4990 }, { "epoch": 2.7979854504756574, "grad_norm": 0.27573442459106445, "learning_rate": 8.945702546981969e-05, "loss": 0.018, "step": 5000 }, { "epoch": 2.8035814213766086, "grad_norm": 0.33170339465141296, "learning_rate": 8.940619244685388e-05, "loss": 0.019, "step": 5010 }, { "epoch": 2.8091773922775602, "grad_norm": 0.2994827628135681, "learning_rate": 8.935525168886262e-05, "loss": 0.019, "step": 5020 }, { "epoch": 2.8147733631785115, "grad_norm": 0.3199397921562195, "learning_rate": 8.930420333511606e-05, "loss": 0.0172, "step": 5030 }, { "epoch": 2.8203693340794627, "grad_norm": 0.24537423253059387, "learning_rate": 8.92530475251784e-05, "loss": 0.0146, "step": 5040 }, { "epoch": 2.8259653049804143, "grad_norm": 0.24761222302913666, "learning_rate": 8.920178439890765e-05, "loss": 0.0194, "step": 5050 }, { "epoch": 2.8315612758813655, "grad_norm": 0.2208421230316162, "learning_rate": 8.91504140964553e-05, "loss": 0.0123, "step": 5060 }, { "epoch": 2.8371572467823167, "grad_norm": 0.3568471074104309, "learning_rate": 8.909893675826574e-05, "loss": 0.0147, "step": 5070 }, { "epoch": 2.842753217683268, "grad_norm": 0.24207855761051178, "learning_rate": 8.90473525250761e-05, "loss": 0.0166, "step": 5080 }, { "epoch": 2.848349188584219, "grad_norm": 0.47056907415390015, "learning_rate": 8.899566153791566e-05, "loss": 0.0234, "step": 5090 }, { "epoch": 2.853945159485171, "grad_norm": 0.26351991295814514, "learning_rate": 8.894386393810563e-05, "loss": 0.0212, "step": 5100 }, { "epoch": 2.859541130386122, "grad_norm": 0.2002822756767273, "learning_rate": 8.889195986725865e-05, "loss": 0.0191, "step": 5110 }, { "epoch": 2.865137101287073, "grad_norm": 0.28489527106285095, "learning_rate": 8.883994946727849e-05, "loss": 0.0155, "step": 5120 }, { "epoch": 2.870733072188025, "grad_norm": 0.30861204862594604, "learning_rate": 8.878783288035957e-05, "loss": 0.0158, "step": 5130 }, { "epoch": 2.876329043088976, "grad_norm": 0.2856840193271637, "learning_rate": 8.873561024898668e-05, "loss": 0.0201, "step": 5140 }, { "epoch": 2.8819250139899273, "grad_norm": 0.3461334705352783, "learning_rate": 8.868328171593448e-05, "loss": 0.0184, "step": 5150 }, { "epoch": 2.8875209848908785, "grad_norm": 0.22160184383392334, "learning_rate": 8.863084742426719e-05, "loss": 0.0171, "step": 5160 }, { "epoch": 2.8931169557918297, "grad_norm": 0.2488642781972885, "learning_rate": 8.857830751733815e-05, "loss": 0.0153, "step": 5170 }, { "epoch": 2.8987129266927814, "grad_norm": 0.33482569456100464, "learning_rate": 8.852566213878947e-05, "loss": 0.0189, "step": 5180 }, { "epoch": 2.9043088975937326, "grad_norm": 0.2865656316280365, "learning_rate": 8.84729114325516e-05, "loss": 0.0168, "step": 5190 }, { "epoch": 2.9099048684946838, "grad_norm": 0.3801150321960449, "learning_rate": 8.842005554284296e-05, "loss": 0.0149, "step": 5200 }, { "epoch": 2.915500839395635, "grad_norm": 0.24389003217220306, "learning_rate": 8.836709461416952e-05, "loss": 0.0176, "step": 5210 }, { "epoch": 2.9210968102965866, "grad_norm": 0.4815085828304291, "learning_rate": 8.831402879132446e-05, "loss": 0.014, "step": 5220 }, { "epoch": 2.926692781197538, "grad_norm": 0.2196839153766632, "learning_rate": 8.82608582193877e-05, "loss": 0.0174, "step": 5230 }, { "epoch": 2.932288752098489, "grad_norm": 0.30073830485343933, "learning_rate": 8.820758304372557e-05, "loss": 0.0168, "step": 5240 }, { "epoch": 2.9378847229994403, "grad_norm": 0.21486796438694, "learning_rate": 8.815420340999033e-05, "loss": 0.0128, "step": 5250 }, { "epoch": 2.9434806939003915, "grad_norm": 0.31880220770835876, "learning_rate": 8.810071946411989e-05, "loss": 0.0209, "step": 5260 }, { "epoch": 2.949076664801343, "grad_norm": 0.20475736260414124, "learning_rate": 8.804713135233731e-05, "loss": 0.0152, "step": 5270 }, { "epoch": 2.9546726357022943, "grad_norm": 0.19735224545001984, "learning_rate": 8.799343922115044e-05, "loss": 0.0104, "step": 5280 }, { "epoch": 2.9602686066032455, "grad_norm": 0.17013341188430786, "learning_rate": 8.79396432173515e-05, "loss": 0.0129, "step": 5290 }, { "epoch": 2.965864577504197, "grad_norm": 0.38702845573425293, "learning_rate": 8.788574348801675e-05, "loss": 0.0239, "step": 5300 }, { "epoch": 2.9714605484051484, "grad_norm": 0.34306514263153076, "learning_rate": 8.783174018050594e-05, "loss": 0.03, "step": 5310 }, { "epoch": 2.9770565193060996, "grad_norm": 0.26854732632637024, "learning_rate": 8.77776334424621e-05, "loss": 0.019, "step": 5320 }, { "epoch": 2.982652490207051, "grad_norm": 0.28458869457244873, "learning_rate": 8.772342342181095e-05, "loss": 0.0213, "step": 5330 }, { "epoch": 2.988248461108002, "grad_norm": 0.28708454966545105, "learning_rate": 8.766911026676064e-05, "loss": 0.0173, "step": 5340 }, { "epoch": 2.9938444320089537, "grad_norm": 0.35600361227989197, "learning_rate": 8.761469412580125e-05, "loss": 0.0179, "step": 5350 }, { "epoch": 2.999440402909905, "grad_norm": 0.29637375473976135, "learning_rate": 8.756017514770443e-05, "loss": 0.0223, "step": 5360 }, { "epoch": 3.005036373810856, "grad_norm": 0.39075925946235657, "learning_rate": 8.750555348152298e-05, "loss": 0.0148, "step": 5370 }, { "epoch": 3.0106323447118073, "grad_norm": 0.3552566468715668, "learning_rate": 8.745082927659047e-05, "loss": 0.0187, "step": 5380 }, { "epoch": 3.016228315612759, "grad_norm": 0.2608230710029602, "learning_rate": 8.739600268252078e-05, "loss": 0.0205, "step": 5390 }, { "epoch": 3.02182428651371, "grad_norm": 0.2771034240722656, "learning_rate": 8.73410738492077e-05, "loss": 0.0187, "step": 5400 }, { "epoch": 3.0274202574146614, "grad_norm": 0.2750489413738251, "learning_rate": 8.728604292682459e-05, "loss": 0.0161, "step": 5410 }, { "epoch": 3.0330162283156126, "grad_norm": 0.3373420834541321, "learning_rate": 8.723091006582389e-05, "loss": 0.0193, "step": 5420 }, { "epoch": 3.0386121992165642, "grad_norm": 0.27592456340789795, "learning_rate": 8.717567541693673e-05, "loss": 0.0171, "step": 5430 }, { "epoch": 3.0442081701175154, "grad_norm": 0.3381069004535675, "learning_rate": 8.71203391311725e-05, "loss": 0.0185, "step": 5440 }, { "epoch": 3.0498041410184666, "grad_norm": 0.342650830745697, "learning_rate": 8.706490135981855e-05, "loss": 0.0223, "step": 5450 }, { "epoch": 3.055400111919418, "grad_norm": 0.2777611017227173, "learning_rate": 8.700936225443959e-05, "loss": 0.0135, "step": 5460 }, { "epoch": 3.0609960828203695, "grad_norm": 0.26987946033477783, "learning_rate": 8.695372196687743e-05, "loss": 0.0182, "step": 5470 }, { "epoch": 3.0665920537213207, "grad_norm": 0.24877256155014038, "learning_rate": 8.689798064925049e-05, "loss": 0.015, "step": 5480 }, { "epoch": 3.072188024622272, "grad_norm": 0.31654706597328186, "learning_rate": 8.684213845395339e-05, "loss": 0.0142, "step": 5490 }, { "epoch": 3.077783995523223, "grad_norm": 0.22976505756378174, "learning_rate": 8.678619553365659e-05, "loss": 0.0119, "step": 5500 }, { "epoch": 3.083379966424175, "grad_norm": 0.3443313241004944, "learning_rate": 8.673015204130586e-05, "loss": 0.0138, "step": 5510 }, { "epoch": 3.088975937325126, "grad_norm": 0.34815511107444763, "learning_rate": 8.6674008130122e-05, "loss": 0.0127, "step": 5520 }, { "epoch": 3.094571908226077, "grad_norm": 0.392868310213089, "learning_rate": 8.661776395360029e-05, "loss": 0.0148, "step": 5530 }, { "epoch": 3.1001678791270284, "grad_norm": 0.15690505504608154, "learning_rate": 8.656141966551019e-05, "loss": 0.0158, "step": 5540 }, { "epoch": 3.10576385002798, "grad_norm": 0.2958482503890991, "learning_rate": 8.650497541989482e-05, "loss": 0.015, "step": 5550 }, { "epoch": 3.1113598209289313, "grad_norm": 0.34652698040008545, "learning_rate": 8.644843137107059e-05, "loss": 0.0186, "step": 5560 }, { "epoch": 3.1169557918298825, "grad_norm": 0.2787473201751709, "learning_rate": 8.639178767362676e-05, "loss": 0.0171, "step": 5570 }, { "epoch": 3.1225517627308337, "grad_norm": 0.28770115971565247, "learning_rate": 8.633504448242505e-05, "loss": 0.0088, "step": 5580 }, { "epoch": 3.128147733631785, "grad_norm": 0.16269604861736298, "learning_rate": 8.627820195259918e-05, "loss": 0.0144, "step": 5590 }, { "epoch": 3.1337437045327365, "grad_norm": 0.2170538753271103, "learning_rate": 8.622126023955446e-05, "loss": 0.0145, "step": 5600 }, { "epoch": 3.1393396754336877, "grad_norm": 0.1933916211128235, "learning_rate": 8.616421949896734e-05, "loss": 0.0145, "step": 5610 }, { "epoch": 3.144935646334639, "grad_norm": 0.28321388363838196, "learning_rate": 8.610707988678503e-05, "loss": 0.0171, "step": 5620 }, { "epoch": 3.1505316172355906, "grad_norm": 0.1729007363319397, "learning_rate": 8.604984155922506e-05, "loss": 0.0103, "step": 5630 }, { "epoch": 3.156127588136542, "grad_norm": 0.41079893708229065, "learning_rate": 8.599250467277483e-05, "loss": 0.0159, "step": 5640 }, { "epoch": 3.161723559037493, "grad_norm": 0.4628431797027588, "learning_rate": 8.59350693841912e-05, "loss": 0.0184, "step": 5650 }, { "epoch": 3.1673195299384442, "grad_norm": 0.30907726287841797, "learning_rate": 8.587753585050004e-05, "loss": 0.0183, "step": 5660 }, { "epoch": 3.1729155008393954, "grad_norm": 0.19282157719135284, "learning_rate": 8.581990422899585e-05, "loss": 0.0127, "step": 5670 }, { "epoch": 3.178511471740347, "grad_norm": 0.27166658639907837, "learning_rate": 8.576217467724128e-05, "loss": 0.023, "step": 5680 }, { "epoch": 3.1841074426412983, "grad_norm": 0.3486577272415161, "learning_rate": 8.570434735306671e-05, "loss": 0.0108, "step": 5690 }, { "epoch": 3.1897034135422495, "grad_norm": 0.295238733291626, "learning_rate": 8.564642241456986e-05, "loss": 0.0181, "step": 5700 }, { "epoch": 3.1952993844432007, "grad_norm": 0.20616333186626434, "learning_rate": 8.558840002011528e-05, "loss": 0.0202, "step": 5710 }, { "epoch": 3.2008953553441524, "grad_norm": 0.12979304790496826, "learning_rate": 8.553028032833397e-05, "loss": 0.0125, "step": 5720 }, { "epoch": 3.2064913262451036, "grad_norm": 0.23997394740581512, "learning_rate": 8.547206349812298e-05, "loss": 0.0159, "step": 5730 }, { "epoch": 3.212087297146055, "grad_norm": 0.2359701246023178, "learning_rate": 8.541374968864487e-05, "loss": 0.0136, "step": 5740 }, { "epoch": 3.217683268047006, "grad_norm": 0.25309842824935913, "learning_rate": 8.535533905932738e-05, "loss": 0.0154, "step": 5750 }, { "epoch": 3.2232792389479576, "grad_norm": 0.26648661494255066, "learning_rate": 8.529683176986295e-05, "loss": 0.0132, "step": 5760 }, { "epoch": 3.228875209848909, "grad_norm": 0.32268235087394714, "learning_rate": 8.523822798020827e-05, "loss": 0.0133, "step": 5770 }, { "epoch": 3.23447118074986, "grad_norm": 0.2632688283920288, "learning_rate": 8.517952785058385e-05, "loss": 0.017, "step": 5780 }, { "epoch": 3.2400671516508113, "grad_norm": 0.16985219717025757, "learning_rate": 8.512073154147362e-05, "loss": 0.0143, "step": 5790 }, { "epoch": 3.245663122551763, "grad_norm": 0.23951981961727142, "learning_rate": 8.506183921362443e-05, "loss": 0.0157, "step": 5800 }, { "epoch": 3.251259093452714, "grad_norm": 0.36843812465667725, "learning_rate": 8.500285102804568e-05, "loss": 0.0198, "step": 5810 }, { "epoch": 3.2568550643536653, "grad_norm": 0.27591267228126526, "learning_rate": 8.494376714600878e-05, "loss": 0.0246, "step": 5820 }, { "epoch": 3.2624510352546165, "grad_norm": 0.3020281195640564, "learning_rate": 8.488458772904684e-05, "loss": 0.018, "step": 5830 }, { "epoch": 3.2680470061555678, "grad_norm": 0.20429036021232605, "learning_rate": 8.482531293895412e-05, "loss": 0.0154, "step": 5840 }, { "epoch": 3.2736429770565194, "grad_norm": 0.3011918067932129, "learning_rate": 8.476594293778561e-05, "loss": 0.0181, "step": 5850 }, { "epoch": 3.2792389479574706, "grad_norm": 0.20082388818264008, "learning_rate": 8.470647788785665e-05, "loss": 0.0118, "step": 5860 }, { "epoch": 3.284834918858422, "grad_norm": 0.25404563546180725, "learning_rate": 8.46469179517424e-05, "loss": 0.0122, "step": 5870 }, { "epoch": 3.2904308897593735, "grad_norm": 0.17162342369556427, "learning_rate": 8.458726329227747e-05, "loss": 0.0178, "step": 5880 }, { "epoch": 3.2960268606603247, "grad_norm": 0.2713855803012848, "learning_rate": 8.452751407255541e-05, "loss": 0.0127, "step": 5890 }, { "epoch": 3.301622831561276, "grad_norm": 0.25792196393013, "learning_rate": 8.44676704559283e-05, "loss": 0.0151, "step": 5900 }, { "epoch": 3.307218802462227, "grad_norm": 0.24708054959774017, "learning_rate": 8.44077326060063e-05, "loss": 0.0205, "step": 5910 }, { "epoch": 3.3128147733631783, "grad_norm": 0.22907878458499908, "learning_rate": 8.434770068665723e-05, "loss": 0.0196, "step": 5920 }, { "epoch": 3.31841074426413, "grad_norm": 0.42451682686805725, "learning_rate": 8.428757486200603e-05, "loss": 0.0181, "step": 5930 }, { "epoch": 3.324006715165081, "grad_norm": 0.2787477970123291, "learning_rate": 8.422735529643444e-05, "loss": 0.0163, "step": 5940 }, { "epoch": 3.3296026860660324, "grad_norm": 0.2536604404449463, "learning_rate": 8.416704215458043e-05, "loss": 0.0153, "step": 5950 }, { "epoch": 3.3351986569669836, "grad_norm": 0.27685803174972534, "learning_rate": 8.410663560133784e-05, "loss": 0.0171, "step": 5960 }, { "epoch": 3.3407946278679352, "grad_norm": 0.21129871904850006, "learning_rate": 8.404613580185585e-05, "loss": 0.0146, "step": 5970 }, { "epoch": 3.3463905987688864, "grad_norm": 0.2712884247303009, "learning_rate": 8.398554292153866e-05, "loss": 0.0124, "step": 5980 }, { "epoch": 3.3519865696698377, "grad_norm": 0.28807780146598816, "learning_rate": 8.392485712604483e-05, "loss": 0.0151, "step": 5990 }, { "epoch": 3.357582540570789, "grad_norm": 0.24215184152126312, "learning_rate": 8.386407858128706e-05, "loss": 0.0201, "step": 6000 }, { "epoch": 3.3631785114717405, "grad_norm": 0.3111182451248169, "learning_rate": 8.380320745343153e-05, "loss": 0.0148, "step": 6010 }, { "epoch": 3.3687744823726917, "grad_norm": 0.3122502267360687, "learning_rate": 8.37422439088976e-05, "loss": 0.0138, "step": 6020 }, { "epoch": 3.374370453273643, "grad_norm": 0.23829977214336395, "learning_rate": 8.368118811435726e-05, "loss": 0.0172, "step": 6030 }, { "epoch": 3.379966424174594, "grad_norm": 0.22568489611148834, "learning_rate": 8.362004023673474e-05, "loss": 0.0191, "step": 6040 }, { "epoch": 3.385562395075546, "grad_norm": 0.37260109186172485, "learning_rate": 8.355880044320598e-05, "loss": 0.0146, "step": 6050 }, { "epoch": 3.391158365976497, "grad_norm": 0.36467012763023376, "learning_rate": 8.349746890119826e-05, "loss": 0.0144, "step": 6060 }, { "epoch": 3.396754336877448, "grad_norm": 0.28992265462875366, "learning_rate": 8.343604577838964e-05, "loss": 0.014, "step": 6070 }, { "epoch": 3.4023503077783994, "grad_norm": 0.3018409311771393, "learning_rate": 8.337453124270863e-05, "loss": 0.0126, "step": 6080 }, { "epoch": 3.4079462786793506, "grad_norm": 0.31771036982536316, "learning_rate": 8.331292546233362e-05, "loss": 0.0124, "step": 6090 }, { "epoch": 3.4135422495803023, "grad_norm": 0.2008838802576065, "learning_rate": 8.32512286056924e-05, "loss": 0.0181, "step": 6100 }, { "epoch": 3.4191382204812535, "grad_norm": 0.3000880777835846, "learning_rate": 8.318944084146192e-05, "loss": 0.0178, "step": 6110 }, { "epoch": 3.4247341913822047, "grad_norm": 0.201462984085083, "learning_rate": 8.31275623385675e-05, "loss": 0.0121, "step": 6120 }, { "epoch": 3.4303301622831563, "grad_norm": 0.29394298791885376, "learning_rate": 8.306559326618259e-05, "loss": 0.019, "step": 6130 }, { "epoch": 3.4359261331841076, "grad_norm": 0.20683641731739044, "learning_rate": 8.300353379372834e-05, "loss": 0.0157, "step": 6140 }, { "epoch": 3.4415221040850588, "grad_norm": 0.2323373705148697, "learning_rate": 8.29413840908729e-05, "loss": 0.0132, "step": 6150 }, { "epoch": 3.44711807498601, "grad_norm": 0.28800690174102783, "learning_rate": 8.287914432753123e-05, "loss": 0.0149, "step": 6160 }, { "epoch": 3.452714045886961, "grad_norm": 0.24825571477413177, "learning_rate": 8.281681467386446e-05, "loss": 0.0143, "step": 6170 }, { "epoch": 3.458310016787913, "grad_norm": 0.26586174964904785, "learning_rate": 8.275439530027948e-05, "loss": 0.0193, "step": 6180 }, { "epoch": 3.463905987688864, "grad_norm": 0.384670615196228, "learning_rate": 8.269188637742846e-05, "loss": 0.0135, "step": 6190 }, { "epoch": 3.4695019585898152, "grad_norm": 0.2598379850387573, "learning_rate": 8.262928807620843e-05, "loss": 0.0192, "step": 6200 }, { "epoch": 3.4750979294907665, "grad_norm": 0.26824334263801575, "learning_rate": 8.256660056776076e-05, "loss": 0.017, "step": 6210 }, { "epoch": 3.480693900391718, "grad_norm": 0.29601970314979553, "learning_rate": 8.250382402347065e-05, "loss": 0.0236, "step": 6220 }, { "epoch": 3.4862898712926693, "grad_norm": 0.2569962739944458, "learning_rate": 8.244095861496686e-05, "loss": 0.0148, "step": 6230 }, { "epoch": 3.4918858421936205, "grad_norm": 0.18870459496974945, "learning_rate": 8.237800451412095e-05, "loss": 0.0166, "step": 6240 }, { "epoch": 3.4974818130945717, "grad_norm": 0.20874905586242676, "learning_rate": 8.231496189304704e-05, "loss": 0.012, "step": 6250 }, { "epoch": 3.5030777839955234, "grad_norm": 0.456989586353302, "learning_rate": 8.225183092410128e-05, "loss": 0.0174, "step": 6260 }, { "epoch": 3.5086737548964746, "grad_norm": 0.3724716305732727, "learning_rate": 8.218861177988129e-05, "loss": 0.0164, "step": 6270 }, { "epoch": 3.514269725797426, "grad_norm": 0.2510260343551636, "learning_rate": 8.212530463322583e-05, "loss": 0.014, "step": 6280 }, { "epoch": 3.519865696698377, "grad_norm": 0.17292679846286774, "learning_rate": 8.206190965721419e-05, "loss": 0.0135, "step": 6290 }, { "epoch": 3.5254616675993287, "grad_norm": 0.25856831669807434, "learning_rate": 8.199842702516583e-05, "loss": 0.0159, "step": 6300 }, { "epoch": 3.53105763850028, "grad_norm": 0.26525381207466125, "learning_rate": 8.193485691063985e-05, "loss": 0.0132, "step": 6310 }, { "epoch": 3.536653609401231, "grad_norm": 0.319915235042572, "learning_rate": 8.18711994874345e-05, "loss": 0.0113, "step": 6320 }, { "epoch": 3.5422495803021823, "grad_norm": 0.23749981820583344, "learning_rate": 8.180745492958674e-05, "loss": 0.0145, "step": 6330 }, { "epoch": 3.5478455512031335, "grad_norm": 0.25086531043052673, "learning_rate": 8.174362341137177e-05, "loss": 0.0165, "step": 6340 }, { "epoch": 3.553441522104085, "grad_norm": 0.19675312936306, "learning_rate": 8.167970510730253e-05, "loss": 0.0155, "step": 6350 }, { "epoch": 3.5590374930050364, "grad_norm": 0.2085702270269394, "learning_rate": 8.161570019212921e-05, "loss": 0.0155, "step": 6360 }, { "epoch": 3.5646334639059876, "grad_norm": 0.4404468536376953, "learning_rate": 8.155160884083881e-05, "loss": 0.0208, "step": 6370 }, { "epoch": 3.570229434806939, "grad_norm": 0.10625205188989639, "learning_rate": 8.148743122865463e-05, "loss": 0.015, "step": 6380 }, { "epoch": 3.5758254057078904, "grad_norm": 0.34253987669944763, "learning_rate": 8.14231675310358e-05, "loss": 0.0229, "step": 6390 }, { "epoch": 3.5814213766088416, "grad_norm": 0.43956324458122253, "learning_rate": 8.135881792367686e-05, "loss": 0.0181, "step": 6400 }, { "epoch": 3.587017347509793, "grad_norm": 0.45199209451675415, "learning_rate": 8.129438258250712e-05, "loss": 0.0198, "step": 6410 }, { "epoch": 3.592613318410744, "grad_norm": 0.2245771586894989, "learning_rate": 8.12298616836904e-05, "loss": 0.0141, "step": 6420 }, { "epoch": 3.5982092893116957, "grad_norm": 0.3338348865509033, "learning_rate": 8.116525540362434e-05, "loss": 0.0168, "step": 6430 }, { "epoch": 3.603805260212647, "grad_norm": 0.21632985770702362, "learning_rate": 8.110056391894005e-05, "loss": 0.0117, "step": 6440 }, { "epoch": 3.609401231113598, "grad_norm": 0.2893829643726349, "learning_rate": 8.103578740650156e-05, "loss": 0.0166, "step": 6450 }, { "epoch": 3.6149972020145498, "grad_norm": 0.24873918294906616, "learning_rate": 8.097092604340542e-05, "loss": 0.0139, "step": 6460 }, { "epoch": 3.620593172915501, "grad_norm": 0.31232985854148865, "learning_rate": 8.090598000698009e-05, "loss": 0.0122, "step": 6470 }, { "epoch": 3.626189143816452, "grad_norm": 0.20202654600143433, "learning_rate": 8.084094947478556e-05, "loss": 0.0126, "step": 6480 }, { "epoch": 3.6317851147174034, "grad_norm": 0.339890718460083, "learning_rate": 8.077583462461283e-05, "loss": 0.0107, "step": 6490 }, { "epoch": 3.6373810856183546, "grad_norm": 0.17959007620811462, "learning_rate": 8.07106356344834e-05, "loss": 0.0125, "step": 6500 }, { "epoch": 3.6429770565193063, "grad_norm": 0.21795189380645752, "learning_rate": 8.064535268264883e-05, "loss": 0.0202, "step": 6510 }, { "epoch": 3.6485730274202575, "grad_norm": 0.17131085693836212, "learning_rate": 8.057998594759022e-05, "loss": 0.0197, "step": 6520 }, { "epoch": 3.6541689983212087, "grad_norm": 0.180596724152565, "learning_rate": 8.051453560801772e-05, "loss": 0.0128, "step": 6530 }, { "epoch": 3.65976496922216, "grad_norm": 0.23086079955101013, "learning_rate": 8.044900184287007e-05, "loss": 0.0171, "step": 6540 }, { "epoch": 3.6653609401231115, "grad_norm": 0.40819284319877625, "learning_rate": 8.038338483131407e-05, "loss": 0.0162, "step": 6550 }, { "epoch": 3.6709569110240627, "grad_norm": 0.20544512569904327, "learning_rate": 8.031768475274413e-05, "loss": 0.01, "step": 6560 }, { "epoch": 3.676552881925014, "grad_norm": 0.3116811513900757, "learning_rate": 8.025190178678175e-05, "loss": 0.0183, "step": 6570 }, { "epoch": 3.682148852825965, "grad_norm": 0.3111719787120819, "learning_rate": 8.018603611327504e-05, "loss": 0.015, "step": 6580 }, { "epoch": 3.6877448237269164, "grad_norm": 0.20265722274780273, "learning_rate": 8.012008791229826e-05, "loss": 0.0136, "step": 6590 }, { "epoch": 3.693340794627868, "grad_norm": 0.35717812180519104, "learning_rate": 8.005405736415126e-05, "loss": 0.0098, "step": 6600 }, { "epoch": 3.6989367655288192, "grad_norm": 0.45737767219543457, "learning_rate": 7.998794464935904e-05, "loss": 0.0115, "step": 6610 }, { "epoch": 3.7045327364297704, "grad_norm": 0.3025696873664856, "learning_rate": 7.992174994867123e-05, "loss": 0.0159, "step": 6620 }, { "epoch": 3.710128707330722, "grad_norm": 0.3852231502532959, "learning_rate": 7.985547344306161e-05, "loss": 0.0116, "step": 6630 }, { "epoch": 3.7157246782316733, "grad_norm": 0.23505637049674988, "learning_rate": 7.978911531372765e-05, "loss": 0.012, "step": 6640 }, { "epoch": 3.7213206491326245, "grad_norm": 0.16072528064250946, "learning_rate": 7.972267574208991e-05, "loss": 0.0101, "step": 6650 }, { "epoch": 3.7269166200335757, "grad_norm": 0.2579629719257355, "learning_rate": 7.965615490979163e-05, "loss": 0.0172, "step": 6660 }, { "epoch": 3.732512590934527, "grad_norm": 0.170463427901268, "learning_rate": 7.958955299869825e-05, "loss": 0.0164, "step": 6670 }, { "epoch": 3.7381085618354786, "grad_norm": 0.2048628181219101, "learning_rate": 7.952287019089685e-05, "loss": 0.0095, "step": 6680 }, { "epoch": 3.74370453273643, "grad_norm": 0.1665850281715393, "learning_rate": 7.945610666869568e-05, "loss": 0.0131, "step": 6690 }, { "epoch": 3.749300503637381, "grad_norm": 0.184804305434227, "learning_rate": 7.938926261462366e-05, "loss": 0.0161, "step": 6700 }, { "epoch": 3.7548964745383326, "grad_norm": 0.17109259963035583, "learning_rate": 7.932233821142987e-05, "loss": 0.014, "step": 6710 }, { "epoch": 3.760492445439284, "grad_norm": 0.23285003006458282, "learning_rate": 7.925533364208309e-05, "loss": 0.0106, "step": 6720 }, { "epoch": 3.766088416340235, "grad_norm": 0.21361905336380005, "learning_rate": 7.918824908977123e-05, "loss": 0.0218, "step": 6730 }, { "epoch": 3.7716843872411863, "grad_norm": 0.22354750335216522, "learning_rate": 7.912108473790092e-05, "loss": 0.0203, "step": 6740 }, { "epoch": 3.7772803581421375, "grad_norm": 0.24767528474330902, "learning_rate": 7.905384077009693e-05, "loss": 0.0193, "step": 6750 }, { "epoch": 3.782876329043089, "grad_norm": 0.18995364010334015, "learning_rate": 7.898651737020166e-05, "loss": 0.0162, "step": 6760 }, { "epoch": 3.7884722999440403, "grad_norm": 0.13995826244354248, "learning_rate": 7.891911472227478e-05, "loss": 0.0187, "step": 6770 }, { "epoch": 3.7940682708449915, "grad_norm": 0.2525804340839386, "learning_rate": 7.88516330105925e-05, "loss": 0.0136, "step": 6780 }, { "epoch": 3.799664241745943, "grad_norm": 0.17206352949142456, "learning_rate": 7.878407241964729e-05, "loss": 0.0133, "step": 6790 }, { "epoch": 3.8052602126468944, "grad_norm": 0.17433176934719086, "learning_rate": 7.871643313414718e-05, "loss": 0.0257, "step": 6800 }, { "epoch": 3.8108561835478456, "grad_norm": 0.2698834240436554, "learning_rate": 7.864871533901544e-05, "loss": 0.0141, "step": 6810 }, { "epoch": 3.816452154448797, "grad_norm": 0.2874978482723236, "learning_rate": 7.858091921938988e-05, "loss": 0.0175, "step": 6820 }, { "epoch": 3.822048125349748, "grad_norm": 0.267092227935791, "learning_rate": 7.851304496062254e-05, "loss": 0.0169, "step": 6830 }, { "epoch": 3.8276440962506992, "grad_norm": 0.31751275062561035, "learning_rate": 7.844509274827907e-05, "loss": 0.0175, "step": 6840 }, { "epoch": 3.833240067151651, "grad_norm": 0.30981171131134033, "learning_rate": 7.837706276813819e-05, "loss": 0.0145, "step": 6850 }, { "epoch": 3.838836038052602, "grad_norm": 0.31560707092285156, "learning_rate": 7.830895520619128e-05, "loss": 0.0157, "step": 6860 }, { "epoch": 3.8444320089535533, "grad_norm": 0.22295020520687103, "learning_rate": 7.824077024864179e-05, "loss": 0.0108, "step": 6870 }, { "epoch": 3.850027979854505, "grad_norm": 0.25469842553138733, "learning_rate": 7.817250808190483e-05, "loss": 0.015, "step": 6880 }, { "epoch": 3.855623950755456, "grad_norm": 0.3890667259693146, "learning_rate": 7.810416889260653e-05, "loss": 0.0179, "step": 6890 }, { "epoch": 3.8612199216564074, "grad_norm": 0.1923862248659134, "learning_rate": 7.803575286758364e-05, "loss": 0.013, "step": 6900 }, { "epoch": 3.8668158925573586, "grad_norm": 0.17686985433101654, "learning_rate": 7.796726019388295e-05, "loss": 0.0143, "step": 6910 }, { "epoch": 3.87241186345831, "grad_norm": 0.1899517923593521, "learning_rate": 7.789869105876083e-05, "loss": 0.0178, "step": 6920 }, { "epoch": 3.8780078343592614, "grad_norm": 0.3056480586528778, "learning_rate": 7.783004564968263e-05, "loss": 0.0129, "step": 6930 }, { "epoch": 3.8836038052602126, "grad_norm": 0.27795109152793884, "learning_rate": 7.776132415432234e-05, "loss": 0.0151, "step": 6940 }, { "epoch": 3.889199776161164, "grad_norm": 0.22460781037807465, "learning_rate": 7.769252676056187e-05, "loss": 0.0145, "step": 6950 }, { "epoch": 3.8947957470621155, "grad_norm": 0.29980891942977905, "learning_rate": 7.762365365649067e-05, "loss": 0.015, "step": 6960 }, { "epoch": 3.9003917179630667, "grad_norm": 0.2440609186887741, "learning_rate": 7.755470503040516e-05, "loss": 0.0137, "step": 6970 }, { "epoch": 3.905987688864018, "grad_norm": 0.2510973811149597, "learning_rate": 7.748568107080832e-05, "loss": 0.0118, "step": 6980 }, { "epoch": 3.911583659764969, "grad_norm": 0.4981507956981659, "learning_rate": 7.741658196640892e-05, "loss": 0.0217, "step": 6990 }, { "epoch": 3.9171796306659203, "grad_norm": 0.28161290287971497, "learning_rate": 7.734740790612136e-05, "loss": 0.0154, "step": 7000 }, { "epoch": 3.922775601566872, "grad_norm": 0.40513697266578674, "learning_rate": 7.727815907906481e-05, "loss": 0.0169, "step": 7010 }, { "epoch": 3.928371572467823, "grad_norm": 0.31741997599601746, "learning_rate": 7.720883567456298e-05, "loss": 0.0156, "step": 7020 }, { "epoch": 3.9339675433687744, "grad_norm": 0.2534908652305603, "learning_rate": 7.713943788214337e-05, "loss": 0.0142, "step": 7030 }, { "epoch": 3.939563514269726, "grad_norm": 0.2655825912952423, "learning_rate": 7.70699658915369e-05, "loss": 0.0154, "step": 7040 }, { "epoch": 3.9451594851706773, "grad_norm": 0.32799914479255676, "learning_rate": 7.700041989267736e-05, "loss": 0.0137, "step": 7050 }, { "epoch": 3.9507554560716285, "grad_norm": 0.184087872505188, "learning_rate": 7.693080007570084e-05, "loss": 0.013, "step": 7060 }, { "epoch": 3.9563514269725797, "grad_norm": 0.31337958574295044, "learning_rate": 7.686110663094525e-05, "loss": 0.0203, "step": 7070 }, { "epoch": 3.961947397873531, "grad_norm": 0.44696512818336487, "learning_rate": 7.679133974894983e-05, "loss": 0.0136, "step": 7080 }, { "epoch": 3.967543368774482, "grad_norm": 0.2737766206264496, "learning_rate": 7.672149962045457e-05, "loss": 0.0157, "step": 7090 }, { "epoch": 3.9731393396754338, "grad_norm": 0.4152137339115143, "learning_rate": 7.66515864363997e-05, "loss": 0.0151, "step": 7100 }, { "epoch": 3.978735310576385, "grad_norm": 0.25766709446907043, "learning_rate": 7.658160038792518e-05, "loss": 0.0185, "step": 7110 }, { "epoch": 3.984331281477336, "grad_norm": 0.2175714522600174, "learning_rate": 7.651154166637025e-05, "loss": 0.013, "step": 7120 }, { "epoch": 3.989927252378288, "grad_norm": 0.2838795483112335, "learning_rate": 7.644141046327271e-05, "loss": 0.0152, "step": 7130 }, { "epoch": 3.995523223279239, "grad_norm": 0.17076176404953003, "learning_rate": 7.637120697036866e-05, "loss": 0.0161, "step": 7140 }, { "epoch": 4.00111919418019, "grad_norm": 0.34454286098480225, "learning_rate": 7.630093137959171e-05, "loss": 0.0155, "step": 7150 }, { "epoch": 4.0067151650811414, "grad_norm": 0.2543468773365021, "learning_rate": 7.623058388307269e-05, "loss": 0.0224, "step": 7160 }, { "epoch": 4.012311135982093, "grad_norm": 0.26474493741989136, "learning_rate": 7.616016467313891e-05, "loss": 0.0121, "step": 7170 }, { "epoch": 4.017907106883044, "grad_norm": 0.2469242513179779, "learning_rate": 7.608967394231387e-05, "loss": 0.0168, "step": 7180 }, { "epoch": 4.023503077783996, "grad_norm": 0.2605207562446594, "learning_rate": 7.60191118833165e-05, "loss": 0.0142, "step": 7190 }, { "epoch": 4.029099048684947, "grad_norm": 0.1799083948135376, "learning_rate": 7.594847868906076e-05, "loss": 0.02, "step": 7200 }, { "epoch": 4.034695019585898, "grad_norm": 0.179059699177742, "learning_rate": 7.587777455265515e-05, "loss": 0.0115, "step": 7210 }, { "epoch": 4.04029099048685, "grad_norm": 0.2233004868030548, "learning_rate": 7.580699966740201e-05, "loss": 0.0128, "step": 7220 }, { "epoch": 4.045886961387801, "grad_norm": 0.253635436296463, "learning_rate": 7.573615422679726e-05, "loss": 0.0149, "step": 7230 }, { "epoch": 4.051482932288752, "grad_norm": 0.3416047692298889, "learning_rate": 7.566523842452958e-05, "loss": 0.0125, "step": 7240 }, { "epoch": 4.057078903189703, "grad_norm": 0.27430468797683716, "learning_rate": 7.559425245448006e-05, "loss": 0.0153, "step": 7250 }, { "epoch": 4.062674874090654, "grad_norm": 0.26396802067756653, "learning_rate": 7.552319651072164e-05, "loss": 0.0128, "step": 7260 }, { "epoch": 4.068270844991606, "grad_norm": 0.1688843071460724, "learning_rate": 7.545207078751857e-05, "loss": 0.017, "step": 7270 }, { "epoch": 4.073866815892558, "grad_norm": 0.25092509388923645, "learning_rate": 7.538087547932585e-05, "loss": 0.0119, "step": 7280 }, { "epoch": 4.079462786793509, "grad_norm": 0.12876421213150024, "learning_rate": 7.530961078078873e-05, "loss": 0.0099, "step": 7290 }, { "epoch": 4.08505875769446, "grad_norm": 0.13818064332008362, "learning_rate": 7.52382768867422e-05, "loss": 0.0156, "step": 7300 }, { "epoch": 4.090654728595411, "grad_norm": 0.23580847680568695, "learning_rate": 7.516687399221037e-05, "loss": 0.0122, "step": 7310 }, { "epoch": 4.096250699496363, "grad_norm": 0.22529348731040955, "learning_rate": 7.509540229240601e-05, "loss": 0.0115, "step": 7320 }, { "epoch": 4.101846670397314, "grad_norm": 0.29066744446754456, "learning_rate": 7.50238619827301e-05, "loss": 0.0125, "step": 7330 }, { "epoch": 4.107442641298265, "grad_norm": 0.30195966362953186, "learning_rate": 7.495225325877103e-05, "loss": 0.0136, "step": 7340 }, { "epoch": 4.113038612199216, "grad_norm": 0.2478567361831665, "learning_rate": 7.488057631630437e-05, "loss": 0.0138, "step": 7350 }, { "epoch": 4.118634583100168, "grad_norm": 0.23493291437625885, "learning_rate": 7.480883135129211e-05, "loss": 0.0171, "step": 7360 }, { "epoch": 4.1242305540011195, "grad_norm": 0.28376439213752747, "learning_rate": 7.473701855988227e-05, "loss": 0.0161, "step": 7370 }, { "epoch": 4.129826524902071, "grad_norm": 0.183238685131073, "learning_rate": 7.466513813840825e-05, "loss": 0.0159, "step": 7380 }, { "epoch": 4.135422495803022, "grad_norm": 0.26259323954582214, "learning_rate": 7.45931902833884e-05, "loss": 0.0139, "step": 7390 }, { "epoch": 4.141018466703973, "grad_norm": 0.31283116340637207, "learning_rate": 7.452117519152542e-05, "loss": 0.0103, "step": 7400 }, { "epoch": 4.146614437604924, "grad_norm": 0.3131321370601654, "learning_rate": 7.444909305970578e-05, "loss": 0.0147, "step": 7410 }, { "epoch": 4.1522104085058755, "grad_norm": 0.22739440202713013, "learning_rate": 7.437694408499933e-05, "loss": 0.0199, "step": 7420 }, { "epoch": 4.157806379406827, "grad_norm": 0.22918283939361572, "learning_rate": 7.430472846465856e-05, "loss": 0.0152, "step": 7430 }, { "epoch": 4.163402350307779, "grad_norm": 0.3530014455318451, "learning_rate": 7.423244639611826e-05, "loss": 0.0123, "step": 7440 }, { "epoch": 4.16899832120873, "grad_norm": 0.32133522629737854, "learning_rate": 7.416009807699482e-05, "loss": 0.0151, "step": 7450 }, { "epoch": 4.174594292109681, "grad_norm": 0.13515067100524902, "learning_rate": 7.408768370508576e-05, "loss": 0.0123, "step": 7460 }, { "epoch": 4.1801902630106325, "grad_norm": 0.39963120222091675, "learning_rate": 7.401520347836926e-05, "loss": 0.0132, "step": 7470 }, { "epoch": 4.185786233911584, "grad_norm": 0.16310429573059082, "learning_rate": 7.394265759500348e-05, "loss": 0.0211, "step": 7480 }, { "epoch": 4.191382204812535, "grad_norm": 0.23062337934970856, "learning_rate": 7.387004625332608e-05, "loss": 0.0155, "step": 7490 }, { "epoch": 4.196978175713486, "grad_norm": 0.3456437289714813, "learning_rate": 7.379736965185368e-05, "loss": 0.0149, "step": 7500 }, { "epoch": 4.202574146614437, "grad_norm": 0.30712154507637024, "learning_rate": 7.372462798928137e-05, "loss": 0.0142, "step": 7510 }, { "epoch": 4.2081701175153885, "grad_norm": 0.40980008244514465, "learning_rate": 7.365182146448205e-05, "loss": 0.0185, "step": 7520 }, { "epoch": 4.213766088416341, "grad_norm": 0.3277069330215454, "learning_rate": 7.357895027650598e-05, "loss": 0.0202, "step": 7530 }, { "epoch": 4.219362059317292, "grad_norm": 0.2991955280303955, "learning_rate": 7.350601462458024e-05, "loss": 0.0129, "step": 7540 }, { "epoch": 4.224958030218243, "grad_norm": 0.3370542526245117, "learning_rate": 7.343301470810808e-05, "loss": 0.0186, "step": 7550 }, { "epoch": 4.230554001119194, "grad_norm": 0.31613653898239136, "learning_rate": 7.335995072666848e-05, "loss": 0.0123, "step": 7560 }, { "epoch": 4.236149972020145, "grad_norm": 0.21174335479736328, "learning_rate": 7.328682288001561e-05, "loss": 0.0088, "step": 7570 }, { "epoch": 4.241745942921097, "grad_norm": 0.18430404365062714, "learning_rate": 7.32136313680782e-05, "loss": 0.0136, "step": 7580 }, { "epoch": 4.247341913822048, "grad_norm": 0.161945641040802, "learning_rate": 7.3140376390959e-05, "loss": 0.0146, "step": 7590 }, { "epoch": 4.252937884722999, "grad_norm": 0.3349175453186035, "learning_rate": 7.30670581489344e-05, "loss": 0.0151, "step": 7600 }, { "epoch": 4.258533855623951, "grad_norm": 0.22331948578357697, "learning_rate": 7.299367684245362e-05, "loss": 0.0116, "step": 7610 }, { "epoch": 4.264129826524902, "grad_norm": 0.32214659452438354, "learning_rate": 7.292023267213835e-05, "loss": 0.0125, "step": 7620 }, { "epoch": 4.269725797425854, "grad_norm": 0.2628123164176941, "learning_rate": 7.284672583878219e-05, "loss": 0.021, "step": 7630 }, { "epoch": 4.275321768326805, "grad_norm": 0.17666281759738922, "learning_rate": 7.277315654334997e-05, "loss": 0.0129, "step": 7640 }, { "epoch": 4.280917739227756, "grad_norm": 0.13651759922504425, "learning_rate": 7.269952498697734e-05, "loss": 0.0136, "step": 7650 }, { "epoch": 4.286513710128707, "grad_norm": 0.19819198548793793, "learning_rate": 7.262583137097018e-05, "loss": 0.0178, "step": 7660 }, { "epoch": 4.292109681029658, "grad_norm": 0.30227622389793396, "learning_rate": 7.255207589680402e-05, "loss": 0.0099, "step": 7670 }, { "epoch": 4.29770565193061, "grad_norm": 0.1803039014339447, "learning_rate": 7.247825876612353e-05, "loss": 0.0125, "step": 7680 }, { "epoch": 4.303301622831562, "grad_norm": 0.2602524757385254, "learning_rate": 7.240438018074189e-05, "loss": 0.0128, "step": 7690 }, { "epoch": 4.308897593732513, "grad_norm": 0.22282052040100098, "learning_rate": 7.233044034264034e-05, "loss": 0.0105, "step": 7700 }, { "epoch": 4.314493564633464, "grad_norm": 0.3194449841976166, "learning_rate": 7.225643945396757e-05, "loss": 0.0133, "step": 7710 }, { "epoch": 4.320089535534415, "grad_norm": 0.31051668524742126, "learning_rate": 7.218237771703921e-05, "loss": 0.021, "step": 7720 }, { "epoch": 4.3256855064353665, "grad_norm": 0.23389574885368347, "learning_rate": 7.210825533433719e-05, "loss": 0.0151, "step": 7730 }, { "epoch": 4.331281477336318, "grad_norm": 0.16604237258434296, "learning_rate": 7.203407250850928e-05, "loss": 0.0101, "step": 7740 }, { "epoch": 4.336877448237269, "grad_norm": 0.26793259382247925, "learning_rate": 7.195982944236851e-05, "loss": 0.0177, "step": 7750 }, { "epoch": 4.34247341913822, "grad_norm": 0.21598176658153534, "learning_rate": 7.188552633889259e-05, "loss": 0.0168, "step": 7760 }, { "epoch": 4.348069390039171, "grad_norm": 0.30887526273727417, "learning_rate": 7.181116340122336e-05, "loss": 0.0122, "step": 7770 }, { "epoch": 4.3536653609401235, "grad_norm": 0.3463345468044281, "learning_rate": 7.173674083266624e-05, "loss": 0.0143, "step": 7780 }, { "epoch": 4.359261331841075, "grad_norm": 0.26217085123062134, "learning_rate": 7.166225883668969e-05, "loss": 0.0151, "step": 7790 }, { "epoch": 4.364857302742026, "grad_norm": 0.28720608353614807, "learning_rate": 7.158771761692464e-05, "loss": 0.0139, "step": 7800 }, { "epoch": 4.370453273642977, "grad_norm": 0.35230302810668945, "learning_rate": 7.151311737716397e-05, "loss": 0.0146, "step": 7810 }, { "epoch": 4.376049244543928, "grad_norm": 0.2841963469982147, "learning_rate": 7.143845832136188e-05, "loss": 0.0153, "step": 7820 }, { "epoch": 4.3816452154448795, "grad_norm": 0.3889724016189575, "learning_rate": 7.136374065363334e-05, "loss": 0.0147, "step": 7830 }, { "epoch": 4.387241186345831, "grad_norm": 0.2717784345149994, "learning_rate": 7.128896457825364e-05, "loss": 0.0161, "step": 7840 }, { "epoch": 4.392837157246782, "grad_norm": 0.27939334511756897, "learning_rate": 7.121413029965769e-05, "loss": 0.0127, "step": 7850 }, { "epoch": 4.398433128147734, "grad_norm": 0.24780631065368652, "learning_rate": 7.113923802243957e-05, "loss": 0.0134, "step": 7860 }, { "epoch": 4.404029099048685, "grad_norm": 0.2736693024635315, "learning_rate": 7.10642879513519e-05, "loss": 0.0157, "step": 7870 }, { "epoch": 4.409625069949636, "grad_norm": 0.2332269549369812, "learning_rate": 7.09892802913053e-05, "loss": 0.0155, "step": 7880 }, { "epoch": 4.415221040850588, "grad_norm": 0.3542332947254181, "learning_rate": 7.091421524736784e-05, "loss": 0.0161, "step": 7890 }, { "epoch": 4.420817011751539, "grad_norm": 0.29242730140686035, "learning_rate": 7.083909302476453e-05, "loss": 0.0137, "step": 7900 }, { "epoch": 4.42641298265249, "grad_norm": 0.33528995513916016, "learning_rate": 7.076391382887661e-05, "loss": 0.0146, "step": 7910 }, { "epoch": 4.432008953553441, "grad_norm": 0.34565469622612, "learning_rate": 7.068867786524116e-05, "loss": 0.0128, "step": 7920 }, { "epoch": 4.4376049244543925, "grad_norm": 0.29550039768218994, "learning_rate": 7.061338533955043e-05, "loss": 0.0143, "step": 7930 }, { "epoch": 4.443200895355345, "grad_norm": 0.18918676674365997, "learning_rate": 7.053803645765128e-05, "loss": 0.017, "step": 7940 }, { "epoch": 4.448796866256296, "grad_norm": 0.24842104315757751, "learning_rate": 7.04626314255447e-05, "loss": 0.0115, "step": 7950 }, { "epoch": 4.454392837157247, "grad_norm": 0.25395554304122925, "learning_rate": 7.038717044938519e-05, "loss": 0.0136, "step": 7960 }, { "epoch": 4.459988808058198, "grad_norm": 0.223357155919075, "learning_rate": 7.031165373548014e-05, "loss": 0.0159, "step": 7970 }, { "epoch": 4.465584778959149, "grad_norm": 0.2434312105178833, "learning_rate": 7.023608149028937e-05, "loss": 0.0113, "step": 7980 }, { "epoch": 4.471180749860101, "grad_norm": 0.27500098943710327, "learning_rate": 7.016045392042452e-05, "loss": 0.0127, "step": 7990 }, { "epoch": 4.476776720761052, "grad_norm": 0.1670360416173935, "learning_rate": 7.008477123264848e-05, "loss": 0.0151, "step": 8000 }, { "epoch": 4.482372691662003, "grad_norm": 0.3035995662212372, "learning_rate": 7.000903363387482e-05, "loss": 0.0143, "step": 8010 }, { "epoch": 4.487968662562954, "grad_norm": 0.25943461060523987, "learning_rate": 6.993324133116726e-05, "loss": 0.0099, "step": 8020 }, { "epoch": 4.493564633463906, "grad_norm": 0.20338699221611023, "learning_rate": 6.985739453173903e-05, "loss": 0.0127, "step": 8030 }, { "epoch": 4.4991606043648575, "grad_norm": 0.18308840692043304, "learning_rate": 6.978149344295242e-05, "loss": 0.012, "step": 8040 }, { "epoch": 4.504756575265809, "grad_norm": 0.142523393034935, "learning_rate": 6.97055382723181e-05, "loss": 0.0117, "step": 8050 }, { "epoch": 4.51035254616676, "grad_norm": 0.26383474469184875, "learning_rate": 6.962952922749457e-05, "loss": 0.0171, "step": 8060 }, { "epoch": 4.515948517067711, "grad_norm": 0.1817890852689743, "learning_rate": 6.955346651628771e-05, "loss": 0.0147, "step": 8070 }, { "epoch": 4.521544487968662, "grad_norm": 0.20679673552513123, "learning_rate": 6.947735034665002e-05, "loss": 0.0161, "step": 8080 }, { "epoch": 4.527140458869614, "grad_norm": 0.2073245346546173, "learning_rate": 6.940118092668022e-05, "loss": 0.0104, "step": 8090 }, { "epoch": 4.532736429770566, "grad_norm": 0.45759397745132446, "learning_rate": 6.932495846462261e-05, "loss": 0.0141, "step": 8100 }, { "epoch": 4.538332400671517, "grad_norm": 0.2275332510471344, "learning_rate": 6.924868316886649e-05, "loss": 0.0144, "step": 8110 }, { "epoch": 4.543928371572468, "grad_norm": 0.24839594960212708, "learning_rate": 6.917235524794558e-05, "loss": 0.0153, "step": 8120 }, { "epoch": 4.549524342473419, "grad_norm": 0.13045403361320496, "learning_rate": 6.909597491053751e-05, "loss": 0.0148, "step": 8130 }, { "epoch": 4.5551203133743705, "grad_norm": 0.298033207654953, "learning_rate": 6.901954236546323e-05, "loss": 0.0148, "step": 8140 }, { "epoch": 4.560716284275322, "grad_norm": 0.3102302849292755, "learning_rate": 6.894305782168638e-05, "loss": 0.0104, "step": 8150 }, { "epoch": 4.566312255176273, "grad_norm": 0.3511497378349304, "learning_rate": 6.886652148831279e-05, "loss": 0.0114, "step": 8160 }, { "epoch": 4.571908226077224, "grad_norm": 0.19204401969909668, "learning_rate": 6.878993357458986e-05, "loss": 0.0144, "step": 8170 }, { "epoch": 4.577504196978175, "grad_norm": 0.27601921558380127, "learning_rate": 6.871329428990602e-05, "loss": 0.0121, "step": 8180 }, { "epoch": 4.583100167879127, "grad_norm": 0.15351536870002747, "learning_rate": 6.863660384379017e-05, "loss": 0.017, "step": 8190 }, { "epoch": 4.588696138780079, "grad_norm": 0.34269094467163086, "learning_rate": 6.855986244591104e-05, "loss": 0.0164, "step": 8200 }, { "epoch": 4.59429210968103, "grad_norm": 0.20768719911575317, "learning_rate": 6.84830703060767e-05, "loss": 0.0186, "step": 8210 }, { "epoch": 4.599888080581981, "grad_norm": 0.29763510823249817, "learning_rate": 6.840622763423391e-05, "loss": 0.0134, "step": 8220 }, { "epoch": 4.605484051482932, "grad_norm": 0.29871609807014465, "learning_rate": 6.83293346404676e-05, "loss": 0.0118, "step": 8230 }, { "epoch": 4.6110800223838835, "grad_norm": 0.24642953276634216, "learning_rate": 6.825239153500029e-05, "loss": 0.015, "step": 8240 }, { "epoch": 4.616675993284835, "grad_norm": 0.20664198696613312, "learning_rate": 6.817539852819149e-05, "loss": 0.0165, "step": 8250 }, { "epoch": 4.622271964185786, "grad_norm": 0.1941448450088501, "learning_rate": 6.809835583053715e-05, "loss": 0.0129, "step": 8260 }, { "epoch": 4.627867935086737, "grad_norm": 0.21355387568473816, "learning_rate": 6.802126365266905e-05, "loss": 0.013, "step": 8270 }, { "epoch": 4.633463905987689, "grad_norm": 0.2642342746257782, "learning_rate": 6.794412220535426e-05, "loss": 0.0176, "step": 8280 }, { "epoch": 4.63905987688864, "grad_norm": 0.31280654668807983, "learning_rate": 6.786693169949455e-05, "loss": 0.017, "step": 8290 }, { "epoch": 4.644655847789592, "grad_norm": 0.2257363200187683, "learning_rate": 6.778969234612584e-05, "loss": 0.0099, "step": 8300 }, { "epoch": 4.650251818690543, "grad_norm": 0.16536390781402588, "learning_rate": 6.771240435641754e-05, "loss": 0.012, "step": 8310 }, { "epoch": 4.655847789591494, "grad_norm": 0.16031181812286377, "learning_rate": 6.763506794167208e-05, "loss": 0.0094, "step": 8320 }, { "epoch": 4.661443760492445, "grad_norm": 0.2519717514514923, "learning_rate": 6.755768331332424e-05, "loss": 0.0153, "step": 8330 }, { "epoch": 4.6670397313933965, "grad_norm": 0.11290234327316284, "learning_rate": 6.748025068294067e-05, "loss": 0.0187, "step": 8340 }, { "epoch": 4.6726357022943485, "grad_norm": 0.18607747554779053, "learning_rate": 6.740277026221923e-05, "loss": 0.0123, "step": 8350 }, { "epoch": 4.6782316731953, "grad_norm": 0.20653483271598816, "learning_rate": 6.732524226298841e-05, "loss": 0.0128, "step": 8360 }, { "epoch": 4.683827644096251, "grad_norm": 0.20888541638851166, "learning_rate": 6.72476668972068e-05, "loss": 0.0235, "step": 8370 }, { "epoch": 4.689423614997202, "grad_norm": 0.23816397786140442, "learning_rate": 6.71700443769625e-05, "loss": 0.0125, "step": 8380 }, { "epoch": 4.695019585898153, "grad_norm": 0.3250564932823181, "learning_rate": 6.709237491447249e-05, "loss": 0.011, "step": 8390 }, { "epoch": 4.700615556799105, "grad_norm": 0.3211959898471832, "learning_rate": 6.701465872208216e-05, "loss": 0.0124, "step": 8400 }, { "epoch": 4.706211527700056, "grad_norm": 0.3432743549346924, "learning_rate": 6.693689601226458e-05, "loss": 0.0119, "step": 8410 }, { "epoch": 4.711807498601007, "grad_norm": 0.2595174014568329, "learning_rate": 6.685908699762002e-05, "loss": 0.0111, "step": 8420 }, { "epoch": 4.717403469501958, "grad_norm": 0.283252090215683, "learning_rate": 6.67812318908754e-05, "loss": 0.0119, "step": 8430 }, { "epoch": 4.72299944040291, "grad_norm": 0.20471790432929993, "learning_rate": 6.670333090488356e-05, "loss": 0.013, "step": 8440 }, { "epoch": 4.7285954113038615, "grad_norm": 0.1850796490907669, "learning_rate": 6.662538425262285e-05, "loss": 0.0112, "step": 8450 }, { "epoch": 4.734191382204813, "grad_norm": 0.2515677213668823, "learning_rate": 6.654739214719641e-05, "loss": 0.0084, "step": 8460 }, { "epoch": 4.739787353105764, "grad_norm": 0.25231802463531494, "learning_rate": 6.646935480183173e-05, "loss": 0.0149, "step": 8470 }, { "epoch": 4.745383324006715, "grad_norm": 0.24691557884216309, "learning_rate": 6.639127242987988e-05, "loss": 0.0144, "step": 8480 }, { "epoch": 4.750979294907666, "grad_norm": 0.3806649446487427, "learning_rate": 6.631314524481513e-05, "loss": 0.0136, "step": 8490 }, { "epoch": 4.756575265808618, "grad_norm": 0.233370840549469, "learning_rate": 6.623497346023418e-05, "loss": 0.0119, "step": 8500 }, { "epoch": 4.762171236709569, "grad_norm": 0.16195163130760193, "learning_rate": 6.615675728985572e-05, "loss": 0.0178, "step": 8510 }, { "epoch": 4.76776720761052, "grad_norm": 0.25800469517707825, "learning_rate": 6.607849694751977e-05, "loss": 0.012, "step": 8520 }, { "epoch": 4.773363178511472, "grad_norm": 0.17752796411514282, "learning_rate": 6.600019264718713e-05, "loss": 0.0084, "step": 8530 }, { "epoch": 4.778959149412423, "grad_norm": 0.2168557047843933, "learning_rate": 6.592184460293877e-05, "loss": 0.0163, "step": 8540 }, { "epoch": 4.7845551203133745, "grad_norm": 0.2908076345920563, "learning_rate": 6.584345302897523e-05, "loss": 0.0091, "step": 8550 }, { "epoch": 4.790151091214326, "grad_norm": 0.16817107796669006, "learning_rate": 6.576501813961609e-05, "loss": 0.012, "step": 8560 }, { "epoch": 4.795747062115277, "grad_norm": 0.17607803642749786, "learning_rate": 6.568654014929932e-05, "loss": 0.0095, "step": 8570 }, { "epoch": 4.801343033016228, "grad_norm": 0.1395525336265564, "learning_rate": 6.56080192725808e-05, "loss": 0.0127, "step": 8580 }, { "epoch": 4.806939003917179, "grad_norm": 0.12721598148345947, "learning_rate": 6.552945572413358e-05, "loss": 0.0127, "step": 8590 }, { "epoch": 4.812534974818131, "grad_norm": 0.220106303691864, "learning_rate": 6.545084971874738e-05, "loss": 0.0124, "step": 8600 }, { "epoch": 4.818130945719083, "grad_norm": 0.1850575953722, "learning_rate": 6.537220147132805e-05, "loss": 0.0133, "step": 8610 }, { "epoch": 4.823726916620034, "grad_norm": 0.14641323685646057, "learning_rate": 6.529351119689688e-05, "loss": 0.0083, "step": 8620 }, { "epoch": 4.829322887520985, "grad_norm": 0.2565167546272278, "learning_rate": 6.521477911059008e-05, "loss": 0.0146, "step": 8630 }, { "epoch": 4.834918858421936, "grad_norm": 0.1807018518447876, "learning_rate": 6.513600542765817e-05, "loss": 0.0093, "step": 8640 }, { "epoch": 4.8405148293228875, "grad_norm": 0.22783279418945312, "learning_rate": 6.505719036346539e-05, "loss": 0.0105, "step": 8650 }, { "epoch": 4.846110800223839, "grad_norm": 0.18857407569885254, "learning_rate": 6.497833413348909e-05, "loss": 0.012, "step": 8660 }, { "epoch": 4.85170677112479, "grad_norm": 0.31593799591064453, "learning_rate": 6.489943695331923e-05, "loss": 0.013, "step": 8670 }, { "epoch": 4.857302742025741, "grad_norm": 0.3053518533706665, "learning_rate": 6.48204990386577e-05, "loss": 0.0106, "step": 8680 }, { "epoch": 4.862898712926693, "grad_norm": 0.2662791311740875, "learning_rate": 6.474152060531768e-05, "loss": 0.0151, "step": 8690 }, { "epoch": 4.868494683827644, "grad_norm": 0.13093920052051544, "learning_rate": 6.466250186922325e-05, "loss": 0.0108, "step": 8700 }, { "epoch": 4.874090654728596, "grad_norm": 0.17706599831581116, "learning_rate": 6.458344304640858e-05, "loss": 0.0118, "step": 8710 }, { "epoch": 4.879686625629547, "grad_norm": 0.19158832728862762, "learning_rate": 6.450434435301751e-05, "loss": 0.0116, "step": 8720 }, { "epoch": 4.885282596530498, "grad_norm": 0.12095298618078232, "learning_rate": 6.44252060053028e-05, "loss": 0.0134, "step": 8730 }, { "epoch": 4.890878567431449, "grad_norm": 0.2882150411605835, "learning_rate": 6.43460282196257e-05, "loss": 0.0112, "step": 8740 }, { "epoch": 4.8964745383324, "grad_norm": 0.34821435809135437, "learning_rate": 6.426681121245527e-05, "loss": 0.0111, "step": 8750 }, { "epoch": 4.902070509233352, "grad_norm": 0.28680020570755005, "learning_rate": 6.418755520036775e-05, "loss": 0.011, "step": 8760 }, { "epoch": 4.907666480134303, "grad_norm": 0.15372464060783386, "learning_rate": 6.410826040004607e-05, "loss": 0.0138, "step": 8770 }, { "epoch": 4.913262451035255, "grad_norm": 0.24093207716941833, "learning_rate": 6.402892702827916e-05, "loss": 0.0152, "step": 8780 }, { "epoch": 4.918858421936206, "grad_norm": 0.3779686689376831, "learning_rate": 6.394955530196147e-05, "loss": 0.0173, "step": 8790 }, { "epoch": 4.924454392837157, "grad_norm": 0.19445843994617462, "learning_rate": 6.387014543809223e-05, "loss": 0.0142, "step": 8800 }, { "epoch": 4.930050363738109, "grad_norm": 0.32286763191223145, "learning_rate": 6.3790697653775e-05, "loss": 0.0217, "step": 8810 }, { "epoch": 4.93564633463906, "grad_norm": 0.27731436491012573, "learning_rate": 6.371121216621698e-05, "loss": 0.0103, "step": 8820 }, { "epoch": 4.941242305540011, "grad_norm": 0.2174469232559204, "learning_rate": 6.363168919272846e-05, "loss": 0.0112, "step": 8830 }, { "epoch": 4.946838276440962, "grad_norm": 0.20424802601337433, "learning_rate": 6.355212895072223e-05, "loss": 0.0179, "step": 8840 }, { "epoch": 4.952434247341914, "grad_norm": 0.14288559556007385, "learning_rate": 6.34725316577129e-05, "loss": 0.0116, "step": 8850 }, { "epoch": 4.9580302182428655, "grad_norm": 0.21734347939491272, "learning_rate": 6.339289753131649e-05, "loss": 0.012, "step": 8860 }, { "epoch": 4.963626189143817, "grad_norm": 0.29445502161979675, "learning_rate": 6.331322678924962e-05, "loss": 0.0116, "step": 8870 }, { "epoch": 4.969222160044768, "grad_norm": 0.2319229543209076, "learning_rate": 6.323351964932908e-05, "loss": 0.0194, "step": 8880 }, { "epoch": 4.974818130945719, "grad_norm": 0.13166509568691254, "learning_rate": 6.315377632947115e-05, "loss": 0.0127, "step": 8890 }, { "epoch": 4.98041410184667, "grad_norm": 0.2546875774860382, "learning_rate": 6.307399704769099e-05, "loss": 0.0115, "step": 8900 }, { "epoch": 4.9860100727476215, "grad_norm": 0.2343253493309021, "learning_rate": 6.299418202210214e-05, "loss": 0.0123, "step": 8910 }, { "epoch": 4.991606043648573, "grad_norm": 0.12813247740268707, "learning_rate": 6.291433147091583e-05, "loss": 0.0121, "step": 8920 }, { "epoch": 4.997202014549524, "grad_norm": 0.11860624700784683, "learning_rate": 6.283444561244042e-05, "loss": 0.0125, "step": 8930 }, { "epoch": 5.002797985450476, "grad_norm": 0.1995118260383606, "learning_rate": 6.275452466508077e-05, "loss": 0.0112, "step": 8940 }, { "epoch": 5.008393956351427, "grad_norm": 0.2113560289144516, "learning_rate": 6.26745688473377e-05, "loss": 0.0118, "step": 8950 }, { "epoch": 5.0139899272523785, "grad_norm": 0.321319580078125, "learning_rate": 6.259457837780742e-05, "loss": 0.0145, "step": 8960 }, { "epoch": 5.01958589815333, "grad_norm": 0.15436704456806183, "learning_rate": 6.251455347518073e-05, "loss": 0.011, "step": 8970 }, { "epoch": 5.025181869054281, "grad_norm": 0.2929522693157196, "learning_rate": 6.243449435824276e-05, "loss": 0.0145, "step": 8980 }, { "epoch": 5.030777839955232, "grad_norm": 0.2311781346797943, "learning_rate": 6.235440124587198e-05, "loss": 0.0121, "step": 8990 }, { "epoch": 5.036373810856183, "grad_norm": 0.16461458802223206, "learning_rate": 6.227427435703997e-05, "loss": 0.016, "step": 9000 }, { "epoch": 5.0419697817571345, "grad_norm": 0.23925089836120605, "learning_rate": 6.219411391081055e-05, "loss": 0.0125, "step": 9010 }, { "epoch": 5.047565752658087, "grad_norm": 0.3376557230949402, "learning_rate": 6.211392012633932e-05, "loss": 0.0147, "step": 9020 }, { "epoch": 5.053161723559038, "grad_norm": 0.20988136529922485, "learning_rate": 6.203369322287306e-05, "loss": 0.0139, "step": 9030 }, { "epoch": 5.058757694459989, "grad_norm": 0.17247657477855682, "learning_rate": 6.195343341974899e-05, "loss": 0.0133, "step": 9040 }, { "epoch": 5.06435366536094, "grad_norm": 0.24936120212078094, "learning_rate": 6.187314093639444e-05, "loss": 0.0112, "step": 9050 }, { "epoch": 5.069949636261891, "grad_norm": 0.1587497889995575, "learning_rate": 6.179281599232591e-05, "loss": 0.0127, "step": 9060 }, { "epoch": 5.075545607162843, "grad_norm": 0.12296043336391449, "learning_rate": 6.17124588071488e-05, "loss": 0.0132, "step": 9070 }, { "epoch": 5.081141578063794, "grad_norm": 0.2310076504945755, "learning_rate": 6.163206960055651e-05, "loss": 0.013, "step": 9080 }, { "epoch": 5.086737548964745, "grad_norm": 0.1278199851512909, "learning_rate": 6.155164859233012e-05, "loss": 0.0127, "step": 9090 }, { "epoch": 5.092333519865696, "grad_norm": 0.225848987698555, "learning_rate": 6.147119600233758e-05, "loss": 0.0125, "step": 9100 }, { "epoch": 5.097929490766648, "grad_norm": 0.12778952717781067, "learning_rate": 6.13907120505332e-05, "loss": 0.0102, "step": 9110 }, { "epoch": 5.1035254616676, "grad_norm": 0.2868061065673828, "learning_rate": 6.131019695695702e-05, "loss": 0.0102, "step": 9120 }, { "epoch": 5.109121432568551, "grad_norm": 0.35349947214126587, "learning_rate": 6.122965094173424e-05, "loss": 0.0151, "step": 9130 }, { "epoch": 5.114717403469502, "grad_norm": 0.24252165853977203, "learning_rate": 6.11490742250746e-05, "loss": 0.0111, "step": 9140 }, { "epoch": 5.120313374370453, "grad_norm": 0.17868760228157043, "learning_rate": 6.106846702727172e-05, "loss": 0.0102, "step": 9150 }, { "epoch": 5.125909345271404, "grad_norm": 0.21379156410694122, "learning_rate": 6.0987829568702656e-05, "loss": 0.0137, "step": 9160 }, { "epoch": 5.131505316172356, "grad_norm": 0.29363685846328735, "learning_rate": 6.090716206982714e-05, "loss": 0.0131, "step": 9170 }, { "epoch": 5.137101287073307, "grad_norm": 0.330162912607193, "learning_rate": 6.0826464751186994e-05, "loss": 0.0129, "step": 9180 }, { "epoch": 5.142697257974259, "grad_norm": 0.2052110731601715, "learning_rate": 6.074573783340562e-05, "loss": 0.0108, "step": 9190 }, { "epoch": 5.14829322887521, "grad_norm": 0.17011559009552002, "learning_rate": 6.066498153718735e-05, "loss": 0.0125, "step": 9200 }, { "epoch": 5.153889199776161, "grad_norm": 0.3137349486351013, "learning_rate": 6.0584196083316794e-05, "loss": 0.0192, "step": 9210 }, { "epoch": 5.1594851706771125, "grad_norm": 0.3046635389328003, "learning_rate": 6.05033816926583e-05, "loss": 0.0119, "step": 9220 }, { "epoch": 5.165081141578064, "grad_norm": 0.1919318437576294, "learning_rate": 6.042253858615532e-05, "loss": 0.0139, "step": 9230 }, { "epoch": 5.170677112479015, "grad_norm": 0.3815397322177887, "learning_rate": 6.034166698482984e-05, "loss": 0.0176, "step": 9240 }, { "epoch": 5.176273083379966, "grad_norm": 0.23484662175178528, "learning_rate": 6.026076710978171e-05, "loss": 0.0137, "step": 9250 }, { "epoch": 5.181869054280917, "grad_norm": 0.1737549602985382, "learning_rate": 6.017983918218812e-05, "loss": 0.0112, "step": 9260 }, { "epoch": 5.1874650251818695, "grad_norm": 0.28736233711242676, "learning_rate": 6.009888342330292e-05, "loss": 0.0112, "step": 9270 }, { "epoch": 5.193060996082821, "grad_norm": 0.21343185007572174, "learning_rate": 6.001790005445607e-05, "loss": 0.0089, "step": 9280 }, { "epoch": 5.198656966983772, "grad_norm": 0.15162508189678192, "learning_rate": 5.9936889297052986e-05, "loss": 0.0156, "step": 9290 }, { "epoch": 5.204252937884723, "grad_norm": 0.2816758155822754, "learning_rate": 5.985585137257401e-05, "loss": 0.0093, "step": 9300 }, { "epoch": 5.209848908785674, "grad_norm": 0.1730954796075821, "learning_rate": 5.977478650257374e-05, "loss": 0.016, "step": 9310 }, { "epoch": 5.2154448796866255, "grad_norm": 0.18365302681922913, "learning_rate": 5.969369490868042e-05, "loss": 0.0259, "step": 9320 }, { "epoch": 5.221040850587577, "grad_norm": 0.12864327430725098, "learning_rate": 5.961257681259535e-05, "loss": 0.0119, "step": 9330 }, { "epoch": 5.226636821488528, "grad_norm": 0.16363385319709778, "learning_rate": 5.953143243609235e-05, "loss": 0.0129, "step": 9340 }, { "epoch": 5.23223279238948, "grad_norm": 0.15773551166057587, "learning_rate": 5.945026200101702e-05, "loss": 0.0083, "step": 9350 }, { "epoch": 5.237828763290431, "grad_norm": 0.22605851292610168, "learning_rate": 5.9369065729286245e-05, "loss": 0.0096, "step": 9360 }, { "epoch": 5.243424734191382, "grad_norm": 0.13637419044971466, "learning_rate": 5.92878438428875e-05, "loss": 0.0185, "step": 9370 }, { "epoch": 5.249020705092334, "grad_norm": 0.12795643508434296, "learning_rate": 5.9206596563878357e-05, "loss": 0.008, "step": 9380 }, { "epoch": 5.254616675993285, "grad_norm": 0.2635105550289154, "learning_rate": 5.912532411438576e-05, "loss": 0.0162, "step": 9390 }, { "epoch": 5.260212646894236, "grad_norm": 0.18397080898284912, "learning_rate": 5.90440267166055e-05, "loss": 0.013, "step": 9400 }, { "epoch": 5.265808617795187, "grad_norm": 0.23337115347385406, "learning_rate": 5.896270459280153e-05, "loss": 0.0105, "step": 9410 }, { "epoch": 5.2714045886961385, "grad_norm": 0.24963605403900146, "learning_rate": 5.888135796530544e-05, "loss": 0.0098, "step": 9420 }, { "epoch": 5.27700055959709, "grad_norm": 0.372761070728302, "learning_rate": 5.8799987056515804e-05, "loss": 0.0125, "step": 9430 }, { "epoch": 5.282596530498042, "grad_norm": 0.2931661009788513, "learning_rate": 5.871859208889759e-05, "loss": 0.012, "step": 9440 }, { "epoch": 5.288192501398993, "grad_norm": 0.2341478168964386, "learning_rate": 5.8637173284981526e-05, "loss": 0.0113, "step": 9450 }, { "epoch": 5.293788472299944, "grad_norm": 0.2445063441991806, "learning_rate": 5.85557308673635e-05, "loss": 0.0157, "step": 9460 }, { "epoch": 5.299384443200895, "grad_norm": 0.22766774892807007, "learning_rate": 5.847426505870399e-05, "loss": 0.011, "step": 9470 }, { "epoch": 5.304980414101847, "grad_norm": 0.25397437810897827, "learning_rate": 5.8392776081727385e-05, "loss": 0.0088, "step": 9480 }, { "epoch": 5.310576385002798, "grad_norm": 0.2036605179309845, "learning_rate": 5.831126415922148e-05, "loss": 0.0138, "step": 9490 }, { "epoch": 5.316172355903749, "grad_norm": 0.17595243453979492, "learning_rate": 5.8229729514036705e-05, "loss": 0.0102, "step": 9500 }, { "epoch": 5.3217683268047, "grad_norm": 0.14046894013881683, "learning_rate": 5.8148172369085686e-05, "loss": 0.0148, "step": 9510 }, { "epoch": 5.327364297705652, "grad_norm": 0.2699585556983948, "learning_rate": 5.8066592947342555e-05, "loss": 0.0107, "step": 9520 }, { "epoch": 5.3329602686066035, "grad_norm": 0.15614166855812073, "learning_rate": 5.798499147184233e-05, "loss": 0.0118, "step": 9530 }, { "epoch": 5.338556239507555, "grad_norm": 0.3686412572860718, "learning_rate": 5.7903368165680327e-05, "loss": 0.0122, "step": 9540 }, { "epoch": 5.344152210408506, "grad_norm": 0.2578679323196411, "learning_rate": 5.782172325201155e-05, "loss": 0.0152, "step": 9550 }, { "epoch": 5.349748181309457, "grad_norm": 0.24605675041675568, "learning_rate": 5.7740056954050084e-05, "loss": 0.0106, "step": 9560 }, { "epoch": 5.355344152210408, "grad_norm": 0.19138172268867493, "learning_rate": 5.765836949506843e-05, "loss": 0.0134, "step": 9570 }, { "epoch": 5.36094012311136, "grad_norm": 0.23657287657260895, "learning_rate": 5.757666109839702e-05, "loss": 0.0076, "step": 9580 }, { "epoch": 5.366536094012311, "grad_norm": 0.13402613997459412, "learning_rate": 5.74949319874235e-05, "loss": 0.0092, "step": 9590 }, { "epoch": 5.372132064913263, "grad_norm": 0.16487988829612732, "learning_rate": 5.74131823855921e-05, "loss": 0.0165, "step": 9600 }, { "epoch": 5.377728035814214, "grad_norm": 0.1842515617609024, "learning_rate": 5.733141251640315e-05, "loss": 0.0101, "step": 9610 }, { "epoch": 5.383324006715165, "grad_norm": 0.17961528897285461, "learning_rate": 5.72496226034123e-05, "loss": 0.012, "step": 9620 }, { "epoch": 5.3889199776161165, "grad_norm": 0.2516380548477173, "learning_rate": 5.7167812870230094e-05, "loss": 0.011, "step": 9630 }, { "epoch": 5.394515948517068, "grad_norm": 0.1506935954093933, "learning_rate": 5.7085983540521216e-05, "loss": 0.0075, "step": 9640 }, { "epoch": 5.400111919418019, "grad_norm": 0.3415573835372925, "learning_rate": 5.70041348380039e-05, "loss": 0.0142, "step": 9650 }, { "epoch": 5.40570789031897, "grad_norm": 0.2501567006111145, "learning_rate": 5.692226698644938e-05, "loss": 0.0126, "step": 9660 }, { "epoch": 5.411303861219921, "grad_norm": 0.15769636631011963, "learning_rate": 5.6840380209681255e-05, "loss": 0.0206, "step": 9670 }, { "epoch": 5.416899832120873, "grad_norm": 0.17793142795562744, "learning_rate": 5.675847473157485e-05, "loss": 0.0198, "step": 9680 }, { "epoch": 5.422495803021825, "grad_norm": 0.19135138392448425, "learning_rate": 5.667655077605659e-05, "loss": 0.0089, "step": 9690 }, { "epoch": 5.428091773922776, "grad_norm": 0.1910410374403, "learning_rate": 5.6594608567103456e-05, "loss": 0.0178, "step": 9700 }, { "epoch": 5.433687744823727, "grad_norm": 0.18896977603435516, "learning_rate": 5.65126483287423e-05, "loss": 0.0102, "step": 9710 }, { "epoch": 5.439283715724678, "grad_norm": 0.12857311964035034, "learning_rate": 5.6430670285049314e-05, "loss": 0.0147, "step": 9720 }, { "epoch": 5.4448796866256295, "grad_norm": 0.20521825551986694, "learning_rate": 5.634867466014932e-05, "loss": 0.0101, "step": 9730 }, { "epoch": 5.450475657526581, "grad_norm": 0.16037105023860931, "learning_rate": 5.6266661678215216e-05, "loss": 0.0114, "step": 9740 }, { "epoch": 5.456071628427532, "grad_norm": 0.15576882660388947, "learning_rate": 5.618463156346739e-05, "loss": 0.0138, "step": 9750 }, { "epoch": 5.461667599328483, "grad_norm": 0.24249835312366486, "learning_rate": 5.6102584540173006e-05, "loss": 0.0131, "step": 9760 }, { "epoch": 5.467263570229435, "grad_norm": 0.27811625599861145, "learning_rate": 5.602052083264555e-05, "loss": 0.0098, "step": 9770 }, { "epoch": 5.472859541130386, "grad_norm": 0.3673328459262848, "learning_rate": 5.5938440665244006e-05, "loss": 0.0131, "step": 9780 }, { "epoch": 5.478455512031338, "grad_norm": 0.2886298596858978, "learning_rate": 5.585634426237246e-05, "loss": 0.0141, "step": 9790 }, { "epoch": 5.484051482932289, "grad_norm": 0.2564665973186493, "learning_rate": 5.577423184847932e-05, "loss": 0.0104, "step": 9800 }, { "epoch": 5.48964745383324, "grad_norm": 0.22507299482822418, "learning_rate": 5.569210364805677e-05, "loss": 0.0116, "step": 9810 }, { "epoch": 5.495243424734191, "grad_norm": 0.09582646191120148, "learning_rate": 5.560995988564023e-05, "loss": 0.0107, "step": 9820 }, { "epoch": 5.5008393956351425, "grad_norm": 0.25511208176612854, "learning_rate": 5.552780078580756e-05, "loss": 0.0111, "step": 9830 }, { "epoch": 5.506435366536094, "grad_norm": 0.14793109893798828, "learning_rate": 5.544562657317863e-05, "loss": 0.0088, "step": 9840 }, { "epoch": 5.512031337437046, "grad_norm": 0.3215508759021759, "learning_rate": 5.5363437472414595e-05, "loss": 0.0132, "step": 9850 }, { "epoch": 5.517627308337997, "grad_norm": 0.357731431722641, "learning_rate": 5.52812337082173e-05, "loss": 0.0119, "step": 9860 }, { "epoch": 5.523223279238948, "grad_norm": 0.2520214915275574, "learning_rate": 5.519901550532871e-05, "loss": 0.0121, "step": 9870 }, { "epoch": 5.528819250139899, "grad_norm": 0.28353017568588257, "learning_rate": 5.511678308853026e-05, "loss": 0.0077, "step": 9880 }, { "epoch": 5.534415221040851, "grad_norm": 0.34384286403656006, "learning_rate": 5.5034536682642224e-05, "loss": 0.0125, "step": 9890 }, { "epoch": 5.540011191941802, "grad_norm": 0.21323193609714508, "learning_rate": 5.495227651252315e-05, "loss": 0.0121, "step": 9900 }, { "epoch": 5.545607162842753, "grad_norm": 0.3126833736896515, "learning_rate": 5.487000280306917e-05, "loss": 0.0125, "step": 9910 }, { "epoch": 5.551203133743704, "grad_norm": 0.29106199741363525, "learning_rate": 5.478771577921351e-05, "loss": 0.0098, "step": 9920 }, { "epoch": 5.556799104644655, "grad_norm": 0.2740892469882965, "learning_rate": 5.470541566592573e-05, "loss": 0.0135, "step": 9930 }, { "epoch": 5.5623950755456075, "grad_norm": 0.19003938138484955, "learning_rate": 5.462310268821118e-05, "loss": 0.0146, "step": 9940 }, { "epoch": 5.567991046446559, "grad_norm": 0.2251635491847992, "learning_rate": 5.454077707111042e-05, "loss": 0.0153, "step": 9950 }, { "epoch": 5.57358701734751, "grad_norm": 0.16961322724819183, "learning_rate": 5.445843903969854e-05, "loss": 0.0154, "step": 9960 }, { "epoch": 5.579182988248461, "grad_norm": 0.2752644419670105, "learning_rate": 5.4376088819084556e-05, "loss": 0.0102, "step": 9970 }, { "epoch": 5.584778959149412, "grad_norm": 0.24675792455673218, "learning_rate": 5.4293726634410855e-05, "loss": 0.0123, "step": 9980 }, { "epoch": 5.590374930050364, "grad_norm": 0.2074369490146637, "learning_rate": 5.4211352710852495e-05, "loss": 0.0095, "step": 9990 }, { "epoch": 5.595970900951315, "grad_norm": 0.22929449379444122, "learning_rate": 5.4128967273616625e-05, "loss": 0.0123, "step": 10000 }, { "epoch": 5.601566871852266, "grad_norm": 0.21107512712478638, "learning_rate": 5.404657054794189e-05, "loss": 0.01, "step": 10010 }, { "epoch": 5.607162842753217, "grad_norm": 0.3743564188480377, "learning_rate": 5.396416275909779e-05, "loss": 0.0173, "step": 10020 }, { "epoch": 5.612758813654169, "grad_norm": 0.19637951254844666, "learning_rate": 5.3881744132384104e-05, "loss": 0.0114, "step": 10030 }, { "epoch": 5.6183547845551205, "grad_norm": 0.2417994886636734, "learning_rate": 5.379931489313016e-05, "loss": 0.0117, "step": 10040 }, { "epoch": 5.623950755456072, "grad_norm": 0.18541017174720764, "learning_rate": 5.371687526669439e-05, "loss": 0.0139, "step": 10050 }, { "epoch": 5.629546726357023, "grad_norm": 0.26478803157806396, "learning_rate": 5.363442547846356e-05, "loss": 0.0108, "step": 10060 }, { "epoch": 5.635142697257974, "grad_norm": 0.23468017578125, "learning_rate": 5.355196575385225e-05, "loss": 0.0107, "step": 10070 }, { "epoch": 5.640738668158925, "grad_norm": 0.2251582145690918, "learning_rate": 5.3469496318302204e-05, "loss": 0.0105, "step": 10080 }, { "epoch": 5.6463346390598765, "grad_norm": 0.18580631911754608, "learning_rate": 5.3387017397281704e-05, "loss": 0.0107, "step": 10090 }, { "epoch": 5.651930609960829, "grad_norm": 0.14670825004577637, "learning_rate": 5.330452921628497e-05, "loss": 0.0103, "step": 10100 }, { "epoch": 5.65752658086178, "grad_norm": 0.22916555404663086, "learning_rate": 5.322203200083154e-05, "loss": 0.0113, "step": 10110 }, { "epoch": 5.663122551762731, "grad_norm": 0.1360463947057724, "learning_rate": 5.313952597646568e-05, "loss": 0.0121, "step": 10120 }, { "epoch": 5.668718522663682, "grad_norm": 0.24525059759616852, "learning_rate": 5.305701136875566e-05, "loss": 0.0092, "step": 10130 }, { "epoch": 5.6743144935646335, "grad_norm": 0.1451522707939148, "learning_rate": 5.297448840329329e-05, "loss": 0.0081, "step": 10140 }, { "epoch": 5.679910464465585, "grad_norm": 0.1923244744539261, "learning_rate": 5.2891957305693205e-05, "loss": 0.0117, "step": 10150 }, { "epoch": 5.685506435366536, "grad_norm": 0.18804806470870972, "learning_rate": 5.280941830159227e-05, "loss": 0.0095, "step": 10160 }, { "epoch": 5.691102406267487, "grad_norm": 0.1880972534418106, "learning_rate": 5.2726871616649e-05, "loss": 0.0111, "step": 10170 }, { "epoch": 5.696698377168438, "grad_norm": 0.18024373054504395, "learning_rate": 5.264431747654284e-05, "loss": 0.0119, "step": 10180 }, { "epoch": 5.70229434806939, "grad_norm": 0.16494502127170563, "learning_rate": 5.2561756106973656e-05, "loss": 0.0131, "step": 10190 }, { "epoch": 5.707890318970342, "grad_norm": 0.2051820605993271, "learning_rate": 5.247918773366112e-05, "loss": 0.0136, "step": 10200 }, { "epoch": 5.713486289871293, "grad_norm": 0.21385324001312256, "learning_rate": 5.2396612582343986e-05, "loss": 0.0101, "step": 10210 }, { "epoch": 5.719082260772244, "grad_norm": 0.2170487344264984, "learning_rate": 5.231403087877955e-05, "loss": 0.0107, "step": 10220 }, { "epoch": 5.724678231673195, "grad_norm": 0.23433655500411987, "learning_rate": 5.2231442848743064e-05, "loss": 0.0139, "step": 10230 }, { "epoch": 5.730274202574146, "grad_norm": 0.2549709379673004, "learning_rate": 5.214884871802703e-05, "loss": 0.0178, "step": 10240 }, { "epoch": 5.735870173475098, "grad_norm": 0.11975869536399841, "learning_rate": 5.2066248712440656e-05, "loss": 0.0101, "step": 10250 }, { "epoch": 5.74146614437605, "grad_norm": 0.39216071367263794, "learning_rate": 5.198364305780922e-05, "loss": 0.0131, "step": 10260 }, { "epoch": 5.747062115277, "grad_norm": 0.2390432357788086, "learning_rate": 5.1901031979973394e-05, "loss": 0.0097, "step": 10270 }, { "epoch": 5.752658086177952, "grad_norm": 0.1686331033706665, "learning_rate": 5.1818415704788725e-05, "loss": 0.0104, "step": 10280 }, { "epoch": 5.758254057078903, "grad_norm": 0.28812578320503235, "learning_rate": 5.1735794458124956e-05, "loss": 0.01, "step": 10290 }, { "epoch": 5.763850027979855, "grad_norm": 0.4722854197025299, "learning_rate": 5.165316846586541e-05, "loss": 0.0125, "step": 10300 }, { "epoch": 5.769445998880806, "grad_norm": 0.19151827692985535, "learning_rate": 5.157053795390642e-05, "loss": 0.0134, "step": 10310 }, { "epoch": 5.775041969781757, "grad_norm": 0.2533670961856842, "learning_rate": 5.148790314815663e-05, "loss": 0.011, "step": 10320 }, { "epoch": 5.780637940682708, "grad_norm": 0.1756027489900589, "learning_rate": 5.1405264274536445e-05, "loss": 0.0092, "step": 10330 }, { "epoch": 5.786233911583659, "grad_norm": 0.2753913700580597, "learning_rate": 5.132262155897739e-05, "loss": 0.0118, "step": 10340 }, { "epoch": 5.7918298824846115, "grad_norm": 0.17530974745750427, "learning_rate": 5.123997522742151e-05, "loss": 0.0092, "step": 10350 }, { "epoch": 5.797425853385563, "grad_norm": 0.3250185251235962, "learning_rate": 5.1157325505820694e-05, "loss": 0.0135, "step": 10360 }, { "epoch": 5.803021824286514, "grad_norm": 0.2266574651002884, "learning_rate": 5.107467262013614e-05, "loss": 0.0174, "step": 10370 }, { "epoch": 5.808617795187465, "grad_norm": 0.15442338585853577, "learning_rate": 5.0992016796337686e-05, "loss": 0.0112, "step": 10380 }, { "epoch": 5.814213766088416, "grad_norm": 0.16227369010448456, "learning_rate": 5.0909358260403186e-05, "loss": 0.0141, "step": 10390 }, { "epoch": 5.8198097369893675, "grad_norm": 0.288241982460022, "learning_rate": 5.0826697238317935e-05, "loss": 0.0142, "step": 10400 }, { "epoch": 5.825405707890319, "grad_norm": 0.17878948152065277, "learning_rate": 5.074403395607399e-05, "loss": 0.0115, "step": 10410 }, { "epoch": 5.83100167879127, "grad_norm": 0.2224341630935669, "learning_rate": 5.066136863966963e-05, "loss": 0.0106, "step": 10420 }, { "epoch": 5.836597649692221, "grad_norm": 0.1762062907218933, "learning_rate": 5.057870151510864e-05, "loss": 0.0115, "step": 10430 }, { "epoch": 5.842193620593173, "grad_norm": 0.15165816247463226, "learning_rate": 5.0496032808399815e-05, "loss": 0.0116, "step": 10440 }, { "epoch": 5.8477895914941245, "grad_norm": 0.23350821435451508, "learning_rate": 5.041336274555625e-05, "loss": 0.0124, "step": 10450 }, { "epoch": 5.853385562395076, "grad_norm": 0.3131781816482544, "learning_rate": 5.033069155259471e-05, "loss": 0.0136, "step": 10460 }, { "epoch": 5.858981533296027, "grad_norm": 0.25165101885795593, "learning_rate": 5.02480194555351e-05, "loss": 0.0081, "step": 10470 }, { "epoch": 5.864577504196978, "grad_norm": 0.17109723389148712, "learning_rate": 5.016534668039976e-05, "loss": 0.0104, "step": 10480 }, { "epoch": 5.870173475097929, "grad_norm": 0.14172928035259247, "learning_rate": 5.0082673453212914e-05, "loss": 0.0096, "step": 10490 }, { "epoch": 5.8757694459988805, "grad_norm": 0.15533624589443207, "learning_rate": 5e-05, "loss": 0.0075, "step": 10500 }, { "epoch": 5.881365416899833, "grad_norm": 0.12869463860988617, "learning_rate": 4.991732654678709e-05, "loss": 0.0114, "step": 10510 }, { "epoch": 5.886961387800784, "grad_norm": 0.3376826345920563, "learning_rate": 4.9834653319600246e-05, "loss": 0.0135, "step": 10520 }, { "epoch": 5.892557358701735, "grad_norm": 0.20675431191921234, "learning_rate": 4.975198054446492e-05, "loss": 0.0106, "step": 10530 }, { "epoch": 5.898153329602686, "grad_norm": 0.14309728145599365, "learning_rate": 4.96693084474053e-05, "loss": 0.0122, "step": 10540 }, { "epoch": 5.903749300503637, "grad_norm": 0.13042593002319336, "learning_rate": 4.9586637254443756e-05, "loss": 0.0114, "step": 10550 }, { "epoch": 5.909345271404589, "grad_norm": 0.14101748168468475, "learning_rate": 4.950396719160018e-05, "loss": 0.0104, "step": 10560 }, { "epoch": 5.91494124230554, "grad_norm": 0.22409436106681824, "learning_rate": 4.942129848489137e-05, "loss": 0.0109, "step": 10570 }, { "epoch": 5.920537213206491, "grad_norm": 0.22155794501304626, "learning_rate": 4.93386313603304e-05, "loss": 0.0091, "step": 10580 }, { "epoch": 5.926133184107442, "grad_norm": 0.1839323341846466, "learning_rate": 4.925596604392603e-05, "loss": 0.0086, "step": 10590 }, { "epoch": 5.931729155008394, "grad_norm": 0.1160067617893219, "learning_rate": 4.917330276168208e-05, "loss": 0.0103, "step": 10600 }, { "epoch": 5.937325125909346, "grad_norm": 0.2413625419139862, "learning_rate": 4.909064173959681e-05, "loss": 0.0117, "step": 10610 }, { "epoch": 5.942921096810297, "grad_norm": 0.19037237763404846, "learning_rate": 4.9007983203662326e-05, "loss": 0.011, "step": 10620 }, { "epoch": 5.948517067711248, "grad_norm": 0.17303366959095, "learning_rate": 4.892532737986387e-05, "loss": 0.0094, "step": 10630 }, { "epoch": 5.954113038612199, "grad_norm": 0.2476578801870346, "learning_rate": 4.884267449417931e-05, "loss": 0.0118, "step": 10640 }, { "epoch": 5.95970900951315, "grad_norm": 0.29616495966911316, "learning_rate": 4.87600247725785e-05, "loss": 0.0118, "step": 10650 }, { "epoch": 5.965304980414102, "grad_norm": 0.1653703898191452, "learning_rate": 4.867737844102261e-05, "loss": 0.0093, "step": 10660 }, { "epoch": 5.970900951315053, "grad_norm": 0.2089630663394928, "learning_rate": 4.8594735725463567e-05, "loss": 0.0113, "step": 10670 }, { "epoch": 5.976496922216004, "grad_norm": 0.14042207598686218, "learning_rate": 4.851209685184338e-05, "loss": 0.0091, "step": 10680 }, { "epoch": 5.982092893116956, "grad_norm": 0.17145408689975739, "learning_rate": 4.8429462046093585e-05, "loss": 0.0103, "step": 10690 }, { "epoch": 5.987688864017907, "grad_norm": 0.2082109898328781, "learning_rate": 4.834683153413459e-05, "loss": 0.0109, "step": 10700 }, { "epoch": 5.9932848349188586, "grad_norm": 0.3018309473991394, "learning_rate": 4.826420554187506e-05, "loss": 0.0125, "step": 10710 }, { "epoch": 5.99888080581981, "grad_norm": 0.1233690157532692, "learning_rate": 4.818158429521129e-05, "loss": 0.0093, "step": 10720 }, { "epoch": 6.004476776720761, "grad_norm": 0.226378932595253, "learning_rate": 4.809896802002662e-05, "loss": 0.0124, "step": 10730 }, { "epoch": 6.010072747621712, "grad_norm": 0.149214506149292, "learning_rate": 4.801635694219079e-05, "loss": 0.0105, "step": 10740 }, { "epoch": 6.015668718522663, "grad_norm": 0.35911405086517334, "learning_rate": 4.7933751287559335e-05, "loss": 0.0097, "step": 10750 }, { "epoch": 6.021264689423615, "grad_norm": 0.3472690284252167, "learning_rate": 4.785115128197298e-05, "loss": 0.0115, "step": 10760 }, { "epoch": 6.026860660324567, "grad_norm": 0.1740999072790146, "learning_rate": 4.776855715125694e-05, "loss": 0.0088, "step": 10770 }, { "epoch": 6.032456631225518, "grad_norm": 0.22089268267154694, "learning_rate": 4.7685969121220456e-05, "loss": 0.0087, "step": 10780 }, { "epoch": 6.038052602126469, "grad_norm": 0.17993643879890442, "learning_rate": 4.7603387417656026e-05, "loss": 0.0086, "step": 10790 }, { "epoch": 6.04364857302742, "grad_norm": 0.3000619113445282, "learning_rate": 4.7520812266338885e-05, "loss": 0.0117, "step": 10800 }, { "epoch": 6.0492445439283715, "grad_norm": 0.16510385274887085, "learning_rate": 4.743824389302635e-05, "loss": 0.0098, "step": 10810 }, { "epoch": 6.054840514829323, "grad_norm": 0.17736104130744934, "learning_rate": 4.735568252345718e-05, "loss": 0.0111, "step": 10820 }, { "epoch": 6.060436485730274, "grad_norm": 0.17262353003025055, "learning_rate": 4.7273128383351015e-05, "loss": 0.0075, "step": 10830 }, { "epoch": 6.066032456631225, "grad_norm": 0.15096010267734528, "learning_rate": 4.7190581698407725e-05, "loss": 0.0086, "step": 10840 }, { "epoch": 6.071628427532177, "grad_norm": 0.16276976466178894, "learning_rate": 4.710804269430681e-05, "loss": 0.0102, "step": 10850 }, { "epoch": 6.0772243984331284, "grad_norm": 0.42808446288108826, "learning_rate": 4.702551159670672e-05, "loss": 0.0094, "step": 10860 }, { "epoch": 6.08282036933408, "grad_norm": 0.17846183478832245, "learning_rate": 4.694298863124435e-05, "loss": 0.0092, "step": 10870 }, { "epoch": 6.088416340235031, "grad_norm": 0.2053506076335907, "learning_rate": 4.6860474023534335e-05, "loss": 0.0086, "step": 10880 }, { "epoch": 6.094012311135982, "grad_norm": 0.2614595592021942, "learning_rate": 4.677796799916845e-05, "loss": 0.017, "step": 10890 }, { "epoch": 6.099608282036933, "grad_norm": 0.2127176970243454, "learning_rate": 4.669547078371504e-05, "loss": 0.014, "step": 10900 }, { "epoch": 6.1052042529378845, "grad_norm": 0.2204008847475052, "learning_rate": 4.66129826027183e-05, "loss": 0.0116, "step": 10910 }, { "epoch": 6.110800223838836, "grad_norm": 0.3794216215610504, "learning_rate": 4.65305036816978e-05, "loss": 0.0112, "step": 10920 }, { "epoch": 6.116396194739787, "grad_norm": 0.22125349938869476, "learning_rate": 4.6448034246147754e-05, "loss": 0.0086, "step": 10930 }, { "epoch": 6.121992165640739, "grad_norm": 0.21079552173614502, "learning_rate": 4.6365574521536445e-05, "loss": 0.0118, "step": 10940 }, { "epoch": 6.12758813654169, "grad_norm": 0.17766894400119781, "learning_rate": 4.6283124733305624e-05, "loss": 0.007, "step": 10950 }, { "epoch": 6.133184107442641, "grad_norm": 0.23495835065841675, "learning_rate": 4.620068510686985e-05, "loss": 0.0092, "step": 10960 }, { "epoch": 6.138780078343593, "grad_norm": 0.25509214401245117, "learning_rate": 4.611825586761591e-05, "loss": 0.0098, "step": 10970 }, { "epoch": 6.144376049244544, "grad_norm": 0.2415831834077835, "learning_rate": 4.60358372409022e-05, "loss": 0.0105, "step": 10980 }, { "epoch": 6.149972020145495, "grad_norm": 0.1638316661119461, "learning_rate": 4.5953429452058135e-05, "loss": 0.0092, "step": 10990 }, { "epoch": 6.155567991046446, "grad_norm": 0.17809127271175385, "learning_rate": 4.5871032726383386e-05, "loss": 0.0089, "step": 11000 }, { "epoch": 6.1611639619473975, "grad_norm": 0.22080188989639282, "learning_rate": 4.5788647289147516e-05, "loss": 0.008, "step": 11010 }, { "epoch": 6.16675993284835, "grad_norm": 0.19198036193847656, "learning_rate": 4.570627336558915e-05, "loss": 0.0099, "step": 11020 }, { "epoch": 6.172355903749301, "grad_norm": 0.1567138433456421, "learning_rate": 4.562391118091544e-05, "loss": 0.0081, "step": 11030 }, { "epoch": 6.177951874650252, "grad_norm": 0.10507390648126602, "learning_rate": 4.554156096030149e-05, "loss": 0.0068, "step": 11040 }, { "epoch": 6.183547845551203, "grad_norm": 0.2201065570116043, "learning_rate": 4.545922292888959e-05, "loss": 0.0111, "step": 11050 }, { "epoch": 6.189143816452154, "grad_norm": 0.2924385666847229, "learning_rate": 4.537689731178883e-05, "loss": 0.0198, "step": 11060 }, { "epoch": 6.194739787353106, "grad_norm": 0.18973895907402039, "learning_rate": 4.529458433407429e-05, "loss": 0.0113, "step": 11070 }, { "epoch": 6.200335758254057, "grad_norm": 0.2131788432598114, "learning_rate": 4.5212284220786494e-05, "loss": 0.0093, "step": 11080 }, { "epoch": 6.205931729155008, "grad_norm": 0.17389249801635742, "learning_rate": 4.5129997196930845e-05, "loss": 0.0066, "step": 11090 }, { "epoch": 6.21152770005596, "grad_norm": 0.21684075891971588, "learning_rate": 4.504772348747687e-05, "loss": 0.0071, "step": 11100 }, { "epoch": 6.217123670956911, "grad_norm": 0.19866231083869934, "learning_rate": 4.496546331735778e-05, "loss": 0.0096, "step": 11110 }, { "epoch": 6.2227196418578625, "grad_norm": 0.19832220673561096, "learning_rate": 4.488321691146975e-05, "loss": 0.0068, "step": 11120 }, { "epoch": 6.228315612758814, "grad_norm": 0.12977780401706696, "learning_rate": 4.480098449467132e-05, "loss": 0.0089, "step": 11130 }, { "epoch": 6.233911583659765, "grad_norm": 0.32740047574043274, "learning_rate": 4.471876629178273e-05, "loss": 0.0092, "step": 11140 }, { "epoch": 6.239507554560716, "grad_norm": 0.12163751572370529, "learning_rate": 4.463656252758542e-05, "loss": 0.0089, "step": 11150 }, { "epoch": 6.245103525461667, "grad_norm": 0.21914434432983398, "learning_rate": 4.4554373426821374e-05, "loss": 0.0084, "step": 11160 }, { "epoch": 6.250699496362619, "grad_norm": 0.23196600377559662, "learning_rate": 4.447219921419244e-05, "loss": 0.0095, "step": 11170 }, { "epoch": 6.25629546726357, "grad_norm": 0.19451774656772614, "learning_rate": 4.439004011435979e-05, "loss": 0.01, "step": 11180 }, { "epoch": 6.261891438164522, "grad_norm": 0.20714877545833588, "learning_rate": 4.430789635194324e-05, "loss": 0.0124, "step": 11190 }, { "epoch": 6.267487409065473, "grad_norm": 0.1735510528087616, "learning_rate": 4.4225768151520694e-05, "loss": 0.0089, "step": 11200 }, { "epoch": 6.273083379966424, "grad_norm": 0.2282591164112091, "learning_rate": 4.414365573762755e-05, "loss": 0.0166, "step": 11210 }, { "epoch": 6.2786793508673755, "grad_norm": 0.2207183688879013, "learning_rate": 4.406155933475599e-05, "loss": 0.0089, "step": 11220 }, { "epoch": 6.284275321768327, "grad_norm": 0.252380907535553, "learning_rate": 4.3979479167354477e-05, "loss": 0.0111, "step": 11230 }, { "epoch": 6.289871292669278, "grad_norm": 0.18762193620204926, "learning_rate": 4.3897415459827e-05, "loss": 0.0099, "step": 11240 }, { "epoch": 6.295467263570229, "grad_norm": 0.15788224339485168, "learning_rate": 4.381536843653262e-05, "loss": 0.0086, "step": 11250 }, { "epoch": 6.301063234471181, "grad_norm": 0.22205393016338348, "learning_rate": 4.373333832178478e-05, "loss": 0.0081, "step": 11260 }, { "epoch": 6.306659205372132, "grad_norm": 0.2042773962020874, "learning_rate": 4.365132533985071e-05, "loss": 0.0112, "step": 11270 }, { "epoch": 6.312255176273084, "grad_norm": 0.15884517133235931, "learning_rate": 4.3569329714950704e-05, "loss": 0.011, "step": 11280 }, { "epoch": 6.317851147174035, "grad_norm": 0.1604417860507965, "learning_rate": 4.348735167125771e-05, "loss": 0.0126, "step": 11290 }, { "epoch": 6.323447118074986, "grad_norm": 0.1566859632730484, "learning_rate": 4.3405391432896555e-05, "loss": 0.0078, "step": 11300 }, { "epoch": 6.329043088975937, "grad_norm": 0.2835988700389862, "learning_rate": 4.3323449223943416e-05, "loss": 0.0096, "step": 11310 }, { "epoch": 6.3346390598768885, "grad_norm": 0.2758636772632599, "learning_rate": 4.324152526842517e-05, "loss": 0.0118, "step": 11320 }, { "epoch": 6.34023503077784, "grad_norm": 0.09336747974157333, "learning_rate": 4.315961979031875e-05, "loss": 0.0111, "step": 11330 }, { "epoch": 6.345831001678791, "grad_norm": 0.16241887211799622, "learning_rate": 4.307773301355062e-05, "loss": 0.0106, "step": 11340 }, { "epoch": 6.351426972579743, "grad_norm": 0.20391559600830078, "learning_rate": 4.2995865161996105e-05, "loss": 0.0081, "step": 11350 }, { "epoch": 6.357022943480694, "grad_norm": 0.12543804943561554, "learning_rate": 4.291401645947879e-05, "loss": 0.0137, "step": 11360 }, { "epoch": 6.362618914381645, "grad_norm": 0.24983376264572144, "learning_rate": 4.283218712976992e-05, "loss": 0.0095, "step": 11370 }, { "epoch": 6.368214885282597, "grad_norm": 0.2291889637708664, "learning_rate": 4.275037739658771e-05, "loss": 0.0113, "step": 11380 }, { "epoch": 6.373810856183548, "grad_norm": 0.1601787656545639, "learning_rate": 4.2668587483596864e-05, "loss": 0.0128, "step": 11390 }, { "epoch": 6.379406827084499, "grad_norm": 0.14628605544567108, "learning_rate": 4.2586817614407895e-05, "loss": 0.0076, "step": 11400 }, { "epoch": 6.38500279798545, "grad_norm": 0.16742217540740967, "learning_rate": 4.250506801257653e-05, "loss": 0.0104, "step": 11410 }, { "epoch": 6.390598768886401, "grad_norm": 0.20203527808189392, "learning_rate": 4.2423338901602985e-05, "loss": 0.0112, "step": 11420 }, { "epoch": 6.396194739787353, "grad_norm": 0.2605644762516022, "learning_rate": 4.234163050493158e-05, "loss": 0.0166, "step": 11430 }, { "epoch": 6.401790710688305, "grad_norm": 0.22104188799858093, "learning_rate": 4.2259943045949934e-05, "loss": 0.0069, "step": 11440 }, { "epoch": 6.407386681589256, "grad_norm": 0.2080865204334259, "learning_rate": 4.2178276747988446e-05, "loss": 0.0136, "step": 11450 }, { "epoch": 6.412982652490207, "grad_norm": 0.22961939871311188, "learning_rate": 4.209663183431969e-05, "loss": 0.0184, "step": 11460 }, { "epoch": 6.418578623391158, "grad_norm": 0.3134923577308655, "learning_rate": 4.201500852815768e-05, "loss": 0.0108, "step": 11470 }, { "epoch": 6.42417459429211, "grad_norm": 0.11267667263746262, "learning_rate": 4.1933407052657456e-05, "loss": 0.0113, "step": 11480 }, { "epoch": 6.429770565193061, "grad_norm": 0.11718063056468964, "learning_rate": 4.1851827630914305e-05, "loss": 0.0069, "step": 11490 }, { "epoch": 6.435366536094012, "grad_norm": 0.15294240415096283, "learning_rate": 4.17702704859633e-05, "loss": 0.0087, "step": 11500 }, { "epoch": 6.440962506994964, "grad_norm": 0.16003765165805817, "learning_rate": 4.1688735840778546e-05, "loss": 0.0087, "step": 11510 }, { "epoch": 6.446558477895915, "grad_norm": 0.28345319628715515, "learning_rate": 4.160722391827262e-05, "loss": 0.0119, "step": 11520 }, { "epoch": 6.4521544487968665, "grad_norm": 0.18619926273822784, "learning_rate": 4.1525734941296026e-05, "loss": 0.01, "step": 11530 }, { "epoch": 6.457750419697818, "grad_norm": 0.1567833423614502, "learning_rate": 4.14442691326365e-05, "loss": 0.0089, "step": 11540 }, { "epoch": 6.463346390598769, "grad_norm": 0.16688846051692963, "learning_rate": 4.13628267150185e-05, "loss": 0.0078, "step": 11550 }, { "epoch": 6.46894236149972, "grad_norm": 0.19638372957706451, "learning_rate": 4.1281407911102425e-05, "loss": 0.0119, "step": 11560 }, { "epoch": 6.474538332400671, "grad_norm": 0.13919275999069214, "learning_rate": 4.120001294348421e-05, "loss": 0.0105, "step": 11570 }, { "epoch": 6.4801343033016225, "grad_norm": 0.17611968517303467, "learning_rate": 4.111864203469457e-05, "loss": 0.0145, "step": 11580 }, { "epoch": 6.485730274202574, "grad_norm": 0.15707933902740479, "learning_rate": 4.103729540719847e-05, "loss": 0.0088, "step": 11590 }, { "epoch": 6.491326245103526, "grad_norm": 0.16832014918327332, "learning_rate": 4.095597328339452e-05, "loss": 0.0087, "step": 11600 }, { "epoch": 6.496922216004477, "grad_norm": 0.16573460400104523, "learning_rate": 4.087467588561424e-05, "loss": 0.0085, "step": 11610 }, { "epoch": 6.502518186905428, "grad_norm": 0.16878801584243774, "learning_rate": 4.079340343612165e-05, "loss": 0.0081, "step": 11620 }, { "epoch": 6.5081141578063795, "grad_norm": 0.10650831460952759, "learning_rate": 4.07121561571125e-05, "loss": 0.0088, "step": 11630 }, { "epoch": 6.513710128707331, "grad_norm": 0.15549488365650177, "learning_rate": 4.063093427071376e-05, "loss": 0.008, "step": 11640 }, { "epoch": 6.519306099608282, "grad_norm": 0.17358443140983582, "learning_rate": 4.0549737998983e-05, "loss": 0.0133, "step": 11650 }, { "epoch": 6.524902070509233, "grad_norm": 0.24347983300685883, "learning_rate": 4.046856756390767e-05, "loss": 0.0123, "step": 11660 }, { "epoch": 6.530498041410184, "grad_norm": 0.31662797927856445, "learning_rate": 4.038742318740465e-05, "loss": 0.0108, "step": 11670 }, { "epoch": 6.5360940123111355, "grad_norm": 0.21490415930747986, "learning_rate": 4.0306305091319595e-05, "loss": 0.0116, "step": 11680 }, { "epoch": 6.541689983212088, "grad_norm": 0.10896732658147812, "learning_rate": 4.0225213497426276e-05, "loss": 0.0088, "step": 11690 }, { "epoch": 6.547285954113039, "grad_norm": 0.22287431359291077, "learning_rate": 4.0144148627425993e-05, "loss": 0.0157, "step": 11700 }, { "epoch": 6.55288192501399, "grad_norm": 0.2492447942495346, "learning_rate": 4.006311070294702e-05, "loss": 0.0155, "step": 11710 }, { "epoch": 6.558477895914941, "grad_norm": 0.09591550379991531, "learning_rate": 3.9982099945543945e-05, "loss": 0.0076, "step": 11720 }, { "epoch": 6.564073866815892, "grad_norm": 0.21364928781986237, "learning_rate": 3.9901116576697083e-05, "loss": 0.0109, "step": 11730 }, { "epoch": 6.569669837716844, "grad_norm": 0.2347889095544815, "learning_rate": 3.982016081781189e-05, "loss": 0.009, "step": 11740 }, { "epoch": 6.575265808617795, "grad_norm": 0.07959645986557007, "learning_rate": 3.973923289021829e-05, "loss": 0.007, "step": 11750 }, { "epoch": 6.580861779518747, "grad_norm": 0.18356555700302124, "learning_rate": 3.965833301517017e-05, "loss": 0.014, "step": 11760 }, { "epoch": 6.586457750419698, "grad_norm": 0.16104575991630554, "learning_rate": 3.9577461413844684e-05, "loss": 0.0159, "step": 11770 }, { "epoch": 6.592053721320649, "grad_norm": 0.2652454972267151, "learning_rate": 3.949661830734172e-05, "loss": 0.0103, "step": 11780 }, { "epoch": 6.597649692221601, "grad_norm": 0.29040461778640747, "learning_rate": 3.9415803916683224e-05, "loss": 0.0077, "step": 11790 }, { "epoch": 6.603245663122552, "grad_norm": 0.3047587275505066, "learning_rate": 3.933501846281267e-05, "loss": 0.0137, "step": 11800 }, { "epoch": 6.608841634023503, "grad_norm": 0.15864235162734985, "learning_rate": 3.925426216659438e-05, "loss": 0.0097, "step": 11810 }, { "epoch": 6.614437604924454, "grad_norm": 0.20918135344982147, "learning_rate": 3.917353524881302e-05, "loss": 0.008, "step": 11820 }, { "epoch": 6.620033575825405, "grad_norm": 0.17880207300186157, "learning_rate": 3.9092837930172884e-05, "loss": 0.0119, "step": 11830 }, { "epoch": 6.625629546726357, "grad_norm": 0.16844668984413147, "learning_rate": 3.901217043129735e-05, "loss": 0.0092, "step": 11840 }, { "epoch": 6.631225517627309, "grad_norm": 0.2069406360387802, "learning_rate": 3.8931532972728285e-05, "loss": 0.0116, "step": 11850 }, { "epoch": 6.63682148852826, "grad_norm": 0.2709522843360901, "learning_rate": 3.8850925774925425e-05, "loss": 0.0076, "step": 11860 }, { "epoch": 6.642417459429211, "grad_norm": 0.16224393248558044, "learning_rate": 3.877034905826577e-05, "loss": 0.0099, "step": 11870 }, { "epoch": 6.648013430330162, "grad_norm": 0.238708034157753, "learning_rate": 3.8689803043043e-05, "loss": 0.0073, "step": 11880 }, { "epoch": 6.6536094012311136, "grad_norm": 0.12267536669969559, "learning_rate": 3.860928794946682e-05, "loss": 0.0086, "step": 11890 }, { "epoch": 6.659205372132065, "grad_norm": 0.1931445449590683, "learning_rate": 3.852880399766243e-05, "loss": 0.0098, "step": 11900 }, { "epoch": 6.664801343033016, "grad_norm": 0.23762571811676025, "learning_rate": 3.844835140766988e-05, "loss": 0.0091, "step": 11910 }, { "epoch": 6.670397313933967, "grad_norm": 0.1977052241563797, "learning_rate": 3.836793039944349e-05, "loss": 0.0079, "step": 11920 }, { "epoch": 6.675993284834918, "grad_norm": 0.10921810567378998, "learning_rate": 3.828754119285123e-05, "loss": 0.0072, "step": 11930 }, { "epoch": 6.6815892557358705, "grad_norm": 0.2423611879348755, "learning_rate": 3.820718400767409e-05, "loss": 0.0119, "step": 11940 }, { "epoch": 6.687185226636822, "grad_norm": 0.19429948925971985, "learning_rate": 3.812685906360557e-05, "loss": 0.0081, "step": 11950 }, { "epoch": 6.692781197537773, "grad_norm": 0.104859858751297, "learning_rate": 3.8046566580251e-05, "loss": 0.0064, "step": 11960 }, { "epoch": 6.698377168438724, "grad_norm": 0.11694277077913284, "learning_rate": 3.796630677712697e-05, "loss": 0.0086, "step": 11970 }, { "epoch": 6.703973139339675, "grad_norm": 0.2368919551372528, "learning_rate": 3.788607987366069e-05, "loss": 0.0059, "step": 11980 }, { "epoch": 6.7095691102406265, "grad_norm": 0.20411504805088043, "learning_rate": 3.780588608918947e-05, "loss": 0.0133, "step": 11990 }, { "epoch": 6.715165081141578, "grad_norm": 0.11036452651023865, "learning_rate": 3.772572564296005e-05, "loss": 0.0085, "step": 12000 }, { "epoch": 6.72076105204253, "grad_norm": 0.09863012284040451, "learning_rate": 3.764559875412803e-05, "loss": 0.0064, "step": 12010 }, { "epoch": 6.726357022943481, "grad_norm": 0.12064427882432938, "learning_rate": 3.756550564175727e-05, "loss": 0.009, "step": 12020 }, { "epoch": 6.731952993844432, "grad_norm": 0.11138517409563065, "learning_rate": 3.748544652481927e-05, "loss": 0.0082, "step": 12030 }, { "epoch": 6.7375489647453835, "grad_norm": 0.1209891140460968, "learning_rate": 3.74054216221926e-05, "loss": 0.0074, "step": 12040 }, { "epoch": 6.743144935646335, "grad_norm": 0.22739742696285248, "learning_rate": 3.73254311526623e-05, "loss": 0.0082, "step": 12050 }, { "epoch": 6.748740906547286, "grad_norm": 0.19938482344150543, "learning_rate": 3.7245475334919246e-05, "loss": 0.0087, "step": 12060 }, { "epoch": 6.754336877448237, "grad_norm": 0.18825367093086243, "learning_rate": 3.716555438755961e-05, "loss": 0.0091, "step": 12070 }, { "epoch": 6.759932848349188, "grad_norm": 0.18540059030056, "learning_rate": 3.7085668529084184e-05, "loss": 0.0096, "step": 12080 }, { "epoch": 6.7655288192501395, "grad_norm": 0.11188949644565582, "learning_rate": 3.700581797789786e-05, "loss": 0.0081, "step": 12090 }, { "epoch": 6.771124790151092, "grad_norm": 0.09911153465509415, "learning_rate": 3.6926002952309016e-05, "loss": 0.0065, "step": 12100 }, { "epoch": 6.776720761052043, "grad_norm": 0.2001970112323761, "learning_rate": 3.684622367052887e-05, "loss": 0.007, "step": 12110 }, { "epoch": 6.782316731952994, "grad_norm": 0.256001740694046, "learning_rate": 3.676648035067093e-05, "loss": 0.0101, "step": 12120 }, { "epoch": 6.787912702853945, "grad_norm": 0.16810284554958344, "learning_rate": 3.6686773210750385e-05, "loss": 0.0084, "step": 12130 }, { "epoch": 6.793508673754896, "grad_norm": 0.21629579365253448, "learning_rate": 3.6607102468683526e-05, "loss": 0.0066, "step": 12140 }, { "epoch": 6.799104644655848, "grad_norm": 0.2616669237613678, "learning_rate": 3.65274683422871e-05, "loss": 0.0111, "step": 12150 }, { "epoch": 6.804700615556799, "grad_norm": 0.18898139894008636, "learning_rate": 3.6447871049277796e-05, "loss": 0.0103, "step": 12160 }, { "epoch": 6.81029658645775, "grad_norm": 0.20177505910396576, "learning_rate": 3.636831080727154e-05, "loss": 0.0064, "step": 12170 }, { "epoch": 6.815892557358701, "grad_norm": 0.18514911830425262, "learning_rate": 3.628878783378302e-05, "loss": 0.0118, "step": 12180 }, { "epoch": 6.821488528259653, "grad_norm": 0.25894469022750854, "learning_rate": 3.6209302346225006e-05, "loss": 0.0083, "step": 12190 }, { "epoch": 6.827084499160605, "grad_norm": 0.16605038940906525, "learning_rate": 3.612985456190778e-05, "loss": 0.0049, "step": 12200 }, { "epoch": 6.832680470061556, "grad_norm": 0.17524683475494385, "learning_rate": 3.605044469803854e-05, "loss": 0.0066, "step": 12210 }, { "epoch": 6.838276440962507, "grad_norm": 0.10738332569599152, "learning_rate": 3.597107297172084e-05, "loss": 0.0087, "step": 12220 }, { "epoch": 6.843872411863458, "grad_norm": 0.19934684038162231, "learning_rate": 3.5891739599953945e-05, "loss": 0.009, "step": 12230 }, { "epoch": 6.849468382764409, "grad_norm": 0.12639135122299194, "learning_rate": 3.581244479963225e-05, "loss": 0.0092, "step": 12240 }, { "epoch": 6.855064353665361, "grad_norm": 0.1152096539735794, "learning_rate": 3.5733188787544745e-05, "loss": 0.007, "step": 12250 }, { "epoch": 6.860660324566313, "grad_norm": 0.2878243625164032, "learning_rate": 3.5653971780374295e-05, "loss": 0.0096, "step": 12260 }, { "epoch": 6.866256295467264, "grad_norm": 0.2725951075553894, "learning_rate": 3.557479399469721e-05, "loss": 0.0081, "step": 12270 }, { "epoch": 6.871852266368215, "grad_norm": 0.16931770741939545, "learning_rate": 3.5495655646982505e-05, "loss": 0.0085, "step": 12280 }, { "epoch": 6.877448237269166, "grad_norm": 0.11503436416387558, "learning_rate": 3.541655695359142e-05, "loss": 0.0062, "step": 12290 }, { "epoch": 6.8830442081701175, "grad_norm": 0.18025194108486176, "learning_rate": 3.533749813077677e-05, "loss": 0.0082, "step": 12300 }, { "epoch": 6.888640179071069, "grad_norm": 0.1392613649368286, "learning_rate": 3.525847939468233e-05, "loss": 0.0086, "step": 12310 }, { "epoch": 6.89423614997202, "grad_norm": 0.2620909512042999, "learning_rate": 3.517950096134232e-05, "loss": 0.0108, "step": 12320 }, { "epoch": 6.899832120872971, "grad_norm": 0.12296637147665024, "learning_rate": 3.5100563046680764e-05, "loss": 0.008, "step": 12330 }, { "epoch": 6.905428091773922, "grad_norm": 0.13329119980335236, "learning_rate": 3.5021665866510925e-05, "loss": 0.0104, "step": 12340 }, { "epoch": 6.9110240626748745, "grad_norm": 0.18710525333881378, "learning_rate": 3.494280963653463e-05, "loss": 0.0096, "step": 12350 }, { "epoch": 6.916620033575826, "grad_norm": 0.199269637465477, "learning_rate": 3.4863994572341843e-05, "loss": 0.0098, "step": 12360 }, { "epoch": 6.922216004476777, "grad_norm": 0.24953125417232513, "learning_rate": 3.478522088940993e-05, "loss": 0.01, "step": 12370 }, { "epoch": 6.927811975377728, "grad_norm": 0.1573137789964676, "learning_rate": 3.470648880310313e-05, "loss": 0.0119, "step": 12380 }, { "epoch": 6.933407946278679, "grad_norm": 0.24244867265224457, "learning_rate": 3.462779852867197e-05, "loss": 0.0129, "step": 12390 }, { "epoch": 6.9390039171796305, "grad_norm": 0.12841010093688965, "learning_rate": 3.4549150281252636e-05, "loss": 0.0074, "step": 12400 }, { "epoch": 6.944599888080582, "grad_norm": 0.17973212897777557, "learning_rate": 3.447054427586644e-05, "loss": 0.0084, "step": 12410 }, { "epoch": 6.950195858981533, "grad_norm": 0.2083815336227417, "learning_rate": 3.439198072741921e-05, "loss": 0.0096, "step": 12420 }, { "epoch": 6.955791829882484, "grad_norm": 0.21580283343791962, "learning_rate": 3.431345985070067e-05, "loss": 0.009, "step": 12430 }, { "epoch": 6.961387800783436, "grad_norm": 0.22562581300735474, "learning_rate": 3.423498186038393e-05, "loss": 0.0105, "step": 12440 }, { "epoch": 6.966983771684387, "grad_norm": 0.19070309400558472, "learning_rate": 3.4156546971024784e-05, "loss": 0.0074, "step": 12450 }, { "epoch": 6.972579742585339, "grad_norm": 0.2400059998035431, "learning_rate": 3.407815539706124e-05, "loss": 0.0102, "step": 12460 }, { "epoch": 6.97817571348629, "grad_norm": 0.13252539932727814, "learning_rate": 3.399980735281286e-05, "loss": 0.0066, "step": 12470 }, { "epoch": 6.983771684387241, "grad_norm": 0.2826622426509857, "learning_rate": 3.392150305248024e-05, "loss": 0.0103, "step": 12480 }, { "epoch": 6.989367655288192, "grad_norm": 0.2674136757850647, "learning_rate": 3.384324271014429e-05, "loss": 0.0089, "step": 12490 }, { "epoch": 6.9949636261891435, "grad_norm": 0.09753147512674332, "learning_rate": 3.3765026539765834e-05, "loss": 0.0126, "step": 12500 }, { "epoch": 7.000559597090096, "grad_norm": 0.13642564415931702, "learning_rate": 3.368685475518488e-05, "loss": 0.01, "step": 12510 }, { "epoch": 7.006155567991047, "grad_norm": 0.2658902704715729, "learning_rate": 3.360872757012011e-05, "loss": 0.0168, "step": 12520 }, { "epoch": 7.011751538891998, "grad_norm": 0.12951083481311798, "learning_rate": 3.3530645198168295e-05, "loss": 0.0081, "step": 12530 }, { "epoch": 7.017347509792949, "grad_norm": 0.23773689568042755, "learning_rate": 3.3452607852803584e-05, "loss": 0.0082, "step": 12540 }, { "epoch": 7.0229434806939, "grad_norm": 0.21580462157726288, "learning_rate": 3.337461574737716e-05, "loss": 0.0106, "step": 12550 }, { "epoch": 7.028539451594852, "grad_norm": 0.15399706363677979, "learning_rate": 3.329666909511645e-05, "loss": 0.0103, "step": 12560 }, { "epoch": 7.034135422495803, "grad_norm": 0.21200086176395416, "learning_rate": 3.321876810912461e-05, "loss": 0.0141, "step": 12570 }, { "epoch": 7.039731393396754, "grad_norm": 0.2530173063278198, "learning_rate": 3.3140913002379995e-05, "loss": 0.0101, "step": 12580 }, { "epoch": 7.045327364297705, "grad_norm": 0.16888059675693512, "learning_rate": 3.3063103987735433e-05, "loss": 0.0068, "step": 12590 }, { "epoch": 7.050923335198657, "grad_norm": 0.213544562458992, "learning_rate": 3.298534127791785e-05, "loss": 0.0099, "step": 12600 }, { "epoch": 7.0565193060996085, "grad_norm": 0.2427508383989334, "learning_rate": 3.2907625085527503e-05, "loss": 0.0078, "step": 12610 }, { "epoch": 7.06211527700056, "grad_norm": 0.3301132023334503, "learning_rate": 3.282995562303754e-05, "loss": 0.0091, "step": 12620 }, { "epoch": 7.067711247901511, "grad_norm": 0.15243375301361084, "learning_rate": 3.275233310279321e-05, "loss": 0.0058, "step": 12630 }, { "epoch": 7.073307218802462, "grad_norm": 0.14671820402145386, "learning_rate": 3.267475773701161e-05, "loss": 0.0062, "step": 12640 }, { "epoch": 7.078903189703413, "grad_norm": 0.22168104350566864, "learning_rate": 3.2597229737780774e-05, "loss": 0.0079, "step": 12650 }, { "epoch": 7.084499160604365, "grad_norm": 0.25640955567359924, "learning_rate": 3.251974931705933e-05, "loss": 0.0085, "step": 12660 }, { "epoch": 7.090095131505316, "grad_norm": 0.2436077892780304, "learning_rate": 3.244231668667578e-05, "loss": 0.0078, "step": 12670 }, { "epoch": 7.095691102406268, "grad_norm": 0.19463610649108887, "learning_rate": 3.236493205832795e-05, "loss": 0.0066, "step": 12680 }, { "epoch": 7.101287073307219, "grad_norm": 0.22004422545433044, "learning_rate": 3.228759564358248e-05, "loss": 0.0078, "step": 12690 }, { "epoch": 7.10688304420817, "grad_norm": 0.1793327033519745, "learning_rate": 3.221030765387417e-05, "loss": 0.0059, "step": 12700 }, { "epoch": 7.1124790151091215, "grad_norm": 0.2823750376701355, "learning_rate": 3.2133068300505455e-05, "loss": 0.0072, "step": 12710 }, { "epoch": 7.118074986010073, "grad_norm": 0.3006185293197632, "learning_rate": 3.205587779464576e-05, "loss": 0.0099, "step": 12720 }, { "epoch": 7.123670956911024, "grad_norm": 0.15955254435539246, "learning_rate": 3.197873634733096e-05, "loss": 0.01, "step": 12730 }, { "epoch": 7.129266927811975, "grad_norm": 0.3392355442047119, "learning_rate": 3.190164416946285e-05, "loss": 0.0096, "step": 12740 }, { "epoch": 7.134862898712926, "grad_norm": 0.209779292345047, "learning_rate": 3.18246014718085e-05, "loss": 0.0083, "step": 12750 }, { "epoch": 7.140458869613878, "grad_norm": 0.13492996990680695, "learning_rate": 3.1747608464999725e-05, "loss": 0.0085, "step": 12760 }, { "epoch": 7.14605484051483, "grad_norm": 0.20543181896209717, "learning_rate": 3.167066535953242e-05, "loss": 0.0099, "step": 12770 }, { "epoch": 7.151650811415781, "grad_norm": 0.24595800042152405, "learning_rate": 3.1593772365766105e-05, "loss": 0.0089, "step": 12780 }, { "epoch": 7.157246782316732, "grad_norm": 0.24962860345840454, "learning_rate": 3.1516929693923315e-05, "loss": 0.0111, "step": 12790 }, { "epoch": 7.162842753217683, "grad_norm": 0.236158549785614, "learning_rate": 3.144013755408895e-05, "loss": 0.0092, "step": 12800 }, { "epoch": 7.1684387241186345, "grad_norm": 0.09373817592859268, "learning_rate": 3.136339615620985e-05, "loss": 0.0073, "step": 12810 }, { "epoch": 7.174034695019586, "grad_norm": 0.3018852770328522, "learning_rate": 3.128670571009399e-05, "loss": 0.0109, "step": 12820 }, { "epoch": 7.179630665920537, "grad_norm": 0.22144253551959991, "learning_rate": 3.121006642541014e-05, "loss": 0.008, "step": 12830 }, { "epoch": 7.185226636821488, "grad_norm": 0.14473740756511688, "learning_rate": 3.113347851168721e-05, "loss": 0.0095, "step": 12840 }, { "epoch": 7.19082260772244, "grad_norm": 0.14747409522533417, "learning_rate": 3.105694217831361e-05, "loss": 0.0062, "step": 12850 }, { "epoch": 7.196418578623391, "grad_norm": 0.2111588716506958, "learning_rate": 3.098045763453678e-05, "loss": 0.0074, "step": 12860 }, { "epoch": 7.202014549524343, "grad_norm": 0.2098371833562851, "learning_rate": 3.090402508946249e-05, "loss": 0.0084, "step": 12870 }, { "epoch": 7.207610520425294, "grad_norm": 0.1614372432231903, "learning_rate": 3.082764475205442e-05, "loss": 0.007, "step": 12880 }, { "epoch": 7.213206491326245, "grad_norm": 0.0742206946015358, "learning_rate": 3.075131683113352e-05, "loss": 0.006, "step": 12890 }, { "epoch": 7.218802462227196, "grad_norm": 0.07135152816772461, "learning_rate": 3.0675041535377405e-05, "loss": 0.0057, "step": 12900 }, { "epoch": 7.2243984331281474, "grad_norm": 0.20988823473453522, "learning_rate": 3.059881907331979e-05, "loss": 0.0071, "step": 12910 }, { "epoch": 7.229994404029099, "grad_norm": 0.10817866027355194, "learning_rate": 3.052264965335e-05, "loss": 0.0049, "step": 12920 }, { "epoch": 7.235590374930051, "grad_norm": 0.13764233887195587, "learning_rate": 3.0446533483712304e-05, "loss": 0.0088, "step": 12930 }, { "epoch": 7.241186345831002, "grad_norm": 0.17063380777835846, "learning_rate": 3.0370470772505433e-05, "loss": 0.0073, "step": 12940 }, { "epoch": 7.246782316731953, "grad_norm": 0.11198591440916061, "learning_rate": 3.0294461727681932e-05, "loss": 0.0112, "step": 12950 }, { "epoch": 7.252378287632904, "grad_norm": 0.1855844408273697, "learning_rate": 3.0218506557047598e-05, "loss": 0.0069, "step": 12960 }, { "epoch": 7.257974258533856, "grad_norm": 0.10013962537050247, "learning_rate": 3.0142605468260978e-05, "loss": 0.0063, "step": 12970 }, { "epoch": 7.263570229434807, "grad_norm": 0.16480940580368042, "learning_rate": 3.006675866883275e-05, "loss": 0.0062, "step": 12980 }, { "epoch": 7.269166200335758, "grad_norm": 0.2087039351463318, "learning_rate": 2.999096636612518e-05, "loss": 0.0085, "step": 12990 }, { "epoch": 7.274762171236709, "grad_norm": 0.15215320885181427, "learning_rate": 2.991522876735154e-05, "loss": 0.0077, "step": 13000 }, { "epoch": 7.280358142137661, "grad_norm": 0.2687567472457886, "learning_rate": 2.9839546079575497e-05, "loss": 0.0105, "step": 13010 }, { "epoch": 7.2859541130386125, "grad_norm": 0.23126524686813354, "learning_rate": 2.976391850971065e-05, "loss": 0.0076, "step": 13020 }, { "epoch": 7.291550083939564, "grad_norm": 0.10021013021469116, "learning_rate": 2.9688346264519866e-05, "loss": 0.01, "step": 13030 }, { "epoch": 7.297146054840515, "grad_norm": 0.16525714099407196, "learning_rate": 2.9612829550614836e-05, "loss": 0.0082, "step": 13040 }, { "epoch": 7.302742025741466, "grad_norm": 0.16742092370986938, "learning_rate": 2.9537368574455304e-05, "loss": 0.0141, "step": 13050 }, { "epoch": 7.308337996642417, "grad_norm": 0.07409677654504776, "learning_rate": 2.9461963542348737e-05, "loss": 0.0083, "step": 13060 }, { "epoch": 7.3139339675433686, "grad_norm": 0.2794577181339264, "learning_rate": 2.9386614660449596e-05, "loss": 0.0091, "step": 13070 }, { "epoch": 7.31952993844432, "grad_norm": 0.16768626868724823, "learning_rate": 2.931132213475884e-05, "loss": 0.0128, "step": 13080 }, { "epoch": 7.325125909345271, "grad_norm": 0.19670413434505463, "learning_rate": 2.9236086171123404e-05, "loss": 0.0058, "step": 13090 }, { "epoch": 7.330721880246223, "grad_norm": 0.1663038730621338, "learning_rate": 2.916090697523549e-05, "loss": 0.0081, "step": 13100 }, { "epoch": 7.336317851147174, "grad_norm": 0.2468092292547226, "learning_rate": 2.9085784752632157e-05, "loss": 0.0094, "step": 13110 }, { "epoch": 7.3419138220481255, "grad_norm": 0.20476868748664856, "learning_rate": 2.9010719708694722e-05, "loss": 0.0095, "step": 13120 }, { "epoch": 7.347509792949077, "grad_norm": 0.19373807311058044, "learning_rate": 2.8935712048648112e-05, "loss": 0.0077, "step": 13130 }, { "epoch": 7.353105763850028, "grad_norm": 0.16226400434970856, "learning_rate": 2.8860761977560436e-05, "loss": 0.0105, "step": 13140 }, { "epoch": 7.358701734750979, "grad_norm": 0.2760455906391144, "learning_rate": 2.878586970034232e-05, "loss": 0.017, "step": 13150 }, { "epoch": 7.36429770565193, "grad_norm": 0.269136518239975, "learning_rate": 2.8711035421746367e-05, "loss": 0.0127, "step": 13160 }, { "epoch": 7.3698936765528815, "grad_norm": 0.2237207144498825, "learning_rate": 2.8636259346366666e-05, "loss": 0.007, "step": 13170 }, { "epoch": 7.375489647453834, "grad_norm": 0.1836055964231491, "learning_rate": 2.8561541678638142e-05, "loss": 0.0077, "step": 13180 }, { "epoch": 7.381085618354785, "grad_norm": 0.1962578445672989, "learning_rate": 2.8486882622836026e-05, "loss": 0.0078, "step": 13190 }, { "epoch": 7.386681589255736, "grad_norm": 0.16476459801197052, "learning_rate": 2.8412282383075363e-05, "loss": 0.0093, "step": 13200 }, { "epoch": 7.392277560156687, "grad_norm": 0.17988111078739166, "learning_rate": 2.8337741163310317e-05, "loss": 0.0081, "step": 13210 }, { "epoch": 7.3978735310576385, "grad_norm": 0.21751411259174347, "learning_rate": 2.8263259167333777e-05, "loss": 0.0092, "step": 13220 }, { "epoch": 7.40346950195859, "grad_norm": 0.150657057762146, "learning_rate": 2.8188836598776662e-05, "loss": 0.0094, "step": 13230 }, { "epoch": 7.409065472859541, "grad_norm": 0.16722621023654938, "learning_rate": 2.811447366110741e-05, "loss": 0.0074, "step": 13240 }, { "epoch": 7.414661443760492, "grad_norm": 0.16167713701725006, "learning_rate": 2.804017055763149e-05, "loss": 0.0063, "step": 13250 }, { "epoch": 7.420257414661444, "grad_norm": 0.07585649192333221, "learning_rate": 2.7965927491490705e-05, "loss": 0.0112, "step": 13260 }, { "epoch": 7.425853385562395, "grad_norm": 0.19306915998458862, "learning_rate": 2.7891744665662823e-05, "loss": 0.0069, "step": 13270 }, { "epoch": 7.431449356463347, "grad_norm": 0.23972170054912567, "learning_rate": 2.7817622282960815e-05, "loss": 0.0062, "step": 13280 }, { "epoch": 7.437045327364298, "grad_norm": 0.15592247247695923, "learning_rate": 2.774356054603243e-05, "loss": 0.0055, "step": 13290 }, { "epoch": 7.442641298265249, "grad_norm": 0.20682460069656372, "learning_rate": 2.766955965735968e-05, "loss": 0.0052, "step": 13300 }, { "epoch": 7.4482372691662, "grad_norm": 0.09251468628644943, "learning_rate": 2.7595619819258116e-05, "loss": 0.0077, "step": 13310 }, { "epoch": 7.453833240067151, "grad_norm": 0.1358599066734314, "learning_rate": 2.7521741233876496e-05, "loss": 0.0098, "step": 13320 }, { "epoch": 7.459429210968103, "grad_norm": 0.10552109777927399, "learning_rate": 2.7447924103195976e-05, "loss": 0.0045, "step": 13330 }, { "epoch": 7.465025181869054, "grad_norm": 0.22331656515598297, "learning_rate": 2.7374168629029813e-05, "loss": 0.0075, "step": 13340 }, { "epoch": 7.470621152770006, "grad_norm": 0.25520750880241394, "learning_rate": 2.7300475013022663e-05, "loss": 0.0079, "step": 13350 }, { "epoch": 7.476217123670957, "grad_norm": 0.3160042464733124, "learning_rate": 2.7226843456650037e-05, "loss": 0.0123, "step": 13360 }, { "epoch": 7.481813094571908, "grad_norm": 0.1619534194469452, "learning_rate": 2.7153274161217846e-05, "loss": 0.0049, "step": 13370 }, { "epoch": 7.48740906547286, "grad_norm": 0.3031173646450043, "learning_rate": 2.707976732786166e-05, "loss": 0.0098, "step": 13380 }, { "epoch": 7.493005036373811, "grad_norm": 0.1819227635860443, "learning_rate": 2.7006323157546386e-05, "loss": 0.0065, "step": 13390 }, { "epoch": 7.498601007274762, "grad_norm": 0.17307765781879425, "learning_rate": 2.693294185106562e-05, "loss": 0.0087, "step": 13400 }, { "epoch": 7.504196978175713, "grad_norm": 0.1600845456123352, "learning_rate": 2.6859623609040984e-05, "loss": 0.0061, "step": 13410 }, { "epoch": 7.509792949076665, "grad_norm": 0.21853172779083252, "learning_rate": 2.6786368631921836e-05, "loss": 0.0054, "step": 13420 }, { "epoch": 7.5153889199776165, "grad_norm": 0.16434265673160553, "learning_rate": 2.67131771199844e-05, "loss": 0.0104, "step": 13430 }, { "epoch": 7.520984890878568, "grad_norm": 0.1688595563173294, "learning_rate": 2.6640049273331515e-05, "loss": 0.0068, "step": 13440 }, { "epoch": 7.526580861779519, "grad_norm": 0.10968342423439026, "learning_rate": 2.656698529189193e-05, "loss": 0.0072, "step": 13450 }, { "epoch": 7.53217683268047, "grad_norm": 0.12489527463912964, "learning_rate": 2.6493985375419778e-05, "loss": 0.0067, "step": 13460 }, { "epoch": 7.537772803581421, "grad_norm": 0.3275364935398102, "learning_rate": 2.642104972349403e-05, "loss": 0.0066, "step": 13470 }, { "epoch": 7.5433687744823725, "grad_norm": 0.10653702169656754, "learning_rate": 2.6348178535517966e-05, "loss": 0.0133, "step": 13480 }, { "epoch": 7.548964745383324, "grad_norm": 0.16446645557880402, "learning_rate": 2.6275372010718635e-05, "loss": 0.0075, "step": 13490 }, { "epoch": 7.554560716284275, "grad_norm": 0.17610448598861694, "learning_rate": 2.6202630348146324e-05, "loss": 0.0077, "step": 13500 }, { "epoch": 7.560156687185227, "grad_norm": 0.1589246541261673, "learning_rate": 2.612995374667394e-05, "loss": 0.0044, "step": 13510 }, { "epoch": 7.565752658086178, "grad_norm": 0.3019932806491852, "learning_rate": 2.6057342404996522e-05, "loss": 0.0067, "step": 13520 }, { "epoch": 7.5713486289871295, "grad_norm": 0.19549022614955902, "learning_rate": 2.5984796521630737e-05, "loss": 0.0083, "step": 13530 }, { "epoch": 7.576944599888081, "grad_norm": 0.1532057523727417, "learning_rate": 2.591231629491423e-05, "loss": 0.0043, "step": 13540 }, { "epoch": 7.582540570789032, "grad_norm": 0.1547580510377884, "learning_rate": 2.5839901923005205e-05, "loss": 0.0083, "step": 13550 }, { "epoch": 7.588136541689983, "grad_norm": 0.30122992396354675, "learning_rate": 2.5767553603881767e-05, "loss": 0.0064, "step": 13560 }, { "epoch": 7.593732512590934, "grad_norm": 0.12354984134435654, "learning_rate": 2.5695271535341443e-05, "loss": 0.0059, "step": 13570 }, { "epoch": 7.5993284834918855, "grad_norm": 0.14805443584918976, "learning_rate": 2.562305591500069e-05, "loss": 0.0072, "step": 13580 }, { "epoch": 7.604924454392837, "grad_norm": 0.15644380450248718, "learning_rate": 2.555090694029421e-05, "loss": 0.0076, "step": 13590 }, { "epoch": 7.610520425293789, "grad_norm": 0.22504927217960358, "learning_rate": 2.547882480847461e-05, "loss": 0.0114, "step": 13600 }, { "epoch": 7.61611639619474, "grad_norm": 0.10872774571180344, "learning_rate": 2.540680971661161e-05, "loss": 0.0098, "step": 13610 }, { "epoch": 7.621712367095691, "grad_norm": 0.1415761411190033, "learning_rate": 2.5334861861591753e-05, "loss": 0.0059, "step": 13620 }, { "epoch": 7.627308337996642, "grad_norm": 0.18380744755268097, "learning_rate": 2.526298144011775e-05, "loss": 0.0074, "step": 13630 }, { "epoch": 7.632904308897594, "grad_norm": 0.13029605150222778, "learning_rate": 2.5191168648707887e-05, "loss": 0.0046, "step": 13640 }, { "epoch": 7.638500279798545, "grad_norm": 0.11022605746984482, "learning_rate": 2.511942368369566e-05, "loss": 0.0052, "step": 13650 }, { "epoch": 7.644096250699496, "grad_norm": 0.1933964192867279, "learning_rate": 2.5047746741228978e-05, "loss": 0.0062, "step": 13660 }, { "epoch": 7.649692221600448, "grad_norm": 0.10140606015920639, "learning_rate": 2.4976138017269908e-05, "loss": 0.005, "step": 13670 }, { "epoch": 7.655288192501399, "grad_norm": 0.1074545681476593, "learning_rate": 2.490459770759398e-05, "loss": 0.0081, "step": 13680 }, { "epoch": 7.660884163402351, "grad_norm": 0.11866219341754913, "learning_rate": 2.4833126007789653e-05, "loss": 0.0063, "step": 13690 }, { "epoch": 7.666480134303302, "grad_norm": 0.14528554677963257, "learning_rate": 2.476172311325783e-05, "loss": 0.0075, "step": 13700 }, { "epoch": 7.672076105204253, "grad_norm": 0.12533891201019287, "learning_rate": 2.4690389219211273e-05, "loss": 0.0056, "step": 13710 }, { "epoch": 7.677672076105204, "grad_norm": 0.2228127419948578, "learning_rate": 2.4619124520674146e-05, "loss": 0.007, "step": 13720 }, { "epoch": 7.683268047006155, "grad_norm": 0.167043074965477, "learning_rate": 2.4547929212481435e-05, "loss": 0.0092, "step": 13730 }, { "epoch": 7.688864017907107, "grad_norm": 0.1956396847963333, "learning_rate": 2.447680348927837e-05, "loss": 0.0104, "step": 13740 }, { "epoch": 7.694459988808058, "grad_norm": 0.3440028429031372, "learning_rate": 2.4405747545519963e-05, "loss": 0.0101, "step": 13750 }, { "epoch": 7.70005595970901, "grad_norm": 0.19462288916110992, "learning_rate": 2.433476157547044e-05, "loss": 0.0123, "step": 13760 }, { "epoch": 7.705651930609961, "grad_norm": 0.2774219512939453, "learning_rate": 2.4263845773202736e-05, "loss": 0.012, "step": 13770 }, { "epoch": 7.711247901510912, "grad_norm": 0.15917648375034332, "learning_rate": 2.419300033259798e-05, "loss": 0.0072, "step": 13780 }, { "epoch": 7.7168438724118635, "grad_norm": 0.17087779939174652, "learning_rate": 2.4122225447344875e-05, "loss": 0.0051, "step": 13790 }, { "epoch": 7.722439843312815, "grad_norm": 0.3049764931201935, "learning_rate": 2.405152131093926e-05, "loss": 0.0068, "step": 13800 }, { "epoch": 7.728035814213766, "grad_norm": 0.23013077676296234, "learning_rate": 2.3980888116683515e-05, "loss": 0.0093, "step": 13810 }, { "epoch": 7.733631785114717, "grad_norm": 0.25196191668510437, "learning_rate": 2.3910326057686127e-05, "loss": 0.0063, "step": 13820 }, { "epoch": 7.739227756015668, "grad_norm": 0.13192011415958405, "learning_rate": 2.3839835326861104e-05, "loss": 0.0077, "step": 13830 }, { "epoch": 7.74482372691662, "grad_norm": 0.14442972838878632, "learning_rate": 2.3769416116927335e-05, "loss": 0.0131, "step": 13840 }, { "epoch": 7.750419697817572, "grad_norm": 0.1425463706254959, "learning_rate": 2.3699068620408304e-05, "loss": 0.0066, "step": 13850 }, { "epoch": 7.756015668718523, "grad_norm": 0.1162482276558876, "learning_rate": 2.362879302963135e-05, "loss": 0.007, "step": 13860 }, { "epoch": 7.761611639619474, "grad_norm": 0.21869398653507233, "learning_rate": 2.3558589536727277e-05, "loss": 0.0045, "step": 13870 }, { "epoch": 7.767207610520425, "grad_norm": 0.1804109364748001, "learning_rate": 2.3488458333629777e-05, "loss": 0.0064, "step": 13880 }, { "epoch": 7.7728035814213765, "grad_norm": 0.18711616098880768, "learning_rate": 2.341839961207482e-05, "loss": 0.0082, "step": 13890 }, { "epoch": 7.778399552322328, "grad_norm": 0.17115071415901184, "learning_rate": 2.3348413563600325e-05, "loss": 0.008, "step": 13900 }, { "epoch": 7.783995523223279, "grad_norm": 0.3199642300605774, "learning_rate": 2.3278500379545436e-05, "loss": 0.008, "step": 13910 }, { "epoch": 7.789591494124231, "grad_norm": 0.16800075769424438, "learning_rate": 2.3208660251050158e-05, "loss": 0.0054, "step": 13920 }, { "epoch": 7.795187465025182, "grad_norm": 0.11445470154285431, "learning_rate": 2.3138893369054766e-05, "loss": 0.0067, "step": 13930 }, { "epoch": 7.800783435926133, "grad_norm": 0.1465342938899994, "learning_rate": 2.3069199924299174e-05, "loss": 0.0046, "step": 13940 }, { "epoch": 7.806379406827085, "grad_norm": 0.10726216435432434, "learning_rate": 2.2999580107322653e-05, "loss": 0.013, "step": 13950 }, { "epoch": 7.811975377728036, "grad_norm": 0.2467944324016571, "learning_rate": 2.29300341084631e-05, "loss": 0.006, "step": 13960 }, { "epoch": 7.817571348628987, "grad_norm": 0.18158167600631714, "learning_rate": 2.2860562117856647e-05, "loss": 0.0065, "step": 13970 }, { "epoch": 7.823167319529938, "grad_norm": 0.1618615835905075, "learning_rate": 2.279116432543705e-05, "loss": 0.0065, "step": 13980 }, { "epoch": 7.8287632904308895, "grad_norm": 0.1069146990776062, "learning_rate": 2.2721840920935196e-05, "loss": 0.0105, "step": 13990 }, { "epoch": 7.834359261331841, "grad_norm": 0.12003065645694733, "learning_rate": 2.2652592093878666e-05, "loss": 0.0049, "step": 14000 }, { "epoch": 7.839955232232793, "grad_norm": 0.09423186630010605, "learning_rate": 2.258341803359108e-05, "loss": 0.0061, "step": 14010 }, { "epoch": 7.845551203133744, "grad_norm": 0.35245028138160706, "learning_rate": 2.251431892919171e-05, "loss": 0.0091, "step": 14020 }, { "epoch": 7.851147174034695, "grad_norm": 0.11108125001192093, "learning_rate": 2.2445294969594844e-05, "loss": 0.007, "step": 14030 }, { "epoch": 7.856743144935646, "grad_norm": 0.10527674853801727, "learning_rate": 2.237634634350934e-05, "loss": 0.0042, "step": 14040 }, { "epoch": 7.862339115836598, "grad_norm": 0.2263229489326477, "learning_rate": 2.2307473239438154e-05, "loss": 0.0056, "step": 14050 }, { "epoch": 7.867935086737549, "grad_norm": 0.13221915066242218, "learning_rate": 2.2238675845677663e-05, "loss": 0.0068, "step": 14060 }, { "epoch": 7.8735310576385, "grad_norm": 0.17508424818515778, "learning_rate": 2.2169954350317374e-05, "loss": 0.007, "step": 14070 }, { "epoch": 7.879127028539451, "grad_norm": 0.24999241530895233, "learning_rate": 2.2101308941239203e-05, "loss": 0.0085, "step": 14080 }, { "epoch": 7.8847229994404024, "grad_norm": 0.12810635566711426, "learning_rate": 2.2032739806117058e-05, "loss": 0.0084, "step": 14090 }, { "epoch": 7.8903189703413545, "grad_norm": 0.22745615243911743, "learning_rate": 2.196424713241637e-05, "loss": 0.0145, "step": 14100 }, { "epoch": 7.895914941242306, "grad_norm": 0.0886574536561966, "learning_rate": 2.1895831107393484e-05, "loss": 0.0071, "step": 14110 }, { "epoch": 7.901510912143257, "grad_norm": 0.18623238801956177, "learning_rate": 2.182749191809518e-05, "loss": 0.0077, "step": 14120 }, { "epoch": 7.907106883044208, "grad_norm": 0.20176784694194794, "learning_rate": 2.1759229751358217e-05, "loss": 0.008, "step": 14130 }, { "epoch": 7.912702853945159, "grad_norm": 0.18935443460941315, "learning_rate": 2.1691044793808734e-05, "loss": 0.0069, "step": 14140 }, { "epoch": 7.918298824846111, "grad_norm": 0.18812550604343414, "learning_rate": 2.1622937231861822e-05, "loss": 0.0051, "step": 14150 }, { "epoch": 7.923894795747062, "grad_norm": 0.12224578857421875, "learning_rate": 2.1554907251720945e-05, "loss": 0.0053, "step": 14160 }, { "epoch": 7.929490766648014, "grad_norm": 0.12175440043210983, "learning_rate": 2.148695503937745e-05, "loss": 0.0075, "step": 14170 }, { "epoch": 7.935086737548965, "grad_norm": 0.11878049373626709, "learning_rate": 2.1419080780610123e-05, "loss": 0.0062, "step": 14180 }, { "epoch": 7.940682708449916, "grad_norm": 0.19284716248512268, "learning_rate": 2.1351284660984572e-05, "loss": 0.0063, "step": 14190 }, { "epoch": 7.9462786793508675, "grad_norm": 0.159319207072258, "learning_rate": 2.128356686585282e-05, "loss": 0.0064, "step": 14200 }, { "epoch": 7.951874650251819, "grad_norm": 0.16800148785114288, "learning_rate": 2.121592758035273e-05, "loss": 0.0054, "step": 14210 }, { "epoch": 7.95747062115277, "grad_norm": 0.23277972638607025, "learning_rate": 2.1148366989407496e-05, "loss": 0.0056, "step": 14220 }, { "epoch": 7.963066592053721, "grad_norm": 0.08594591915607452, "learning_rate": 2.1080885277725236e-05, "loss": 0.0054, "step": 14230 }, { "epoch": 7.968662562954672, "grad_norm": 0.21676327288150787, "learning_rate": 2.1013482629798333e-05, "loss": 0.0071, "step": 14240 }, { "epoch": 7.9742585338556236, "grad_norm": 0.1778232604265213, "learning_rate": 2.094615922990309e-05, "loss": 0.0067, "step": 14250 }, { "epoch": 7.979854504756576, "grad_norm": 0.2177736759185791, "learning_rate": 2.0878915262099098e-05, "loss": 0.0068, "step": 14260 }, { "epoch": 7.985450475657527, "grad_norm": 0.25127291679382324, "learning_rate": 2.0811750910228774e-05, "loss": 0.0104, "step": 14270 }, { "epoch": 7.991046446558478, "grad_norm": 0.08792544901371002, "learning_rate": 2.0744666357916925e-05, "loss": 0.0064, "step": 14280 }, { "epoch": 7.996642417459429, "grad_norm": 0.1125119999051094, "learning_rate": 2.067766178857013e-05, "loss": 0.0099, "step": 14290 }, { "epoch": 8.00223838836038, "grad_norm": 0.18561410903930664, "learning_rate": 2.061073738537635e-05, "loss": 0.0089, "step": 14300 }, { "epoch": 8.007834359261333, "grad_norm": 0.10987678915262222, "learning_rate": 2.0543893331304333e-05, "loss": 0.0071, "step": 14310 }, { "epoch": 8.013430330162283, "grad_norm": 0.10636857897043228, "learning_rate": 2.0477129809103147e-05, "loss": 0.007, "step": 14320 }, { "epoch": 8.019026301063235, "grad_norm": 0.16379332542419434, "learning_rate": 2.0410447001301753e-05, "loss": 0.006, "step": 14330 }, { "epoch": 8.024622271964185, "grad_norm": 0.09951362758874893, "learning_rate": 2.0343845090208368e-05, "loss": 0.0052, "step": 14340 }, { "epoch": 8.030218242865137, "grad_norm": 0.1974375694990158, "learning_rate": 2.0277324257910106e-05, "loss": 0.0061, "step": 14350 }, { "epoch": 8.035814213766088, "grad_norm": 0.16213521361351013, "learning_rate": 2.0210884686272368e-05, "loss": 0.0056, "step": 14360 }, { "epoch": 8.04141018466704, "grad_norm": 0.32907333970069885, "learning_rate": 2.0144526556938387e-05, "loss": 0.011, "step": 14370 }, { "epoch": 8.047006155567992, "grad_norm": 0.24763990938663483, "learning_rate": 2.0078250051328784e-05, "loss": 0.0059, "step": 14380 }, { "epoch": 8.052602126468942, "grad_norm": 0.06522991508245468, "learning_rate": 2.0012055350640986e-05, "loss": 0.0075, "step": 14390 }, { "epoch": 8.058198097369894, "grad_norm": 0.1594466120004654, "learning_rate": 1.9945942635848748e-05, "loss": 0.0107, "step": 14400 }, { "epoch": 8.063794068270845, "grad_norm": 0.11248297244310379, "learning_rate": 1.9879912087701753e-05, "loss": 0.0043, "step": 14410 }, { "epoch": 8.069390039171797, "grad_norm": 0.11491246521472931, "learning_rate": 1.981396388672496e-05, "loss": 0.0043, "step": 14420 }, { "epoch": 8.074986010072747, "grad_norm": 0.22106263041496277, "learning_rate": 1.974809821321827e-05, "loss": 0.0055, "step": 14430 }, { "epoch": 8.0805819809737, "grad_norm": 0.16226910054683685, "learning_rate": 1.9682315247255894e-05, "loss": 0.0085, "step": 14440 }, { "epoch": 8.08617795187465, "grad_norm": 0.09066546708345413, "learning_rate": 1.9616615168685943e-05, "loss": 0.0083, "step": 14450 }, { "epoch": 8.091773922775602, "grad_norm": 0.11933751404285431, "learning_rate": 1.9550998157129946e-05, "loss": 0.0057, "step": 14460 }, { "epoch": 8.097369893676554, "grad_norm": 0.1404096931219101, "learning_rate": 1.9485464391982284e-05, "loss": 0.0047, "step": 14470 }, { "epoch": 8.102965864577504, "grad_norm": 0.2508150339126587, "learning_rate": 1.942001405240979e-05, "loss": 0.0076, "step": 14480 }, { "epoch": 8.108561835478456, "grad_norm": 0.17527352273464203, "learning_rate": 1.9354647317351188e-05, "loss": 0.0077, "step": 14490 }, { "epoch": 8.114157806379406, "grad_norm": 0.11819542944431305, "learning_rate": 1.928936436551661e-05, "loss": 0.0048, "step": 14500 }, { "epoch": 8.119753777280359, "grad_norm": 0.17159508168697357, "learning_rate": 1.9224165375387193e-05, "loss": 0.0072, "step": 14510 }, { "epoch": 8.125349748181309, "grad_norm": 0.1392519325017929, "learning_rate": 1.9159050525214452e-05, "loss": 0.0058, "step": 14520 }, { "epoch": 8.130945719082261, "grad_norm": 0.2096053957939148, "learning_rate": 1.909401999301993e-05, "loss": 0.007, "step": 14530 }, { "epoch": 8.136541689983211, "grad_norm": 0.2075774371623993, "learning_rate": 1.9029073956594606e-05, "loss": 0.0063, "step": 14540 }, { "epoch": 8.142137660884163, "grad_norm": 0.0607825368642807, "learning_rate": 1.8964212593498442e-05, "loss": 0.0046, "step": 14550 }, { "epoch": 8.147733631785115, "grad_norm": 0.20028991997241974, "learning_rate": 1.8899436081059975e-05, "loss": 0.0067, "step": 14560 }, { "epoch": 8.153329602686066, "grad_norm": 0.12437421083450317, "learning_rate": 1.8834744596375666e-05, "loss": 0.0045, "step": 14570 }, { "epoch": 8.158925573587018, "grad_norm": 0.09412521868944168, "learning_rate": 1.877013831630961e-05, "loss": 0.0053, "step": 14580 }, { "epoch": 8.164521544487968, "grad_norm": 0.30078214406967163, "learning_rate": 1.8705617417492883e-05, "loss": 0.0088, "step": 14590 }, { "epoch": 8.17011751538892, "grad_norm": 0.2020367681980133, "learning_rate": 1.8641182076323148e-05, "loss": 0.0074, "step": 14600 }, { "epoch": 8.17571348628987, "grad_norm": 0.12557435035705566, "learning_rate": 1.85768324689642e-05, "loss": 0.0054, "step": 14610 }, { "epoch": 8.181309457190823, "grad_norm": 0.11945895105600357, "learning_rate": 1.851256877134538e-05, "loss": 0.0078, "step": 14620 }, { "epoch": 8.186905428091775, "grad_norm": 0.33773839473724365, "learning_rate": 1.8448391159161204e-05, "loss": 0.0101, "step": 14630 }, { "epoch": 8.192501398992725, "grad_norm": 0.2184380739927292, "learning_rate": 1.838429980787081e-05, "loss": 0.0059, "step": 14640 }, { "epoch": 8.198097369893677, "grad_norm": 0.06359529495239258, "learning_rate": 1.8320294892697478e-05, "loss": 0.006, "step": 14650 }, { "epoch": 8.203693340794628, "grad_norm": 0.1690957248210907, "learning_rate": 1.8256376588628238e-05, "loss": 0.008, "step": 14660 }, { "epoch": 8.20928931169558, "grad_norm": 0.296812504529953, "learning_rate": 1.8192545070413282e-05, "loss": 0.0069, "step": 14670 }, { "epoch": 8.21488528259653, "grad_norm": 0.08360179513692856, "learning_rate": 1.8128800512565513e-05, "loss": 0.007, "step": 14680 }, { "epoch": 8.220481253497482, "grad_norm": 0.12985661625862122, "learning_rate": 1.8065143089360172e-05, "loss": 0.0079, "step": 14690 }, { "epoch": 8.226077224398432, "grad_norm": 0.10445982962846756, "learning_rate": 1.800157297483417e-05, "loss": 0.0036, "step": 14700 }, { "epoch": 8.231673195299384, "grad_norm": 0.1876983791589737, "learning_rate": 1.7938090342785817e-05, "loss": 0.0058, "step": 14710 }, { "epoch": 8.237269166200337, "grad_norm": 0.07933235913515091, "learning_rate": 1.787469536677419e-05, "loss": 0.0048, "step": 14720 }, { "epoch": 8.242865137101287, "grad_norm": 0.2597578465938568, "learning_rate": 1.7811388220118707e-05, "loss": 0.0077, "step": 14730 }, { "epoch": 8.248461108002239, "grad_norm": 0.1318414807319641, "learning_rate": 1.774816907589873e-05, "loss": 0.0038, "step": 14740 }, { "epoch": 8.25405707890319, "grad_norm": 0.23657891154289246, "learning_rate": 1.768503810695295e-05, "loss": 0.0074, "step": 14750 }, { "epoch": 8.259653049804141, "grad_norm": 0.12084835767745972, "learning_rate": 1.7621995485879062e-05, "loss": 0.0086, "step": 14760 }, { "epoch": 8.265249020705092, "grad_norm": 0.2077346295118332, "learning_rate": 1.755904138503316e-05, "loss": 0.0066, "step": 14770 }, { "epoch": 8.270844991606044, "grad_norm": 0.26253417134284973, "learning_rate": 1.749617597652934e-05, "loss": 0.0107, "step": 14780 }, { "epoch": 8.276440962506994, "grad_norm": 0.25481829047203064, "learning_rate": 1.743339943223926e-05, "loss": 0.0044, "step": 14790 }, { "epoch": 8.282036933407946, "grad_norm": 0.23157408833503723, "learning_rate": 1.7370711923791567e-05, "loss": 0.0069, "step": 14800 }, { "epoch": 8.287632904308898, "grad_norm": 0.10085418075323105, "learning_rate": 1.7308113622571544e-05, "loss": 0.0036, "step": 14810 }, { "epoch": 8.293228875209849, "grad_norm": 0.10876370966434479, "learning_rate": 1.7245604699720535e-05, "loss": 0.007, "step": 14820 }, { "epoch": 8.2988248461108, "grad_norm": 0.20935757458209991, "learning_rate": 1.7183185326135543e-05, "loss": 0.0055, "step": 14830 }, { "epoch": 8.304420817011751, "grad_norm": 0.13824748992919922, "learning_rate": 1.712085567246878e-05, "loss": 0.0072, "step": 14840 }, { "epoch": 8.310016787912703, "grad_norm": 0.3369564414024353, "learning_rate": 1.70586159091271e-05, "loss": 0.0069, "step": 14850 }, { "epoch": 8.315612758813653, "grad_norm": 0.2684394419193268, "learning_rate": 1.699646620627168e-05, "loss": 0.0061, "step": 14860 }, { "epoch": 8.321208729714606, "grad_norm": 0.23020261526107788, "learning_rate": 1.6934406733817414e-05, "loss": 0.0126, "step": 14870 }, { "epoch": 8.326804700615558, "grad_norm": 0.23905567824840546, "learning_rate": 1.6872437661432517e-05, "loss": 0.0057, "step": 14880 }, { "epoch": 8.332400671516508, "grad_norm": 0.11183072626590729, "learning_rate": 1.6810559158538092e-05, "loss": 0.0061, "step": 14890 }, { "epoch": 8.33799664241746, "grad_norm": 0.11450804024934769, "learning_rate": 1.6748771394307585e-05, "loss": 0.0041, "step": 14900 }, { "epoch": 8.34359261331841, "grad_norm": 0.14276103675365448, "learning_rate": 1.6687074537666398e-05, "loss": 0.0046, "step": 14910 }, { "epoch": 8.349188584219362, "grad_norm": 0.1129729300737381, "learning_rate": 1.662546875729138e-05, "loss": 0.0063, "step": 14920 }, { "epoch": 8.354784555120313, "grad_norm": 0.18285100162029266, "learning_rate": 1.6563954221610355e-05, "loss": 0.0106, "step": 14930 }, { "epoch": 8.360380526021265, "grad_norm": 0.10539596527814865, "learning_rate": 1.6502531098801753e-05, "loss": 0.0043, "step": 14940 }, { "epoch": 8.365976496922215, "grad_norm": 0.13819168508052826, "learning_rate": 1.6441199556794033e-05, "loss": 0.0065, "step": 14950 }, { "epoch": 8.371572467823167, "grad_norm": 0.19076746702194214, "learning_rate": 1.637995976326527e-05, "loss": 0.01, "step": 14960 }, { "epoch": 8.37716843872412, "grad_norm": 0.24138867855072021, "learning_rate": 1.631881188564275e-05, "loss": 0.0082, "step": 14970 }, { "epoch": 8.38276440962507, "grad_norm": 0.1397552490234375, "learning_rate": 1.62577560911024e-05, "loss": 0.0047, "step": 14980 }, { "epoch": 8.388360380526022, "grad_norm": 0.08066073060035706, "learning_rate": 1.6196792546568472e-05, "loss": 0.0076, "step": 14990 }, { "epoch": 8.393956351426972, "grad_norm": 0.2772653102874756, "learning_rate": 1.6135921418712956e-05, "loss": 0.008, "step": 15000 }, { "epoch": 8.399552322327924, "grad_norm": 0.1933654099702835, "learning_rate": 1.6075142873955164e-05, "loss": 0.0049, "step": 15010 }, { "epoch": 8.405148293228875, "grad_norm": 0.09738892316818237, "learning_rate": 1.6014457078461353e-05, "loss": 0.0046, "step": 15020 }, { "epoch": 8.410744264129827, "grad_norm": 0.11632133275270462, "learning_rate": 1.5953864198144135e-05, "loss": 0.0079, "step": 15030 }, { "epoch": 8.416340235030777, "grad_norm": 0.10637476295232773, "learning_rate": 1.5893364398662176e-05, "loss": 0.0052, "step": 15040 }, { "epoch": 8.421936205931729, "grad_norm": 0.22587163746356964, "learning_rate": 1.583295784541958e-05, "loss": 0.0064, "step": 15050 }, { "epoch": 8.427532176832681, "grad_norm": 0.15165762603282928, "learning_rate": 1.5772644703565565e-05, "loss": 0.0068, "step": 15060 }, { "epoch": 8.433128147733632, "grad_norm": 0.13497453927993774, "learning_rate": 1.5712425137993973e-05, "loss": 0.0076, "step": 15070 }, { "epoch": 8.438724118634584, "grad_norm": 0.1444980800151825, "learning_rate": 1.5652299313342773e-05, "loss": 0.0066, "step": 15080 }, { "epoch": 8.444320089535534, "grad_norm": 0.32101383805274963, "learning_rate": 1.5592267393993716e-05, "loss": 0.0054, "step": 15090 }, { "epoch": 8.449916060436486, "grad_norm": 0.26894599199295044, "learning_rate": 1.553232954407171e-05, "loss": 0.0039, "step": 15100 }, { "epoch": 8.455512031337436, "grad_norm": 0.26109951734542847, "learning_rate": 1.5472485927444597e-05, "loss": 0.0057, "step": 15110 }, { "epoch": 8.461108002238388, "grad_norm": 0.09691357612609863, "learning_rate": 1.5412736707722537e-05, "loss": 0.0036, "step": 15120 }, { "epoch": 8.46670397313934, "grad_norm": 0.08756586909294128, "learning_rate": 1.5353082048257596e-05, "loss": 0.0059, "step": 15130 }, { "epoch": 8.47229994404029, "grad_norm": 0.0936000794172287, "learning_rate": 1.5293522112143373e-05, "loss": 0.0042, "step": 15140 }, { "epoch": 8.477895914941243, "grad_norm": 0.20747262239456177, "learning_rate": 1.5234057062214402e-05, "loss": 0.0118, "step": 15150 }, { "epoch": 8.483491885842193, "grad_norm": 0.11843043565750122, "learning_rate": 1.517468706104589e-05, "loss": 0.0072, "step": 15160 }, { "epoch": 8.489087856743145, "grad_norm": 0.23854964971542358, "learning_rate": 1.5115412270953167e-05, "loss": 0.0066, "step": 15170 }, { "epoch": 8.494683827644096, "grad_norm": 0.1770446002483368, "learning_rate": 1.5056232853991209e-05, "loss": 0.0062, "step": 15180 }, { "epoch": 8.500279798545048, "grad_norm": 0.23799461126327515, "learning_rate": 1.4997148971954344e-05, "loss": 0.0075, "step": 15190 }, { "epoch": 8.505875769445998, "grad_norm": 0.3780512511730194, "learning_rate": 1.4938160786375572e-05, "loss": 0.0081, "step": 15200 }, { "epoch": 8.51147174034695, "grad_norm": 0.11119966208934784, "learning_rate": 1.4879268458526379e-05, "loss": 0.0046, "step": 15210 }, { "epoch": 8.517067711247902, "grad_norm": 0.09658356010913849, "learning_rate": 1.4820472149416154e-05, "loss": 0.007, "step": 15220 }, { "epoch": 8.522663682148853, "grad_norm": 0.17144611477851868, "learning_rate": 1.4761772019791748e-05, "loss": 0.0056, "step": 15230 }, { "epoch": 8.528259653049805, "grad_norm": 0.14623138308525085, "learning_rate": 1.470316823013707e-05, "loss": 0.0051, "step": 15240 }, { "epoch": 8.533855623950755, "grad_norm": 0.1579722911119461, "learning_rate": 1.4644660940672627e-05, "loss": 0.0049, "step": 15250 }, { "epoch": 8.539451594851707, "grad_norm": 0.14990709722042084, "learning_rate": 1.4586250311355132e-05, "loss": 0.006, "step": 15260 }, { "epoch": 8.545047565752657, "grad_norm": 0.24695487320423126, "learning_rate": 1.4527936501877032e-05, "loss": 0.0072, "step": 15270 }, { "epoch": 8.55064353665361, "grad_norm": 0.2550105154514313, "learning_rate": 1.4469719671666043e-05, "loss": 0.0058, "step": 15280 }, { "epoch": 8.556239507554562, "grad_norm": 0.17998188734054565, "learning_rate": 1.4411599979884744e-05, "loss": 0.0089, "step": 15290 }, { "epoch": 8.561835478455512, "grad_norm": 0.3639971613883972, "learning_rate": 1.435357758543015e-05, "loss": 0.0085, "step": 15300 }, { "epoch": 8.567431449356464, "grad_norm": 0.12687824666500092, "learning_rate": 1.4295652646933277e-05, "loss": 0.0061, "step": 15310 }, { "epoch": 8.573027420257414, "grad_norm": 0.1352899670600891, "learning_rate": 1.4237825322758736e-05, "loss": 0.0066, "step": 15320 }, { "epoch": 8.578623391158366, "grad_norm": 0.2139214277267456, "learning_rate": 1.4180095771004154e-05, "loss": 0.006, "step": 15330 }, { "epoch": 8.584219362059317, "grad_norm": 0.13526403903961182, "learning_rate": 1.412246414949997e-05, "loss": 0.0061, "step": 15340 }, { "epoch": 8.589815332960269, "grad_norm": 0.10206010937690735, "learning_rate": 1.4064930615808808e-05, "loss": 0.0042, "step": 15350 }, { "epoch": 8.59541130386122, "grad_norm": 0.1680195927619934, "learning_rate": 1.4007495327225162e-05, "loss": 0.0063, "step": 15360 }, { "epoch": 8.601007274762171, "grad_norm": 0.2092961072921753, "learning_rate": 1.3950158440774957e-05, "loss": 0.0089, "step": 15370 }, { "epoch": 8.606603245663123, "grad_norm": 0.24639266729354858, "learning_rate": 1.389292011321498e-05, "loss": 0.0037, "step": 15380 }, { "epoch": 8.612199216564074, "grad_norm": 0.20889121294021606, "learning_rate": 1.383578050103268e-05, "loss": 0.0036, "step": 15390 }, { "epoch": 8.617795187465026, "grad_norm": 0.1731806993484497, "learning_rate": 1.3778739760445552e-05, "loss": 0.0049, "step": 15400 }, { "epoch": 8.623391158365976, "grad_norm": 0.15791241824626923, "learning_rate": 1.3721798047400813e-05, "loss": 0.0064, "step": 15410 }, { "epoch": 8.628987129266928, "grad_norm": 0.2612980604171753, "learning_rate": 1.3664955517574968e-05, "loss": 0.0056, "step": 15420 }, { "epoch": 8.634583100167879, "grad_norm": 0.12942969799041748, "learning_rate": 1.3608212326373249e-05, "loss": 0.0044, "step": 15430 }, { "epoch": 8.64017907106883, "grad_norm": 0.224086731672287, "learning_rate": 1.3551568628929434e-05, "loss": 0.0065, "step": 15440 }, { "epoch": 8.645775041969781, "grad_norm": 0.234924778342247, "learning_rate": 1.3495024580105192e-05, "loss": 0.0055, "step": 15450 }, { "epoch": 8.651371012870733, "grad_norm": 0.14701171219348907, "learning_rate": 1.343858033448982e-05, "loss": 0.0078, "step": 15460 }, { "epoch": 8.656966983771685, "grad_norm": 0.06672263145446777, "learning_rate": 1.3382236046399722e-05, "loss": 0.0057, "step": 15470 }, { "epoch": 8.662562954672635, "grad_norm": 0.11234284192323685, "learning_rate": 1.3325991869878013e-05, "loss": 0.0053, "step": 15480 }, { "epoch": 8.668158925573588, "grad_norm": 0.2150266021490097, "learning_rate": 1.3269847958694148e-05, "loss": 0.0045, "step": 15490 }, { "epoch": 8.673754896474538, "grad_norm": 0.37493982911109924, "learning_rate": 1.3213804466343421e-05, "loss": 0.0058, "step": 15500 }, { "epoch": 8.67935086737549, "grad_norm": 0.054848652333021164, "learning_rate": 1.3157861546046613e-05, "loss": 0.0062, "step": 15510 }, { "epoch": 8.68494683827644, "grad_norm": 0.30526259541511536, "learning_rate": 1.3102019350749528e-05, "loss": 0.005, "step": 15520 }, { "epoch": 8.690542809177392, "grad_norm": 0.11414709687232971, "learning_rate": 1.3046278033122577e-05, "loss": 0.0055, "step": 15530 }, { "epoch": 8.696138780078343, "grad_norm": 0.19409357011318207, "learning_rate": 1.299063774556042e-05, "loss": 0.0048, "step": 15540 }, { "epoch": 8.701734750979295, "grad_norm": 0.0840323343873024, "learning_rate": 1.293509864018146e-05, "loss": 0.0062, "step": 15550 }, { "epoch": 8.707330721880247, "grad_norm": 0.2921426594257355, "learning_rate": 1.2879660868827508e-05, "loss": 0.0055, "step": 15560 }, { "epoch": 8.712926692781197, "grad_norm": 0.18921242654323578, "learning_rate": 1.2824324583063302e-05, "loss": 0.0065, "step": 15570 }, { "epoch": 8.71852266368215, "grad_norm": 0.2043517678976059, "learning_rate": 1.2769089934176126e-05, "loss": 0.0048, "step": 15580 }, { "epoch": 8.7241186345831, "grad_norm": 0.14090007543563843, "learning_rate": 1.2713957073175425e-05, "loss": 0.0043, "step": 15590 }, { "epoch": 8.729714605484052, "grad_norm": 0.13512486219406128, "learning_rate": 1.2658926150792322e-05, "loss": 0.009, "step": 15600 }, { "epoch": 8.735310576385002, "grad_norm": 0.16850633919239044, "learning_rate": 1.2603997317479238e-05, "loss": 0.0043, "step": 15610 }, { "epoch": 8.740906547285954, "grad_norm": 0.0671689510345459, "learning_rate": 1.2549170723409549e-05, "loss": 0.0047, "step": 15620 }, { "epoch": 8.746502518186904, "grad_norm": 0.17265447974205017, "learning_rate": 1.2494446518477022e-05, "loss": 0.0078, "step": 15630 }, { "epoch": 8.752098489087857, "grad_norm": 0.09633443504571915, "learning_rate": 1.243982485229559e-05, "loss": 0.01, "step": 15640 }, { "epoch": 8.757694459988809, "grad_norm": 0.07608158886432648, "learning_rate": 1.2385305874198776e-05, "loss": 0.008, "step": 15650 }, { "epoch": 8.763290430889759, "grad_norm": 0.1386493295431137, "learning_rate": 1.233088973323937e-05, "loss": 0.0141, "step": 15660 }, { "epoch": 8.768886401790711, "grad_norm": 0.22368523478507996, "learning_rate": 1.2276576578189064e-05, "loss": 0.0046, "step": 15670 }, { "epoch": 8.774482372691661, "grad_norm": 0.1423027664422989, "learning_rate": 1.2222366557537911e-05, "loss": 0.0059, "step": 15680 }, { "epoch": 8.780078343592614, "grad_norm": 0.09472924470901489, "learning_rate": 1.2168259819494066e-05, "loss": 0.0078, "step": 15690 }, { "epoch": 8.785674314493564, "grad_norm": 0.1385987550020218, "learning_rate": 1.2114256511983274e-05, "loss": 0.0044, "step": 15700 }, { "epoch": 8.791270285394516, "grad_norm": 0.1465826779603958, "learning_rate": 1.2060356782648503e-05, "loss": 0.0035, "step": 15710 }, { "epoch": 8.796866256295468, "grad_norm": 0.3275586664676666, "learning_rate": 1.2006560778849578e-05, "loss": 0.0057, "step": 15720 }, { "epoch": 8.802462227196418, "grad_norm": 0.09989197552204132, "learning_rate": 1.1952868647662696e-05, "loss": 0.006, "step": 15730 }, { "epoch": 8.80805819809737, "grad_norm": 0.12719599902629852, "learning_rate": 1.1899280535880119e-05, "loss": 0.0042, "step": 15740 }, { "epoch": 8.81365416899832, "grad_norm": 0.3480566740036011, "learning_rate": 1.1845796590009683e-05, "loss": 0.0073, "step": 15750 }, { "epoch": 8.819250139899273, "grad_norm": 0.1562948226928711, "learning_rate": 1.1792416956274444e-05, "loss": 0.0066, "step": 15760 }, { "epoch": 8.824846110800223, "grad_norm": 0.23169738054275513, "learning_rate": 1.1739141780612306e-05, "loss": 0.0067, "step": 15770 }, { "epoch": 8.830442081701175, "grad_norm": 0.1328081339597702, "learning_rate": 1.1685971208675539e-05, "loss": 0.0051, "step": 15780 }, { "epoch": 8.836038052602127, "grad_norm": 0.10535513609647751, "learning_rate": 1.1632905385830484e-05, "loss": 0.0061, "step": 15790 }, { "epoch": 8.841634023503078, "grad_norm": 0.08534829318523407, "learning_rate": 1.157994445715706e-05, "loss": 0.0052, "step": 15800 }, { "epoch": 8.84722999440403, "grad_norm": 0.21224470436573029, "learning_rate": 1.1527088567448407e-05, "loss": 0.0066, "step": 15810 }, { "epoch": 8.85282596530498, "grad_norm": 0.20451109111309052, "learning_rate": 1.1474337861210543e-05, "loss": 0.0067, "step": 15820 }, { "epoch": 8.858421936205932, "grad_norm": 0.21763543784618378, "learning_rate": 1.1421692482661856e-05, "loss": 0.0089, "step": 15830 }, { "epoch": 8.864017907106883, "grad_norm": 0.14212079346179962, "learning_rate": 1.1369152575732822e-05, "loss": 0.0048, "step": 15840 }, { "epoch": 8.869613878007835, "grad_norm": 0.1489504873752594, "learning_rate": 1.1316718284065537e-05, "loss": 0.0046, "step": 15850 }, { "epoch": 8.875209848908785, "grad_norm": 0.09450363367795944, "learning_rate": 1.1264389751013326e-05, "loss": 0.0053, "step": 15860 }, { "epoch": 8.880805819809737, "grad_norm": 0.2034289836883545, "learning_rate": 1.1212167119640438e-05, "loss": 0.0081, "step": 15870 }, { "epoch": 8.88640179071069, "grad_norm": 0.13935258984565735, "learning_rate": 1.1160050532721528e-05, "loss": 0.0064, "step": 15880 }, { "epoch": 8.89199776161164, "grad_norm": 0.08578619360923767, "learning_rate": 1.1108040132741354e-05, "loss": 0.0111, "step": 15890 }, { "epoch": 8.897593732512592, "grad_norm": 0.0884697362780571, "learning_rate": 1.1056136061894384e-05, "loss": 0.0108, "step": 15900 }, { "epoch": 8.903189703413542, "grad_norm": 0.28323593735694885, "learning_rate": 1.100433846208434e-05, "loss": 0.0116, "step": 15910 }, { "epoch": 8.908785674314494, "grad_norm": 0.14971330761909485, "learning_rate": 1.095264747492391e-05, "loss": 0.0079, "step": 15920 }, { "epoch": 8.914381645215444, "grad_norm": 0.18808728456497192, "learning_rate": 1.090106324173426e-05, "loss": 0.0082, "step": 15930 }, { "epoch": 8.919977616116396, "grad_norm": 0.16924549639225006, "learning_rate": 1.0849585903544706e-05, "loss": 0.0064, "step": 15940 }, { "epoch": 8.925573587017347, "grad_norm": 0.1466728150844574, "learning_rate": 1.0798215601092354e-05, "loss": 0.0106, "step": 15950 }, { "epoch": 8.931169557918299, "grad_norm": 0.18614622950553894, "learning_rate": 1.0746952474821614e-05, "loss": 0.0089, "step": 15960 }, { "epoch": 8.936765528819251, "grad_norm": 0.04307910427451134, "learning_rate": 1.069579666488395e-05, "loss": 0.0092, "step": 15970 }, { "epoch": 8.942361499720201, "grad_norm": 0.20207299292087555, "learning_rate": 1.0644748311137376e-05, "loss": 0.0077, "step": 15980 }, { "epoch": 8.947957470621153, "grad_norm": 0.12527382373809814, "learning_rate": 1.059380755314613e-05, "loss": 0.008, "step": 15990 }, { "epoch": 8.953553441522104, "grad_norm": 0.3143978416919708, "learning_rate": 1.0542974530180327e-05, "loss": 0.0061, "step": 16000 }, { "epoch": 8.959149412423056, "grad_norm": 0.0894945040345192, "learning_rate": 1.049224938121548e-05, "loss": 0.0041, "step": 16010 }, { "epoch": 8.964745383324006, "grad_norm": 0.23625624179840088, "learning_rate": 1.0441632244932237e-05, "loss": 0.0067, "step": 16020 }, { "epoch": 8.970341354224958, "grad_norm": 0.14668506383895874, "learning_rate": 1.0391123259715906e-05, "loss": 0.0056, "step": 16030 }, { "epoch": 8.975937325125908, "grad_norm": 0.17659400403499603, "learning_rate": 1.0340722563656107e-05, "loss": 0.0066, "step": 16040 }, { "epoch": 8.98153329602686, "grad_norm": 0.2076718956232071, "learning_rate": 1.0290430294546449e-05, "loss": 0.0074, "step": 16050 }, { "epoch": 8.987129266927813, "grad_norm": 0.1386403888463974, "learning_rate": 1.0240246589884044e-05, "loss": 0.0052, "step": 16060 }, { "epoch": 8.992725237828763, "grad_norm": 0.12247960269451141, "learning_rate": 1.0190171586869258e-05, "loss": 0.0059, "step": 16070 }, { "epoch": 8.998321208729715, "grad_norm": 0.08335962146520615, "learning_rate": 1.0140205422405214e-05, "loss": 0.0045, "step": 16080 }, { "epoch": 9.003917179630665, "grad_norm": 0.13206073641777039, "learning_rate": 1.009034823309749e-05, "loss": 0.0049, "step": 16090 }, { "epoch": 9.009513150531617, "grad_norm": 0.1473735272884369, "learning_rate": 1.0040600155253765e-05, "loss": 0.0035, "step": 16100 }, { "epoch": 9.015109121432568, "grad_norm": 0.07891960442066193, "learning_rate": 9.990961324883358e-06, "loss": 0.0064, "step": 16110 }, { "epoch": 9.02070509233352, "grad_norm": 0.16706879436969757, "learning_rate": 9.941431877696955e-06, "loss": 0.0039, "step": 16120 }, { "epoch": 9.026301063234472, "grad_norm": 0.0876656025648117, "learning_rate": 9.892011949106172e-06, "loss": 0.008, "step": 16130 }, { "epoch": 9.031897034135422, "grad_norm": 0.10205890983343124, "learning_rate": 9.842701674223187e-06, "loss": 0.0071, "step": 16140 }, { "epoch": 9.037493005036374, "grad_norm": 0.16774903237819672, "learning_rate": 9.793501187860432e-06, "loss": 0.0037, "step": 16150 }, { "epoch": 9.043088975937325, "grad_norm": 0.2676295340061188, "learning_rate": 9.744410624530148e-06, "loss": 0.0062, "step": 16160 }, { "epoch": 9.048684946838277, "grad_norm": 0.2096317857503891, "learning_rate": 9.695430118444048e-06, "loss": 0.0036, "step": 16170 }, { "epoch": 9.054280917739227, "grad_norm": 0.09436144679784775, "learning_rate": 9.646559803512994e-06, "loss": 0.0045, "step": 16180 }, { "epoch": 9.05987688864018, "grad_norm": 0.17315761744976044, "learning_rate": 9.597799813346525e-06, "loss": 0.0064, "step": 16190 }, { "epoch": 9.06547285954113, "grad_norm": 0.07326121628284454, "learning_rate": 9.549150281252633e-06, "loss": 0.0035, "step": 16200 }, { "epoch": 9.071068830442082, "grad_norm": 0.14720216393470764, "learning_rate": 9.500611340237258e-06, "loss": 0.0055, "step": 16210 }, { "epoch": 9.076664801343034, "grad_norm": 0.0691135823726654, "learning_rate": 9.452183123004e-06, "loss": 0.0077, "step": 16220 }, { "epoch": 9.082260772243984, "grad_norm": 0.13588427007198334, "learning_rate": 9.403865761953779e-06, "loss": 0.0046, "step": 16230 }, { "epoch": 9.087856743144936, "grad_norm": 0.13852879405021667, "learning_rate": 9.355659389184396e-06, "loss": 0.0046, "step": 16240 }, { "epoch": 9.093452714045887, "grad_norm": 0.0626252144575119, "learning_rate": 9.307564136490254e-06, "loss": 0.0069, "step": 16250 }, { "epoch": 9.099048684946839, "grad_norm": 0.25919991731643677, "learning_rate": 9.259580135361929e-06, "loss": 0.0046, "step": 16260 }, { "epoch": 9.104644655847789, "grad_norm": 0.0894588977098465, "learning_rate": 9.211707516985829e-06, "loss": 0.0046, "step": 16270 }, { "epoch": 9.110240626748741, "grad_norm": 0.45610806345939636, "learning_rate": 9.163946412243896e-06, "loss": 0.0069, "step": 16280 }, { "epoch": 9.115836597649691, "grad_norm": 0.1714649349451065, "learning_rate": 9.116296951713133e-06, "loss": 0.0058, "step": 16290 }, { "epoch": 9.121432568550643, "grad_norm": 0.20788055658340454, "learning_rate": 9.068759265665384e-06, "loss": 0.0046, "step": 16300 }, { "epoch": 9.127028539451596, "grad_norm": 0.13281454145908356, "learning_rate": 9.02133348406684e-06, "loss": 0.0073, "step": 16310 }, { "epoch": 9.132624510352546, "grad_norm": 0.20327745378017426, "learning_rate": 8.974019736577777e-06, "loss": 0.0061, "step": 16320 }, { "epoch": 9.138220481253498, "grad_norm": 0.1418776661157608, "learning_rate": 8.92681815255219e-06, "loss": 0.0054, "step": 16330 }, { "epoch": 9.143816452154448, "grad_norm": 0.08617481589317322, "learning_rate": 8.879728861037384e-06, "loss": 0.0057, "step": 16340 }, { "epoch": 9.1494124230554, "grad_norm": 0.14362642168998718, "learning_rate": 8.832751990773714e-06, "loss": 0.0059, "step": 16350 }, { "epoch": 9.15500839395635, "grad_norm": 0.05195459723472595, "learning_rate": 8.785887670194138e-06, "loss": 0.0063, "step": 16360 }, { "epoch": 9.160604364857303, "grad_norm": 0.1765775829553604, "learning_rate": 8.739136027423894e-06, "loss": 0.0075, "step": 16370 }, { "epoch": 9.166200335758255, "grad_norm": 0.1646648496389389, "learning_rate": 8.692497190280224e-06, "loss": 0.0065, "step": 16380 }, { "epoch": 9.171796306659205, "grad_norm": 0.16203129291534424, "learning_rate": 8.645971286271904e-06, "loss": 0.0049, "step": 16390 }, { "epoch": 9.177392277560157, "grad_norm": 0.07584717124700546, "learning_rate": 8.599558442598998e-06, "loss": 0.0071, "step": 16400 }, { "epoch": 9.182988248461108, "grad_norm": 0.14030073583126068, "learning_rate": 8.55325878615244e-06, "loss": 0.0033, "step": 16410 }, { "epoch": 9.18858421936206, "grad_norm": 0.09595508873462677, "learning_rate": 8.507072443513702e-06, "loss": 0.0034, "step": 16420 }, { "epoch": 9.19418019026301, "grad_norm": 0.2346934825181961, "learning_rate": 8.460999540954517e-06, "loss": 0.0091, "step": 16430 }, { "epoch": 9.199776161163962, "grad_norm": 0.11720654368400574, "learning_rate": 8.415040204436426e-06, "loss": 0.0056, "step": 16440 }, { "epoch": 9.205372132064912, "grad_norm": 0.18266266584396362, "learning_rate": 8.369194559610482e-06, "loss": 0.0044, "step": 16450 }, { "epoch": 9.210968102965865, "grad_norm": 0.11530566215515137, "learning_rate": 8.323462731816961e-06, "loss": 0.0091, "step": 16460 }, { "epoch": 9.216564073866817, "grad_norm": 0.15264108777046204, "learning_rate": 8.277844846084898e-06, "loss": 0.0056, "step": 16470 }, { "epoch": 9.222160044767767, "grad_norm": 0.12221037596464157, "learning_rate": 8.232341027131885e-06, "loss": 0.0046, "step": 16480 }, { "epoch": 9.227756015668719, "grad_norm": 0.18118728697299957, "learning_rate": 8.186951399363613e-06, "loss": 0.0048, "step": 16490 }, { "epoch": 9.23335198656967, "grad_norm": 0.11156457662582397, "learning_rate": 8.141676086873572e-06, "loss": 0.0038, "step": 16500 }, { "epoch": 9.238947957470621, "grad_norm": 0.24215921759605408, "learning_rate": 8.096515213442762e-06, "loss": 0.0053, "step": 16510 }, { "epoch": 9.244543928371572, "grad_norm": 0.1042838767170906, "learning_rate": 8.051468902539272e-06, "loss": 0.0038, "step": 16520 }, { "epoch": 9.250139899272524, "grad_norm": 0.15312840044498444, "learning_rate": 8.00653727731801e-06, "loss": 0.0056, "step": 16530 }, { "epoch": 9.255735870173474, "grad_norm": 0.12216275930404663, "learning_rate": 7.96172046062032e-06, "loss": 0.009, "step": 16540 }, { "epoch": 9.261331841074426, "grad_norm": 0.14912450313568115, "learning_rate": 7.917018574973645e-06, "loss": 0.0104, "step": 16550 }, { "epoch": 9.266927811975378, "grad_norm": 0.2108585089445114, "learning_rate": 7.872431742591268e-06, "loss": 0.0068, "step": 16560 }, { "epoch": 9.272523782876329, "grad_norm": 0.0906781554222107, "learning_rate": 7.827960085371855e-06, "loss": 0.0044, "step": 16570 }, { "epoch": 9.27811975377728, "grad_norm": 0.13947215676307678, "learning_rate": 7.783603724899257e-06, "loss": 0.0057, "step": 16580 }, { "epoch": 9.283715724678231, "grad_norm": 0.11844757199287415, "learning_rate": 7.739362782442021e-06, "loss": 0.0044, "step": 16590 }, { "epoch": 9.289311695579183, "grad_norm": 0.13809189200401306, "learning_rate": 7.695237378953223e-06, "loss": 0.0064, "step": 16600 }, { "epoch": 9.294907666480134, "grad_norm": 0.33429670333862305, "learning_rate": 7.651227635070041e-06, "loss": 0.0033, "step": 16610 }, { "epoch": 9.300503637381086, "grad_norm": 0.15949353575706482, "learning_rate": 7.607333671113409e-06, "loss": 0.0142, "step": 16620 }, { "epoch": 9.306099608282038, "grad_norm": 0.30085355043411255, "learning_rate": 7.56355560708778e-06, "loss": 0.0064, "step": 16630 }, { "epoch": 9.311695579182988, "grad_norm": 0.09114662557840347, "learning_rate": 7.519893562680663e-06, "loss": 0.0062, "step": 16640 }, { "epoch": 9.31729155008394, "grad_norm": 0.3248306214809418, "learning_rate": 7.476347657262456e-06, "loss": 0.0063, "step": 16650 }, { "epoch": 9.32288752098489, "grad_norm": 0.15951383113861084, "learning_rate": 7.432918009885997e-06, "loss": 0.0069, "step": 16660 }, { "epoch": 9.328483491885843, "grad_norm": 0.1393985003232956, "learning_rate": 7.389604739286271e-06, "loss": 0.0046, "step": 16670 }, { "epoch": 9.334079462786793, "grad_norm": 0.14699183404445648, "learning_rate": 7.3464079638801365e-06, "loss": 0.0047, "step": 16680 }, { "epoch": 9.339675433687745, "grad_norm": 0.14034835994243622, "learning_rate": 7.30332780176588e-06, "loss": 0.0068, "step": 16690 }, { "epoch": 9.345271404588695, "grad_norm": 0.202976793050766, "learning_rate": 7.260364370723044e-06, "loss": 0.007, "step": 16700 }, { "epoch": 9.350867375489647, "grad_norm": 0.1574084311723709, "learning_rate": 7.217517788212025e-06, "loss": 0.0037, "step": 16710 }, { "epoch": 9.3564633463906, "grad_norm": 0.23007866740226746, "learning_rate": 7.174788171373731e-06, "loss": 0.006, "step": 16720 }, { "epoch": 9.36205931729155, "grad_norm": 0.06488067656755447, "learning_rate": 7.132175637029293e-06, "loss": 0.0038, "step": 16730 }, { "epoch": 9.367655288192502, "grad_norm": 0.08520302921533585, "learning_rate": 7.089680301679752e-06, "loss": 0.0035, "step": 16740 }, { "epoch": 9.373251259093452, "grad_norm": 0.1132565289735794, "learning_rate": 7.047302281505736e-06, "loss": 0.0033, "step": 16750 }, { "epoch": 9.378847229994404, "grad_norm": 0.29900556802749634, "learning_rate": 7.005041692367154e-06, "loss": 0.0083, "step": 16760 }, { "epoch": 9.384443200895355, "grad_norm": 0.21089625358581543, "learning_rate": 6.962898649802823e-06, "loss": 0.004, "step": 16770 }, { "epoch": 9.390039171796307, "grad_norm": 0.1411179006099701, "learning_rate": 6.92087326903022e-06, "loss": 0.0051, "step": 16780 }, { "epoch": 9.395635142697259, "grad_norm": 0.20569784939289093, "learning_rate": 6.878965664945108e-06, "loss": 0.0057, "step": 16790 }, { "epoch": 9.40123111359821, "grad_norm": 0.13673344254493713, "learning_rate": 6.837175952121306e-06, "loss": 0.0029, "step": 16800 }, { "epoch": 9.406827084499161, "grad_norm": 0.07221835851669312, "learning_rate": 6.795504244810285e-06, "loss": 0.0028, "step": 16810 }, { "epoch": 9.412423055400112, "grad_norm": 0.15173490345478058, "learning_rate": 6.753950656940905e-06, "loss": 0.0055, "step": 16820 }, { "epoch": 9.418019026301064, "grad_norm": 0.12818996608257294, "learning_rate": 6.712515302119077e-06, "loss": 0.0047, "step": 16830 }, { "epoch": 9.423614997202014, "grad_norm": 0.2607164978981018, "learning_rate": 6.671198293627479e-06, "loss": 0.0062, "step": 16840 }, { "epoch": 9.429210968102966, "grad_norm": 0.1782405823469162, "learning_rate": 6.629999744425236e-06, "loss": 0.0038, "step": 16850 }, { "epoch": 9.434806939003916, "grad_norm": 0.1047229990363121, "learning_rate": 6.588919767147639e-06, "loss": 0.0038, "step": 16860 }, { "epoch": 9.440402909904869, "grad_norm": 0.21528460085391998, "learning_rate": 6.5479584741057255e-06, "loss": 0.0044, "step": 16870 }, { "epoch": 9.44599888080582, "grad_norm": 0.033052559942007065, "learning_rate": 6.5071159772861436e-06, "loss": 0.0043, "step": 16880 }, { "epoch": 9.451594851706771, "grad_norm": 0.08729052543640137, "learning_rate": 6.466392388350695e-06, "loss": 0.0067, "step": 16890 }, { "epoch": 9.457190822607723, "grad_norm": 0.1754913330078125, "learning_rate": 6.425787818636131e-06, "loss": 0.0038, "step": 16900 }, { "epoch": 9.462786793508673, "grad_norm": 0.13821344077587128, "learning_rate": 6.385302379153818e-06, "loss": 0.0046, "step": 16910 }, { "epoch": 9.468382764409625, "grad_norm": 0.1275906264781952, "learning_rate": 6.344936180589351e-06, "loss": 0.0036, "step": 16920 }, { "epoch": 9.473978735310576, "grad_norm": 0.14954271912574768, "learning_rate": 6.304689333302416e-06, "loss": 0.0034, "step": 16930 }, { "epoch": 9.479574706211528, "grad_norm": 0.12982557713985443, "learning_rate": 6.264561947326331e-06, "loss": 0.0043, "step": 16940 }, { "epoch": 9.485170677112478, "grad_norm": 0.06912703812122345, "learning_rate": 6.22455413236786e-06, "loss": 0.0055, "step": 16950 }, { "epoch": 9.49076664801343, "grad_norm": 0.19244985282421112, "learning_rate": 6.184665997806832e-06, "loss": 0.0043, "step": 16960 }, { "epoch": 9.496362618914382, "grad_norm": 0.08739597350358963, "learning_rate": 6.144897652695864e-06, "loss": 0.0151, "step": 16970 }, { "epoch": 9.501958589815333, "grad_norm": 0.11885930597782135, "learning_rate": 6.1052492057601275e-06, "loss": 0.0073, "step": 16980 }, { "epoch": 9.507554560716285, "grad_norm": 0.07571222633123398, "learning_rate": 6.0657207653969315e-06, "loss": 0.0032, "step": 16990 }, { "epoch": 9.513150531617235, "grad_norm": 0.07605729252099991, "learning_rate": 6.026312439675552e-06, "loss": 0.0036, "step": 17000 }, { "epoch": 9.518746502518187, "grad_norm": 0.20224310457706451, "learning_rate": 5.9870243363368275e-06, "loss": 0.0055, "step": 17010 }, { "epoch": 9.524342473419138, "grad_norm": 0.09693833440542221, "learning_rate": 5.947856562792925e-06, "loss": 0.0048, "step": 17020 }, { "epoch": 9.52993844432009, "grad_norm": 0.13180632889270782, "learning_rate": 5.908809226127054e-06, "loss": 0.0052, "step": 17030 }, { "epoch": 9.53553441522104, "grad_norm": 0.18198780715465546, "learning_rate": 5.869882433093155e-06, "loss": 0.0053, "step": 17040 }, { "epoch": 9.541130386121992, "grad_norm": 0.08620735257863998, "learning_rate": 5.831076290115573e-06, "loss": 0.0047, "step": 17050 }, { "epoch": 9.546726357022944, "grad_norm": 0.18070462346076965, "learning_rate": 5.79239090328883e-06, "loss": 0.005, "step": 17060 }, { "epoch": 9.552322327923894, "grad_norm": 0.13954901695251465, "learning_rate": 5.753826378377286e-06, "loss": 0.0037, "step": 17070 }, { "epoch": 9.557918298824847, "grad_norm": 0.08338068425655365, "learning_rate": 5.715382820814885e-06, "loss": 0.0035, "step": 17080 }, { "epoch": 9.563514269725797, "grad_norm": 0.1206720620393753, "learning_rate": 5.67706033570487e-06, "loss": 0.0071, "step": 17090 }, { "epoch": 9.569110240626749, "grad_norm": 0.1978680044412613, "learning_rate": 5.6388590278194096e-06, "loss": 0.0048, "step": 17100 }, { "epoch": 9.5747062115277, "grad_norm": 0.2190864086151123, "learning_rate": 5.600779001599455e-06, "loss": 0.0043, "step": 17110 }, { "epoch": 9.580302182428651, "grad_norm": 0.0734127014875412, "learning_rate": 5.562820361154314e-06, "loss": 0.0049, "step": 17120 }, { "epoch": 9.585898153329603, "grad_norm": 0.14367960393428802, "learning_rate": 5.524983210261481e-06, "loss": 0.0035, "step": 17130 }, { "epoch": 9.591494124230554, "grad_norm": 0.26178881525993347, "learning_rate": 5.48726765236629e-06, "loss": 0.005, "step": 17140 }, { "epoch": 9.597090095131506, "grad_norm": 0.10900067538022995, "learning_rate": 5.449673790581611e-06, "loss": 0.0065, "step": 17150 }, { "epoch": 9.602686066032456, "grad_norm": 0.16984951496124268, "learning_rate": 5.412201727687644e-06, "loss": 0.0051, "step": 17160 }, { "epoch": 9.608282036933408, "grad_norm": 0.0894961804151535, "learning_rate": 5.374851566131561e-06, "loss": 0.0038, "step": 17170 }, { "epoch": 9.613878007834359, "grad_norm": 0.25771039724349976, "learning_rate": 5.337623408027293e-06, "loss": 0.0073, "step": 17180 }, { "epoch": 9.61947397873531, "grad_norm": 0.14566998183727264, "learning_rate": 5.300517355155215e-06, "loss": 0.0046, "step": 17190 }, { "epoch": 9.625069949636263, "grad_norm": 0.17133091390132904, "learning_rate": 5.263533508961827e-06, "loss": 0.0073, "step": 17200 }, { "epoch": 9.630665920537213, "grad_norm": 0.16593864560127258, "learning_rate": 5.226671970559577e-06, "loss": 0.0053, "step": 17210 }, { "epoch": 9.636261891438165, "grad_norm": 0.11243371665477753, "learning_rate": 5.1899328407264855e-06, "loss": 0.0043, "step": 17220 }, { "epoch": 9.641857862339116, "grad_norm": 0.15767988562583923, "learning_rate": 5.153316219905946e-06, "loss": 0.0072, "step": 17230 }, { "epoch": 9.647453833240068, "grad_norm": 0.2645623981952667, "learning_rate": 5.116822208206396e-06, "loss": 0.0052, "step": 17240 }, { "epoch": 9.653049804141018, "grad_norm": 0.08610297739505768, "learning_rate": 5.080450905401057e-06, "loss": 0.0056, "step": 17250 }, { "epoch": 9.65864577504197, "grad_norm": 0.08036172389984131, "learning_rate": 5.044202410927706e-06, "loss": 0.0036, "step": 17260 }, { "epoch": 9.66424174594292, "grad_norm": 0.18519535660743713, "learning_rate": 5.008076823888319e-06, "loss": 0.0057, "step": 17270 }, { "epoch": 9.669837716843872, "grad_norm": 0.19542230665683746, "learning_rate": 4.972074243048897e-06, "loss": 0.0036, "step": 17280 }, { "epoch": 9.675433687744825, "grad_norm": 0.21911007165908813, "learning_rate": 4.936194766839103e-06, "loss": 0.0039, "step": 17290 }, { "epoch": 9.681029658645775, "grad_norm": 0.14355053007602692, "learning_rate": 4.900438493352055e-06, "loss": 0.0052, "step": 17300 }, { "epoch": 9.686625629546727, "grad_norm": 0.34103378653526306, "learning_rate": 4.864805520344051e-06, "loss": 0.0063, "step": 17310 }, { "epoch": 9.692221600447677, "grad_norm": 0.18420292437076569, "learning_rate": 4.829295945234258e-06, "loss": 0.0046, "step": 17320 }, { "epoch": 9.69781757134863, "grad_norm": 0.11074794083833694, "learning_rate": 4.7939098651045235e-06, "loss": 0.0056, "step": 17330 }, { "epoch": 9.70341354224958, "grad_norm": 0.1706562340259552, "learning_rate": 4.758647376699032e-06, "loss": 0.0038, "step": 17340 }, { "epoch": 9.709009513150532, "grad_norm": 0.16499456763267517, "learning_rate": 4.723508576424062e-06, "loss": 0.0046, "step": 17350 }, { "epoch": 9.714605484051482, "grad_norm": 0.08222458511590958, "learning_rate": 4.688493560347773e-06, "loss": 0.0062, "step": 17360 }, { "epoch": 9.720201454952434, "grad_norm": 0.13518883287906647, "learning_rate": 4.653602424199876e-06, "loss": 0.0086, "step": 17370 }, { "epoch": 9.725797425853386, "grad_norm": 0.16546756029129028, "learning_rate": 4.618835263371396e-06, "loss": 0.0051, "step": 17380 }, { "epoch": 9.731393396754337, "grad_norm": 0.31760314106941223, "learning_rate": 4.5841921729144424e-06, "loss": 0.0056, "step": 17390 }, { "epoch": 9.736989367655289, "grad_norm": 0.11362655460834503, "learning_rate": 4.549673247541875e-06, "loss": 0.0085, "step": 17400 }, { "epoch": 9.742585338556239, "grad_norm": 0.12480427324771881, "learning_rate": 4.515278581627141e-06, "loss": 0.003, "step": 17410 }, { "epoch": 9.748181309457191, "grad_norm": 0.09458563476800919, "learning_rate": 4.48100826920394e-06, "loss": 0.0043, "step": 17420 }, { "epoch": 9.753777280358142, "grad_norm": 0.15045048296451569, "learning_rate": 4.446862403965984e-06, "loss": 0.0035, "step": 17430 }, { "epoch": 9.759373251259094, "grad_norm": 0.10754050314426422, "learning_rate": 4.412841079266777e-06, "loss": 0.0059, "step": 17440 }, { "epoch": 9.764969222160044, "grad_norm": 0.09626353532075882, "learning_rate": 4.378944388119311e-06, "loss": 0.0064, "step": 17450 }, { "epoch": 9.770565193060996, "grad_norm": 0.0682365670800209, "learning_rate": 4.3451724231958644e-06, "loss": 0.0039, "step": 17460 }, { "epoch": 9.776161163961948, "grad_norm": 0.0859832614660263, "learning_rate": 4.311525276827682e-06, "loss": 0.0038, "step": 17470 }, { "epoch": 9.781757134862898, "grad_norm": 0.057302311062812805, "learning_rate": 4.27800304100478e-06, "loss": 0.0061, "step": 17480 }, { "epoch": 9.78735310576385, "grad_norm": 0.30939188599586487, "learning_rate": 4.244605807375679e-06, "loss": 0.0072, "step": 17490 }, { "epoch": 9.7929490766648, "grad_norm": 0.06655000895261765, "learning_rate": 4.2113336672471245e-06, "loss": 0.006, "step": 17500 }, { "epoch": 9.798545047565753, "grad_norm": 0.07795148342847824, "learning_rate": 4.178186711583904e-06, "loss": 0.0064, "step": 17510 }, { "epoch": 9.804141018466703, "grad_norm": 0.06218419224023819, "learning_rate": 4.145165031008508e-06, "loss": 0.0041, "step": 17520 }, { "epoch": 9.809736989367655, "grad_norm": 0.064509816467762, "learning_rate": 4.112268715800943e-06, "loss": 0.0048, "step": 17530 }, { "epoch": 9.815332960268606, "grad_norm": 0.2096703052520752, "learning_rate": 4.079497855898501e-06, "loss": 0.0049, "step": 17540 }, { "epoch": 9.820928931169558, "grad_norm": 0.15621553361415863, "learning_rate": 4.046852540895446e-06, "loss": 0.0046, "step": 17550 }, { "epoch": 9.82652490207051, "grad_norm": 0.089202381670475, "learning_rate": 4.01433286004283e-06, "loss": 0.0078, "step": 17560 }, { "epoch": 9.83212087297146, "grad_norm": 0.11227259039878845, "learning_rate": 3.981938902248222e-06, "loss": 0.0046, "step": 17570 }, { "epoch": 9.837716843872412, "grad_norm": 0.038788773119449615, "learning_rate": 3.949670756075447e-06, "loss": 0.0093, "step": 17580 }, { "epoch": 9.843312814773363, "grad_norm": 0.1287786364555359, "learning_rate": 3.917528509744412e-06, "loss": 0.0041, "step": 17590 }, { "epoch": 9.848908785674315, "grad_norm": 0.04712485149502754, "learning_rate": 3.885512251130763e-06, "loss": 0.0046, "step": 17600 }, { "epoch": 9.854504756575265, "grad_norm": 0.24810890853405, "learning_rate": 3.8536220677657495e-06, "loss": 0.0112, "step": 17610 }, { "epoch": 9.860100727476217, "grad_norm": 0.16745951771736145, "learning_rate": 3.821858046835913e-06, "loss": 0.0038, "step": 17620 }, { "epoch": 9.86569669837717, "grad_norm": 0.10218873620033264, "learning_rate": 3.790220275182854e-06, "loss": 0.0037, "step": 17630 }, { "epoch": 9.87129266927812, "grad_norm": 0.19612161815166473, "learning_rate": 3.75870883930306e-06, "loss": 0.004, "step": 17640 }, { "epoch": 9.876888640179072, "grad_norm": 0.20635591447353363, "learning_rate": 3.7273238253475785e-06, "loss": 0.0081, "step": 17650 }, { "epoch": 9.882484611080022, "grad_norm": 0.154740571975708, "learning_rate": 3.696065319121833e-06, "loss": 0.0049, "step": 17660 }, { "epoch": 9.888080581980974, "grad_norm": 0.046477749943733215, "learning_rate": 3.664933406085402e-06, "loss": 0.0055, "step": 17670 }, { "epoch": 9.893676552881924, "grad_norm": 0.20742470026016235, "learning_rate": 3.6339281713517303e-06, "loss": 0.0027, "step": 17680 }, { "epoch": 9.899272523782876, "grad_norm": 0.07390665262937546, "learning_rate": 3.60304969968796e-06, "loss": 0.0035, "step": 17690 }, { "epoch": 9.904868494683829, "grad_norm": 0.12964075803756714, "learning_rate": 3.5722980755146517e-06, "loss": 0.0066, "step": 17700 }, { "epoch": 9.910464465584779, "grad_norm": 0.05571340024471283, "learning_rate": 3.541673382905558e-06, "loss": 0.008, "step": 17710 }, { "epoch": 9.916060436485731, "grad_norm": 0.12276771664619446, "learning_rate": 3.511175705587433e-06, "loss": 0.0069, "step": 17720 }, { "epoch": 9.921656407386681, "grad_norm": 0.09888763725757599, "learning_rate": 3.4808051269397512e-06, "loss": 0.0036, "step": 17730 }, { "epoch": 9.927252378287633, "grad_norm": 0.08338962495326996, "learning_rate": 3.4505617299945336e-06, "loss": 0.004, "step": 17740 }, { "epoch": 9.932848349188584, "grad_norm": 0.06845631450414658, "learning_rate": 3.420445597436056e-06, "loss": 0.0037, "step": 17750 }, { "epoch": 9.938444320089536, "grad_norm": 0.072002112865448, "learning_rate": 3.390456811600673e-06, "loss": 0.0049, "step": 17760 }, { "epoch": 9.944040290990486, "grad_norm": 0.13706427812576294, "learning_rate": 3.360595454476595e-06, "loss": 0.0067, "step": 17770 }, { "epoch": 9.949636261891438, "grad_norm": 0.14595244824886322, "learning_rate": 3.3308616077036115e-06, "loss": 0.0047, "step": 17780 }, { "epoch": 9.95523223279239, "grad_norm": 0.07961612939834595, "learning_rate": 3.301255352572946e-06, "loss": 0.0035, "step": 17790 }, { "epoch": 9.96082820369334, "grad_norm": 0.10814230144023895, "learning_rate": 3.271776770026963e-06, "loss": 0.0048, "step": 17800 }, { "epoch": 9.966424174594293, "grad_norm": 0.11842755228281021, "learning_rate": 3.2424259406589664e-06, "loss": 0.0095, "step": 17810 }, { "epoch": 9.972020145495243, "grad_norm": 0.21332372725009918, "learning_rate": 3.213202944713023e-06, "loss": 0.003, "step": 17820 }, { "epoch": 9.977616116396195, "grad_norm": 0.06386691331863403, "learning_rate": 3.1841078620836683e-06, "loss": 0.0036, "step": 17830 }, { "epoch": 9.983212087297145, "grad_norm": 0.08316194266080856, "learning_rate": 3.155140772315773e-06, "loss": 0.0042, "step": 17840 }, { "epoch": 9.988808058198098, "grad_norm": 0.16622905433177948, "learning_rate": 3.126301754604233e-06, "loss": 0.0039, "step": 17850 }, { "epoch": 9.994404029099048, "grad_norm": 0.11861821264028549, "learning_rate": 3.0975908877938277e-06, "loss": 0.0048, "step": 17860 }, { "epoch": 10.0, "grad_norm": 0.1722375601530075, "learning_rate": 3.0690082503789742e-06, "loss": 0.0026, "step": 17870 }, { "epoch": 10.005595970900952, "grad_norm": 0.06653541326522827, "learning_rate": 3.040553920503503e-06, "loss": 0.0048, "step": 17880 }, { "epoch": 10.011191941801902, "grad_norm": 0.16646505892276764, "learning_rate": 3.0122279759604745e-06, "loss": 0.004, "step": 17890 }, { "epoch": 10.016787912702855, "grad_norm": 0.07118295133113861, "learning_rate": 2.9840304941919415e-06, "loss": 0.0066, "step": 17900 }, { "epoch": 10.022383883603805, "grad_norm": 0.15453752875328064, "learning_rate": 2.9559615522887273e-06, "loss": 0.0052, "step": 17910 }, { "epoch": 10.027979854504757, "grad_norm": 0.23914295434951782, "learning_rate": 2.928021226990263e-06, "loss": 0.0042, "step": 17920 }, { "epoch": 10.033575825405707, "grad_norm": 0.09927842766046524, "learning_rate": 2.9002095946843277e-06, "loss": 0.0053, "step": 17930 }, { "epoch": 10.03917179630666, "grad_norm": 0.039526671171188354, "learning_rate": 2.8725267314068495e-06, "loss": 0.0029, "step": 17940 }, { "epoch": 10.04476776720761, "grad_norm": 0.1683174967765808, "learning_rate": 2.844972712841737e-06, "loss": 0.0042, "step": 17950 }, { "epoch": 10.050363738108562, "grad_norm": 0.10315953940153122, "learning_rate": 2.817547614320615e-06, "loss": 0.0096, "step": 17960 }, { "epoch": 10.055959709009514, "grad_norm": 0.17959141731262207, "learning_rate": 2.790251510822661e-06, "loss": 0.0048, "step": 17970 }, { "epoch": 10.061555679910464, "grad_norm": 0.18458683788776398, "learning_rate": 2.7630844769743757e-06, "loss": 0.0051, "step": 17980 }, { "epoch": 10.067151650811416, "grad_norm": 0.19159017503261566, "learning_rate": 2.73604658704939e-06, "loss": 0.0054, "step": 17990 }, { "epoch": 10.072747621712367, "grad_norm": 0.08318327367305756, "learning_rate": 2.7091379149682685e-06, "loss": 0.0053, "step": 18000 }, { "epoch": 10.078343592613319, "grad_norm": 0.07472005486488342, "learning_rate": 2.682358534298285e-06, "loss": 0.006, "step": 18010 }, { "epoch": 10.083939563514269, "grad_norm": 0.09040942043066025, "learning_rate": 2.6557085182532582e-06, "loss": 0.004, "step": 18020 }, { "epoch": 10.089535534415221, "grad_norm": 0.037220001220703125, "learning_rate": 2.6291879396933004e-06, "loss": 0.0038, "step": 18030 }, { "epoch": 10.095131505316173, "grad_norm": 0.11240635067224503, "learning_rate": 2.602796871124663e-06, "loss": 0.0031, "step": 18040 }, { "epoch": 10.100727476217124, "grad_norm": 0.12259605526924133, "learning_rate": 2.57653538469953e-06, "loss": 0.0049, "step": 18050 }, { "epoch": 10.106323447118076, "grad_norm": 0.16758129000663757, "learning_rate": 2.5504035522157854e-06, "loss": 0.0066, "step": 18060 }, { "epoch": 10.111919418019026, "grad_norm": 0.10704974085092545, "learning_rate": 2.5244014451168863e-06, "loss": 0.0021, "step": 18070 }, { "epoch": 10.117515388919978, "grad_norm": 0.19684171676635742, "learning_rate": 2.4985291344915674e-06, "loss": 0.0035, "step": 18080 }, { "epoch": 10.123111359820928, "grad_norm": 0.25069093704223633, "learning_rate": 2.4727866910737583e-06, "loss": 0.0038, "step": 18090 }, { "epoch": 10.12870733072188, "grad_norm": 0.15888355672359467, "learning_rate": 2.4471741852423237e-06, "loss": 0.0055, "step": 18100 }, { "epoch": 10.13430330162283, "grad_norm": 0.1355513483285904, "learning_rate": 2.421691687020855e-06, "loss": 0.0032, "step": 18110 }, { "epoch": 10.139899272523783, "grad_norm": 0.09521888941526413, "learning_rate": 2.3963392660775575e-06, "loss": 0.0072, "step": 18120 }, { "epoch": 10.145495243424735, "grad_norm": 0.18774038553237915, "learning_rate": 2.371116991724953e-06, "loss": 0.0028, "step": 18130 }, { "epoch": 10.151091214325685, "grad_norm": 0.06293562054634094, "learning_rate": 2.3460249329197824e-06, "loss": 0.0032, "step": 18140 }, { "epoch": 10.156687185226637, "grad_norm": 0.25169095396995544, "learning_rate": 2.321063158262793e-06, "loss": 0.0092, "step": 18150 }, { "epoch": 10.162283156127588, "grad_norm": 0.08376752585172653, "learning_rate": 2.296231735998511e-06, "loss": 0.0021, "step": 18160 }, { "epoch": 10.16787912702854, "grad_norm": 0.06758670508861542, "learning_rate": 2.271530734015104e-06, "loss": 0.0036, "step": 18170 }, { "epoch": 10.17347509792949, "grad_norm": 0.06193256378173828, "learning_rate": 2.2469602198441573e-06, "loss": 0.0036, "step": 18180 }, { "epoch": 10.179071068830442, "grad_norm": 0.21087805926799774, "learning_rate": 2.222520260660521e-06, "loss": 0.0043, "step": 18190 }, { "epoch": 10.184667039731393, "grad_norm": 0.09581877291202545, "learning_rate": 2.1982109232821178e-06, "loss": 0.0048, "step": 18200 }, { "epoch": 10.190263010632345, "grad_norm": 0.23187117278575897, "learning_rate": 2.174032274169746e-06, "loss": 0.0068, "step": 18210 }, { "epoch": 10.195858981533297, "grad_norm": 0.1904383897781372, "learning_rate": 2.149984379426906e-06, "loss": 0.0036, "step": 18220 }, { "epoch": 10.201454952434247, "grad_norm": 0.04588289558887482, "learning_rate": 2.1260673047996227e-06, "loss": 0.0075, "step": 18230 }, { "epoch": 10.2070509233352, "grad_norm": 0.05446457862854004, "learning_rate": 2.102281115676258e-06, "loss": 0.0036, "step": 18240 }, { "epoch": 10.21264689423615, "grad_norm": 0.12907229363918304, "learning_rate": 2.0786258770873647e-06, "loss": 0.0043, "step": 18250 }, { "epoch": 10.218242865137102, "grad_norm": 0.0724627822637558, "learning_rate": 2.0551016537054493e-06, "loss": 0.0024, "step": 18260 }, { "epoch": 10.223838836038052, "grad_norm": 0.11797565221786499, "learning_rate": 2.0317085098448372e-06, "loss": 0.0032, "step": 18270 }, { "epoch": 10.229434806939004, "grad_norm": 0.1239556148648262, "learning_rate": 2.008446509461498e-06, "loss": 0.0038, "step": 18280 }, { "epoch": 10.235030777839956, "grad_norm": 0.05614084377884865, "learning_rate": 1.985315716152847e-06, "loss": 0.0041, "step": 18290 }, { "epoch": 10.240626748740906, "grad_norm": 0.2968387007713318, "learning_rate": 1.962316193157593e-06, "loss": 0.0092, "step": 18300 }, { "epoch": 10.246222719641858, "grad_norm": 0.11529407650232315, "learning_rate": 1.939448003355554e-06, "loss": 0.0059, "step": 18310 }, { "epoch": 10.251818690542809, "grad_norm": 0.24037353694438934, "learning_rate": 1.91671120926748e-06, "loss": 0.0045, "step": 18320 }, { "epoch": 10.257414661443761, "grad_norm": 0.20346900820732117, "learning_rate": 1.8941058730549132e-06, "loss": 0.0047, "step": 18330 }, { "epoch": 10.263010632344711, "grad_norm": 0.27883380651474, "learning_rate": 1.8716320565199618e-06, "loss": 0.0049, "step": 18340 }, { "epoch": 10.268606603245663, "grad_norm": 0.12232355028390884, "learning_rate": 1.849289821105199e-06, "loss": 0.0077, "step": 18350 }, { "epoch": 10.274202574146614, "grad_norm": 0.09397400170564651, "learning_rate": 1.8270792278934302e-06, "loss": 0.0039, "step": 18360 }, { "epoch": 10.279798545047566, "grad_norm": 0.13843244314193726, "learning_rate": 1.8050003376075707e-06, "loss": 0.0059, "step": 18370 }, { "epoch": 10.285394515948518, "grad_norm": 0.04927824065089226, "learning_rate": 1.7830532106104747e-06, "loss": 0.003, "step": 18380 }, { "epoch": 10.290990486849468, "grad_norm": 0.2848436236381531, "learning_rate": 1.7612379069047335e-06, "loss": 0.004, "step": 18390 }, { "epoch": 10.29658645775042, "grad_norm": 0.10808296501636505, "learning_rate": 1.7395544861325718e-06, "loss": 0.0072, "step": 18400 }, { "epoch": 10.30218242865137, "grad_norm": 0.08363109827041626, "learning_rate": 1.7180030075756136e-06, "loss": 0.0029, "step": 18410 }, { "epoch": 10.307778399552323, "grad_norm": 0.07970738410949707, "learning_rate": 1.696583530154794e-06, "loss": 0.0058, "step": 18420 }, { "epoch": 10.313374370453273, "grad_norm": 0.06155739724636078, "learning_rate": 1.6752961124301415e-06, "loss": 0.0042, "step": 18430 }, { "epoch": 10.318970341354225, "grad_norm": 0.15518154203891754, "learning_rate": 1.6541408126006463e-06, "loss": 0.006, "step": 18440 }, { "epoch": 10.324566312255175, "grad_norm": 0.06478218734264374, "learning_rate": 1.6331176885040878e-06, "loss": 0.0083, "step": 18450 }, { "epoch": 10.330162283156128, "grad_norm": 0.11871203780174255, "learning_rate": 1.6122267976168781e-06, "loss": 0.0046, "step": 18460 }, { "epoch": 10.33575825405708, "grad_norm": 0.13164940476417542, "learning_rate": 1.5914681970539192e-06, "loss": 0.0055, "step": 18470 }, { "epoch": 10.34135422495803, "grad_norm": 0.08165992051362991, "learning_rate": 1.5708419435684462e-06, "loss": 0.0065, "step": 18480 }, { "epoch": 10.346950195858982, "grad_norm": 0.06479761004447937, "learning_rate": 1.550348093551829e-06, "loss": 0.0044, "step": 18490 }, { "epoch": 10.352546166759932, "grad_norm": 0.24080127477645874, "learning_rate": 1.5299867030334814e-06, "loss": 0.0085, "step": 18500 }, { "epoch": 10.358142137660884, "grad_norm": 0.1411421000957489, "learning_rate": 1.5097578276806633e-06, "loss": 0.0045, "step": 18510 }, { "epoch": 10.363738108561835, "grad_norm": 0.058580052107572556, "learning_rate": 1.4896615227983468e-06, "loss": 0.0041, "step": 18520 }, { "epoch": 10.369334079462787, "grad_norm": 0.1638147383928299, "learning_rate": 1.4696978433290653e-06, "loss": 0.0054, "step": 18530 }, { "epoch": 10.374930050363739, "grad_norm": 0.05566524341702461, "learning_rate": 1.4498668438527597e-06, "loss": 0.004, "step": 18540 }, { "epoch": 10.38052602126469, "grad_norm": 0.07601140439510345, "learning_rate": 1.4301685785866214e-06, "loss": 0.0034, "step": 18550 }, { "epoch": 10.386121992165641, "grad_norm": 0.10449633747339249, "learning_rate": 1.4106031013849496e-06, "loss": 0.0041, "step": 18560 }, { "epoch": 10.391717963066592, "grad_norm": 0.15937356650829315, "learning_rate": 1.3911704657390113e-06, "loss": 0.0039, "step": 18570 }, { "epoch": 10.397313933967544, "grad_norm": 0.059475306421518326, "learning_rate": 1.3718707247769135e-06, "loss": 0.006, "step": 18580 }, { "epoch": 10.402909904868494, "grad_norm": 0.24354378879070282, "learning_rate": 1.3527039312633827e-06, "loss": 0.0042, "step": 18590 }, { "epoch": 10.408505875769446, "grad_norm": 0.20878778398036957, "learning_rate": 1.333670137599713e-06, "loss": 0.0107, "step": 18600 }, { "epoch": 10.414101846670397, "grad_norm": 0.1909496784210205, "learning_rate": 1.3147693958235618e-06, "loss": 0.0034, "step": 18610 }, { "epoch": 10.419697817571349, "grad_norm": 0.13632823526859283, "learning_rate": 1.2960017576088446e-06, "loss": 0.0066, "step": 18620 }, { "epoch": 10.4252937884723, "grad_norm": 0.10793755203485489, "learning_rate": 1.2773672742655784e-06, "loss": 0.0037, "step": 18630 }, { "epoch": 10.430889759373251, "grad_norm": 0.10346037149429321, "learning_rate": 1.2588659967397e-06, "loss": 0.0044, "step": 18640 }, { "epoch": 10.436485730274203, "grad_norm": 0.08834080398082733, "learning_rate": 1.2404979756130142e-06, "loss": 0.0037, "step": 18650 }, { "epoch": 10.442081701175153, "grad_norm": 0.09045784175395966, "learning_rate": 1.222263261102985e-06, "loss": 0.0052, "step": 18660 }, { "epoch": 10.447677672076106, "grad_norm": 0.07731129229068756, "learning_rate": 1.2041619030626284e-06, "loss": 0.0071, "step": 18670 }, { "epoch": 10.453273642977056, "grad_norm": 0.08769071102142334, "learning_rate": 1.1861939509803687e-06, "loss": 0.0044, "step": 18680 }, { "epoch": 10.458869613878008, "grad_norm": 0.15766629576683044, "learning_rate": 1.1683594539798893e-06, "loss": 0.0063, "step": 18690 }, { "epoch": 10.46446558477896, "grad_norm": 0.11048921942710876, "learning_rate": 1.1506584608200367e-06, "loss": 0.0033, "step": 18700 }, { "epoch": 10.47006155567991, "grad_norm": 0.25674813985824585, "learning_rate": 1.1330910198946442e-06, "loss": 0.0047, "step": 18710 }, { "epoch": 10.475657526580862, "grad_norm": 0.09696432203054428, "learning_rate": 1.1156571792324211e-06, "loss": 0.0038, "step": 18720 }, { "epoch": 10.481253497481813, "grad_norm": 0.17716100811958313, "learning_rate": 1.0983569864968346e-06, "loss": 0.0085, "step": 18730 }, { "epoch": 10.486849468382765, "grad_norm": 0.18763263523578644, "learning_rate": 1.0811904889859336e-06, "loss": 0.009, "step": 18740 }, { "epoch": 10.492445439283715, "grad_norm": 0.047968145459890366, "learning_rate": 1.064157733632276e-06, "loss": 0.0051, "step": 18750 }, { "epoch": 10.498041410184667, "grad_norm": 0.1565999537706375, "learning_rate": 1.0472587670027678e-06, "loss": 0.0062, "step": 18760 }, { "epoch": 10.503637381085618, "grad_norm": 0.06519567221403122, "learning_rate": 1.030493635298535e-06, "loss": 0.0073, "step": 18770 }, { "epoch": 10.50923335198657, "grad_norm": 0.10364692658185959, "learning_rate": 1.0138623843548078e-06, "loss": 0.0051, "step": 18780 }, { "epoch": 10.514829322887522, "grad_norm": 0.036633651703596115, "learning_rate": 9.97365059640787e-07, "loss": 0.0062, "step": 18790 }, { "epoch": 10.520425293788472, "grad_norm": 0.2015930861234665, "learning_rate": 9.810017062595322e-07, "loss": 0.0037, "step": 18800 }, { "epoch": 10.526021264689424, "grad_norm": 0.1180974468588829, "learning_rate": 9.647723689478305e-07, "loss": 0.0039, "step": 18810 }, { "epoch": 10.531617235590375, "grad_norm": 0.07416771352291107, "learning_rate": 9.486770920760668e-07, "loss": 0.0041, "step": 18820 }, { "epoch": 10.537213206491327, "grad_norm": 0.05668334290385246, "learning_rate": 9.327159196481138e-07, "loss": 0.0059, "step": 18830 }, { "epoch": 10.542809177392277, "grad_norm": 0.07584750652313232, "learning_rate": 9.168888953011989e-07, "loss": 0.0054, "step": 18840 }, { "epoch": 10.548405148293229, "grad_norm": 0.06703902035951614, "learning_rate": 9.011960623058202e-07, "loss": 0.0039, "step": 18850 }, { "epoch": 10.55400111919418, "grad_norm": 0.06538796424865723, "learning_rate": 8.856374635655695e-07, "loss": 0.0035, "step": 18860 }, { "epoch": 10.559597090095131, "grad_norm": 0.09234767407178879, "learning_rate": 8.702131416170656e-07, "loss": 0.0047, "step": 18870 }, { "epoch": 10.565193060996084, "grad_norm": 0.09068552404642105, "learning_rate": 8.549231386298151e-07, "loss": 0.0032, "step": 18880 }, { "epoch": 10.570789031897034, "grad_norm": 0.2574044466018677, "learning_rate": 8.397674964061075e-07, "loss": 0.0123, "step": 18890 }, { "epoch": 10.576385002797986, "grad_norm": 0.1742398738861084, "learning_rate": 8.247462563808817e-07, "loss": 0.005, "step": 18900 }, { "epoch": 10.581980973698936, "grad_norm": 0.19498533010482788, "learning_rate": 8.098594596216424e-07, "loss": 0.0051, "step": 18910 }, { "epoch": 10.587576944599888, "grad_norm": 0.1093849390745163, "learning_rate": 7.951071468283167e-07, "loss": 0.0062, "step": 18920 }, { "epoch": 10.593172915500839, "grad_norm": 0.05242215842008591, "learning_rate": 7.804893583331696e-07, "loss": 0.0049, "step": 18930 }, { "epoch": 10.59876888640179, "grad_norm": 0.06830724328756332, "learning_rate": 7.66006134100672e-07, "loss": 0.0031, "step": 18940 }, { "epoch": 10.604364857302741, "grad_norm": 0.08541436493396759, "learning_rate": 7.516575137274162e-07, "loss": 0.0044, "step": 18950 }, { "epoch": 10.609960828203693, "grad_norm": 0.042029768228530884, "learning_rate": 7.374435364419674e-07, "loss": 0.0043, "step": 18960 }, { "epoch": 10.615556799104645, "grad_norm": 0.12100391089916229, "learning_rate": 7.233642411048014e-07, "loss": 0.0032, "step": 18970 }, { "epoch": 10.621152770005596, "grad_norm": 0.04842936620116234, "learning_rate": 7.094196662081831e-07, "loss": 0.0052, "step": 18980 }, { "epoch": 10.626748740906548, "grad_norm": 0.13397961854934692, "learning_rate": 6.956098498760389e-07, "loss": 0.0056, "step": 18990 }, { "epoch": 10.632344711807498, "grad_norm": 0.19486455619335175, "learning_rate": 6.819348298638839e-07, "loss": 0.0029, "step": 19000 }, { "epoch": 10.63794068270845, "grad_norm": 0.1525876224040985, "learning_rate": 6.683946435586952e-07, "loss": 0.0142, "step": 19010 }, { "epoch": 10.6435366536094, "grad_norm": 0.09059377759695053, "learning_rate": 6.549893279788277e-07, "loss": 0.0057, "step": 19020 }, { "epoch": 10.649132624510353, "grad_norm": 0.08628048002719879, "learning_rate": 6.417189197739093e-07, "loss": 0.0059, "step": 19030 }, { "epoch": 10.654728595411305, "grad_norm": 0.34853503108024597, "learning_rate": 6.285834552247128e-07, "loss": 0.0041, "step": 19040 }, { "epoch": 10.660324566312255, "grad_norm": 0.1580825001001358, "learning_rate": 6.15582970243117e-07, "loss": 0.0059, "step": 19050 }, { "epoch": 10.665920537213207, "grad_norm": 0.2064519226551056, "learning_rate": 6.027175003719354e-07, "loss": 0.0065, "step": 19060 }, { "epoch": 10.671516508114157, "grad_norm": 0.1656566709280014, "learning_rate": 5.899870807848762e-07, "loss": 0.0045, "step": 19070 }, { "epoch": 10.67711247901511, "grad_norm": 0.06346923857927322, "learning_rate": 5.773917462864264e-07, "loss": 0.0108, "step": 19080 }, { "epoch": 10.68270844991606, "grad_norm": 0.0746588408946991, "learning_rate": 5.64931531311741e-07, "loss": 0.0038, "step": 19090 }, { "epoch": 10.688304420817012, "grad_norm": 0.10566951334476471, "learning_rate": 5.526064699265753e-07, "loss": 0.0084, "step": 19100 }, { "epoch": 10.693900391717962, "grad_norm": 0.061587151139974594, "learning_rate": 5.404165958271811e-07, "loss": 0.0042, "step": 19110 }, { "epoch": 10.699496362618914, "grad_norm": 0.27593472599983215, "learning_rate": 5.283619423401998e-07, "loss": 0.005, "step": 19120 }, { "epoch": 10.705092333519866, "grad_norm": 0.37827596068382263, "learning_rate": 5.164425424226016e-07, "loss": 0.0068, "step": 19130 }, { "epoch": 10.710688304420817, "grad_norm": 0.2789309322834015, "learning_rate": 5.046584286615697e-07, "loss": 0.0054, "step": 19140 }, { "epoch": 10.716284275321769, "grad_norm": 0.08417310565710068, "learning_rate": 4.930096332744105e-07, "loss": 0.0043, "step": 19150 }, { "epoch": 10.72188024622272, "grad_norm": 0.13277283310890198, "learning_rate": 4.814961881085045e-07, "loss": 0.007, "step": 19160 }, { "epoch": 10.727476217123671, "grad_norm": 0.029057292267680168, "learning_rate": 4.701181246411501e-07, "loss": 0.0077, "step": 19170 }, { "epoch": 10.733072188024622, "grad_norm": 0.07132174074649811, "learning_rate": 4.5887547397955864e-07, "loss": 0.0044, "step": 19180 }, { "epoch": 10.738668158925574, "grad_norm": 0.05213991925120354, "learning_rate": 4.4776826686069305e-07, "loss": 0.0022, "step": 19190 }, { "epoch": 10.744264129826526, "grad_norm": 0.092039555311203, "learning_rate": 4.367965336512403e-07, "loss": 0.0032, "step": 19200 }, { "epoch": 10.749860100727476, "grad_norm": 0.17352578043937683, "learning_rate": 4.259603043475002e-07, "loss": 0.0064, "step": 19210 }, { "epoch": 10.755456071628428, "grad_norm": 0.15915948152542114, "learning_rate": 4.1525960857530243e-07, "loss": 0.0075, "step": 19220 }, { "epoch": 10.761052042529379, "grad_norm": 0.21297423541545868, "learning_rate": 4.0469447558995065e-07, "loss": 0.0057, "step": 19230 }, { "epoch": 10.76664801343033, "grad_norm": 0.17462663352489471, "learning_rate": 3.9426493427611177e-07, "loss": 0.0056, "step": 19240 }, { "epoch": 10.772243984331281, "grad_norm": 0.10657753050327301, "learning_rate": 3.839710131477492e-07, "loss": 0.0089, "step": 19250 }, { "epoch": 10.777839955232233, "grad_norm": 0.07254552841186523, "learning_rate": 3.738127403480507e-07, "loss": 0.003, "step": 19260 }, { "epoch": 10.783435926133183, "grad_norm": 0.27843359112739563, "learning_rate": 3.637901436493507e-07, "loss": 0.0067, "step": 19270 }, { "epoch": 10.789031897034135, "grad_norm": 0.17431190609931946, "learning_rate": 3.5390325045304706e-07, "loss": 0.0042, "step": 19280 }, { "epoch": 10.794627867935088, "grad_norm": 0.11761761456727982, "learning_rate": 3.441520877895288e-07, "loss": 0.0036, "step": 19290 }, { "epoch": 10.800223838836038, "grad_norm": 0.1055087074637413, "learning_rate": 3.3453668231809286e-07, "loss": 0.0049, "step": 19300 }, { "epoch": 10.80581980973699, "grad_norm": 0.05716053023934364, "learning_rate": 3.250570603268943e-07, "loss": 0.0057, "step": 19310 }, { "epoch": 10.81141578063794, "grad_norm": 0.06227661669254303, "learning_rate": 3.157132477328628e-07, "loss": 0.0047, "step": 19320 }, { "epoch": 10.817011751538892, "grad_norm": 0.07587496936321259, "learning_rate": 3.0650527008162513e-07, "loss": 0.0058, "step": 19330 }, { "epoch": 10.822607722439843, "grad_norm": 0.12384708225727081, "learning_rate": 2.9743315254743833e-07, "loss": 0.0044, "step": 19340 }, { "epoch": 10.828203693340795, "grad_norm": 0.130027636885643, "learning_rate": 2.8849691993311777e-07, "loss": 0.0048, "step": 19350 }, { "epoch": 10.833799664241745, "grad_norm": 0.03498604893684387, "learning_rate": 2.796965966699927e-07, "loss": 0.0076, "step": 19360 }, { "epoch": 10.839395635142697, "grad_norm": 0.06795532256364822, "learning_rate": 2.7103220681780615e-07, "loss": 0.0046, "step": 19370 }, { "epoch": 10.84499160604365, "grad_norm": 0.15649089217185974, "learning_rate": 2.625037740646763e-07, "loss": 0.0041, "step": 19380 }, { "epoch": 10.8505875769446, "grad_norm": 0.19872230291366577, "learning_rate": 2.5411132172700194e-07, "loss": 0.0045, "step": 19390 }, { "epoch": 10.856183547845552, "grad_norm": 0.1986837238073349, "learning_rate": 2.458548727494292e-07, "loss": 0.0034, "step": 19400 }, { "epoch": 10.861779518746502, "grad_norm": 0.34645870327949524, "learning_rate": 2.3773444970477955e-07, "loss": 0.0059, "step": 19410 }, { "epoch": 10.867375489647454, "grad_norm": 0.043271441012620926, "learning_rate": 2.2975007479397738e-07, "loss": 0.0042, "step": 19420 }, { "epoch": 10.872971460548404, "grad_norm": 0.10621374845504761, "learning_rate": 2.219017698460002e-07, "loss": 0.0107, "step": 19430 }, { "epoch": 10.878567431449357, "grad_norm": 0.038412097841501236, "learning_rate": 2.1418955631781202e-07, "loss": 0.0025, "step": 19440 }, { "epoch": 10.884163402350307, "grad_norm": 0.14375977218151093, "learning_rate": 2.0661345529430775e-07, "loss": 0.0063, "step": 19450 }, { "epoch": 10.889759373251259, "grad_norm": 0.28644490242004395, "learning_rate": 1.9917348748826335e-07, "loss": 0.0037, "step": 19460 }, { "epoch": 10.895355344152211, "grad_norm": 0.19371145963668823, "learning_rate": 1.918696732402636e-07, "loss": 0.0071, "step": 19470 }, { "epoch": 10.900951315053161, "grad_norm": 0.11907006055116653, "learning_rate": 1.847020325186577e-07, "loss": 0.0049, "step": 19480 }, { "epoch": 10.906547285954113, "grad_norm": 0.10020023584365845, "learning_rate": 1.776705849195037e-07, "loss": 0.0036, "step": 19490 }, { "epoch": 10.912143256855064, "grad_norm": 0.12778791785240173, "learning_rate": 1.7077534966650766e-07, "loss": 0.0057, "step": 19500 }, { "epoch": 10.917739227756016, "grad_norm": 0.06359223276376724, "learning_rate": 1.6401634561098444e-07, "loss": 0.0036, "step": 19510 }, { "epoch": 10.923335198656966, "grad_norm": 0.07983513921499252, "learning_rate": 1.5739359123178587e-07, "loss": 0.0037, "step": 19520 }, { "epoch": 10.928931169557918, "grad_norm": 0.12060696631669998, "learning_rate": 1.5090710463527836e-07, "loss": 0.0031, "step": 19530 }, { "epoch": 10.93452714045887, "grad_norm": 0.10252276062965393, "learning_rate": 1.4455690355525964e-07, "loss": 0.0052, "step": 19540 }, { "epoch": 10.94012311135982, "grad_norm": 0.10586907714605331, "learning_rate": 1.383430053529422e-07, "loss": 0.0025, "step": 19550 }, { "epoch": 10.945719082260773, "grad_norm": 0.05571618303656578, "learning_rate": 1.3226542701689215e-07, "loss": 0.0045, "step": 19560 }, { "epoch": 10.951315053161723, "grad_norm": 0.07698628306388855, "learning_rate": 1.2632418516296262e-07, "loss": 0.0039, "step": 19570 }, { "epoch": 10.956911024062675, "grad_norm": 0.3049318790435791, "learning_rate": 1.2051929603428825e-07, "loss": 0.0036, "step": 19580 }, { "epoch": 10.962506994963626, "grad_norm": 0.04247491434216499, "learning_rate": 1.1485077550122402e-07, "loss": 0.0086, "step": 19590 }, { "epoch": 10.968102965864578, "grad_norm": 0.13998843729496002, "learning_rate": 1.0931863906127327e-07, "loss": 0.0032, "step": 19600 }, { "epoch": 10.973698936765528, "grad_norm": 0.18532228469848633, "learning_rate": 1.0392290183909304e-07, "loss": 0.0053, "step": 19610 }, { "epoch": 10.97929490766648, "grad_norm": 0.24849370121955872, "learning_rate": 9.866357858642205e-08, "loss": 0.0031, "step": 19620 }, { "epoch": 10.984890878567432, "grad_norm": 0.04739070311188698, "learning_rate": 9.354068368204739e-08, "loss": 0.0055, "step": 19630 }, { "epoch": 10.990486849468383, "grad_norm": 0.13325341045856476, "learning_rate": 8.855423113177664e-08, "loss": 0.0027, "step": 19640 }, { "epoch": 10.996082820369335, "grad_norm": 0.15442515909671783, "learning_rate": 8.37042345683714e-08, "loss": 0.009, "step": 19650 }, { "epoch": 11.001678791270285, "grad_norm": 0.20657239854335785, "learning_rate": 7.899070725153613e-08, "loss": 0.0063, "step": 19660 }, { "epoch": 11.007274762171237, "grad_norm": 0.16029535233974457, "learning_rate": 7.44136620678848e-08, "loss": 0.0044, "step": 19670 }, { "epoch": 11.012870733072187, "grad_norm": 0.16476546227931976, "learning_rate": 6.997311153086883e-08, "loss": 0.0066, "step": 19680 }, { "epoch": 11.01846670397314, "grad_norm": 0.12683425843715668, "learning_rate": 6.566906778079917e-08, "loss": 0.0052, "step": 19690 }, { "epoch": 11.024062674874092, "grad_norm": 0.23135153949260712, "learning_rate": 6.150154258476315e-08, "loss": 0.0043, "step": 19700 }, { "epoch": 11.029658645775042, "grad_norm": 0.1939716786146164, "learning_rate": 5.747054733660773e-08, "loss": 0.0077, "step": 19710 }, { "epoch": 11.035254616675994, "grad_norm": 0.11450741440057755, "learning_rate": 5.3576093056922906e-08, "loss": 0.0079, "step": 19720 }, { "epoch": 11.040850587576944, "grad_norm": 0.06929726153612137, "learning_rate": 4.981819039300284e-08, "loss": 0.0039, "step": 19730 }, { "epoch": 11.046446558477896, "grad_norm": 0.11268885433673859, "learning_rate": 4.619684961881254e-08, "loss": 0.0047, "step": 19740 }, { "epoch": 11.052042529378847, "grad_norm": 0.07555661350488663, "learning_rate": 4.2712080634949024e-08, "loss": 0.0038, "step": 19750 }, { "epoch": 11.057638500279799, "grad_norm": 0.07180225849151611, "learning_rate": 3.936389296864129e-08, "loss": 0.0066, "step": 19760 }, { "epoch": 11.063234471180749, "grad_norm": 0.2635197937488556, "learning_rate": 3.615229577371149e-08, "loss": 0.0047, "step": 19770 }, { "epoch": 11.068830442081701, "grad_norm": 0.03527739644050598, "learning_rate": 3.3077297830541584e-08, "loss": 0.0047, "step": 19780 }, { "epoch": 11.074426412982653, "grad_norm": 0.061606280505657196, "learning_rate": 3.01389075460512e-08, "loss": 0.0069, "step": 19790 }, { "epoch": 11.080022383883604, "grad_norm": 0.14764872193336487, "learning_rate": 2.7337132953697554e-08, "loss": 0.0063, "step": 19800 }, { "epoch": 11.085618354784556, "grad_norm": 0.13825170695781708, "learning_rate": 2.467198171342e-08, "loss": 0.0047, "step": 19810 }, { "epoch": 11.091214325685506, "grad_norm": 0.40132373571395874, "learning_rate": 2.214346111164556e-08, "loss": 0.0058, "step": 19820 }, { "epoch": 11.096810296586458, "grad_norm": 0.06293044239282608, "learning_rate": 1.9751578061244504e-08, "loss": 0.0093, "step": 19830 }, { "epoch": 11.102406267487408, "grad_norm": 0.08641501516103745, "learning_rate": 1.749633910153592e-08, "loss": 0.0061, "step": 19840 }, { "epoch": 11.10800223838836, "grad_norm": 0.06543342024087906, "learning_rate": 1.5377750398265502e-08, "loss": 0.0034, "step": 19850 }, { "epoch": 11.11359820928931, "grad_norm": 0.0463268905878067, "learning_rate": 1.3395817743561134e-08, "loss": 0.0031, "step": 19860 }, { "epoch": 11.119194180190263, "grad_norm": 0.18889687955379486, "learning_rate": 1.1550546555960662e-08, "loss": 0.0049, "step": 19870 }, { "epoch": 11.124790151091215, "grad_norm": 0.33526870608329773, "learning_rate": 9.841941880361916e-09, "loss": 0.0068, "step": 19880 }, { "epoch": 11.130386121992165, "grad_norm": 0.17259934544563293, "learning_rate": 8.270008388022721e-09, "loss": 0.0047, "step": 19890 }, { "epoch": 11.135982092893117, "grad_norm": 0.24882031977176666, "learning_rate": 6.834750376549792e-09, "loss": 0.0061, "step": 19900 }, { "epoch": 11.141578063794068, "grad_norm": 0.05286456272006035, "learning_rate": 5.536171769887632e-09, "loss": 0.0059, "step": 19910 }, { "epoch": 11.14717403469502, "grad_norm": 0.08882560580968857, "learning_rate": 4.3742761183018784e-09, "loss": 0.0063, "step": 19920 }, { "epoch": 11.15277000559597, "grad_norm": 0.09571769833564758, "learning_rate": 3.349066598362649e-09, "loss": 0.0034, "step": 19930 }, { "epoch": 11.158365976496922, "grad_norm": 0.07795775681734085, "learning_rate": 2.4605460129556445e-09, "loss": 0.0029, "step": 19940 }, { "epoch": 11.163961947397874, "grad_norm": 0.07696644216775894, "learning_rate": 1.7087167912710478e-09, "loss": 0.0083, "step": 19950 }, { "epoch": 11.169557918298825, "grad_norm": 0.26498469710350037, "learning_rate": 1.0935809887702154e-09, "loss": 0.0033, "step": 19960 }, { "epoch": 11.175153889199777, "grad_norm": 0.165630042552948, "learning_rate": 6.151402872134337e-10, "loss": 0.007, "step": 19970 }, { "epoch": 11.180749860100727, "grad_norm": 0.07009857147932053, "learning_rate": 2.7339599464326627e-10, "loss": 0.0037, "step": 19980 }, { "epoch": 11.18634583100168, "grad_norm": 0.1754114180803299, "learning_rate": 6.834904537900144e-11, "loss": 0.0069, "step": 19990 }, { "epoch": 11.19194180190263, "grad_norm": 0.18103741109371185, "learning_rate": 0.0, "loss": 0.0044, "step": 20000 }, { "epoch": 11.19194180190263, "step": 20000, "total_flos": 7.091345386565736e+17, "train_loss": 0.01661205664295703, "train_runtime": 10761.0808, "train_samples_per_second": 29.737, "train_steps_per_second": 1.859 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.091345386565736e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }