diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5272 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 7470, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004017576898932831, + "grad_norm": 5.806783098701457, + "learning_rate": 1.204819277108434e-07, + "loss": 1.0662, + "step": 10 + }, + { + "epoch": 0.008035153797865662, + "grad_norm": 6.1737495782500815, + "learning_rate": 2.5435073627844717e-07, + "loss": 1.0418, + "step": 20 + }, + { + "epoch": 0.012052730696798493, + "grad_norm": 5.869188839514599, + "learning_rate": 3.8821954484605087e-07, + "loss": 1.0476, + "step": 30 + }, + { + "epoch": 0.016070307595731324, + "grad_norm": 4.067430650309112, + "learning_rate": 5.220883534136546e-07, + "loss": 1.0186, + "step": 40 + }, + { + "epoch": 0.020087884494664157, + "grad_norm": 1.9993360792772212, + "learning_rate": 6.559571619812584e-07, + "loss": 0.9457, + "step": 50 + }, + { + "epoch": 0.024105461393596987, + "grad_norm": 1.0897504400503495, + "learning_rate": 7.898259705488621e-07, + "loss": 0.8694, + "step": 60 + }, + { + "epoch": 0.02812303829252982, + "grad_norm": 0.8480278075524572, + "learning_rate": 9.236947791164659e-07, + "loss": 0.8453, + "step": 70 + }, + { + "epoch": 0.03214061519146265, + "grad_norm": 0.5847985812870466, + "learning_rate": 1.0575635876840697e-06, + "loss": 0.8091, + "step": 80 + }, + { + "epoch": 0.03615819209039548, + "grad_norm": 0.5632601125338154, + "learning_rate": 1.1914323962516733e-06, + "loss": 0.7981, + "step": 90 + }, + { + "epoch": 0.040175768989328314, + "grad_norm": 0.4998168225005524, + "learning_rate": 1.3253012048192773e-06, + "loss": 0.7646, + "step": 100 + }, + { + "epoch": 0.04419334588826114, + "grad_norm": 0.6903822114162349, + "learning_rate": 1.4591700133868811e-06, + "loss": 0.7619, + "step": 110 + }, + { + "epoch": 0.04821092278719397, + "grad_norm": 0.4815863955585346, + "learning_rate": 1.593038821954485e-06, + "loss": 0.7679, + "step": 120 + }, + { + "epoch": 0.052228499686126806, + "grad_norm": 0.4828754779519579, + "learning_rate": 1.7269076305220885e-06, + "loss": 0.7406, + "step": 130 + }, + { + "epoch": 0.05624607658505964, + "grad_norm": 0.5272829904340466, + "learning_rate": 1.8607764390896923e-06, + "loss": 0.7451, + "step": 140 + }, + { + "epoch": 0.060263653483992465, + "grad_norm": 0.48100813622874905, + "learning_rate": 1.994645247657296e-06, + "loss": 0.7053, + "step": 150 + }, + { + "epoch": 0.0642812303829253, + "grad_norm": 0.49305369454902975, + "learning_rate": 2.1285140562248997e-06, + "loss": 0.7217, + "step": 160 + }, + { + "epoch": 0.06829880728185812, + "grad_norm": 0.4711986293293931, + "learning_rate": 2.2623828647925037e-06, + "loss": 0.7368, + "step": 170 + }, + { + "epoch": 0.07231638418079096, + "grad_norm": 0.5025185766261334, + "learning_rate": 2.3962516733601073e-06, + "loss": 0.7179, + "step": 180 + }, + { + "epoch": 0.07633396107972379, + "grad_norm": 0.5264169172616818, + "learning_rate": 2.530120481927711e-06, + "loss": 0.7374, + "step": 190 + }, + { + "epoch": 0.08035153797865663, + "grad_norm": 0.46948548462020134, + "learning_rate": 2.6639892904953145e-06, + "loss": 0.72, + "step": 200 + }, + { + "epoch": 0.08436911487758945, + "grad_norm": 0.463248294175632, + "learning_rate": 2.7978580990629185e-06, + "loss": 0.7115, + "step": 210 + }, + { + "epoch": 0.08838669177652228, + "grad_norm": 0.45484667965794356, + "learning_rate": 2.931726907630522e-06, + "loss": 0.7034, + "step": 220 + }, + { + "epoch": 0.09240426867545512, + "grad_norm": 0.4357030076439862, + "learning_rate": 3.0655957161981257e-06, + "loss": 0.7126, + "step": 230 + }, + { + "epoch": 0.09642184557438795, + "grad_norm": 0.6453731431637157, + "learning_rate": 3.1994645247657297e-06, + "loss": 0.7097, + "step": 240 + }, + { + "epoch": 0.10043942247332077, + "grad_norm": 0.4849489606416127, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.7023, + "step": 250 + }, + { + "epoch": 0.10445699937225361, + "grad_norm": 0.42549798364358465, + "learning_rate": 3.4672021419009373e-06, + "loss": 0.6924, + "step": 260 + }, + { + "epoch": 0.10847457627118644, + "grad_norm": 0.4773000774187341, + "learning_rate": 3.601070950468541e-06, + "loss": 0.6968, + "step": 270 + }, + { + "epoch": 0.11249215317011928, + "grad_norm": 0.5240441729923269, + "learning_rate": 3.7349397590361445e-06, + "loss": 0.7045, + "step": 280 + }, + { + "epoch": 0.1165097300690521, + "grad_norm": 0.4910140491378646, + "learning_rate": 3.8688085676037485e-06, + "loss": 0.6933, + "step": 290 + }, + { + "epoch": 0.12052730696798493, + "grad_norm": 0.4580543272940571, + "learning_rate": 4.002677376171352e-06, + "loss": 0.6762, + "step": 300 + }, + { + "epoch": 0.12454488386691777, + "grad_norm": 0.4638793548670153, + "learning_rate": 4.136546184738956e-06, + "loss": 0.6807, + "step": 310 + }, + { + "epoch": 0.1285624607658506, + "grad_norm": 0.48226078090248237, + "learning_rate": 4.270414993306559e-06, + "loss": 0.6836, + "step": 320 + }, + { + "epoch": 0.13258003766478343, + "grad_norm": 0.5205992514175658, + "learning_rate": 4.404283801874164e-06, + "loss": 0.7006, + "step": 330 + }, + { + "epoch": 0.13659761456371625, + "grad_norm": 0.5157658854686866, + "learning_rate": 4.538152610441767e-06, + "loss": 0.6777, + "step": 340 + }, + { + "epoch": 0.1406151914626491, + "grad_norm": 0.5186893129183067, + "learning_rate": 4.672021419009371e-06, + "loss": 0.6863, + "step": 350 + }, + { + "epoch": 0.14463276836158193, + "grad_norm": 0.48228160242697454, + "learning_rate": 4.8058902275769745e-06, + "loss": 0.692, + "step": 360 + }, + { + "epoch": 0.14865034526051477, + "grad_norm": 0.48252742197899023, + "learning_rate": 4.939759036144578e-06, + "loss": 0.6866, + "step": 370 + }, + { + "epoch": 0.15266792215944758, + "grad_norm": 0.4457961570378318, + "learning_rate": 5.0736278447121826e-06, + "loss": 0.6786, + "step": 380 + }, + { + "epoch": 0.15668549905838042, + "grad_norm": 0.475655993068204, + "learning_rate": 5.207496653279787e-06, + "loss": 0.6899, + "step": 390 + }, + { + "epoch": 0.16070307595731326, + "grad_norm": 0.44137190174512886, + "learning_rate": 5.34136546184739e-06, + "loss": 0.6673, + "step": 400 + }, + { + "epoch": 0.16472065285624607, + "grad_norm": 0.49163743462647375, + "learning_rate": 5.475234270414994e-06, + "loss": 0.6696, + "step": 410 + }, + { + "epoch": 0.1687382297551789, + "grad_norm": 0.5274225683716718, + "learning_rate": 5.609103078982597e-06, + "loss": 0.6604, + "step": 420 + }, + { + "epoch": 0.17275580665411175, + "grad_norm": 0.4371201249615473, + "learning_rate": 5.742971887550201e-06, + "loss": 0.6623, + "step": 430 + }, + { + "epoch": 0.17677338355304456, + "grad_norm": 0.496866788031066, + "learning_rate": 5.876840696117805e-06, + "loss": 0.6733, + "step": 440 + }, + { + "epoch": 0.1807909604519774, + "grad_norm": 0.5029900924311191, + "learning_rate": 6.010709504685409e-06, + "loss": 0.6593, + "step": 450 + }, + { + "epoch": 0.18480853735091024, + "grad_norm": 0.5057790072232172, + "learning_rate": 6.144578313253012e-06, + "loss": 0.684, + "step": 460 + }, + { + "epoch": 0.18882611424984305, + "grad_norm": 1.210542334820464, + "learning_rate": 6.2784471218206166e-06, + "loss": 0.6743, + "step": 470 + }, + { + "epoch": 0.1928436911487759, + "grad_norm": 0.4979782025477802, + "learning_rate": 6.41231593038822e-06, + "loss": 0.6645, + "step": 480 + }, + { + "epoch": 0.19686126804770873, + "grad_norm": 0.4671679965356099, + "learning_rate": 6.546184738955825e-06, + "loss": 0.6593, + "step": 490 + }, + { + "epoch": 0.20087884494664154, + "grad_norm": 0.47107607146399, + "learning_rate": 6.680053547523427e-06, + "loss": 0.653, + "step": 500 + }, + { + "epoch": 0.20489642184557438, + "grad_norm": 0.5355141556215653, + "learning_rate": 6.813922356091032e-06, + "loss": 0.6555, + "step": 510 + }, + { + "epoch": 0.20891399874450722, + "grad_norm": 0.4770424360600091, + "learning_rate": 6.9477911646586345e-06, + "loss": 0.6426, + "step": 520 + }, + { + "epoch": 0.21293157564344006, + "grad_norm": 0.4799836242336395, + "learning_rate": 7.081659973226239e-06, + "loss": 0.6565, + "step": 530 + }, + { + "epoch": 0.21694915254237288, + "grad_norm": 0.5167694867415858, + "learning_rate": 7.2155287817938426e-06, + "loss": 0.6609, + "step": 540 + }, + { + "epoch": 0.22096672944130571, + "grad_norm": 0.5086826009179188, + "learning_rate": 7.349397590361447e-06, + "loss": 0.6494, + "step": 550 + }, + { + "epoch": 0.22498430634023855, + "grad_norm": 0.4922827776447584, + "learning_rate": 7.48326639892905e-06, + "loss": 0.6558, + "step": 560 + }, + { + "epoch": 0.22900188323917137, + "grad_norm": 0.48344938995436526, + "learning_rate": 7.617135207496654e-06, + "loss": 0.6481, + "step": 570 + }, + { + "epoch": 0.2330194601381042, + "grad_norm": 0.507502223485786, + "learning_rate": 7.751004016064258e-06, + "loss": 0.6529, + "step": 580 + }, + { + "epoch": 0.23703703703703705, + "grad_norm": 0.49027674360195406, + "learning_rate": 7.884872824631861e-06, + "loss": 0.6475, + "step": 590 + }, + { + "epoch": 0.24105461393596986, + "grad_norm": 0.47662111002737323, + "learning_rate": 8.018741633199465e-06, + "loss": 0.6606, + "step": 600 + }, + { + "epoch": 0.2450721908349027, + "grad_norm": 0.5178759139169029, + "learning_rate": 8.152610441767069e-06, + "loss": 0.6507, + "step": 610 + }, + { + "epoch": 0.24908976773383554, + "grad_norm": 0.4913437654692537, + "learning_rate": 8.286479250334672e-06, + "loss": 0.6676, + "step": 620 + }, + { + "epoch": 0.25310734463276835, + "grad_norm": 0.4983838425660273, + "learning_rate": 8.420348058902277e-06, + "loss": 0.6431, + "step": 630 + }, + { + "epoch": 0.2571249215317012, + "grad_norm": 0.5296861218593591, + "learning_rate": 8.55421686746988e-06, + "loss": 0.6469, + "step": 640 + }, + { + "epoch": 0.26114249843063403, + "grad_norm": 0.49470648462227873, + "learning_rate": 8.688085676037485e-06, + "loss": 0.653, + "step": 650 + }, + { + "epoch": 0.26516007532956687, + "grad_norm": 0.4977505266791689, + "learning_rate": 8.821954484605088e-06, + "loss": 0.6506, + "step": 660 + }, + { + "epoch": 0.2691776522284997, + "grad_norm": 0.5181442610979835, + "learning_rate": 8.955823293172692e-06, + "loss": 0.6389, + "step": 670 + }, + { + "epoch": 0.2731952291274325, + "grad_norm": 0.5401626680115751, + "learning_rate": 9.089692101740295e-06, + "loss": 0.644, + "step": 680 + }, + { + "epoch": 0.27721280602636533, + "grad_norm": 0.5294876220592581, + "learning_rate": 9.223560910307899e-06, + "loss": 0.6476, + "step": 690 + }, + { + "epoch": 0.2812303829252982, + "grad_norm": 0.5001749586085277, + "learning_rate": 9.357429718875503e-06, + "loss": 0.6469, + "step": 700 + }, + { + "epoch": 0.285247959824231, + "grad_norm": 0.49514793732187534, + "learning_rate": 9.491298527443106e-06, + "loss": 0.6412, + "step": 710 + }, + { + "epoch": 0.28926553672316385, + "grad_norm": 0.5451601544807843, + "learning_rate": 9.62516733601071e-06, + "loss": 0.6342, + "step": 720 + }, + { + "epoch": 0.2932831136220967, + "grad_norm": 0.5386128345367395, + "learning_rate": 9.759036144578315e-06, + "loss": 0.6422, + "step": 730 + }, + { + "epoch": 0.29730069052102953, + "grad_norm": 0.5295404573044942, + "learning_rate": 9.892904953145917e-06, + "loss": 0.6418, + "step": 740 + }, + { + "epoch": 0.3013182674199623, + "grad_norm": 0.48815626124299877, + "learning_rate": 9.999997816397962e-06, + "loss": 0.649, + "step": 750 + }, + { + "epoch": 0.30533584431889516, + "grad_norm": 0.5070827630467095, + "learning_rate": 9.999921390526839e-06, + "loss": 0.6453, + "step": 760 + }, + { + "epoch": 0.309353421217828, + "grad_norm": 0.5070143645241298, + "learning_rate": 9.999735786460982e-06, + "loss": 0.6302, + "step": 770 + }, + { + "epoch": 0.31337099811676083, + "grad_norm": 0.4790968559270035, + "learning_rate": 9.999441008253238e-06, + "loss": 0.632, + "step": 780 + }, + { + "epoch": 0.3173885750156937, + "grad_norm": 0.48852106728621275, + "learning_rate": 9.999037062340376e-06, + "loss": 0.6436, + "step": 790 + }, + { + "epoch": 0.3214061519146265, + "grad_norm": 0.4972110031536491, + "learning_rate": 9.998523957542955e-06, + "loss": 0.6411, + "step": 800 + }, + { + "epoch": 0.3254237288135593, + "grad_norm": 0.533109673228536, + "learning_rate": 9.997901705065118e-06, + "loss": 0.6422, + "step": 810 + }, + { + "epoch": 0.32944130571249214, + "grad_norm": 0.528721261444126, + "learning_rate": 9.997170318494362e-06, + "loss": 0.6457, + "step": 820 + }, + { + "epoch": 0.333458882611425, + "grad_norm": 0.7706123304017218, + "learning_rate": 9.996329813801233e-06, + "loss": 0.6479, + "step": 830 + }, + { + "epoch": 0.3374764595103578, + "grad_norm": 0.6304185980981377, + "learning_rate": 9.995380209338973e-06, + "loss": 0.639, + "step": 840 + }, + { + "epoch": 0.34149403640929066, + "grad_norm": 0.47993658132408695, + "learning_rate": 9.99432152584313e-06, + "loss": 0.6232, + "step": 850 + }, + { + "epoch": 0.3455116133082235, + "grad_norm": 0.5189489672799563, + "learning_rate": 9.993153786431098e-06, + "loss": 0.6457, + "step": 860 + }, + { + "epoch": 0.3495291902071563, + "grad_norm": 0.490298021772909, + "learning_rate": 9.991877016601612e-06, + "loss": 0.6489, + "step": 870 + }, + { + "epoch": 0.3535467671060891, + "grad_norm": 0.4733551505574991, + "learning_rate": 9.990491244234197e-06, + "loss": 0.6327, + "step": 880 + }, + { + "epoch": 0.35756434400502196, + "grad_norm": 0.4812850798556578, + "learning_rate": 9.988996499588556e-06, + "loss": 0.6325, + "step": 890 + }, + { + "epoch": 0.3615819209039548, + "grad_norm": 0.5139202954592238, + "learning_rate": 9.987392815303903e-06, + "loss": 0.6302, + "step": 900 + }, + { + "epoch": 0.36559949780288764, + "grad_norm": 0.4950874659702236, + "learning_rate": 9.985680226398261e-06, + "loss": 0.641, + "step": 910 + }, + { + "epoch": 0.3696170747018205, + "grad_norm": 0.5079828516156794, + "learning_rate": 9.98385877026769e-06, + "loss": 0.6384, + "step": 920 + }, + { + "epoch": 0.3736346516007533, + "grad_norm": 0.49658973438025256, + "learning_rate": 9.981928486685477e-06, + "loss": 0.6365, + "step": 930 + }, + { + "epoch": 0.3776522284996861, + "grad_norm": 0.46532869938151694, + "learning_rate": 9.979889417801257e-06, + "loss": 0.64, + "step": 940 + }, + { + "epoch": 0.38166980539861894, + "grad_norm": 0.5187436115265832, + "learning_rate": 9.9777416081401e-06, + "loss": 0.6268, + "step": 950 + }, + { + "epoch": 0.3856873822975518, + "grad_norm": 0.46664875039431214, + "learning_rate": 9.975485104601544e-06, + "loss": 0.6302, + "step": 960 + }, + { + "epoch": 0.3897049591964846, + "grad_norm": 0.47880441558880193, + "learning_rate": 9.973119956458558e-06, + "loss": 0.6238, + "step": 970 + }, + { + "epoch": 0.39372253609541746, + "grad_norm": 0.4691906979131477, + "learning_rate": 9.970646215356477e-06, + "loss": 0.6422, + "step": 980 + }, + { + "epoch": 0.3977401129943503, + "grad_norm": 0.5000299551978495, + "learning_rate": 9.968063935311865e-06, + "loss": 0.6329, + "step": 990 + }, + { + "epoch": 0.4017576898932831, + "grad_norm": 0.5484904742238748, + "learning_rate": 9.965373172711343e-06, + "loss": 0.6317, + "step": 1000 + }, + { + "epoch": 0.4057752667922159, + "grad_norm": 0.517325966851464, + "learning_rate": 9.96257398631036e-06, + "loss": 0.6404, + "step": 1010 + }, + { + "epoch": 0.40979284369114877, + "grad_norm": 0.4507627813502968, + "learning_rate": 9.959666437231895e-06, + "loss": 0.6303, + "step": 1020 + }, + { + "epoch": 0.4138104205900816, + "grad_norm": 0.5071861791885447, + "learning_rate": 9.95665058896514e-06, + "loss": 0.6135, + "step": 1030 + }, + { + "epoch": 0.41782799748901445, + "grad_norm": 0.45341474710439855, + "learning_rate": 9.953526507364106e-06, + "loss": 0.619, + "step": 1040 + }, + { + "epoch": 0.4218455743879473, + "grad_norm": 0.46797706523025123, + "learning_rate": 9.95029426064618e-06, + "loss": 0.6253, + "step": 1050 + }, + { + "epoch": 0.4258631512868801, + "grad_norm": 0.5018403383029635, + "learning_rate": 9.946953919390648e-06, + "loss": 0.6363, + "step": 1060 + }, + { + "epoch": 0.4298807281858129, + "grad_norm": 0.4563294653990798, + "learning_rate": 9.94350555653714e-06, + "loss": 0.6223, + "step": 1070 + }, + { + "epoch": 0.43389830508474575, + "grad_norm": 0.5142438260172338, + "learning_rate": 9.939949247384046e-06, + "loss": 0.636, + "step": 1080 + }, + { + "epoch": 0.4379158819836786, + "grad_norm": 0.4974669445120328, + "learning_rate": 9.93628506958687e-06, + "loss": 0.6242, + "step": 1090 + }, + { + "epoch": 0.44193345888261143, + "grad_norm": 0.5980709429827091, + "learning_rate": 9.932513103156532e-06, + "loss": 0.6408, + "step": 1100 + }, + { + "epoch": 0.44595103578154427, + "grad_norm": 0.5581732511219715, + "learning_rate": 9.928633430457628e-06, + "loss": 0.6139, + "step": 1110 + }, + { + "epoch": 0.4499686126804771, + "grad_norm": 0.5080904502728457, + "learning_rate": 9.924646136206617e-06, + "loss": 0.628, + "step": 1120 + }, + { + "epoch": 0.4539861895794099, + "grad_norm": 0.5539298313516415, + "learning_rate": 9.920551307469987e-06, + "loss": 0.6212, + "step": 1130 + }, + { + "epoch": 0.45800376647834273, + "grad_norm": 0.5135075682143369, + "learning_rate": 9.916349033662349e-06, + "loss": 0.6207, + "step": 1140 + }, + { + "epoch": 0.4620213433772756, + "grad_norm": 0.5431528274267999, + "learning_rate": 9.912039406544477e-06, + "loss": 0.6268, + "step": 1150 + }, + { + "epoch": 0.4660389202762084, + "grad_norm": 0.46293456683258105, + "learning_rate": 9.907622520221312e-06, + "loss": 0.6168, + "step": 1160 + }, + { + "epoch": 0.47005649717514125, + "grad_norm": 0.5265783067042777, + "learning_rate": 9.903098471139903e-06, + "loss": 0.611, + "step": 1170 + }, + { + "epoch": 0.4740740740740741, + "grad_norm": 0.4964165005454209, + "learning_rate": 9.89846735808731e-06, + "loss": 0.6209, + "step": 1180 + }, + { + "epoch": 0.47809165097300693, + "grad_norm": 0.44720379185402914, + "learning_rate": 9.893729282188433e-06, + "loss": 0.6274, + "step": 1190 + }, + { + "epoch": 0.4821092278719397, + "grad_norm": 0.5160429448873454, + "learning_rate": 9.888884346903813e-06, + "loss": 0.618, + "step": 1200 + }, + { + "epoch": 0.48612680477087256, + "grad_norm": 0.483511156492776, + "learning_rate": 9.883932658027374e-06, + "loss": 0.621, + "step": 1210 + }, + { + "epoch": 0.4901443816698054, + "grad_norm": 0.5158419978797311, + "learning_rate": 9.8788743236841e-06, + "loss": 0.6295, + "step": 1220 + }, + { + "epoch": 0.49416195856873824, + "grad_norm": 0.5784261169598502, + "learning_rate": 9.873709454327697e-06, + "loss": 0.6215, + "step": 1230 + }, + { + "epoch": 0.4981795354676711, + "grad_norm": 0.5198565153146563, + "learning_rate": 9.868438162738154e-06, + "loss": 0.6264, + "step": 1240 + }, + { + "epoch": 0.5021971123666039, + "grad_norm": 0.510434585062711, + "learning_rate": 9.863060564019305e-06, + "loss": 0.6149, + "step": 1250 + }, + { + "epoch": 0.5062146892655367, + "grad_norm": 0.4593962120443787, + "learning_rate": 9.8575767755963e-06, + "loss": 0.625, + "step": 1260 + }, + { + "epoch": 0.5102322661644696, + "grad_norm": 0.498172586068187, + "learning_rate": 9.851986917213044e-06, + "loss": 0.6143, + "step": 1270 + }, + { + "epoch": 0.5142498430634024, + "grad_norm": 0.4620438331915131, + "learning_rate": 9.846291110929586e-06, + "loss": 0.6313, + "step": 1280 + }, + { + "epoch": 0.5182674199623352, + "grad_norm": 0.5343634450415627, + "learning_rate": 9.840489481119452e-06, + "loss": 0.6182, + "step": 1290 + }, + { + "epoch": 0.5222849968612681, + "grad_norm": 0.48909140457697287, + "learning_rate": 9.834582154466927e-06, + "loss": 0.6325, + "step": 1300 + }, + { + "epoch": 0.5263025737602008, + "grad_norm": 0.4678287985367856, + "learning_rate": 9.828569259964291e-06, + "loss": 0.6307, + "step": 1310 + }, + { + "epoch": 0.5303201506591337, + "grad_norm": 0.4920124660634079, + "learning_rate": 9.822450928909e-06, + "loss": 0.6108, + "step": 1320 + }, + { + "epoch": 0.5343377275580665, + "grad_norm": 1.4526222205367336, + "learning_rate": 9.816227294900822e-06, + "loss": 0.6213, + "step": 1330 + }, + { + "epoch": 0.5383553044569994, + "grad_norm": 0.471616985395006, + "learning_rate": 9.809898493838923e-06, + "loss": 0.6169, + "step": 1340 + }, + { + "epoch": 0.5423728813559322, + "grad_norm": 0.4421867425241258, + "learning_rate": 9.803464663918886e-06, + "loss": 0.6093, + "step": 1350 + }, + { + "epoch": 0.546390458254865, + "grad_norm": 0.5351347284551249, + "learning_rate": 9.796925945629711e-06, + "loss": 0.6143, + "step": 1360 + }, + { + "epoch": 0.5504080351537979, + "grad_norm": 0.4877248363865589, + "learning_rate": 9.79028248175073e-06, + "loss": 0.6192, + "step": 1370 + }, + { + "epoch": 0.5544256120527307, + "grad_norm": 0.5009708952846684, + "learning_rate": 9.783534417348507e-06, + "loss": 0.6143, + "step": 1380 + }, + { + "epoch": 0.5584431889516636, + "grad_norm": 0.5178945086806888, + "learning_rate": 9.776681899773652e-06, + "loss": 0.6205, + "step": 1390 + }, + { + "epoch": 0.5624607658505963, + "grad_norm": 0.4762670817917884, + "learning_rate": 9.769725078657622e-06, + "loss": 0.6173, + "step": 1400 + }, + { + "epoch": 0.5664783427495292, + "grad_norm": 0.4613547006315256, + "learning_rate": 9.762664105909434e-06, + "loss": 0.6251, + "step": 1410 + }, + { + "epoch": 0.570495919648462, + "grad_norm": 0.4702754249747577, + "learning_rate": 9.755499135712368e-06, + "loss": 0.6183, + "step": 1420 + }, + { + "epoch": 0.5745134965473948, + "grad_norm": 0.5373232333646776, + "learning_rate": 9.748230324520585e-06, + "loss": 0.6132, + "step": 1430 + }, + { + "epoch": 0.5785310734463277, + "grad_norm": 0.5626349604991109, + "learning_rate": 9.740857831055715e-06, + "loss": 0.621, + "step": 1440 + }, + { + "epoch": 0.5825486503452605, + "grad_norm": 0.5658213472247587, + "learning_rate": 9.733381816303395e-06, + "loss": 0.6138, + "step": 1450 + }, + { + "epoch": 0.5865662272441934, + "grad_norm": 0.45081662608826584, + "learning_rate": 9.725802443509753e-06, + "loss": 0.616, + "step": 1460 + }, + { + "epoch": 0.5905838041431262, + "grad_norm": 0.44740848559688695, + "learning_rate": 9.718119878177837e-06, + "loss": 0.6129, + "step": 1470 + }, + { + "epoch": 0.5946013810420591, + "grad_norm": 0.5237403744825302, + "learning_rate": 9.710334288064007e-06, + "loss": 0.6136, + "step": 1480 + }, + { + "epoch": 0.5986189579409918, + "grad_norm": 0.48924900162227497, + "learning_rate": 9.702445843174274e-06, + "loss": 0.6196, + "step": 1490 + }, + { + "epoch": 0.6026365348399246, + "grad_norm": 0.49368923400575526, + "learning_rate": 9.694454715760573e-06, + "loss": 0.6187, + "step": 1500 + }, + { + "epoch": 0.6066541117388575, + "grad_norm": 0.487126510015843, + "learning_rate": 9.686361080317029e-06, + "loss": 0.6172, + "step": 1510 + }, + { + "epoch": 0.6106716886377903, + "grad_norm": 0.45477094460777745, + "learning_rate": 9.678165113576114e-06, + "loss": 0.6056, + "step": 1520 + }, + { + "epoch": 0.6146892655367232, + "grad_norm": 0.49141680431275075, + "learning_rate": 9.669866994504818e-06, + "loss": 0.6043, + "step": 1530 + }, + { + "epoch": 0.618706842435656, + "grad_norm": 0.7376167208477882, + "learning_rate": 9.66146690430072e-06, + "loss": 0.6208, + "step": 1540 + }, + { + "epoch": 0.6227244193345888, + "grad_norm": 0.4931059951127201, + "learning_rate": 9.652965026388039e-06, + "loss": 0.6097, + "step": 1550 + }, + { + "epoch": 0.6267419962335217, + "grad_norm": 0.48305824703353156, + "learning_rate": 9.644361546413635e-06, + "loss": 0.6081, + "step": 1560 + }, + { + "epoch": 0.6307595731324545, + "grad_norm": 0.46866086349351754, + "learning_rate": 9.635656652242938e-06, + "loss": 0.6187, + "step": 1570 + }, + { + "epoch": 0.6347771500313873, + "grad_norm": 0.48653033771244636, + "learning_rate": 9.626850533955864e-06, + "loss": 0.6039, + "step": 1580 + }, + { + "epoch": 0.6387947269303201, + "grad_norm": 0.4839769721956582, + "learning_rate": 9.617943383842659e-06, + "loss": 0.617, + "step": 1590 + }, + { + "epoch": 0.642812303829253, + "grad_norm": 0.4887686378157136, + "learning_rate": 9.608935396399692e-06, + "loss": 0.6043, + "step": 1600 + }, + { + "epoch": 0.6468298807281858, + "grad_norm": 0.46040947244346264, + "learning_rate": 9.599826768325218e-06, + "loss": 0.6088, + "step": 1610 + }, + { + "epoch": 0.6508474576271186, + "grad_norm": 0.4882314027569112, + "learning_rate": 9.590617698515077e-06, + "loss": 0.6084, + "step": 1620 + }, + { + "epoch": 0.6548650345260515, + "grad_norm": 0.46985181649935615, + "learning_rate": 9.581308388058354e-06, + "loss": 0.6029, + "step": 1630 + }, + { + "epoch": 0.6588826114249843, + "grad_norm": 0.48794194189773543, + "learning_rate": 9.571899040232989e-06, + "loss": 0.6088, + "step": 1640 + }, + { + "epoch": 0.6629001883239172, + "grad_norm": 0.46766933522748133, + "learning_rate": 9.56238986050133e-06, + "loss": 0.6149, + "step": 1650 + }, + { + "epoch": 0.66691776522285, + "grad_norm": 0.48282374325088395, + "learning_rate": 9.552781056505662e-06, + "loss": 0.6101, + "step": 1660 + }, + { + "epoch": 0.6709353421217829, + "grad_norm": 0.5292742570379373, + "learning_rate": 9.543072838063655e-06, + "loss": 0.6128, + "step": 1670 + }, + { + "epoch": 0.6749529190207156, + "grad_norm": 0.5260262320491007, + "learning_rate": 9.533265417163793e-06, + "loss": 0.6234, + "step": 1680 + }, + { + "epoch": 0.6789704959196484, + "grad_norm": 0.48584608333045604, + "learning_rate": 9.523359007960748e-06, + "loss": 0.6116, + "step": 1690 + }, + { + "epoch": 0.6829880728185813, + "grad_norm": 0.4743066347549614, + "learning_rate": 9.513353826770695e-06, + "loss": 0.5959, + "step": 1700 + }, + { + "epoch": 0.6870056497175141, + "grad_norm": 0.4856299177120577, + "learning_rate": 9.503250092066592e-06, + "loss": 0.6204, + "step": 1710 + }, + { + "epoch": 0.691023226616447, + "grad_norm": 0.5046231694012181, + "learning_rate": 9.493048024473413e-06, + "loss": 0.6126, + "step": 1720 + }, + { + "epoch": 0.6950408035153798, + "grad_norm": 0.4928237045269149, + "learning_rate": 9.48274784676332e-06, + "loss": 0.6089, + "step": 1730 + }, + { + "epoch": 0.6990583804143126, + "grad_norm": 0.4766754598686215, + "learning_rate": 9.472349783850815e-06, + "loss": 0.6061, + "step": 1740 + }, + { + "epoch": 0.7030759573132455, + "grad_norm": 0.4631337703939171, + "learning_rate": 9.461854062787812e-06, + "loss": 0.6121, + "step": 1750 + }, + { + "epoch": 0.7070935342121782, + "grad_norm": 0.4961256195106074, + "learning_rate": 9.451260912758695e-06, + "loss": 0.6037, + "step": 1760 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.511885819703456, + "learning_rate": 9.440570565075295e-06, + "loss": 0.6145, + "step": 1770 + }, + { + "epoch": 0.7151286880100439, + "grad_norm": 0.48629463779046733, + "learning_rate": 9.429783253171855e-06, + "loss": 0.5966, + "step": 1780 + }, + { + "epoch": 0.7191462649089768, + "grad_norm": 0.4792036473525835, + "learning_rate": 9.418899212599928e-06, + "loss": 0.608, + "step": 1790 + }, + { + "epoch": 0.7231638418079096, + "grad_norm": 0.5083453945346657, + "learning_rate": 9.407918681023229e-06, + "loss": 0.6095, + "step": 1800 + }, + { + "epoch": 0.7271814187068424, + "grad_norm": 0.4685060218692666, + "learning_rate": 9.396841898212452e-06, + "loss": 0.6111, + "step": 1810 + }, + { + "epoch": 0.7311989956057753, + "grad_norm": 0.5007535253551997, + "learning_rate": 9.38566910604003e-06, + "loss": 0.6175, + "step": 1820 + }, + { + "epoch": 0.7352165725047081, + "grad_norm": 0.4721707017133051, + "learning_rate": 9.374400548474853e-06, + "loss": 0.6105, + "step": 1830 + }, + { + "epoch": 0.739234149403641, + "grad_norm": 0.45194793890207013, + "learning_rate": 9.363036471576945e-06, + "loss": 0.5976, + "step": 1840 + }, + { + "epoch": 0.7432517263025737, + "grad_norm": 0.48023710070315084, + "learning_rate": 9.351577123492087e-06, + "loss": 0.606, + "step": 1850 + }, + { + "epoch": 0.7472693032015066, + "grad_norm": 0.4562292402628347, + "learning_rate": 9.3400227544464e-06, + "loss": 0.6204, + "step": 1860 + }, + { + "epoch": 0.7512868801004394, + "grad_norm": 0.5199018316758173, + "learning_rate": 9.328373616740884e-06, + "loss": 0.6061, + "step": 1870 + }, + { + "epoch": 0.7553044569993722, + "grad_norm": 0.45403478079118165, + "learning_rate": 9.3166299647459e-06, + "loss": 0.5977, + "step": 1880 + }, + { + "epoch": 0.7593220338983051, + "grad_norm": 0.48026121094218754, + "learning_rate": 9.304792054895627e-06, + "loss": 0.6046, + "step": 1890 + }, + { + "epoch": 0.7633396107972379, + "grad_norm": 0.5273730174734627, + "learning_rate": 9.292860145682451e-06, + "loss": 0.6016, + "step": 1900 + }, + { + "epoch": 0.7673571876961708, + "grad_norm": 0.45635254897692096, + "learning_rate": 9.280834497651334e-06, + "loss": 0.6049, + "step": 1910 + }, + { + "epoch": 0.7713747645951036, + "grad_norm": 0.4952893916325239, + "learning_rate": 9.26871537339411e-06, + "loss": 0.6108, + "step": 1920 + }, + { + "epoch": 0.7753923414940365, + "grad_norm": 0.5020795358241874, + "learning_rate": 9.25650303754376e-06, + "loss": 0.6066, + "step": 1930 + }, + { + "epoch": 0.7794099183929692, + "grad_norm": 0.4519568302601775, + "learning_rate": 9.244197756768638e-06, + "loss": 0.6048, + "step": 1940 + }, + { + "epoch": 0.783427495291902, + "grad_norm": 0.4730934017356477, + "learning_rate": 9.231799799766633e-06, + "loss": 0.6205, + "step": 1950 + }, + { + "epoch": 0.7874450721908349, + "grad_norm": 0.43205824907881557, + "learning_rate": 9.219309437259312e-06, + "loss": 0.6094, + "step": 1960 + }, + { + "epoch": 0.7914626490897677, + "grad_norm": 0.49712695410471086, + "learning_rate": 9.206726941986012e-06, + "loss": 0.6177, + "step": 1970 + }, + { + "epoch": 0.7954802259887006, + "grad_norm": 0.5220660443409905, + "learning_rate": 9.194052588697877e-06, + "loss": 0.6101, + "step": 1980 + }, + { + "epoch": 0.7994978028876334, + "grad_norm": 0.5154062082113726, + "learning_rate": 9.18128665415186e-06, + "loss": 0.5928, + "step": 1990 + }, + { + "epoch": 0.8035153797865662, + "grad_norm": 0.466497275783982, + "learning_rate": 9.16842941710468e-06, + "loss": 0.5976, + "step": 2000 + }, + { + "epoch": 0.8075329566854991, + "grad_norm": 0.4710841642629808, + "learning_rate": 9.155481158306736e-06, + "loss": 0.5989, + "step": 2010 + }, + { + "epoch": 0.8115505335844319, + "grad_norm": 0.5011683049021086, + "learning_rate": 9.142442160495981e-06, + "loss": 0.602, + "step": 2020 + }, + { + "epoch": 0.8155681104833647, + "grad_norm": 0.48222917749993743, + "learning_rate": 9.129312708391735e-06, + "loss": 0.5991, + "step": 2030 + }, + { + "epoch": 0.8195856873822975, + "grad_norm": 0.5023833093181953, + "learning_rate": 9.116093088688486e-06, + "loss": 0.603, + "step": 2040 + }, + { + "epoch": 0.8236032642812304, + "grad_norm": 0.5057506213682262, + "learning_rate": 9.102783590049613e-06, + "loss": 0.6074, + "step": 2050 + }, + { + "epoch": 0.8276208411801632, + "grad_norm": 0.4465714041839266, + "learning_rate": 9.08938450310109e-06, + "loss": 0.6117, + "step": 2060 + }, + { + "epoch": 0.831638418079096, + "grad_norm": 0.45774188769330276, + "learning_rate": 9.075896120425144e-06, + "loss": 0.5982, + "step": 2070 + }, + { + "epoch": 0.8356559949780289, + "grad_norm": 0.4956125400496556, + "learning_rate": 9.06231873655386e-06, + "loss": 0.6131, + "step": 2080 + }, + { + "epoch": 0.8396735718769617, + "grad_norm": 0.503936504338912, + "learning_rate": 9.04865264796275e-06, + "loss": 0.6067, + "step": 2090 + }, + { + "epoch": 0.8436911487758946, + "grad_norm": 0.4966401695744208, + "learning_rate": 9.034898153064281e-06, + "loss": 0.5982, + "step": 2100 + }, + { + "epoch": 0.8477087256748274, + "grad_norm": 0.48606930633171735, + "learning_rate": 9.021055552201364e-06, + "loss": 0.6015, + "step": 2110 + }, + { + "epoch": 0.8517263025737603, + "grad_norm": 0.5102880215310355, + "learning_rate": 9.00712514764078e-06, + "loss": 0.6084, + "step": 2120 + }, + { + "epoch": 0.855743879472693, + "grad_norm": 0.5874497994476533, + "learning_rate": 8.993107243566599e-06, + "loss": 0.6014, + "step": 2130 + }, + { + "epoch": 0.8597614563716258, + "grad_norm": 0.45526275583074516, + "learning_rate": 8.979002146073526e-06, + "loss": 0.6047, + "step": 2140 + }, + { + "epoch": 0.8637790332705587, + "grad_norm": 0.44057367739611536, + "learning_rate": 8.964810163160218e-06, + "loss": 0.6023, + "step": 2150 + }, + { + "epoch": 0.8677966101694915, + "grad_norm": 0.47276989533109426, + "learning_rate": 8.95053160472256e-06, + "loss": 0.5996, + "step": 2160 + }, + { + "epoch": 0.8718141870684244, + "grad_norm": 0.47310493449075497, + "learning_rate": 8.936166782546907e-06, + "loss": 0.6053, + "step": 2170 + }, + { + "epoch": 0.8758317639673572, + "grad_norm": 0.4417798231937385, + "learning_rate": 8.921716010303255e-06, + "loss": 0.6075, + "step": 2180 + }, + { + "epoch": 0.87984934086629, + "grad_norm": 0.47514268261185605, + "learning_rate": 8.907179603538411e-06, + "loss": 0.5892, + "step": 2190 + }, + { + "epoch": 0.8838669177652229, + "grad_norm": 0.4830403204501155, + "learning_rate": 8.892557879669097e-06, + "loss": 0.5962, + "step": 2200 + }, + { + "epoch": 0.8878844946641556, + "grad_norm": 0.46322055953346064, + "learning_rate": 8.877851157975017e-06, + "loss": 0.6027, + "step": 2210 + }, + { + "epoch": 0.8919020715630885, + "grad_norm": 0.4722416444611542, + "learning_rate": 8.86305975959188e-06, + "loss": 0.5949, + "step": 2220 + }, + { + "epoch": 0.8959196484620213, + "grad_norm": 0.5039262176010645, + "learning_rate": 8.848184007504404e-06, + "loss": 0.5983, + "step": 2230 + }, + { + "epoch": 0.8999372253609542, + "grad_norm": 0.5011682351198766, + "learning_rate": 8.833224226539246e-06, + "loss": 0.5902, + "step": 2240 + }, + { + "epoch": 0.903954802259887, + "grad_norm": 0.46884988549973994, + "learning_rate": 8.818180743357915e-06, + "loss": 0.6043, + "step": 2250 + }, + { + "epoch": 0.9079723791588198, + "grad_norm": 0.45627355824791144, + "learning_rate": 8.803053886449644e-06, + "loss": 0.609, + "step": 2260 + }, + { + "epoch": 0.9119899560577527, + "grad_norm": 0.4489855391892701, + "learning_rate": 8.787843986124214e-06, + "loss": 0.5945, + "step": 2270 + }, + { + "epoch": 0.9160075329566855, + "grad_norm": 0.5195815180128369, + "learning_rate": 8.772551374504736e-06, + "loss": 0.6032, + "step": 2280 + }, + { + "epoch": 0.9200251098556184, + "grad_norm": 0.4547182251787525, + "learning_rate": 8.757176385520406e-06, + "loss": 0.6071, + "step": 2290 + }, + { + "epoch": 0.9240426867545511, + "grad_norm": 0.4672155983783131, + "learning_rate": 8.741719354899214e-06, + "loss": 0.6026, + "step": 2300 + }, + { + "epoch": 0.928060263653484, + "grad_norm": 0.44811195796882736, + "learning_rate": 8.7261806201606e-06, + "loss": 0.5903, + "step": 2310 + }, + { + "epoch": 0.9320778405524168, + "grad_norm": 0.4609033648332187, + "learning_rate": 8.710560520608106e-06, + "loss": 0.5954, + "step": 2320 + }, + { + "epoch": 0.9360954174513496, + "grad_norm": 0.5031025381027067, + "learning_rate": 8.694859397321947e-06, + "loss": 0.5971, + "step": 2330 + }, + { + "epoch": 0.9401129943502825, + "grad_norm": 0.45508717131932036, + "learning_rate": 8.67907759315157e-06, + "loss": 0.6009, + "step": 2340 + }, + { + "epoch": 0.9441305712492153, + "grad_norm": 0.46492655160451346, + "learning_rate": 8.663215452708173e-06, + "loss": 0.5971, + "step": 2350 + }, + { + "epoch": 0.9481481481481482, + "grad_norm": 0.4891914193609098, + "learning_rate": 8.647273322357174e-06, + "loss": 0.5854, + "step": 2360 + }, + { + "epoch": 0.952165725047081, + "grad_norm": 0.45300916670077845, + "learning_rate": 8.631251550210645e-06, + "loss": 0.6073, + "step": 2370 + }, + { + "epoch": 0.9561833019460139, + "grad_norm": 0.46574627999413143, + "learning_rate": 8.61515048611972e-06, + "loss": 0.5973, + "step": 2380 + }, + { + "epoch": 0.9602008788449466, + "grad_norm": 0.46780579054245386, + "learning_rate": 8.598970481666949e-06, + "loss": 0.5903, + "step": 2390 + }, + { + "epoch": 0.9642184557438794, + "grad_norm": 0.49368155945672554, + "learning_rate": 8.582711890158622e-06, + "loss": 0.5918, + "step": 2400 + }, + { + "epoch": 0.9682360326428123, + "grad_norm": 0.4981441902973779, + "learning_rate": 8.566375066617056e-06, + "loss": 0.5849, + "step": 2410 + }, + { + "epoch": 0.9722536095417451, + "grad_norm": 0.4940426996715437, + "learning_rate": 8.549960367772836e-06, + "loss": 0.5983, + "step": 2420 + }, + { + "epoch": 0.976271186440678, + "grad_norm": 0.4785729651530905, + "learning_rate": 8.533468152057037e-06, + "loss": 0.5886, + "step": 2430 + }, + { + "epoch": 0.9802887633396108, + "grad_norm": 0.47987909811753693, + "learning_rate": 8.51689877959339e-06, + "loss": 0.5934, + "step": 2440 + }, + { + "epoch": 0.9843063402385436, + "grad_norm": 0.5163558607429957, + "learning_rate": 8.500252612190416e-06, + "loss": 0.5996, + "step": 2450 + }, + { + "epoch": 0.9883239171374765, + "grad_norm": 0.51101675304883, + "learning_rate": 8.48353001333353e-06, + "loss": 0.5914, + "step": 2460 + }, + { + "epoch": 0.9923414940364093, + "grad_norm": 0.5029778228075064, + "learning_rate": 8.466731348177106e-06, + "loss": 0.5941, + "step": 2470 + }, + { + "epoch": 0.9963590709353422, + "grad_norm": 0.4522455217876264, + "learning_rate": 8.4498569835365e-06, + "loss": 0.597, + "step": 2480 + }, + { + "epoch": 1.0, + "grad_norm": 0.4817425446921033, + "learning_rate": 8.432907287880033e-06, + "loss": 0.6044, + "step": 2490 + }, + { + "epoch": 1.0040175768989328, + "grad_norm": 0.5586942369932902, + "learning_rate": 8.415882631320963e-06, + "loss": 0.5356, + "step": 2500 + }, + { + "epoch": 1.0080351537978656, + "grad_norm": 0.4841445380910929, + "learning_rate": 8.398783385609386e-06, + "loss": 0.5324, + "step": 2510 + }, + { + "epoch": 1.0120527306967986, + "grad_norm": 0.5199870071061111, + "learning_rate": 8.38160992412413e-06, + "loss": 0.5408, + "step": 2520 + }, + { + "epoch": 1.0160703075957314, + "grad_norm": 0.4803423902226154, + "learning_rate": 8.364362621864595e-06, + "loss": 0.5454, + "step": 2530 + }, + { + "epoch": 1.0200878844946641, + "grad_norm": 0.5170354608787521, + "learning_rate": 8.347041855442565e-06, + "loss": 0.5438, + "step": 2540 + }, + { + "epoch": 1.024105461393597, + "grad_norm": 0.48879934808926573, + "learning_rate": 8.329648003073991e-06, + "loss": 0.5409, + "step": 2550 + }, + { + "epoch": 1.0281230382925297, + "grad_norm": 0.5352031093199895, + "learning_rate": 8.312181444570722e-06, + "loss": 0.5379, + "step": 2560 + }, + { + "epoch": 1.0321406151914627, + "grad_norm": 0.503404555579561, + "learning_rate": 8.29464256133222e-06, + "loss": 0.5339, + "step": 2570 + }, + { + "epoch": 1.0361581920903955, + "grad_norm": 0.5412849211631844, + "learning_rate": 8.277031736337229e-06, + "loss": 0.537, + "step": 2580 + }, + { + "epoch": 1.0401757689893283, + "grad_norm": 0.49624211963806875, + "learning_rate": 8.259349354135408e-06, + "loss": 0.5365, + "step": 2590 + }, + { + "epoch": 1.044193345888261, + "grad_norm": 0.47456260927212846, + "learning_rate": 8.241595800838945e-06, + "loss": 0.5331, + "step": 2600 + }, + { + "epoch": 1.048210922787194, + "grad_norm": 0.5253188216147088, + "learning_rate": 8.223771464114114e-06, + "loss": 0.5407, + "step": 2610 + }, + { + "epoch": 1.0522284996861269, + "grad_norm": 0.48560865787460045, + "learning_rate": 8.205876733172813e-06, + "loss": 0.5358, + "step": 2620 + }, + { + "epoch": 1.0562460765850596, + "grad_norm": 0.4552759788335426, + "learning_rate": 8.187911998764073e-06, + "loss": 0.5383, + "step": 2630 + }, + { + "epoch": 1.0602636534839924, + "grad_norm": 0.5238408302939632, + "learning_rate": 8.169877653165512e-06, + "loss": 0.5432, + "step": 2640 + }, + { + "epoch": 1.0642812303829252, + "grad_norm": 0.5143828197718291, + "learning_rate": 8.15177409017478e-06, + "loss": 0.5449, + "step": 2650 + }, + { + "epoch": 1.0682988072818582, + "grad_norm": 0.5003350510607426, + "learning_rate": 8.13360170510096e-06, + "loss": 0.5379, + "step": 2660 + }, + { + "epoch": 1.072316384180791, + "grad_norm": 0.4862183359830462, + "learning_rate": 8.115360894755928e-06, + "loss": 0.5313, + "step": 2670 + }, + { + "epoch": 1.0763339610797238, + "grad_norm": 0.525472490342403, + "learning_rate": 8.097052057445696e-06, + "loss": 0.5324, + "step": 2680 + }, + { + "epoch": 1.0803515379786566, + "grad_norm": 0.4609085774775871, + "learning_rate": 8.07867559296171e-06, + "loss": 0.5339, + "step": 2690 + }, + { + "epoch": 1.0843691148775894, + "grad_norm": 0.47498380053763667, + "learning_rate": 8.060231902572123e-06, + "loss": 0.5416, + "step": 2700 + }, + { + "epoch": 1.0883866917765224, + "grad_norm": 0.4774259144620562, + "learning_rate": 8.041721389013029e-06, + "loss": 0.5315, + "step": 2710 + }, + { + "epoch": 1.0924042686754551, + "grad_norm": 0.4535031254083697, + "learning_rate": 8.023144456479677e-06, + "loss": 0.5337, + "step": 2720 + }, + { + "epoch": 1.096421845574388, + "grad_norm": 0.45369844007905547, + "learning_rate": 8.004501510617631e-06, + "loss": 0.5286, + "step": 2730 + }, + { + "epoch": 1.1004394224733207, + "grad_norm": 0.47676753007555456, + "learning_rate": 7.985792958513932e-06, + "loss": 0.5316, + "step": 2740 + }, + { + "epoch": 1.1044569993722537, + "grad_norm": 0.49166946653996263, + "learning_rate": 7.967019208688187e-06, + "loss": 0.534, + "step": 2750 + }, + { + "epoch": 1.1084745762711865, + "grad_norm": 0.47818247256990665, + "learning_rate": 7.948180671083665e-06, + "loss": 0.5372, + "step": 2760 + }, + { + "epoch": 1.1124921531701193, + "grad_norm": 0.49991506616495146, + "learning_rate": 7.92927775705834e-06, + "loss": 0.5497, + "step": 2770 + }, + { + "epoch": 1.116509730069052, + "grad_norm": 0.5015448494254134, + "learning_rate": 7.910310879375906e-06, + "loss": 0.5335, + "step": 2780 + }, + { + "epoch": 1.1205273069679849, + "grad_norm": 0.5009679314406517, + "learning_rate": 7.891280452196767e-06, + "loss": 0.5349, + "step": 2790 + }, + { + "epoch": 1.1245448838669179, + "grad_norm": 0.46077740198691347, + "learning_rate": 7.872186891068997e-06, + "loss": 0.5474, + "step": 2800 + }, + { + "epoch": 1.1285624607658507, + "grad_norm": 0.47629174007493424, + "learning_rate": 7.85303061291925e-06, + "loss": 0.5352, + "step": 2810 + }, + { + "epoch": 1.1325800376647834, + "grad_norm": 0.5001826254949262, + "learning_rate": 7.833812036043684e-06, + "loss": 0.5253, + "step": 2820 + }, + { + "epoch": 1.1365976145637162, + "grad_norm": 0.48877881868647444, + "learning_rate": 7.814531580098799e-06, + "loss": 0.5405, + "step": 2830 + }, + { + "epoch": 1.140615191462649, + "grad_norm": 0.4525660243228381, + "learning_rate": 7.795189666092286e-06, + "loss": 0.5392, + "step": 2840 + }, + { + "epoch": 1.144632768361582, + "grad_norm": 0.5016062315339999, + "learning_rate": 7.77578671637384e-06, + "loss": 0.5392, + "step": 2850 + }, + { + "epoch": 1.1486503452605148, + "grad_norm": 0.46664182486781586, + "learning_rate": 7.756323154625927e-06, + "loss": 0.5307, + "step": 2860 + }, + { + "epoch": 1.1526679221594476, + "grad_norm": 0.4851667799865368, + "learning_rate": 7.736799405854531e-06, + "loss": 0.5249, + "step": 2870 + }, + { + "epoch": 1.1566854990583804, + "grad_norm": 0.44756010484495995, + "learning_rate": 7.71721589637989e-06, + "loss": 0.5423, + "step": 2880 + }, + { + "epoch": 1.1607030759573131, + "grad_norm": 0.4810612674475816, + "learning_rate": 7.697573053827163e-06, + "loss": 0.5346, + "step": 2890 + }, + { + "epoch": 1.1647206528562462, + "grad_norm": 0.5005415716085619, + "learning_rate": 7.677871307117117e-06, + "loss": 0.5277, + "step": 2900 + }, + { + "epoch": 1.168738229755179, + "grad_norm": 0.48046892345205033, + "learning_rate": 7.658111086456738e-06, + "loss": 0.5372, + "step": 2910 + }, + { + "epoch": 1.1727558066541117, + "grad_norm": 0.5231466496543029, + "learning_rate": 7.638292823329861e-06, + "loss": 0.5349, + "step": 2920 + }, + { + "epoch": 1.1767733835530445, + "grad_norm": 0.47426409377347806, + "learning_rate": 7.6184169504877195e-06, + "loss": 0.5335, + "step": 2930 + }, + { + "epoch": 1.1807909604519775, + "grad_norm": 0.4644152310984778, + "learning_rate": 7.598483901939525e-06, + "loss": 0.5375, + "step": 2940 + }, + { + "epoch": 1.1848085373509103, + "grad_norm": 0.5016059422510154, + "learning_rate": 7.5784941129429715e-06, + "loss": 0.5336, + "step": 2950 + }, + { + "epoch": 1.188826114249843, + "grad_norm": 0.4893646800410941, + "learning_rate": 7.558448019994733e-06, + "loss": 0.5427, + "step": 2960 + }, + { + "epoch": 1.1928436911487759, + "grad_norm": 0.4964262727258161, + "learning_rate": 7.5383460608209444e-06, + "loss": 0.5362, + "step": 2970 + }, + { + "epoch": 1.1968612680477086, + "grad_norm": 0.542942008787974, + "learning_rate": 7.518188674367628e-06, + "loss": 0.5474, + "step": 2980 + }, + { + "epoch": 1.2008788449466414, + "grad_norm": 0.5299626906544336, + "learning_rate": 7.497976300791114e-06, + "loss": 0.5431, + "step": 2990 + }, + { + "epoch": 1.2048964218455744, + "grad_norm": 0.45657822017276745, + "learning_rate": 7.477709381448436e-06, + "loss": 0.5207, + "step": 3000 + }, + { + "epoch": 1.2089139987445072, + "grad_norm": 0.5192739282525728, + "learning_rate": 7.457388358887682e-06, + "loss": 0.5389, + "step": 3010 + }, + { + "epoch": 1.21293157564344, + "grad_norm": 0.5108336845381785, + "learning_rate": 7.437013676838345e-06, + "loss": 0.5427, + "step": 3020 + }, + { + "epoch": 1.2169491525423728, + "grad_norm": 0.4490173262151658, + "learning_rate": 7.416585780201615e-06, + "loss": 0.541, + "step": 3030 + }, + { + "epoch": 1.2209667294413058, + "grad_norm": 0.4823167719289656, + "learning_rate": 7.396105115040684e-06, + "loss": 0.5396, + "step": 3040 + }, + { + "epoch": 1.2249843063402386, + "grad_norm": 0.4926557447124054, + "learning_rate": 7.37557212857099e-06, + "loss": 0.5413, + "step": 3050 + }, + { + "epoch": 1.2290018832391714, + "grad_norm": 0.4846891664452503, + "learning_rate": 7.3549872691504646e-06, + "loss": 0.5448, + "step": 3060 + }, + { + "epoch": 1.2330194601381042, + "grad_norm": 0.5250489652016472, + "learning_rate": 7.3343509862697295e-06, + "loss": 0.5402, + "step": 3070 + }, + { + "epoch": 1.237037037037037, + "grad_norm": 0.4588507155594117, + "learning_rate": 7.313663730542295e-06, + "loss": 0.5404, + "step": 3080 + }, + { + "epoch": 1.24105461393597, + "grad_norm": 0.5009891938060933, + "learning_rate": 7.292925953694705e-06, + "loss": 0.5363, + "step": 3090 + }, + { + "epoch": 1.2450721908349027, + "grad_norm": 0.5090944716207311, + "learning_rate": 7.272138108556691e-06, + "loss": 0.5284, + "step": 3100 + }, + { + "epoch": 1.2490897677338355, + "grad_norm": 0.449393206268348, + "learning_rate": 7.25130064905127e-06, + "loss": 0.5296, + "step": 3110 + }, + { + "epoch": 1.2531073446327683, + "grad_norm": 0.5002200548002171, + "learning_rate": 7.230414030184835e-06, + "loss": 0.531, + "step": 3120 + }, + { + "epoch": 1.2571249215317013, + "grad_norm": 0.5114004159834465, + "learning_rate": 7.209478708037225e-06, + "loss": 0.5458, + "step": 3130 + }, + { + "epoch": 1.261142498430634, + "grad_norm": 0.5166427993736156, + "learning_rate": 7.1884951397517664e-06, + "loss": 0.5309, + "step": 3140 + }, + { + "epoch": 1.2651600753295669, + "grad_norm": 0.4851353630236827, + "learning_rate": 7.167463783525282e-06, + "loss": 0.5375, + "step": 3150 + }, + { + "epoch": 1.2691776522284997, + "grad_norm": 0.508201816800996, + "learning_rate": 7.146385098598092e-06, + "loss": 0.5356, + "step": 3160 + }, + { + "epoch": 1.2731952291274324, + "grad_norm": 0.47326774384605347, + "learning_rate": 7.12525954524399e-06, + "loss": 0.5281, + "step": 3170 + }, + { + "epoch": 1.2772128060263652, + "grad_norm": 0.4964394626079393, + "learning_rate": 7.1040875847601775e-06, + "loss": 0.5339, + "step": 3180 + }, + { + "epoch": 1.2812303829252982, + "grad_norm": 0.4965199644569434, + "learning_rate": 7.082869679457214e-06, + "loss": 0.5373, + "step": 3190 + }, + { + "epoch": 1.285247959824231, + "grad_norm": 0.48647921670810396, + "learning_rate": 7.061606292648899e-06, + "loss": 0.5368, + "step": 3200 + }, + { + "epoch": 1.2892655367231638, + "grad_norm": 0.4709672064019513, + "learning_rate": 7.040297888642172e-06, + "loss": 0.5401, + "step": 3210 + }, + { + "epoch": 1.2932831136220968, + "grad_norm": 0.4710864047305743, + "learning_rate": 7.018944932726963e-06, + "loss": 0.538, + "step": 3220 + }, + { + "epoch": 1.2973006905210296, + "grad_norm": 0.507624692636882, + "learning_rate": 6.997547891166041e-06, + "loss": 0.5333, + "step": 3230 + }, + { + "epoch": 1.3013182674199624, + "grad_norm": 0.4692990883467198, + "learning_rate": 6.976107231184823e-06, + "loss": 0.5412, + "step": 3240 + }, + { + "epoch": 1.3053358443188952, + "grad_norm": 0.5133848969883468, + "learning_rate": 6.954623420961179e-06, + "loss": 0.5254, + "step": 3250 + }, + { + "epoch": 1.309353421217828, + "grad_norm": 0.5377405201588225, + "learning_rate": 6.933096929615211e-06, + "loss": 0.5304, + "step": 3260 + }, + { + "epoch": 1.3133709981167607, + "grad_norm": 0.4449757913251056, + "learning_rate": 6.911528227199e-06, + "loss": 0.5345, + "step": 3270 + }, + { + "epoch": 1.3173885750156937, + "grad_norm": 0.48594757234752356, + "learning_rate": 6.88991778468635e-06, + "loss": 0.5313, + "step": 3280 + }, + { + "epoch": 1.3214061519146265, + "grad_norm": 0.44216301043487133, + "learning_rate": 6.868266073962497e-06, + "loss": 0.5301, + "step": 3290 + }, + { + "epoch": 1.3254237288135593, + "grad_norm": 0.46671237233856316, + "learning_rate": 6.846573567813819e-06, + "loss": 0.5414, + "step": 3300 + }, + { + "epoch": 1.329441305712492, + "grad_norm": 0.4812496937883229, + "learning_rate": 6.8248407399174865e-06, + "loss": 0.5364, + "step": 3310 + }, + { + "epoch": 1.333458882611425, + "grad_norm": 0.5038329328787501, + "learning_rate": 6.803068064831149e-06, + "loss": 0.5425, + "step": 3320 + }, + { + "epoch": 1.3374764595103579, + "grad_norm": 0.45630407505785825, + "learning_rate": 6.781256017982555e-06, + "loss": 0.5367, + "step": 3330 + }, + { + "epoch": 1.3414940364092907, + "grad_norm": 0.49858693203976323, + "learning_rate": 6.759405075659165e-06, + "loss": 0.539, + "step": 3340 + }, + { + "epoch": 1.3455116133082234, + "grad_norm": 0.4890924664342059, + "learning_rate": 6.7375157149977755e-06, + "loss": 0.5206, + "step": 3350 + }, + { + "epoch": 1.3495291902071562, + "grad_norm": 0.4844858973506766, + "learning_rate": 6.715588413974073e-06, + "loss": 0.533, + "step": 3360 + }, + { + "epoch": 1.353546767106089, + "grad_norm": 0.456108199064706, + "learning_rate": 6.693623651392216e-06, + "loss": 0.54, + "step": 3370 + }, + { + "epoch": 1.357564344005022, + "grad_norm": 0.44821643945703865, + "learning_rate": 6.671621906874366e-06, + "loss": 0.5313, + "step": 3380 + }, + { + "epoch": 1.3615819209039548, + "grad_norm": 0.44366483367693854, + "learning_rate": 6.649583660850232e-06, + "loss": 0.5445, + "step": 3390 + }, + { + "epoch": 1.3655994978028876, + "grad_norm": 0.5007318603356307, + "learning_rate": 6.627509394546558e-06, + "loss": 0.5253, + "step": 3400 + }, + { + "epoch": 1.3696170747018206, + "grad_norm": 0.47701484999347654, + "learning_rate": 6.605399589976631e-06, + "loss": 0.5432, + "step": 3410 + }, + { + "epoch": 1.3736346516007534, + "grad_norm": 0.533932151873456, + "learning_rate": 6.583254729929756e-06, + "loss": 0.5362, + "step": 3420 + }, + { + "epoch": 1.3776522284996862, + "grad_norm": 0.4885239761359488, + "learning_rate": 6.5610752979607e-06, + "loss": 0.5393, + "step": 3430 + }, + { + "epoch": 1.381669805398619, + "grad_norm": 0.534453189520691, + "learning_rate": 6.538861778379147e-06, + "loss": 0.538, + "step": 3440 + }, + { + "epoch": 1.3856873822975517, + "grad_norm": 0.55879808060491, + "learning_rate": 6.516614656239115e-06, + "loss": 0.5379, + "step": 3450 + }, + { + "epoch": 1.3897049591964845, + "grad_norm": 0.5105051602697208, + "learning_rate": 6.49433441732837e-06, + "loss": 0.5434, + "step": 3460 + }, + { + "epoch": 1.3937225360954175, + "grad_norm": 0.5849851114309839, + "learning_rate": 6.472021548157812e-06, + "loss": 0.5309, + "step": 3470 + }, + { + "epoch": 1.3977401129943503, + "grad_norm": 0.4862484792197367, + "learning_rate": 6.4496765359508575e-06, + "loss": 0.5403, + "step": 3480 + }, + { + "epoch": 1.401757689893283, + "grad_norm": 0.4785944624308064, + "learning_rate": 6.427299868632795e-06, + "loss": 0.5315, + "step": 3490 + }, + { + "epoch": 1.4057752667922159, + "grad_norm": 0.5206432193262734, + "learning_rate": 6.404892034820134e-06, + "loss": 0.5363, + "step": 3500 + }, + { + "epoch": 1.4097928436911489, + "grad_norm": 0.5223412310453764, + "learning_rate": 6.382453523809939e-06, + "loss": 0.5409, + "step": 3510 + }, + { + "epoch": 1.4138104205900817, + "grad_norm": 0.5251307070269818, + "learning_rate": 6.359984825569138e-06, + "loss": 0.5286, + "step": 3520 + }, + { + "epoch": 1.4178279974890144, + "grad_norm": 0.512721201250328, + "learning_rate": 6.3374864307238235e-06, + "loss": 0.5261, + "step": 3530 + }, + { + "epoch": 1.4218455743879472, + "grad_norm": 0.5026376332917479, + "learning_rate": 6.3149588305485475e-06, + "loss": 0.5208, + "step": 3540 + }, + { + "epoch": 1.42586315128688, + "grad_norm": 0.4829796384358809, + "learning_rate": 6.2924025169555916e-06, + "loss": 0.5433, + "step": 3550 + }, + { + "epoch": 1.4298807281858128, + "grad_norm": 0.4957733273389861, + "learning_rate": 6.269817982484212e-06, + "loss": 0.529, + "step": 3560 + }, + { + "epoch": 1.4338983050847458, + "grad_norm": 0.45279662345918176, + "learning_rate": 6.247205720289907e-06, + "loss": 0.5292, + "step": 3570 + }, + { + "epoch": 1.4379158819836786, + "grad_norm": 0.45766889598494964, + "learning_rate": 6.224566224133632e-06, + "loss": 0.5358, + "step": 3580 + }, + { + "epoch": 1.4419334588826114, + "grad_norm": 0.4809643743051477, + "learning_rate": 6.201899988371022e-06, + "loss": 0.544, + "step": 3590 + }, + { + "epoch": 1.4459510357815444, + "grad_norm": 0.5019231578335686, + "learning_rate": 6.1792075079416e-06, + "loss": 0.5357, + "step": 3600 + }, + { + "epoch": 1.4499686126804772, + "grad_norm": 0.4966841368294494, + "learning_rate": 6.156489278357967e-06, + "loss": 0.5315, + "step": 3610 + }, + { + "epoch": 1.45398618957941, + "grad_norm": 0.5111737253668558, + "learning_rate": 6.1337457956949774e-06, + "loss": 0.5231, + "step": 3620 + }, + { + "epoch": 1.4580037664783427, + "grad_norm": 0.5050504504365466, + "learning_rate": 6.1109775565789164e-06, + "loss": 0.5354, + "step": 3630 + }, + { + "epoch": 1.4620213433772755, + "grad_norm": 0.49236484658873236, + "learning_rate": 6.0881850581766515e-06, + "loss": 0.5243, + "step": 3640 + }, + { + "epoch": 1.4660389202762083, + "grad_norm": 0.4748265818153898, + "learning_rate": 6.065368798184771e-06, + "loss": 0.5391, + "step": 3650 + }, + { + "epoch": 1.4700564971751413, + "grad_norm": 0.5182967229817997, + "learning_rate": 6.042529274818724e-06, + "loss": 0.5294, + "step": 3660 + }, + { + "epoch": 1.474074074074074, + "grad_norm": 0.4752424911930453, + "learning_rate": 6.019666986801936e-06, + "loss": 0.5281, + "step": 3670 + }, + { + "epoch": 1.4780916509730069, + "grad_norm": 0.47120824092970764, + "learning_rate": 5.996782433354923e-06, + "loss": 0.5253, + "step": 3680 + }, + { + "epoch": 1.4821092278719397, + "grad_norm": 0.46585234442500195, + "learning_rate": 5.973876114184388e-06, + "loss": 0.5202, + "step": 3690 + }, + { + "epoch": 1.4861268047708727, + "grad_norm": 0.47339586038582876, + "learning_rate": 5.95094852947231e-06, + "loss": 0.5288, + "step": 3700 + }, + { + "epoch": 1.4901443816698055, + "grad_norm": 0.44170633595442677, + "learning_rate": 5.928000179865024e-06, + "loss": 0.531, + "step": 3710 + }, + { + "epoch": 1.4941619585687382, + "grad_norm": 0.509631075940108, + "learning_rate": 5.905031566462279e-06, + "loss": 0.5371, + "step": 3720 + }, + { + "epoch": 1.498179535467671, + "grad_norm": 0.4730072164920337, + "learning_rate": 5.882043190806314e-06, + "loss": 0.5275, + "step": 3730 + }, + { + "epoch": 1.5021971123666038, + "grad_norm": 0.4821028115884746, + "learning_rate": 5.859035554870893e-06, + "loss": 0.5337, + "step": 3740 + }, + { + "epoch": 1.5062146892655366, + "grad_norm": 0.4834994282281791, + "learning_rate": 5.836009161050342e-06, + "loss": 0.5289, + "step": 3750 + }, + { + "epoch": 1.5102322661644696, + "grad_norm": 0.45785972034921696, + "learning_rate": 5.812964512148589e-06, + "loss": 0.5399, + "step": 3760 + }, + { + "epoch": 1.5142498430634024, + "grad_norm": 0.4766787260315672, + "learning_rate": 5.78990211136818e-06, + "loss": 0.538, + "step": 3770 + }, + { + "epoch": 1.5182674199623352, + "grad_norm": 0.4892932237467062, + "learning_rate": 5.766822462299286e-06, + "loss": 0.5393, + "step": 3780 + }, + { + "epoch": 1.5222849968612682, + "grad_norm": 0.4837638271264737, + "learning_rate": 5.743726068908717e-06, + "loss": 0.5229, + "step": 3790 + }, + { + "epoch": 1.526302573760201, + "grad_norm": 0.4868620820757227, + "learning_rate": 5.72061343552891e-06, + "loss": 0.5353, + "step": 3800 + }, + { + "epoch": 1.5303201506591337, + "grad_norm": 0.49287861664200744, + "learning_rate": 5.697485066846914e-06, + "loss": 0.5407, + "step": 3810 + }, + { + "epoch": 1.5343377275580665, + "grad_norm": 0.5070216858712217, + "learning_rate": 5.674341467893378e-06, + "loss": 0.5322, + "step": 3820 + }, + { + "epoch": 1.5383553044569993, + "grad_norm": 0.48075109583598735, + "learning_rate": 5.6511831440315215e-06, + "loss": 0.5318, + "step": 3830 + }, + { + "epoch": 1.542372881355932, + "grad_norm": 0.487828149268802, + "learning_rate": 5.628010600946088e-06, + "loss": 0.5367, + "step": 3840 + }, + { + "epoch": 1.5463904582548649, + "grad_norm": 0.4434100223228771, + "learning_rate": 5.604824344632319e-06, + "loss": 0.5413, + "step": 3850 + }, + { + "epoch": 1.5504080351537979, + "grad_norm": 0.46224887215867433, + "learning_rate": 5.581624881384897e-06, + "loss": 0.5287, + "step": 3860 + }, + { + "epoch": 1.5544256120527307, + "grad_norm": 0.5122729795251854, + "learning_rate": 5.55841271778689e-06, + "loss": 0.5365, + "step": 3870 + }, + { + "epoch": 1.5584431889516637, + "grad_norm": 0.690970740866929, + "learning_rate": 5.535188360698687e-06, + "loss": 0.5467, + "step": 3880 + }, + { + "epoch": 1.5624607658505965, + "grad_norm": 0.4794120185813089, + "learning_rate": 5.511952317246941e-06, + "loss": 0.5348, + "step": 3890 + }, + { + "epoch": 1.5664783427495292, + "grad_norm": 0.4818371908690834, + "learning_rate": 5.4887050948134825e-06, + "loss": 0.5412, + "step": 3900 + }, + { + "epoch": 1.570495919648462, + "grad_norm": 0.486538488375387, + "learning_rate": 5.465447201024248e-06, + "loss": 0.5362, + "step": 3910 + }, + { + "epoch": 1.5745134965473948, + "grad_norm": 0.5061137169976885, + "learning_rate": 5.442179143738193e-06, + "loss": 0.5363, + "step": 3920 + }, + { + "epoch": 1.5785310734463276, + "grad_norm": 0.46226895825091646, + "learning_rate": 5.418901431036205e-06, + "loss": 0.5277, + "step": 3930 + }, + { + "epoch": 1.5825486503452604, + "grad_norm": 0.49850901564672195, + "learning_rate": 5.395614571210004e-06, + "loss": 0.5253, + "step": 3940 + }, + { + "epoch": 1.5865662272441934, + "grad_norm": 0.49839262038652726, + "learning_rate": 5.372319072751046e-06, + "loss": 0.5217, + "step": 3950 + }, + { + "epoch": 1.5905838041431262, + "grad_norm": 0.4540519023429122, + "learning_rate": 5.349015444339429e-06, + "loss": 0.5174, + "step": 3960 + }, + { + "epoch": 1.5946013810420592, + "grad_norm": 0.4615246890403801, + "learning_rate": 5.325704194832759e-06, + "loss": 0.5399, + "step": 3970 + }, + { + "epoch": 1.598618957940992, + "grad_norm": 0.5069766547949516, + "learning_rate": 5.302385833255076e-06, + "loss": 0.5377, + "step": 3980 + }, + { + "epoch": 1.6026365348399247, + "grad_norm": 0.5034072911043822, + "learning_rate": 5.2790608687857034e-06, + "loss": 0.5312, + "step": 3990 + }, + { + "epoch": 1.6066541117388575, + "grad_norm": 0.478063165462391, + "learning_rate": 5.2557298107481536e-06, + "loss": 0.5235, + "step": 4000 + }, + { + "epoch": 1.6106716886377903, + "grad_norm": 0.5051927530264109, + "learning_rate": 5.2323931685989945e-06, + "loss": 0.5282, + "step": 4010 + }, + { + "epoch": 1.614689265536723, + "grad_norm": 0.449944668715227, + "learning_rate": 5.209051451916733e-06, + "loss": 0.5391, + "step": 4020 + }, + { + "epoch": 1.6187068424356559, + "grad_norm": 0.4987609704482517, + "learning_rate": 5.185705170390677e-06, + "loss": 0.5401, + "step": 4030 + }, + { + "epoch": 1.6227244193345887, + "grad_norm": 0.5129818470578882, + "learning_rate": 5.162354833809815e-06, + "loss": 0.5389, + "step": 4040 + }, + { + "epoch": 1.6267419962335217, + "grad_norm": 0.46834889576653455, + "learning_rate": 5.139000952051686e-06, + "loss": 0.551, + "step": 4050 + }, + { + "epoch": 1.6307595731324545, + "grad_norm": 0.5100548420871484, + "learning_rate": 5.115644035071234e-06, + "loss": 0.5353, + "step": 4060 + }, + { + "epoch": 1.6347771500313875, + "grad_norm": 0.5091440448579789, + "learning_rate": 5.0922845928896865e-06, + "loss": 0.5312, + "step": 4070 + }, + { + "epoch": 1.6387947269303202, + "grad_norm": 0.5011348467216399, + "learning_rate": 5.068923135583405e-06, + "loss": 0.5379, + "step": 4080 + }, + { + "epoch": 1.642812303829253, + "grad_norm": 0.4879211850299191, + "learning_rate": 5.04556017327276e-06, + "loss": 0.5259, + "step": 4090 + }, + { + "epoch": 1.6468298807281858, + "grad_norm": 0.47580521291496164, + "learning_rate": 5.022196216110978e-06, + "loss": 0.5264, + "step": 4100 + }, + { + "epoch": 1.6508474576271186, + "grad_norm": 0.4836039036319484, + "learning_rate": 4.998831774273016e-06, + "loss": 0.5245, + "step": 4110 + }, + { + "epoch": 1.6548650345260514, + "grad_norm": 0.45734991734522173, + "learning_rate": 4.975467357944412e-06, + "loss": 0.5347, + "step": 4120 + }, + { + "epoch": 1.6588826114249842, + "grad_norm": 0.45580879464211926, + "learning_rate": 4.9521034773101405e-06, + "loss": 0.5281, + "step": 4130 + }, + { + "epoch": 1.6629001883239172, + "grad_norm": 0.5088749078327436, + "learning_rate": 4.928740642543491e-06, + "loss": 0.5203, + "step": 4140 + }, + { + "epoch": 1.66691776522285, + "grad_norm": 0.5023597172357365, + "learning_rate": 4.905379363794907e-06, + "loss": 0.5323, + "step": 4150 + }, + { + "epoch": 1.670935342121783, + "grad_norm": 0.5160005322831623, + "learning_rate": 4.882020151180852e-06, + "loss": 0.5354, + "step": 4160 + }, + { + "epoch": 1.6749529190207157, + "grad_norm": 0.48241576610764997, + "learning_rate": 4.858663514772684e-06, + "loss": 0.5256, + "step": 4170 + }, + { + "epoch": 1.6789704959196485, + "grad_norm": 0.4350265055622632, + "learning_rate": 4.8353099645855e-06, + "loss": 0.5343, + "step": 4180 + }, + { + "epoch": 1.6829880728185813, + "grad_norm": 0.5156247900477684, + "learning_rate": 4.811960010567005e-06, + "loss": 0.5235, + "step": 4190 + }, + { + "epoch": 1.687005649717514, + "grad_norm": 0.49845097220709156, + "learning_rate": 4.788614162586379e-06, + "loss": 0.5311, + "step": 4200 + }, + { + "epoch": 1.6910232266164469, + "grad_norm": 0.4684264084420117, + "learning_rate": 4.76527293042315e-06, + "loss": 0.5361, + "step": 4210 + }, + { + "epoch": 1.6950408035153797, + "grad_norm": 0.5161852081555511, + "learning_rate": 4.741936823756046e-06, + "loss": 0.5207, + "step": 4220 + }, + { + "epoch": 1.6990583804143125, + "grad_norm": 0.6119074989682534, + "learning_rate": 4.718606352151874e-06, + "loss": 0.5221, + "step": 4230 + }, + { + "epoch": 1.7030759573132455, + "grad_norm": 0.4419977523098354, + "learning_rate": 4.695282025054406e-06, + "loss": 0.5336, + "step": 4240 + }, + { + "epoch": 1.7070935342121782, + "grad_norm": 0.4776310585207038, + "learning_rate": 4.671964351773229e-06, + "loss": 0.5254, + "step": 4250 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.4362671430805064, + "learning_rate": 4.648653841472643e-06, + "loss": 0.5368, + "step": 4260 + }, + { + "epoch": 1.715128688010044, + "grad_norm": 0.4926985303907698, + "learning_rate": 4.625351003160539e-06, + "loss": 0.529, + "step": 4270 + }, + { + "epoch": 1.7191462649089768, + "grad_norm": 0.5037843279607946, + "learning_rate": 4.60205634567728e-06, + "loss": 0.5266, + "step": 4280 + }, + { + "epoch": 1.7231638418079096, + "grad_norm": 0.48300010587173975, + "learning_rate": 4.578770377684593e-06, + "loss": 0.5308, + "step": 4290 + }, + { + "epoch": 1.7271814187068424, + "grad_norm": 0.4952905848146038, + "learning_rate": 4.555493607654463e-06, + "loss": 0.5348, + "step": 4300 + }, + { + "epoch": 1.7311989956057752, + "grad_norm": 0.509232593416316, + "learning_rate": 4.532226543858025e-06, + "loss": 0.5363, + "step": 4310 + }, + { + "epoch": 1.735216572504708, + "grad_norm": 0.5186412413734403, + "learning_rate": 4.508969694354472e-06, + "loss": 0.5158, + "step": 4320 + }, + { + "epoch": 1.739234149403641, + "grad_norm": 0.4648730018824965, + "learning_rate": 4.485723566979959e-06, + "loss": 0.5205, + "step": 4330 + }, + { + "epoch": 1.7432517263025737, + "grad_norm": 0.487919260567548, + "learning_rate": 4.462488669336507e-06, + "loss": 0.5292, + "step": 4340 + }, + { + "epoch": 1.7472693032015068, + "grad_norm": 0.4741644363249272, + "learning_rate": 4.439265508780932e-06, + "loss": 0.5283, + "step": 4350 + }, + { + "epoch": 1.7512868801004395, + "grad_norm": 0.49035056338707034, + "learning_rate": 4.416054592413755e-06, + "loss": 0.538, + "step": 4360 + }, + { + "epoch": 1.7553044569993723, + "grad_norm": 0.4755513975974018, + "learning_rate": 4.392856427068132e-06, + "loss": 0.5297, + "step": 4370 + }, + { + "epoch": 1.759322033898305, + "grad_norm": 0.46435677929151326, + "learning_rate": 4.3696715192987904e-06, + "loss": 0.5247, + "step": 4380 + }, + { + "epoch": 1.7633396107972379, + "grad_norm": 0.48979753506487095, + "learning_rate": 4.346500375370966e-06, + "loss": 0.5165, + "step": 4390 + }, + { + "epoch": 1.7673571876961707, + "grad_norm": 0.4487673109128978, + "learning_rate": 4.323343501249346e-06, + "loss": 0.5317, + "step": 4400 + }, + { + "epoch": 1.7713747645951035, + "grad_norm": 0.5113864337117118, + "learning_rate": 4.300201402587019e-06, + "loss": 0.5382, + "step": 4410 + }, + { + "epoch": 1.7753923414940365, + "grad_norm": 0.483652205814584, + "learning_rate": 4.277074584714447e-06, + "loss": 0.5311, + "step": 4420 + }, + { + "epoch": 1.7794099183929692, + "grad_norm": 0.4759761657301343, + "learning_rate": 4.253963552628411e-06, + "loss": 0.5351, + "step": 4430 + }, + { + "epoch": 1.783427495291902, + "grad_norm": 0.4821299331080685, + "learning_rate": 4.230868810980997e-06, + "loss": 0.5342, + "step": 4440 + }, + { + "epoch": 1.787445072190835, + "grad_norm": 0.5245032360028585, + "learning_rate": 4.207790864068573e-06, + "loss": 0.5237, + "step": 4450 + }, + { + "epoch": 1.7914626490897678, + "grad_norm": 0.49328875619112256, + "learning_rate": 4.184730215820782e-06, + "loss": 0.5317, + "step": 4460 + }, + { + "epoch": 1.7954802259887006, + "grad_norm": 0.5008680438944126, + "learning_rate": 4.161687369789526e-06, + "loss": 0.517, + "step": 4470 + }, + { + "epoch": 1.7994978028876334, + "grad_norm": 0.47287999035048983, + "learning_rate": 4.138662829137984e-06, + "loss": 0.5327, + "step": 4480 + }, + { + "epoch": 1.8035153797865662, + "grad_norm": 0.49099568575427033, + "learning_rate": 4.115657096629615e-06, + "loss": 0.5302, + "step": 4490 + }, + { + "epoch": 1.807532956685499, + "grad_norm": 0.4518440123720032, + "learning_rate": 4.092670674617187e-06, + "loss": 0.5153, + "step": 4500 + }, + { + "epoch": 1.8115505335844317, + "grad_norm": 0.4893643353710452, + "learning_rate": 4.069704065031804e-06, + "loss": 0.5354, + "step": 4510 + }, + { + "epoch": 1.8155681104833647, + "grad_norm": 0.4938753269976269, + "learning_rate": 4.0467577693719436e-06, + "loss": 0.5304, + "step": 4520 + }, + { + "epoch": 1.8195856873822975, + "grad_norm": 0.48759036445701953, + "learning_rate": 4.023832288692512e-06, + "loss": 0.5333, + "step": 4530 + }, + { + "epoch": 1.8236032642812305, + "grad_norm": 0.48751563543723775, + "learning_rate": 4.000928123593898e-06, + "loss": 0.5385, + "step": 4540 + }, + { + "epoch": 1.8276208411801633, + "grad_norm": 0.512128536559346, + "learning_rate": 3.978045774211043e-06, + "loss": 0.5438, + "step": 4550 + }, + { + "epoch": 1.831638418079096, + "grad_norm": 0.48148117588240913, + "learning_rate": 3.9551857402025215e-06, + "loss": 0.5321, + "step": 4560 + }, + { + "epoch": 1.835655994978029, + "grad_norm": 0.5036763909061966, + "learning_rate": 3.932348520739633e-06, + "loss": 0.5321, + "step": 4570 + }, + { + "epoch": 1.8396735718769617, + "grad_norm": 0.4850093548850179, + "learning_rate": 3.909534614495495e-06, + "loss": 0.5212, + "step": 4580 + }, + { + "epoch": 1.8436911487758945, + "grad_norm": 0.5089724945679945, + "learning_rate": 3.886744519634157e-06, + "loss": 0.526, + "step": 4590 + }, + { + "epoch": 1.8477087256748272, + "grad_norm": 0.4736889376864286, + "learning_rate": 3.86397873379973e-06, + "loss": 0.5355, + "step": 4600 + }, + { + "epoch": 1.8517263025737603, + "grad_norm": 0.48275206618553307, + "learning_rate": 3.841237754105508e-06, + "loss": 0.5375, + "step": 4610 + }, + { + "epoch": 1.855743879472693, + "grad_norm": 0.5070260134880437, + "learning_rate": 3.818522077123119e-06, + "loss": 0.5256, + "step": 4620 + }, + { + "epoch": 1.8597614563716258, + "grad_norm": 0.48382223029891325, + "learning_rate": 3.795832198871682e-06, + "loss": 0.5272, + "step": 4630 + }, + { + "epoch": 1.8637790332705588, + "grad_norm": 0.4533070548630681, + "learning_rate": 3.7731686148069768e-06, + "loss": 0.529, + "step": 4640 + }, + { + "epoch": 1.8677966101694916, + "grad_norm": 0.5136994231340827, + "learning_rate": 3.7505318198106226e-06, + "loss": 0.5259, + "step": 4650 + }, + { + "epoch": 1.8718141870684244, + "grad_norm": 0.4891813686506932, + "learning_rate": 3.727922308179275e-06, + "loss": 0.528, + "step": 4660 + }, + { + "epoch": 1.8758317639673572, + "grad_norm": 0.4784464468836338, + "learning_rate": 3.7053405736138228e-06, + "loss": 0.5239, + "step": 4670 + }, + { + "epoch": 1.87984934086629, + "grad_norm": 0.4439894514511087, + "learning_rate": 3.6827871092086283e-06, + "loss": 0.5278, + "step": 4680 + }, + { + "epoch": 1.8838669177652227, + "grad_norm": 0.42133586965144204, + "learning_rate": 3.6602624074407354e-06, + "loss": 0.525, + "step": 4690 + }, + { + "epoch": 1.8878844946641555, + "grad_norm": 0.4452525569153129, + "learning_rate": 3.6377669601591314e-06, + "loss": 0.5271, + "step": 4700 + }, + { + "epoch": 1.8919020715630885, + "grad_norm": 0.4781861294441116, + "learning_rate": 3.615301258574009e-06, + "loss": 0.5244, + "step": 4710 + }, + { + "epoch": 1.8959196484620213, + "grad_norm": 0.442730967082675, + "learning_rate": 3.5928657932460252e-06, + "loss": 0.5245, + "step": 4720 + }, + { + "epoch": 1.8999372253609543, + "grad_norm": 0.45356827607495354, + "learning_rate": 3.5704610540756035e-06, + "loss": 0.5226, + "step": 4730 + }, + { + "epoch": 1.9039548022598871, + "grad_norm": 0.44691195704566716, + "learning_rate": 3.5480875302922296e-06, + "loss": 0.5383, + "step": 4740 + }, + { + "epoch": 1.90797237915882, + "grad_norm": 0.48059730824092567, + "learning_rate": 3.525745710443774e-06, + "loss": 0.5224, + "step": 4750 + }, + { + "epoch": 1.9119899560577527, + "grad_norm": 0.447518168596057, + "learning_rate": 3.503436082385817e-06, + "loss": 0.529, + "step": 4760 + }, + { + "epoch": 1.9160075329566855, + "grad_norm": 0.45980480004977614, + "learning_rate": 3.4811591332710003e-06, + "loss": 0.5283, + "step": 4770 + }, + { + "epoch": 1.9200251098556183, + "grad_norm": 0.4641653971039642, + "learning_rate": 3.4589153495383916e-06, + "loss": 0.524, + "step": 4780 + }, + { + "epoch": 1.924042686754551, + "grad_norm": 0.4608734211286836, + "learning_rate": 3.4367052169028557e-06, + "loss": 0.5154, + "step": 4790 + }, + { + "epoch": 1.928060263653484, + "grad_norm": 0.45628159677386826, + "learning_rate": 3.414529220344455e-06, + "loss": 0.5246, + "step": 4800 + }, + { + "epoch": 1.9320778405524168, + "grad_norm": 0.4571910097492241, + "learning_rate": 3.3923878440978563e-06, + "loss": 0.5355, + "step": 4810 + }, + { + "epoch": 1.9360954174513496, + "grad_norm": 0.4701204963184072, + "learning_rate": 3.370281571641759e-06, + "loss": 0.519, + "step": 4820 + }, + { + "epoch": 1.9401129943502826, + "grad_norm": 0.4644809253665488, + "learning_rate": 3.348210885688337e-06, + "loss": 0.5444, + "step": 4830 + }, + { + "epoch": 1.9441305712492154, + "grad_norm": 0.4505954165735258, + "learning_rate": 3.3261762681726955e-06, + "loss": 0.5288, + "step": 4840 + }, + { + "epoch": 1.9481481481481482, + "grad_norm": 0.5016011741209488, + "learning_rate": 3.304178200242351e-06, + "loss": 0.5279, + "step": 4850 + }, + { + "epoch": 1.952165725047081, + "grad_norm": 0.48172105905267193, + "learning_rate": 3.282217162246726e-06, + "loss": 0.5331, + "step": 4860 + }, + { + "epoch": 1.9561833019460138, + "grad_norm": 0.45055441596952966, + "learning_rate": 3.260293633726656e-06, + "loss": 0.5312, + "step": 4870 + }, + { + "epoch": 1.9602008788449465, + "grad_norm": 0.4553856152933495, + "learning_rate": 3.2384080934039193e-06, + "loss": 0.5301, + "step": 4880 + }, + { + "epoch": 1.9642184557438793, + "grad_norm": 0.468059642341407, + "learning_rate": 3.2165610191707872e-06, + "loss": 0.5265, + "step": 4890 + }, + { + "epoch": 1.9682360326428123, + "grad_norm": 0.42050027272205776, + "learning_rate": 3.194752888079585e-06, + "loss": 0.5212, + "step": 4900 + }, + { + "epoch": 1.9722536095417451, + "grad_norm": 0.49679149221807994, + "learning_rate": 3.1729841763322776e-06, + "loss": 0.5298, + "step": 4910 + }, + { + "epoch": 1.9762711864406781, + "grad_norm": 0.4716285847340891, + "learning_rate": 3.1512553592700622e-06, + "loss": 0.5203, + "step": 4920 + }, + { + "epoch": 1.980288763339611, + "grad_norm": 0.4740722369561679, + "learning_rate": 3.129566911363009e-06, + "loss": 0.5208, + "step": 4930 + }, + { + "epoch": 1.9843063402385437, + "grad_norm": 0.4553213513296392, + "learning_rate": 3.1079193061996803e-06, + "loss": 0.5241, + "step": 4940 + }, + { + "epoch": 1.9883239171374765, + "grad_norm": 0.48169953080880973, + "learning_rate": 3.086313016476794e-06, + "loss": 0.5418, + "step": 4950 + }, + { + "epoch": 1.9923414940364093, + "grad_norm": 0.5764706666287279, + "learning_rate": 3.0647485139889145e-06, + "loss": 0.5259, + "step": 4960 + }, + { + "epoch": 1.996359070935342, + "grad_norm": 0.43220459247826987, + "learning_rate": 3.0432262696181336e-06, + "loss": 0.522, + "step": 4970 + }, + { + "epoch": 2.0, + "grad_norm": 0.4710113996508926, + "learning_rate": 3.0217467533237956e-06, + "loss": 0.5142, + "step": 4980 + }, + { + "epoch": 2.004017576898933, + "grad_norm": 0.5202233442297398, + "learning_rate": 3.000310434132237e-06, + "loss": 0.4811, + "step": 4990 + }, + { + "epoch": 2.0080351537978656, + "grad_norm": 0.5384401881895943, + "learning_rate": 2.9789177801265455e-06, + "loss": 0.4769, + "step": 5000 + }, + { + "epoch": 2.0120527306967984, + "grad_norm": 0.4756295523176075, + "learning_rate": 2.9575692584363337e-06, + "loss": 0.4755, + "step": 5010 + }, + { + "epoch": 2.016070307595731, + "grad_norm": 0.5172206673937627, + "learning_rate": 2.9362653352275405e-06, + "loss": 0.4813, + "step": 5020 + }, + { + "epoch": 2.0200878844946644, + "grad_norm": 0.49483001522371817, + "learning_rate": 2.915006475692256e-06, + "loss": 0.472, + "step": 5030 + }, + { + "epoch": 2.024105461393597, + "grad_norm": 0.4704791978578869, + "learning_rate": 2.89379314403856e-06, + "loss": 0.4747, + "step": 5040 + }, + { + "epoch": 2.02812303829253, + "grad_norm": 0.5050782711725152, + "learning_rate": 2.8726258034803866e-06, + "loss": 0.4794, + "step": 5050 + }, + { + "epoch": 2.0321406151914627, + "grad_norm": 0.4712269294430662, + "learning_rate": 2.8515049162274057e-06, + "loss": 0.4722, + "step": 5060 + }, + { + "epoch": 2.0361581920903955, + "grad_norm": 0.48881220535426967, + "learning_rate": 2.83043094347494e-06, + "loss": 0.4678, + "step": 5070 + }, + { + "epoch": 2.0401757689893283, + "grad_norm": 0.48771842977610164, + "learning_rate": 2.8094043453938844e-06, + "loss": 0.4665, + "step": 5080 + }, + { + "epoch": 2.044193345888261, + "grad_norm": 0.5202925253687711, + "learning_rate": 2.7884255811206584e-06, + "loss": 0.4763, + "step": 5090 + }, + { + "epoch": 2.048210922787194, + "grad_norm": 0.46874669055521, + "learning_rate": 2.7674951087471858e-06, + "loss": 0.4833, + "step": 5100 + }, + { + "epoch": 2.0522284996861266, + "grad_norm": 0.47453927342332336, + "learning_rate": 2.7466133853108935e-06, + "loss": 0.4598, + "step": 5110 + }, + { + "epoch": 2.0562460765850594, + "grad_norm": 0.45817909400058926, + "learning_rate": 2.725780866784722e-06, + "loss": 0.4719, + "step": 5120 + }, + { + "epoch": 2.0602636534839927, + "grad_norm": 0.5261764803454092, + "learning_rate": 2.704998008067177e-06, + "loss": 0.4634, + "step": 5130 + }, + { + "epoch": 2.0642812303829254, + "grad_norm": 0.47433900887045005, + "learning_rate": 2.6842652629723907e-06, + "loss": 0.4785, + "step": 5140 + }, + { + "epoch": 2.068298807281858, + "grad_norm": 0.4711156322673265, + "learning_rate": 2.6635830842202182e-06, + "loss": 0.4625, + "step": 5150 + }, + { + "epoch": 2.072316384180791, + "grad_norm": 0.4620663705804988, + "learning_rate": 2.642951923426348e-06, + "loss": 0.4775, + "step": 5160 + }, + { + "epoch": 2.076333961079724, + "grad_norm": 0.4617355126174571, + "learning_rate": 2.622372231092437e-06, + "loss": 0.4817, + "step": 5170 + }, + { + "epoch": 2.0803515379786566, + "grad_norm": 0.5411857216699084, + "learning_rate": 2.6018444565962885e-06, + "loss": 0.4731, + "step": 5180 + }, + { + "epoch": 2.0843691148775894, + "grad_norm": 0.4943277164316023, + "learning_rate": 2.5813690481820184e-06, + "loss": 0.4693, + "step": 5190 + }, + { + "epoch": 2.088386691776522, + "grad_norm": 0.5102055113637535, + "learning_rate": 2.5609464529502815e-06, + "loss": 0.4805, + "step": 5200 + }, + { + "epoch": 2.092404268675455, + "grad_norm": 0.49132645013613546, + "learning_rate": 2.540577116848505e-06, + "loss": 0.4694, + "step": 5210 + }, + { + "epoch": 2.096421845574388, + "grad_norm": 0.49577954933166757, + "learning_rate": 2.52026148466115e-06, + "loss": 0.4825, + "step": 5220 + }, + { + "epoch": 2.100439422473321, + "grad_norm": 0.4840304988230105, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.4775, + "step": 5230 + }, + { + "epoch": 2.1044569993722537, + "grad_norm": 0.5036218210876587, + "learning_rate": 2.4797931052944755e-06, + "loss": 0.472, + "step": 5240 + }, + { + "epoch": 2.1084745762711865, + "grad_norm": 0.4502304265079634, + "learning_rate": 2.4596412417819708e-06, + "loss": 0.4685, + "step": 5250 + }, + { + "epoch": 2.1124921531701193, + "grad_norm": 0.5218870877367079, + "learning_rate": 2.4395448494982198e-06, + "loss": 0.4817, + "step": 5260 + }, + { + "epoch": 2.116509730069052, + "grad_norm": 0.49197993587084365, + "learning_rate": 2.419504367267689e-06, + "loss": 0.4744, + "step": 5270 + }, + { + "epoch": 2.120527306967985, + "grad_norm": 0.5349529463351869, + "learning_rate": 2.3995202326939866e-06, + "loss": 0.4872, + "step": 5280 + }, + { + "epoch": 2.1245448838669176, + "grad_norm": 0.48741903300666545, + "learning_rate": 2.3795928821503275e-06, + "loss": 0.4688, + "step": 5290 + }, + { + "epoch": 2.1285624607658504, + "grad_norm": 0.4999227078942744, + "learning_rate": 2.359722750769981e-06, + "loss": 0.4793, + "step": 5300 + }, + { + "epoch": 2.132580037664783, + "grad_norm": 0.45470893654183736, + "learning_rate": 2.339910272436782e-06, + "loss": 0.4755, + "step": 5310 + }, + { + "epoch": 2.1365976145637164, + "grad_norm": 0.4824984736023203, + "learning_rate": 2.3201558797756602e-06, + "loss": 0.472, + "step": 5320 + }, + { + "epoch": 2.1406151914626492, + "grad_norm": 0.47042980851352273, + "learning_rate": 2.300460004143182e-06, + "loss": 0.477, + "step": 5330 + }, + { + "epoch": 2.144632768361582, + "grad_norm": 0.47770309689595364, + "learning_rate": 2.2808230756181344e-06, + "loss": 0.4678, + "step": 5340 + }, + { + "epoch": 2.148650345260515, + "grad_norm": 0.5137315970001541, + "learning_rate": 2.261245522992141e-06, + "loss": 0.4718, + "step": 5350 + }, + { + "epoch": 2.1526679221594476, + "grad_norm": 0.47959163673834015, + "learning_rate": 2.2417277737602967e-06, + "loss": 0.4777, + "step": 5360 + }, + { + "epoch": 2.1566854990583804, + "grad_norm": 0.5938128736434867, + "learning_rate": 2.222270254111825e-06, + "loss": 0.4573, + "step": 5370 + }, + { + "epoch": 2.160703075957313, + "grad_norm": 0.4891138485781613, + "learning_rate": 2.2028733889207787e-06, + "loss": 0.4767, + "step": 5380 + }, + { + "epoch": 2.164720652856246, + "grad_norm": 0.501211033879534, + "learning_rate": 2.1835376017367665e-06, + "loss": 0.4735, + "step": 5390 + }, + { + "epoch": 2.1687382297551787, + "grad_norm": 0.5297335059398043, + "learning_rate": 2.1642633147756894e-06, + "loss": 0.4824, + "step": 5400 + }, + { + "epoch": 2.172755806654112, + "grad_norm": 0.4658949552573407, + "learning_rate": 2.145050948910536e-06, + "loss": 0.4757, + "step": 5410 + }, + { + "epoch": 2.1767733835530447, + "grad_norm": 0.5130444470120785, + "learning_rate": 2.1259009236621857e-06, + "loss": 0.4804, + "step": 5420 + }, + { + "epoch": 2.1807909604519775, + "grad_norm": 0.5328006329035961, + "learning_rate": 2.1068136571902527e-06, + "loss": 0.4714, + "step": 5430 + }, + { + "epoch": 2.1848085373509103, + "grad_norm": 0.49974994460658273, + "learning_rate": 2.0877895662839477e-06, + "loss": 0.4661, + "step": 5440 + }, + { + "epoch": 2.188826114249843, + "grad_norm": 0.504181506688872, + "learning_rate": 2.0688290663529813e-06, + "loss": 0.469, + "step": 5450 + }, + { + "epoch": 2.192843691148776, + "grad_norm": 0.5249615841875294, + "learning_rate": 2.049932571418494e-06, + "loss": 0.4784, + "step": 5460 + }, + { + "epoch": 2.1968612680477086, + "grad_norm": 0.4718284348814741, + "learning_rate": 2.031100494104014e-06, + "loss": 0.4784, + "step": 5470 + }, + { + "epoch": 2.2008788449466414, + "grad_norm": 0.5193890352090972, + "learning_rate": 2.0123332456264473e-06, + "loss": 0.4818, + "step": 5480 + }, + { + "epoch": 2.204896421845574, + "grad_norm": 0.48046763032522966, + "learning_rate": 1.9936312357870962e-06, + "loss": 0.4802, + "step": 5490 + }, + { + "epoch": 2.2089139987445074, + "grad_norm": 0.47752926213638963, + "learning_rate": 1.9749948729627188e-06, + "loss": 0.4686, + "step": 5500 + }, + { + "epoch": 2.2129315756434402, + "grad_norm": 0.4800555208436017, + "learning_rate": 1.956424564096602e-06, + "loss": 0.482, + "step": 5510 + }, + { + "epoch": 2.216949152542373, + "grad_norm": 0.443284090953342, + "learning_rate": 1.9379207146896827e-06, + "loss": 0.4733, + "step": 5520 + }, + { + "epoch": 2.220966729441306, + "grad_norm": 0.5002931907484188, + "learning_rate": 1.9194837287916817e-06, + "loss": 0.4776, + "step": 5530 + }, + { + "epoch": 2.2249843063402386, + "grad_norm": 0.45028976156882144, + "learning_rate": 1.9011140089923013e-06, + "loss": 0.4785, + "step": 5540 + }, + { + "epoch": 2.2290018832391714, + "grad_norm": 0.4619774988789297, + "learning_rate": 1.8828119564124159e-06, + "loss": 0.475, + "step": 5550 + }, + { + "epoch": 2.233019460138104, + "grad_norm": 0.47661490970644393, + "learning_rate": 1.8645779706953188e-06, + "loss": 0.4824, + "step": 5560 + }, + { + "epoch": 2.237037037037037, + "grad_norm": 0.49398577636680097, + "learning_rate": 1.8464124499980013e-06, + "loss": 0.4719, + "step": 5570 + }, + { + "epoch": 2.2410546139359697, + "grad_norm": 0.4799156749603329, + "learning_rate": 1.8283157909824517e-06, + "loss": 0.4787, + "step": 5580 + }, + { + "epoch": 2.2450721908349025, + "grad_norm": 0.4903571200855578, + "learning_rate": 1.8102883888069917e-06, + "loss": 0.481, + "step": 5590 + }, + { + "epoch": 2.2490897677338357, + "grad_norm": 0.4595319743138882, + "learning_rate": 1.7923306371176542e-06, + "loss": 0.4722, + "step": 5600 + }, + { + "epoch": 2.2531073446327685, + "grad_norm": 0.4815148560520921, + "learning_rate": 1.7744429280395903e-06, + "loss": 0.4804, + "step": 5610 + }, + { + "epoch": 2.2571249215317013, + "grad_norm": 0.49411596599772084, + "learning_rate": 1.7566256521684966e-06, + "loss": 0.4837, + "step": 5620 + }, + { + "epoch": 2.261142498430634, + "grad_norm": 0.5331117323575773, + "learning_rate": 1.7388791985620922e-06, + "loss": 0.4705, + "step": 5630 + }, + { + "epoch": 2.265160075329567, + "grad_norm": 0.52376875529828, + "learning_rate": 1.721203954731624e-06, + "loss": 0.4723, + "step": 5640 + }, + { + "epoch": 2.2691776522284997, + "grad_norm": 0.48551462166212467, + "learning_rate": 1.7036003066334012e-06, + "loss": 0.4853, + "step": 5650 + }, + { + "epoch": 2.2731952291274324, + "grad_norm": 0.5014457542958235, + "learning_rate": 1.6860686386603719e-06, + "loss": 0.4733, + "step": 5660 + }, + { + "epoch": 2.277212806026365, + "grad_norm": 0.4996867895329777, + "learning_rate": 1.6686093336337256e-06, + "loss": 0.4741, + "step": 5670 + }, + { + "epoch": 2.281230382925298, + "grad_norm": 0.48719109638855057, + "learning_rate": 1.6512227727945391e-06, + "loss": 0.4831, + "step": 5680 + }, + { + "epoch": 2.285247959824231, + "grad_norm": 0.4856082508478335, + "learning_rate": 1.6339093357954455e-06, + "loss": 0.4833, + "step": 5690 + }, + { + "epoch": 2.289265536723164, + "grad_norm": 0.5118289081317942, + "learning_rate": 1.6166694006923479e-06, + "loss": 0.4845, + "step": 5700 + }, + { + "epoch": 2.293283113622097, + "grad_norm": 0.5338262164693043, + "learning_rate": 1.5995033439361623e-06, + "loss": 0.4725, + "step": 5710 + }, + { + "epoch": 2.2973006905210296, + "grad_norm": 0.5147815344933698, + "learning_rate": 1.5824115403646e-06, + "loss": 0.471, + "step": 5720 + }, + { + "epoch": 2.3013182674199624, + "grad_norm": 0.4925335147001341, + "learning_rate": 1.5653943631939806e-06, + "loss": 0.4748, + "step": 5730 + }, + { + "epoch": 2.305335844318895, + "grad_norm": 0.5223852201900643, + "learning_rate": 1.5484521840110812e-06, + "loss": 0.4799, + "step": 5740 + }, + { + "epoch": 2.309353421217828, + "grad_norm": 0.5102348798882654, + "learning_rate": 1.5315853727650283e-06, + "loss": 0.4734, + "step": 5750 + }, + { + "epoch": 2.3133709981167607, + "grad_norm": 0.4806877999022709, + "learning_rate": 1.5147942977592111e-06, + "loss": 0.4793, + "step": 5760 + }, + { + "epoch": 2.3173885750156935, + "grad_norm": 0.47394343796609417, + "learning_rate": 1.4980793256432474e-06, + "loss": 0.4778, + "step": 5770 + }, + { + "epoch": 2.3214061519146263, + "grad_norm": 0.5169128981837072, + "learning_rate": 1.4814408214049674e-06, + "loss": 0.4756, + "step": 5780 + }, + { + "epoch": 2.3254237288135595, + "grad_norm": 0.4653905202296831, + "learning_rate": 1.4648791483624586e-06, + "loss": 0.476, + "step": 5790 + }, + { + "epoch": 2.3294413057124923, + "grad_norm": 0.49306875398820615, + "learning_rate": 1.4483946681561178e-06, + "loss": 0.4686, + "step": 5800 + }, + { + "epoch": 2.333458882611425, + "grad_norm": 0.48126484624398314, + "learning_rate": 1.4319877407407623e-06, + "loss": 0.4757, + "step": 5810 + }, + { + "epoch": 2.337476459510358, + "grad_norm": 0.4857946344608446, + "learning_rate": 1.415658724377767e-06, + "loss": 0.4707, + "step": 5820 + }, + { + "epoch": 2.3414940364092907, + "grad_norm": 0.508812865037136, + "learning_rate": 1.3994079756272467e-06, + "loss": 0.4716, + "step": 5830 + }, + { + "epoch": 2.3455116133082234, + "grad_norm": 0.4924478822718041, + "learning_rate": 1.3832358493402591e-06, + "loss": 0.4788, + "step": 5840 + }, + { + "epoch": 2.3495291902071562, + "grad_norm": 0.5636558365101663, + "learning_rate": 1.3671426986510667e-06, + "loss": 0.4791, + "step": 5850 + }, + { + "epoch": 2.353546767106089, + "grad_norm": 0.4787221048450779, + "learning_rate": 1.3511288749694245e-06, + "loss": 0.4774, + "step": 5860 + }, + { + "epoch": 2.357564344005022, + "grad_norm": 0.509596234617302, + "learning_rate": 1.3351947279729016e-06, + "loss": 0.4738, + "step": 5870 + }, + { + "epoch": 2.361581920903955, + "grad_norm": 0.48376004159048386, + "learning_rate": 1.3193406055992485e-06, + "loss": 0.4828, + "step": 5880 + }, + { + "epoch": 2.365599497802888, + "grad_norm": 0.5038494271293078, + "learning_rate": 1.3035668540388002e-06, + "loss": 0.4864, + "step": 5890 + }, + { + "epoch": 2.3696170747018206, + "grad_norm": 0.5316770270179247, + "learning_rate": 1.2878738177269156e-06, + "loss": 0.4785, + "step": 5900 + }, + { + "epoch": 2.3736346516007534, + "grad_norm": 0.5142512043455278, + "learning_rate": 1.2722618393364572e-06, + "loss": 0.4817, + "step": 5910 + }, + { + "epoch": 2.377652228499686, + "grad_norm": 0.468789477239248, + "learning_rate": 1.2567312597703063e-06, + "loss": 0.4735, + "step": 5920 + }, + { + "epoch": 2.381669805398619, + "grad_norm": 0.4736300406705453, + "learning_rate": 1.2412824181539256e-06, + "loss": 0.467, + "step": 5930 + }, + { + "epoch": 2.3856873822975517, + "grad_norm": 0.49508031018022314, + "learning_rate": 1.2259156518279452e-06, + "loss": 0.4854, + "step": 5940 + }, + { + "epoch": 2.3897049591964845, + "grad_norm": 0.4889357358622553, + "learning_rate": 1.2106312963408024e-06, + "loss": 0.4683, + "step": 5950 + }, + { + "epoch": 2.3937225360954173, + "grad_norm": 0.5186581775849328, + "learning_rate": 1.1954296854414111e-06, + "loss": 0.4743, + "step": 5960 + }, + { + "epoch": 2.3977401129943505, + "grad_norm": 0.534928567712559, + "learning_rate": 1.1803111510718774e-06, + "loss": 0.4713, + "step": 5970 + }, + { + "epoch": 2.401757689893283, + "grad_norm": 0.46937598704982375, + "learning_rate": 1.1652760233602495e-06, + "loss": 0.4773, + "step": 5980 + }, + { + "epoch": 2.405775266792216, + "grad_norm": 0.49187943461524203, + "learning_rate": 1.1503246306133099e-06, + "loss": 0.48, + "step": 5990 + }, + { + "epoch": 2.409792843691149, + "grad_norm": 0.4699512441083907, + "learning_rate": 1.1354572993094031e-06, + "loss": 0.4752, + "step": 6000 + }, + { + "epoch": 2.4138104205900817, + "grad_norm": 0.514478387098126, + "learning_rate": 1.1206743540913144e-06, + "loss": 0.4735, + "step": 6010 + }, + { + "epoch": 2.4178279974890144, + "grad_norm": 0.4912678185053618, + "learning_rate": 1.1059761177591727e-06, + "loss": 0.4738, + "step": 6020 + }, + { + "epoch": 2.4218455743879472, + "grad_norm": 0.45184357921612245, + "learning_rate": 1.0913629112634045e-06, + "loss": 0.4764, + "step": 6030 + }, + { + "epoch": 2.42586315128688, + "grad_norm": 0.45539851634790796, + "learning_rate": 1.076835053697728e-06, + "loss": 0.4758, + "step": 6040 + }, + { + "epoch": 2.429880728185813, + "grad_norm": 0.5152318496750359, + "learning_rate": 1.0623928622921825e-06, + "loss": 0.4732, + "step": 6050 + }, + { + "epoch": 2.4338983050847456, + "grad_norm": 0.5016552767543895, + "learning_rate": 1.0480366524062041e-06, + "loss": 0.483, + "step": 6060 + }, + { + "epoch": 2.4379158819836784, + "grad_norm": 0.47698174699672474, + "learning_rate": 1.0337667375217353e-06, + "loss": 0.4737, + "step": 6070 + }, + { + "epoch": 2.4419334588826116, + "grad_norm": 0.5215110019193456, + "learning_rate": 1.0195834292363881e-06, + "loss": 0.4717, + "step": 6080 + }, + { + "epoch": 2.4459510357815444, + "grad_norm": 0.5003135573061556, + "learning_rate": 1.0054870372566273e-06, + "loss": 0.4711, + "step": 6090 + }, + { + "epoch": 2.449968612680477, + "grad_norm": 0.5154412726359013, + "learning_rate": 9.914778693910165e-07, + "loss": 0.4738, + "step": 6100 + }, + { + "epoch": 2.45398618957941, + "grad_norm": 0.5155157663249713, + "learning_rate": 9.775562315435005e-07, + "loss": 0.481, + "step": 6110 + }, + { + "epoch": 2.4580037664783427, + "grad_norm": 0.4896545010247639, + "learning_rate": 9.637224277067142e-07, + "loss": 0.4869, + "step": 6120 + }, + { + "epoch": 2.4620213433772755, + "grad_norm": 0.4603343227039726, + "learning_rate": 9.499767599553528e-07, + "loss": 0.4817, + "step": 6130 + }, + { + "epoch": 2.4660389202762083, + "grad_norm": 0.4600261678419497, + "learning_rate": 9.363195284395732e-07, + "loss": 0.4679, + "step": 6140 + }, + { + "epoch": 2.470056497175141, + "grad_norm": 0.4989876951619158, + "learning_rate": 9.227510313784405e-07, + "loss": 0.4805, + "step": 6150 + }, + { + "epoch": 2.474074074074074, + "grad_norm": 0.49701651082881143, + "learning_rate": 9.092715650534162e-07, + "loss": 0.474, + "step": 6160 + }, + { + "epoch": 2.478091650973007, + "grad_norm": 0.47689581346361015, + "learning_rate": 8.958814238018864e-07, + "loss": 0.4735, + "step": 6170 + }, + { + "epoch": 2.48210922787194, + "grad_norm": 0.48466102861465504, + "learning_rate": 8.825809000107382e-07, + "loss": 0.4823, + "step": 6180 + }, + { + "epoch": 2.4861268047708727, + "grad_norm": 0.43949823990630094, + "learning_rate": 8.693702841099744e-07, + "loss": 0.468, + "step": 6190 + }, + { + "epoch": 2.4901443816698055, + "grad_norm": 0.4734599690753599, + "learning_rate": 8.56249864566368e-07, + "loss": 0.4716, + "step": 6200 + }, + { + "epoch": 2.4941619585687382, + "grad_norm": 0.4837246488118121, + "learning_rate": 8.432199278771679e-07, + "loss": 0.4727, + "step": 6210 + }, + { + "epoch": 2.498179535467671, + "grad_norm": 0.5212860772248176, + "learning_rate": 8.302807585638401e-07, + "loss": 0.4781, + "step": 6220 + }, + { + "epoch": 2.502197112366604, + "grad_norm": 0.4952468033104653, + "learning_rate": 8.174326391658561e-07, + "loss": 0.4742, + "step": 6230 + }, + { + "epoch": 2.5062146892655366, + "grad_norm": 0.4867979964745898, + "learning_rate": 8.04675850234523e-07, + "loss": 0.4731, + "step": 6240 + }, + { + "epoch": 2.5102322661644694, + "grad_norm": 0.48350910576661243, + "learning_rate": 7.92010670326856e-07, + "loss": 0.4793, + "step": 6250 + }, + { + "epoch": 2.5142498430634026, + "grad_norm": 0.48063080647615003, + "learning_rate": 7.794373759995017e-07, + "loss": 0.4814, + "step": 6260 + }, + { + "epoch": 2.518267419962335, + "grad_norm": 0.5071850927755938, + "learning_rate": 7.669562418026905e-07, + "loss": 0.4726, + "step": 6270 + }, + { + "epoch": 2.522284996861268, + "grad_norm": 0.5007552952463317, + "learning_rate": 7.545675402742464e-07, + "loss": 0.4701, + "step": 6280 + }, + { + "epoch": 2.526302573760201, + "grad_norm": 0.49193921591367956, + "learning_rate": 7.422715419336374e-07, + "loss": 0.4798, + "step": 6290 + }, + { + "epoch": 2.5303201506591337, + "grad_norm": 0.4631014183740996, + "learning_rate": 7.30068515276064e-07, + "loss": 0.4783, + "step": 6300 + }, + { + "epoch": 2.5343377275580665, + "grad_norm": 0.4461109021704027, + "learning_rate": 7.179587267665999e-07, + "loss": 0.4807, + "step": 6310 + }, + { + "epoch": 2.5383553044569993, + "grad_norm": 0.5615597716996266, + "learning_rate": 7.059424408343713e-07, + "loss": 0.476, + "step": 6320 + }, + { + "epoch": 2.542372881355932, + "grad_norm": 0.4681517030493916, + "learning_rate": 6.940199198667863e-07, + "loss": 0.4746, + "step": 6330 + }, + { + "epoch": 2.546390458254865, + "grad_norm": 0.4270578942801569, + "learning_rate": 6.821914242038013e-07, + "loss": 0.47, + "step": 6340 + }, + { + "epoch": 2.550408035153798, + "grad_norm": 0.5120970629852971, + "learning_rate": 6.704572121322356e-07, + "loss": 0.4661, + "step": 6350 + }, + { + "epoch": 2.5544256120527304, + "grad_norm": 0.4577342016167032, + "learning_rate": 6.588175398801356e-07, + "loss": 0.4778, + "step": 6360 + }, + { + "epoch": 2.5584431889516637, + "grad_norm": 0.5112843701769388, + "learning_rate": 6.472726616111797e-07, + "loss": 0.4774, + "step": 6370 + }, + { + "epoch": 2.5624607658505965, + "grad_norm": 0.5002523546947816, + "learning_rate": 6.358228294191248e-07, + "loss": 0.4745, + "step": 6380 + }, + { + "epoch": 2.5664783427495292, + "grad_norm": 0.4844209522259677, + "learning_rate": 6.244682933223023e-07, + "loss": 0.4743, + "step": 6390 + }, + { + "epoch": 2.570495919648462, + "grad_norm": 0.5054412286540757, + "learning_rate": 6.13209301258162e-07, + "loss": 0.4689, + "step": 6400 + }, + { + "epoch": 2.574513496547395, + "grad_norm": 0.4613555569290718, + "learning_rate": 6.020460990778537e-07, + "loss": 0.4711, + "step": 6410 + }, + { + "epoch": 2.5785310734463276, + "grad_norm": 0.47785054223407225, + "learning_rate": 5.909789305408631e-07, + "loss": 0.476, + "step": 6420 + }, + { + "epoch": 2.5825486503452604, + "grad_norm": 0.5097922646263033, + "learning_rate": 5.800080373096839e-07, + "loss": 0.4628, + "step": 6430 + }, + { + "epoch": 2.5865662272441936, + "grad_norm": 0.460211806295128, + "learning_rate": 5.691336589445485e-07, + "loss": 0.4693, + "step": 6440 + }, + { + "epoch": 2.590583804143126, + "grad_norm": 0.47963475329302363, + "learning_rate": 5.583560328981885e-07, + "loss": 0.4741, + "step": 6450 + }, + { + "epoch": 2.594601381042059, + "grad_norm": 0.521520019445044, + "learning_rate": 5.476753945106556e-07, + "loss": 0.4763, + "step": 6460 + }, + { + "epoch": 2.598618957940992, + "grad_norm": 0.4716378386067206, + "learning_rate": 5.370919770041799e-07, + "loss": 0.4742, + "step": 6470 + }, + { + "epoch": 2.6026365348399247, + "grad_norm": 0.5095512360791891, + "learning_rate": 5.266060114780774e-07, + "loss": 0.4769, + "step": 6480 + }, + { + "epoch": 2.6066541117388575, + "grad_norm": 0.49008584458884724, + "learning_rate": 5.162177269037061e-07, + "loss": 0.4695, + "step": 6490 + }, + { + "epoch": 2.6106716886377903, + "grad_norm": 0.4430564461046631, + "learning_rate": 5.059273501194622e-07, + "loss": 0.4738, + "step": 6500 + }, + { + "epoch": 2.614689265536723, + "grad_norm": 0.48041503000167013, + "learning_rate": 4.95735105825833e-07, + "loss": 0.4671, + "step": 6510 + }, + { + "epoch": 2.618706842435656, + "grad_norm": 0.47058785476417425, + "learning_rate": 4.856412165804824e-07, + "loss": 0.4656, + "step": 6520 + }, + { + "epoch": 2.6227244193345887, + "grad_norm": 0.5186436231128471, + "learning_rate": 4.756459027933974e-07, + "loss": 0.4795, + "step": 6530 + }, + { + "epoch": 2.6267419962335214, + "grad_norm": 0.4971208460235255, + "learning_rate": 4.657493827220705e-07, + "loss": 0.4745, + "step": 6540 + }, + { + "epoch": 2.6307595731324547, + "grad_norm": 0.46196697310421064, + "learning_rate": 4.559518724667411e-07, + "loss": 0.4788, + "step": 6550 + }, + { + "epoch": 2.6347771500313875, + "grad_norm": 0.516003177023342, + "learning_rate": 4.462535859656675e-07, + "loss": 0.476, + "step": 6560 + }, + { + "epoch": 2.6387947269303202, + "grad_norm": 0.5015505649590103, + "learning_rate": 4.36654734990461e-07, + "loss": 0.4818, + "step": 6570 + }, + { + "epoch": 2.642812303829253, + "grad_norm": 0.517201945888056, + "learning_rate": 4.271555291414636e-07, + "loss": 0.4564, + "step": 6580 + }, + { + "epoch": 2.646829880728186, + "grad_norm": 0.4978218856523734, + "learning_rate": 4.1775617584316476e-07, + "loss": 0.4713, + "step": 6590 + }, + { + "epoch": 2.6508474576271186, + "grad_norm": 0.4876775749761943, + "learning_rate": 4.0845688033967435e-07, + "loss": 0.4753, + "step": 6600 + }, + { + "epoch": 2.6548650345260514, + "grad_norm": 0.49046863554031045, + "learning_rate": 3.992578456902452e-07, + "loss": 0.4719, + "step": 6610 + }, + { + "epoch": 2.658882611424984, + "grad_norm": 0.4828102602774462, + "learning_rate": 3.901592727648351e-07, + "loss": 0.471, + "step": 6620 + }, + { + "epoch": 2.662900188323917, + "grad_norm": 0.5128903958409182, + "learning_rate": 3.811613602397202e-07, + "loss": 0.4799, + "step": 6630 + }, + { + "epoch": 2.66691776522285, + "grad_norm": 0.485862558152459, + "learning_rate": 3.7226430459315957e-07, + "loss": 0.4682, + "step": 6640 + }, + { + "epoch": 2.670935342121783, + "grad_norm": 0.48320591671258245, + "learning_rate": 3.634683001011019e-07, + "loss": 0.4802, + "step": 6650 + }, + { + "epoch": 2.6749529190207157, + "grad_norm": 0.4382446479240708, + "learning_rate": 3.547735388329443e-07, + "loss": 0.4728, + "step": 6660 + }, + { + "epoch": 2.6789704959196485, + "grad_norm": 0.4939760563538167, + "learning_rate": 3.461802106473411e-07, + "loss": 0.4811, + "step": 6670 + }, + { + "epoch": 2.6829880728185813, + "grad_norm": 0.50894968077572, + "learning_rate": 3.3768850318805224e-07, + "loss": 0.4666, + "step": 6680 + }, + { + "epoch": 2.687005649717514, + "grad_norm": 0.5297225230888177, + "learning_rate": 3.2929860187985216e-07, + "loss": 0.4712, + "step": 6690 + }, + { + "epoch": 2.691023226616447, + "grad_norm": 0.49359696533604985, + "learning_rate": 3.210106899244775e-07, + "loss": 0.4808, + "step": 6700 + }, + { + "epoch": 2.6950408035153797, + "grad_norm": 0.47433607917767673, + "learning_rate": 3.1282494829662556e-07, + "loss": 0.4676, + "step": 6710 + }, + { + "epoch": 2.6990583804143125, + "grad_norm": 0.4777730701958091, + "learning_rate": 3.047415557400057e-07, + "loss": 0.4777, + "step": 6720 + }, + { + "epoch": 2.7030759573132457, + "grad_norm": 0.4944173220055023, + "learning_rate": 2.967606887634344e-07, + "loss": 0.4736, + "step": 6730 + }, + { + "epoch": 2.707093534212178, + "grad_norm": 0.5904235702447377, + "learning_rate": 2.888825216369806e-07, + "loss": 0.4772, + "step": 6740 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 0.4952766389802285, + "learning_rate": 2.811072263881615e-07, + "loss": 0.485, + "step": 6750 + }, + { + "epoch": 2.715128688010044, + "grad_norm": 1.0088921588490039, + "learning_rate": 2.7343497279818833e-07, + "loss": 0.4695, + "step": 6760 + }, + { + "epoch": 2.719146264908977, + "grad_norm": 0.4788614551696112, + "learning_rate": 2.658659283982523e-07, + "loss": 0.4737, + "step": 6770 + }, + { + "epoch": 2.7231638418079096, + "grad_norm": 0.5299426999271306, + "learning_rate": 2.58400258465874e-07, + "loss": 0.4835, + "step": 6780 + }, + { + "epoch": 2.7271814187068424, + "grad_norm": 0.496725834314719, + "learning_rate": 2.510381260212874e-07, + "loss": 0.4714, + "step": 6790 + }, + { + "epoch": 2.731198995605775, + "grad_norm": 0.5015477715189429, + "learning_rate": 2.4377969182388774e-07, + "loss": 0.4692, + "step": 6800 + }, + { + "epoch": 2.735216572504708, + "grad_norm": 0.5250384332220221, + "learning_rate": 2.3662511436871538e-07, + "loss": 0.4749, + "step": 6810 + }, + { + "epoch": 2.739234149403641, + "grad_norm": 0.4422929805667402, + "learning_rate": 2.295745498829949e-07, + "loss": 0.475, + "step": 6820 + }, + { + "epoch": 2.7432517263025735, + "grad_norm": 0.5083194750205056, + "learning_rate": 2.2262815232272916e-07, + "loss": 0.4683, + "step": 6830 + }, + { + "epoch": 2.7472693032015068, + "grad_norm": 0.459034326672265, + "learning_rate": 2.1578607336933177e-07, + "loss": 0.4776, + "step": 6840 + }, + { + "epoch": 2.7512868801004395, + "grad_norm": 0.4944061747494303, + "learning_rate": 2.090484624263167e-07, + "loss": 0.4686, + "step": 6850 + }, + { + "epoch": 2.7553044569993723, + "grad_norm": 0.5465184475852697, + "learning_rate": 2.0241546661603605e-07, + "loss": 0.4694, + "step": 6860 + }, + { + "epoch": 2.759322033898305, + "grad_norm": 0.4897764666579114, + "learning_rate": 1.9588723077646976e-07, + "loss": 0.4711, + "step": 6870 + }, + { + "epoch": 2.763339610797238, + "grad_norm": 0.49639087360500034, + "learning_rate": 1.8946389745805983e-07, + "loss": 0.4747, + "step": 6880 + }, + { + "epoch": 2.7673571876961707, + "grad_norm": 0.5022906703230215, + "learning_rate": 1.8314560692059836e-07, + "loss": 0.4735, + "step": 6890 + }, + { + "epoch": 2.7713747645951035, + "grad_norm": 0.48564181193501227, + "learning_rate": 1.7693249713016558e-07, + "loss": 0.466, + "step": 6900 + }, + { + "epoch": 2.7753923414940367, + "grad_norm": 0.5530530885750234, + "learning_rate": 1.7082470375611614e-07, + "loss": 0.4815, + "step": 6910 + }, + { + "epoch": 2.779409918392969, + "grad_norm": 0.48580647047733116, + "learning_rate": 1.648223601681176e-07, + "loss": 0.4858, + "step": 6920 + }, + { + "epoch": 2.7834274952919023, + "grad_norm": 0.4942106873900172, + "learning_rate": 1.589255974332382e-07, + "loss": 0.4755, + "step": 6930 + }, + { + "epoch": 2.787445072190835, + "grad_norm": 0.5077413435223396, + "learning_rate": 1.5313454431308494e-07, + "loss": 0.4762, + "step": 6940 + }, + { + "epoch": 2.791462649089768, + "grad_norm": 0.49459375051900323, + "learning_rate": 1.4744932726099005e-07, + "loss": 0.4678, + "step": 6950 + }, + { + "epoch": 2.7954802259887006, + "grad_norm": 0.5103917447607162, + "learning_rate": 1.4187007041925328e-07, + "loss": 0.4734, + "step": 6960 + }, + { + "epoch": 2.7994978028876334, + "grad_norm": 0.47776810942745174, + "learning_rate": 1.363968956164269e-07, + "loss": 0.4736, + "step": 6970 + }, + { + "epoch": 2.803515379786566, + "grad_norm": 0.5055359246952604, + "learning_rate": 1.310299223646594e-07, + "loss": 0.4675, + "step": 6980 + }, + { + "epoch": 2.807532956685499, + "grad_norm": 0.5159323211802336, + "learning_rate": 1.2576926785708321e-07, + "loss": 0.4796, + "step": 6990 + }, + { + "epoch": 2.8115505335844317, + "grad_norm": 0.4717892098536317, + "learning_rate": 1.2061504696525617e-07, + "loss": 0.4752, + "step": 7000 + }, + { + "epoch": 2.8155681104833645, + "grad_norm": 0.47314656128582044, + "learning_rate": 1.1556737223665515e-07, + "loss": 0.4715, + "step": 7010 + }, + { + "epoch": 2.8195856873822978, + "grad_norm": 0.47908474784592364, + "learning_rate": 1.1062635389221588e-07, + "loss": 0.4865, + "step": 7020 + }, + { + "epoch": 2.8236032642812305, + "grad_norm": 0.4460236716075369, + "learning_rate": 1.0579209982392757e-07, + "loss": 0.4692, + "step": 7030 + }, + { + "epoch": 2.8276208411801633, + "grad_norm": 0.5320629331138326, + "learning_rate": 1.0106471559247433e-07, + "loss": 0.4692, + "step": 7040 + }, + { + "epoch": 2.831638418079096, + "grad_norm": 0.5095821610850613, + "learning_rate": 9.644430442493636e-08, + "loss": 0.4635, + "step": 7050 + }, + { + "epoch": 2.835655994978029, + "grad_norm": 0.45919702634359383, + "learning_rate": 9.193096721252903e-08, + "loss": 0.4623, + "step": 7060 + }, + { + "epoch": 2.8396735718769617, + "grad_norm": 0.5030328915586799, + "learning_rate": 8.752480250840411e-08, + "loss": 0.4738, + "step": 7070 + }, + { + "epoch": 2.8436911487758945, + "grad_norm": 0.5239356661209714, + "learning_rate": 8.322590652549478e-08, + "loss": 0.4717, + "step": 7080 + }, + { + "epoch": 2.8477087256748272, + "grad_norm": 0.5003917496304626, + "learning_rate": 7.903437313441842e-08, + "loss": 0.4857, + "step": 7090 + }, + { + "epoch": 2.85172630257376, + "grad_norm": 0.4708502707470253, + "learning_rate": 7.495029386142382e-08, + "loss": 0.4724, + "step": 7100 + }, + { + "epoch": 2.8557438794726933, + "grad_norm": 0.5171601825808555, + "learning_rate": 7.097375788639227e-08, + "loss": 0.4655, + "step": 7110 + }, + { + "epoch": 2.8597614563716256, + "grad_norm": 0.49349820404810807, + "learning_rate": 6.710485204089456e-08, + "loss": 0.4701, + "step": 7120 + }, + { + "epoch": 2.863779033270559, + "grad_norm": 0.5150356271171084, + "learning_rate": 6.334366080628873e-08, + "loss": 0.482, + "step": 7130 + }, + { + "epoch": 2.8677966101694916, + "grad_norm": 0.45255498690118884, + "learning_rate": 5.96902663118798e-08, + "loss": 0.4696, + "step": 7140 + }, + { + "epoch": 2.8718141870684244, + "grad_norm": 0.4833344072840721, + "learning_rate": 5.614474833312622e-08, + "loss": 0.4686, + "step": 7150 + }, + { + "epoch": 2.875831763967357, + "grad_norm": 0.44938418775925026, + "learning_rate": 5.270718428989463e-08, + "loss": 0.4671, + "step": 7160 + }, + { + "epoch": 2.87984934086629, + "grad_norm": 0.5005428826464499, + "learning_rate": 4.937764924477284e-08, + "loss": 0.4757, + "step": 7170 + }, + { + "epoch": 2.8838669177652227, + "grad_norm": 0.511470087270601, + "learning_rate": 4.615621590142838e-08, + "loss": 0.488, + "step": 7180 + }, + { + "epoch": 2.8878844946641555, + "grad_norm": 0.5016274179565011, + "learning_rate": 4.3042954603023655e-08, + "loss": 0.4717, + "step": 7190 + }, + { + "epoch": 2.8919020715630888, + "grad_norm": 0.46282357124816725, + "learning_rate": 4.003793333067607e-08, + "loss": 0.47, + "step": 7200 + }, + { + "epoch": 2.895919648462021, + "grad_norm": 0.5009456286050996, + "learning_rate": 3.714121770197754e-08, + "loss": 0.467, + "step": 7210 + }, + { + "epoch": 2.8999372253609543, + "grad_norm": 0.49427390713109254, + "learning_rate": 3.435287096955897e-08, + "loss": 0.4703, + "step": 7220 + }, + { + "epoch": 2.903954802259887, + "grad_norm": 0.4756544687033634, + "learning_rate": 3.167295401970971e-08, + "loss": 0.475, + "step": 7230 + }, + { + "epoch": 2.90797237915882, + "grad_norm": 0.4729939555426576, + "learning_rate": 2.9101525371049154e-08, + "loss": 0.4851, + "step": 7240 + }, + { + "epoch": 2.9119899560577527, + "grad_norm": 0.4655382923368333, + "learning_rate": 2.663864117324777e-08, + "loss": 0.4755, + "step": 7250 + }, + { + "epoch": 2.9160075329566855, + "grad_norm": 0.47231627841883606, + "learning_rate": 2.42843552058003e-08, + "loss": 0.4677, + "step": 7260 + }, + { + "epoch": 2.9200251098556183, + "grad_norm": 0.48596292370132727, + "learning_rate": 2.203871887685449e-08, + "loss": 0.4744, + "step": 7270 + }, + { + "epoch": 2.924042686754551, + "grad_norm": 0.4968533928087242, + "learning_rate": 1.9901781222084192e-08, + "loss": 0.4755, + "step": 7280 + }, + { + "epoch": 2.9280602636534843, + "grad_norm": 0.5186135008077664, + "learning_rate": 1.7873588903623006e-08, + "loss": 0.479, + "step": 7290 + }, + { + "epoch": 2.9320778405524166, + "grad_norm": 0.5196627425255889, + "learning_rate": 1.5954186209042323e-08, + "loss": 0.4684, + "step": 7300 + }, + { + "epoch": 2.93609541745135, + "grad_norm": 0.4875506685262467, + "learning_rate": 1.4143615050384862e-08, + "loss": 0.4619, + "step": 7310 + }, + { + "epoch": 2.9401129943502826, + "grad_norm": 0.4803641492705183, + "learning_rate": 1.2441914963250423e-08, + "loss": 0.4753, + "step": 7320 + }, + { + "epoch": 2.9441305712492154, + "grad_norm": 0.4438429839046257, + "learning_rate": 1.0849123105931558e-08, + "loss": 0.4772, + "step": 7330 + }, + { + "epoch": 2.948148148148148, + "grad_norm": 0.47157812956867295, + "learning_rate": 9.365274258604229e-09, + "loss": 0.4743, + "step": 7340 + }, + { + "epoch": 2.952165725047081, + "grad_norm": 0.5144283649963357, + "learning_rate": 7.990400822564525e-09, + "loss": 0.4898, + "step": 7350 + }, + { + "epoch": 2.9561833019460138, + "grad_norm": 0.4994406655154466, + "learning_rate": 6.7245328195247875e-09, + "loss": 0.4807, + "step": 7360 + }, + { + "epoch": 2.9602008788449465, + "grad_norm": 0.4789005591934425, + "learning_rate": 5.567697890955792e-09, + "loss": 0.4809, + "step": 7370 + }, + { + "epoch": 2.9642184557438793, + "grad_norm": 0.4556018148546147, + "learning_rate": 4.519921297484464e-09, + "loss": 0.4687, + "step": 7380 + }, + { + "epoch": 2.968236032642812, + "grad_norm": 0.5209374897753106, + "learning_rate": 3.5812259183426457e-09, + "loss": 0.47, + "step": 7390 + }, + { + "epoch": 2.9722536095417453, + "grad_norm": 0.4460089771421974, + "learning_rate": 2.751632250865832e-09, + "loss": 0.4778, + "step": 7400 + }, + { + "epoch": 2.976271186440678, + "grad_norm": 0.4783096492145474, + "learning_rate": 2.0311584100457526e-09, + "loss": 0.4753, + "step": 7410 + }, + { + "epoch": 2.980288763339611, + "grad_norm": 0.4882741296424901, + "learning_rate": 1.4198201281373503e-09, + "loss": 0.484, + "step": 7420 + }, + { + "epoch": 2.9843063402385437, + "grad_norm": 0.4605012097308224, + "learning_rate": 9.17630754312393e-10, + "loss": 0.4795, + "step": 7430 + }, + { + "epoch": 2.9883239171374765, + "grad_norm": 0.7503308211144039, + "learning_rate": 5.246012543680401e-10, + "loss": 0.4806, + "step": 7440 + }, + { + "epoch": 2.9923414940364093, + "grad_norm": 0.4845828672261576, + "learning_rate": 2.4074021049091954e-10, + "loss": 0.4753, + "step": 7450 + }, + { + "epoch": 2.996359070935342, + "grad_norm": 0.525299815132595, + "learning_rate": 6.605382106505964e-11, + "loss": 0.4796, + "step": 7460 + }, + { + "epoch": 3.0, + "grad_norm": 0.5136490807064724, + "learning_rate": 5.459005397723261e-13, + "loss": 0.4664, + "step": 7470 + }, + { + "epoch": 3.0, + "step": 7470, + "total_flos": 4.845415158505275e+18, + "train_loss": 0.5501844393999541, + "train_runtime": 257337.3533, + "train_samples_per_second": 3.714, + "train_steps_per_second": 0.029 + } + ], + "logging_steps": 10, + "max_steps": 7470, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 24890, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.845415158505275e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}