{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.8648734680884926, "eval_steps": 500, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.0, "learning_rate": 0, "loss": 7.7169, "step": 10 }, { "epoch": 0.01, "grad_norm": 10.55540657043457, "learning_rate": 9.997877083112197e-05, "loss": 9.0438, "step": 20 }, { "epoch": 0.01, "grad_norm": 6.060225009918213, "learning_rate": 9.987262498673178e-05, "loss": 3.211, "step": 30 }, { "epoch": 0.01, "grad_norm": 2.5255496501922607, "learning_rate": 9.976647914234159e-05, "loss": 0.6387, "step": 40 }, { "epoch": 0.02, "grad_norm": 2.976543664932251, "learning_rate": 9.966033329795139e-05, "loss": 0.5633, "step": 50 }, { "epoch": 0.02, "grad_norm": 2.2680673599243164, "learning_rate": 9.95541874535612e-05, "loss": 0.474, "step": 60 }, { "epoch": 0.02, "grad_norm": 3.136930465698242, "learning_rate": 9.944804160917101e-05, "loss": 0.3379, "step": 70 }, { "epoch": 0.03, "grad_norm": 4.159604072570801, "learning_rate": 9.935251034921983e-05, "loss": 0.4444, "step": 80 }, { "epoch": 0.03, "grad_norm": 1.704042911529541, "learning_rate": 9.924636450482963e-05, "loss": 0.4925, "step": 90 }, { "epoch": 0.03, "grad_norm": 3.9414522647857666, "learning_rate": 9.914021866043945e-05, "loss": 0.4583, "step": 100 }, { "epoch": 0.04, "grad_norm": 2.938662052154541, "learning_rate": 9.903407281604927e-05, "loss": 0.3838, "step": 110 }, { "epoch": 0.04, "grad_norm": 1.8753790855407715, "learning_rate": 9.892792697165907e-05, "loss": 0.3247, "step": 120 }, { "epoch": 0.04, "grad_norm": 1.75948965549469, "learning_rate": 9.882178112726887e-05, "loss": 0.3609, "step": 130 }, { "epoch": 0.04, "grad_norm": 1.9066141843795776, "learning_rate": 9.871563528287868e-05, "loss": 0.3453, "step": 140 }, { "epoch": 0.05, "grad_norm": 1.7767695188522339, "learning_rate": 9.86094894384885e-05, "loss": 0.5076, "step": 150 }, { "epoch": 0.05, "grad_norm": 2.5219664573669434, "learning_rate": 9.85033435940983e-05, "loss": 0.4999, "step": 160 }, { "epoch": 0.05, "grad_norm": 2.0505383014678955, "learning_rate": 9.83971977497081e-05, "loss": 0.5429, "step": 170 }, { "epoch": 0.06, "grad_norm": 6.132015705108643, "learning_rate": 9.82910519053179e-05, "loss": 0.5099, "step": 180 }, { "epoch": 0.06, "grad_norm": 1.057868480682373, "learning_rate": 9.818490606092772e-05, "loss": 0.4416, "step": 190 }, { "epoch": 0.06, "grad_norm": 2.6155290603637695, "learning_rate": 9.807876021653753e-05, "loss": 0.3986, "step": 200 }, { "epoch": 0.07, "grad_norm": 2.1468820571899414, "learning_rate": 9.797261437214733e-05, "loss": 0.3216, "step": 210 }, { "epoch": 0.07, "grad_norm": 0.6600925326347351, "learning_rate": 9.786646852775713e-05, "loss": 0.3552, "step": 220 }, { "epoch": 0.07, "grad_norm": 5.129382133483887, "learning_rate": 9.776032268336695e-05, "loss": 0.3221, "step": 230 }, { "epoch": 0.08, "grad_norm": 0.3891478180885315, "learning_rate": 9.765417683897677e-05, "loss": 0.4073, "step": 240 }, { "epoch": 0.08, "grad_norm": 3.254958391189575, "learning_rate": 9.754803099458657e-05, "loss": 0.4212, "step": 250 }, { "epoch": 0.08, "grad_norm": 3.34332013130188, "learning_rate": 9.744188515019638e-05, "loss": 0.2167, "step": 260 }, { "epoch": 0.09, "grad_norm": 3.801086902618408, "learning_rate": 9.733573930580618e-05, "loss": 0.5605, "step": 270 }, { "epoch": 0.09, "grad_norm": 5.026745796203613, "learning_rate": 9.7229593461416e-05, "loss": 0.3527, "step": 280 }, { "epoch": 0.09, "grad_norm": 3.8389620780944824, "learning_rate": 9.71234476170258e-05, "loss": 0.295, "step": 290 }, { "epoch": 0.1, "grad_norm": 2.0584566593170166, "learning_rate": 9.70173017726356e-05, "loss": 0.2759, "step": 300 }, { "epoch": 0.1, "grad_norm": 3.132164239883423, "learning_rate": 9.691115592824541e-05, "loss": 0.3888, "step": 310 }, { "epoch": 0.1, "grad_norm": 0.5387492179870605, "learning_rate": 9.680501008385522e-05, "loss": 0.2285, "step": 320 }, { "epoch": 0.11, "grad_norm": 3.0382373332977295, "learning_rate": 9.669886423946503e-05, "loss": 0.2549, "step": 330 }, { "epoch": 0.11, "grad_norm": 6.465576648712158, "learning_rate": 9.659271839507483e-05, "loss": 0.7377, "step": 340 }, { "epoch": 0.11, "grad_norm": 4.1156134605407715, "learning_rate": 9.648657255068465e-05, "loss": 0.3387, "step": 350 }, { "epoch": 0.11, "grad_norm": 4.147655963897705, "learning_rate": 9.638042670629445e-05, "loss": 0.2605, "step": 360 }, { "epoch": 0.12, "grad_norm": 1.4572869539260864, "learning_rate": 9.627428086190427e-05, "loss": 0.3024, "step": 370 }, { "epoch": 0.12, "grad_norm": 1.906175971031189, "learning_rate": 9.616813501751407e-05, "loss": 0.3728, "step": 380 }, { "epoch": 0.12, "grad_norm": 1.169878363609314, "learning_rate": 9.606198917312388e-05, "loss": 0.3961, "step": 390 }, { "epoch": 0.13, "grad_norm": 1.2084730863571167, "learning_rate": 9.595584332873368e-05, "loss": 0.4887, "step": 400 }, { "epoch": 0.13, "grad_norm": 0.7927988171577454, "learning_rate": 9.58496974843435e-05, "loss": 0.4519, "step": 410 }, { "epoch": 0.13, "grad_norm": 6.37067985534668, "learning_rate": 9.57435516399533e-05, "loss": 0.237, "step": 420 }, { "epoch": 0.14, "grad_norm": 2.9806203842163086, "learning_rate": 9.56374057955631e-05, "loss": 0.2917, "step": 430 }, { "epoch": 0.14, "grad_norm": 5.05634880065918, "learning_rate": 9.553125995117291e-05, "loss": 0.2794, "step": 440 }, { "epoch": 0.14, "grad_norm": 3.0483241081237793, "learning_rate": 9.542511410678273e-05, "loss": 0.3182, "step": 450 }, { "epoch": 0.15, "grad_norm": 3.2123796939849854, "learning_rate": 9.531896826239253e-05, "loss": 0.2872, "step": 460 }, { "epoch": 0.15, "grad_norm": 1.532020092010498, "learning_rate": 9.521282241800233e-05, "loss": 0.3258, "step": 470 }, { "epoch": 0.15, "grad_norm": 1.1242539882659912, "learning_rate": 9.510667657361215e-05, "loss": 0.3356, "step": 480 }, { "epoch": 0.16, "grad_norm": 4.846567153930664, "learning_rate": 9.500053072922196e-05, "loss": 0.3551, "step": 490 }, { "epoch": 0.16, "grad_norm": 3.233238458633423, "learning_rate": 9.489438488483177e-05, "loss": 0.3971, "step": 500 }, { "epoch": 0.16, "grad_norm": 1.7334824800491333, "learning_rate": 9.478823904044158e-05, "loss": 0.1896, "step": 510 }, { "epoch": 0.17, "grad_norm": 7.36009407043457, "learning_rate": 9.468209319605138e-05, "loss": 0.3338, "step": 520 }, { "epoch": 0.17, "grad_norm": 2.7838549613952637, "learning_rate": 9.457594735166118e-05, "loss": 0.3331, "step": 530 }, { "epoch": 0.17, "grad_norm": 2.643627405166626, "learning_rate": 9.4469801507271e-05, "loss": 0.4575, "step": 540 }, { "epoch": 0.18, "grad_norm": 5.420917510986328, "learning_rate": 9.43636556628808e-05, "loss": 0.37, "step": 550 }, { "epoch": 0.18, "grad_norm": 2.1689090728759766, "learning_rate": 9.425750981849061e-05, "loss": 0.3551, "step": 560 }, { "epoch": 0.18, "grad_norm": 0.7210526466369629, "learning_rate": 9.415136397410041e-05, "loss": 0.4028, "step": 570 }, { "epoch": 0.18, "grad_norm": 0.3214457929134369, "learning_rate": 9.404521812971022e-05, "loss": 0.2391, "step": 580 }, { "epoch": 0.19, "grad_norm": 3.8258142471313477, "learning_rate": 9.393907228532003e-05, "loss": 0.3399, "step": 590 }, { "epoch": 0.19, "grad_norm": 1.5249234437942505, "learning_rate": 9.383292644092985e-05, "loss": 0.4449, "step": 600 }, { "epoch": 0.19, "grad_norm": 0.22292350232601166, "learning_rate": 9.372678059653965e-05, "loss": 0.2156, "step": 610 }, { "epoch": 0.2, "grad_norm": 1.3040258884429932, "learning_rate": 9.362063475214946e-05, "loss": 0.4175, "step": 620 }, { "epoch": 0.2, "grad_norm": 1.3762481212615967, "learning_rate": 9.351448890775926e-05, "loss": 0.3191, "step": 630 }, { "epoch": 0.2, "grad_norm": 2.706467866897583, "learning_rate": 9.340834306336908e-05, "loss": 0.5163, "step": 640 }, { "epoch": 0.21, "grad_norm": 1.8577134609222412, "learning_rate": 9.330219721897888e-05, "loss": 0.1832, "step": 650 }, { "epoch": 0.21, "grad_norm": 5.450695037841797, "learning_rate": 9.319605137458869e-05, "loss": 0.269, "step": 660 }, { "epoch": 0.21, "grad_norm": 3.1967124938964844, "learning_rate": 9.308990553019849e-05, "loss": 0.3387, "step": 670 }, { "epoch": 0.22, "grad_norm": 2.2148098945617676, "learning_rate": 9.29837596858083e-05, "loss": 0.3407, "step": 680 }, { "epoch": 0.22, "grad_norm": 2.2693583965301514, "learning_rate": 9.287761384141811e-05, "loss": 0.2758, "step": 690 }, { "epoch": 0.22, "grad_norm": 4.460744857788086, "learning_rate": 9.277146799702791e-05, "loss": 0.2493, "step": 700 }, { "epoch": 0.23, "grad_norm": 8.331945419311523, "learning_rate": 9.266532215263772e-05, "loss": 0.2264, "step": 710 }, { "epoch": 0.23, "grad_norm": 2.7469747066497803, "learning_rate": 9.255917630824753e-05, "loss": 0.3038, "step": 720 }, { "epoch": 0.23, "grad_norm": 3.013535737991333, "learning_rate": 9.245303046385735e-05, "loss": 0.3136, "step": 730 }, { "epoch": 0.24, "grad_norm": 3.508979558944702, "learning_rate": 9.234688461946716e-05, "loss": 0.3502, "step": 740 }, { "epoch": 0.24, "grad_norm": 5.0464301109313965, "learning_rate": 9.224073877507696e-05, "loss": 0.1776, "step": 750 }, { "epoch": 0.24, "grad_norm": 1.6929841041564941, "learning_rate": 9.213459293068676e-05, "loss": 0.2984, "step": 760 }, { "epoch": 0.25, "grad_norm": 1.1452223062515259, "learning_rate": 9.202844708629658e-05, "loss": 0.2503, "step": 770 }, { "epoch": 0.25, "grad_norm": 1.3975647687911987, "learning_rate": 9.192230124190638e-05, "loss": 0.2423, "step": 780 }, { "epoch": 0.25, "grad_norm": 1.8630661964416504, "learning_rate": 9.181615539751619e-05, "loss": 0.327, "step": 790 }, { "epoch": 0.25, "grad_norm": 5.333163261413574, "learning_rate": 9.171000955312599e-05, "loss": 0.4495, "step": 800 }, { "epoch": 0.26, "grad_norm": 1.6478999853134155, "learning_rate": 9.160386370873581e-05, "loss": 0.2546, "step": 810 }, { "epoch": 0.26, "grad_norm": 1.2132633924484253, "learning_rate": 9.149771786434561e-05, "loss": 0.2439, "step": 820 }, { "epoch": 0.26, "grad_norm": 2.2123448848724365, "learning_rate": 9.139157201995542e-05, "loss": 0.3715, "step": 830 }, { "epoch": 0.27, "grad_norm": 2.148674726486206, "learning_rate": 9.128542617556523e-05, "loss": 0.252, "step": 840 }, { "epoch": 0.27, "grad_norm": 3.6980788707733154, "learning_rate": 9.117928033117504e-05, "loss": 0.4487, "step": 850 }, { "epoch": 0.27, "grad_norm": 6.548594951629639, "learning_rate": 9.107313448678485e-05, "loss": 0.2199, "step": 860 }, { "epoch": 0.28, "grad_norm": 3.5746383666992188, "learning_rate": 9.096698864239466e-05, "loss": 0.2728, "step": 870 }, { "epoch": 0.28, "grad_norm": 0.9120383858680725, "learning_rate": 9.086084279800446e-05, "loss": 0.2737, "step": 880 }, { "epoch": 0.28, "grad_norm": 4.220329761505127, "learning_rate": 9.075469695361427e-05, "loss": 0.4124, "step": 890 }, { "epoch": 0.29, "grad_norm": 2.5000956058502197, "learning_rate": 9.064855110922408e-05, "loss": 0.302, "step": 900 }, { "epoch": 0.29, "grad_norm": 5.3845906257629395, "learning_rate": 9.054240526483389e-05, "loss": 0.4177, "step": 910 }, { "epoch": 0.29, "grad_norm": 1.0533277988433838, "learning_rate": 9.043625942044369e-05, "loss": 0.3834, "step": 920 }, { "epoch": 0.3, "grad_norm": 2.482363224029541, "learning_rate": 9.03301135760535e-05, "loss": 0.3497, "step": 930 }, { "epoch": 0.3, "grad_norm": 2.785825729370117, "learning_rate": 9.022396773166331e-05, "loss": 0.2696, "step": 940 }, { "epoch": 0.3, "grad_norm": 0.9899762868881226, "learning_rate": 9.011782188727311e-05, "loss": 0.3139, "step": 950 }, { "epoch": 0.31, "grad_norm": 3.0521786212921143, "learning_rate": 9.001167604288293e-05, "loss": 0.4116, "step": 960 }, { "epoch": 0.31, "grad_norm": 1.1553211212158203, "learning_rate": 8.990553019849274e-05, "loss": 0.3239, "step": 970 }, { "epoch": 0.31, "grad_norm": 2.973958730697632, "learning_rate": 8.979938435410254e-05, "loss": 0.297, "step": 980 }, { "epoch": 0.32, "grad_norm": 1.3011306524276733, "learning_rate": 8.969323850971236e-05, "loss": 0.3136, "step": 990 }, { "epoch": 0.32, "grad_norm": 2.6845755577087402, "learning_rate": 8.958709266532216e-05, "loss": 0.3207, "step": 1000 }, { "epoch": 0.32, "grad_norm": 0.33025118708610535, "learning_rate": 8.948094682093196e-05, "loss": 0.1847, "step": 1010 }, { "epoch": 0.32, "grad_norm": 1.9631307125091553, "learning_rate": 8.937480097654177e-05, "loss": 0.2798, "step": 1020 }, { "epoch": 0.33, "grad_norm": 1.952580451965332, "learning_rate": 8.926865513215158e-05, "loss": 0.2184, "step": 1030 }, { "epoch": 0.33, "grad_norm": 5.541811466217041, "learning_rate": 8.916250928776139e-05, "loss": 0.2649, "step": 1040 }, { "epoch": 0.33, "grad_norm": 1.0800001621246338, "learning_rate": 8.905636344337119e-05, "loss": 0.3064, "step": 1050 }, { "epoch": 0.34, "grad_norm": 4.908554553985596, "learning_rate": 8.8950217598981e-05, "loss": 0.2, "step": 1060 }, { "epoch": 0.34, "grad_norm": 0.08677980303764343, "learning_rate": 8.884407175459081e-05, "loss": 0.1262, "step": 1070 }, { "epoch": 0.34, "grad_norm": 1.9461978673934937, "learning_rate": 8.873792591020062e-05, "loss": 0.3098, "step": 1080 }, { "epoch": 0.35, "grad_norm": 0.11714805662631989, "learning_rate": 8.863178006581043e-05, "loss": 0.3596, "step": 1090 }, { "epoch": 0.35, "grad_norm": 2.0041699409484863, "learning_rate": 8.852563422142024e-05, "loss": 0.2518, "step": 1100 }, { "epoch": 0.35, "grad_norm": 5.036510467529297, "learning_rate": 8.841948837703004e-05, "loss": 0.3654, "step": 1110 }, { "epoch": 0.36, "grad_norm": 2.267143726348877, "learning_rate": 8.831334253263986e-05, "loss": 0.2812, "step": 1120 }, { "epoch": 0.36, "grad_norm": 3.063321113586426, "learning_rate": 8.820719668824966e-05, "loss": 0.3135, "step": 1130 }, { "epoch": 0.36, "grad_norm": 4.012215614318848, "learning_rate": 8.810105084385947e-05, "loss": 0.2423, "step": 1140 }, { "epoch": 0.37, "grad_norm": 1.7306702136993408, "learning_rate": 8.799490499946927e-05, "loss": 0.187, "step": 1150 }, { "epoch": 0.37, "grad_norm": 1.7319563627243042, "learning_rate": 8.788875915507909e-05, "loss": 0.3792, "step": 1160 }, { "epoch": 0.37, "grad_norm": 4.382763862609863, "learning_rate": 8.778261331068889e-05, "loss": 0.483, "step": 1170 }, { "epoch": 0.38, "grad_norm": 1.3643946647644043, "learning_rate": 8.76764674662987e-05, "loss": 0.1497, "step": 1180 }, { "epoch": 0.38, "grad_norm": 5.549211025238037, "learning_rate": 8.75703216219085e-05, "loss": 0.2628, "step": 1190 }, { "epoch": 0.38, "grad_norm": 2.2046520709991455, "learning_rate": 8.747479036195734e-05, "loss": 0.3474, "step": 1200 }, { "epoch": 0.39, "grad_norm": 3.313180446624756, "learning_rate": 8.736864451756715e-05, "loss": 0.3096, "step": 1210 }, { "epoch": 0.39, "grad_norm": 2.811859130859375, "learning_rate": 8.726249867317695e-05, "loss": 0.1371, "step": 1220 }, { "epoch": 0.39, "grad_norm": 0.43377700448036194, "learning_rate": 8.715635282878675e-05, "loss": 0.2461, "step": 1230 }, { "epoch": 0.39, "grad_norm": 2.7710583209991455, "learning_rate": 8.705020698439657e-05, "loss": 0.3332, "step": 1240 }, { "epoch": 0.4, "grad_norm": 0.4188406467437744, "learning_rate": 8.694406114000637e-05, "loss": 0.3196, "step": 1250 }, { "epoch": 0.4, "grad_norm": 0.7705641388893127, "learning_rate": 8.683791529561618e-05, "loss": 0.1709, "step": 1260 }, { "epoch": 0.4, "grad_norm": 2.6247994899749756, "learning_rate": 8.673176945122598e-05, "loss": 0.3033, "step": 1270 }, { "epoch": 0.41, "grad_norm": 1.033170461654663, "learning_rate": 8.66256236068358e-05, "loss": 0.2506, "step": 1280 }, { "epoch": 0.41, "grad_norm": 4.289760112762451, "learning_rate": 8.65194777624456e-05, "loss": 0.2839, "step": 1290 }, { "epoch": 0.41, "grad_norm": 1.3554538488388062, "learning_rate": 8.64133319180554e-05, "loss": 0.2703, "step": 1300 }, { "epoch": 0.42, "grad_norm": 1.9523005485534668, "learning_rate": 8.630718607366522e-05, "loss": 0.1133, "step": 1310 }, { "epoch": 0.42, "grad_norm": 5.332389831542969, "learning_rate": 8.620104022927503e-05, "loss": 0.3579, "step": 1320 }, { "epoch": 0.42, "grad_norm": 5.874100208282471, "learning_rate": 8.609489438488484e-05, "loss": 0.4038, "step": 1330 }, { "epoch": 0.43, "grad_norm": 1.4143377542495728, "learning_rate": 8.598874854049465e-05, "loss": 0.2451, "step": 1340 }, { "epoch": 0.43, "grad_norm": 0.5176362991333008, "learning_rate": 8.588260269610445e-05, "loss": 0.2561, "step": 1350 }, { "epoch": 0.43, "grad_norm": 1.5968561172485352, "learning_rate": 8.577645685171426e-05, "loss": 0.3456, "step": 1360 }, { "epoch": 0.44, "grad_norm": 1.039812445640564, "learning_rate": 8.567031100732407e-05, "loss": 0.2792, "step": 1370 }, { "epoch": 0.44, "grad_norm": 5.390068531036377, "learning_rate": 8.556416516293388e-05, "loss": 0.398, "step": 1380 }, { "epoch": 0.44, "grad_norm": 1.3645654916763306, "learning_rate": 8.545801931854368e-05, "loss": 0.4537, "step": 1390 }, { "epoch": 0.45, "grad_norm": 2.444027900695801, "learning_rate": 8.535187347415348e-05, "loss": 0.218, "step": 1400 }, { "epoch": 0.45, "grad_norm": 4.201082229614258, "learning_rate": 8.52457276297633e-05, "loss": 0.3146, "step": 1410 }, { "epoch": 0.45, "grad_norm": 4.080310344696045, "learning_rate": 8.51395817853731e-05, "loss": 0.2769, "step": 1420 }, { "epoch": 0.46, "grad_norm": 2.712216377258301, "learning_rate": 8.503343594098292e-05, "loss": 0.2795, "step": 1430 }, { "epoch": 0.46, "grad_norm": 3.2429492473602295, "learning_rate": 8.492729009659273e-05, "loss": 0.2956, "step": 1440 }, { "epoch": 0.46, "grad_norm": 6.107478618621826, "learning_rate": 8.482114425220253e-05, "loss": 0.3381, "step": 1450 }, { "epoch": 0.46, "grad_norm": 0.9037106037139893, "learning_rate": 8.471499840781235e-05, "loss": 0.4196, "step": 1460 }, { "epoch": 0.47, "grad_norm": 1.2487717866897583, "learning_rate": 8.460885256342215e-05, "loss": 0.2471, "step": 1470 }, { "epoch": 0.47, "grad_norm": 2.8922715187072754, "learning_rate": 8.450270671903195e-05, "loss": 0.2664, "step": 1480 }, { "epoch": 0.47, "grad_norm": 0.6493813991546631, "learning_rate": 8.439656087464176e-05, "loss": 0.206, "step": 1490 }, { "epoch": 0.48, "grad_norm": 0.11327870935201645, "learning_rate": 8.429041503025157e-05, "loss": 0.2593, "step": 1500 }, { "epoch": 0.48, "grad_norm": 4.4462690353393555, "learning_rate": 8.418426918586138e-05, "loss": 0.4474, "step": 1510 }, { "epoch": 0.48, "grad_norm": 2.0405867099761963, "learning_rate": 8.407812334147118e-05, "loss": 0.1657, "step": 1520 }, { "epoch": 0.49, "grad_norm": 0.3047516942024231, "learning_rate": 8.397197749708099e-05, "loss": 0.1691, "step": 1530 }, { "epoch": 0.49, "grad_norm": 6.330657958984375, "learning_rate": 8.386583165269079e-05, "loss": 0.2041, "step": 1540 }, { "epoch": 0.49, "grad_norm": 2.403702974319458, "learning_rate": 8.375968580830062e-05, "loss": 0.3408, "step": 1550 }, { "epoch": 0.5, "grad_norm": 3.2958528995513916, "learning_rate": 8.365353996391042e-05, "loss": 0.3271, "step": 1560 }, { "epoch": 0.5, "grad_norm": 3.2511487007141113, "learning_rate": 8.354739411952023e-05, "loss": 0.1719, "step": 1570 }, { "epoch": 0.5, "grad_norm": 2.447939872741699, "learning_rate": 8.344124827513003e-05, "loss": 0.2823, "step": 1580 }, { "epoch": 0.51, "grad_norm": 1.9992095232009888, "learning_rate": 8.333510243073985e-05, "loss": 0.2479, "step": 1590 }, { "epoch": 0.51, "grad_norm": 3.8574376106262207, "learning_rate": 8.322895658634965e-05, "loss": 0.2539, "step": 1600 }, { "epoch": 0.51, "grad_norm": 3.184896230697632, "learning_rate": 8.312281074195946e-05, "loss": 0.2826, "step": 1610 }, { "epoch": 0.52, "grad_norm": 0.6027563810348511, "learning_rate": 8.301666489756926e-05, "loss": 0.1404, "step": 1620 }, { "epoch": 0.52, "grad_norm": 1.0776386260986328, "learning_rate": 8.291051905317906e-05, "loss": 0.3887, "step": 1630 }, { "epoch": 0.52, "grad_norm": 2.386305093765259, "learning_rate": 8.280437320878888e-05, "loss": 0.4232, "step": 1640 }, { "epoch": 0.53, "grad_norm": 1.299332618713379, "learning_rate": 8.269822736439868e-05, "loss": 0.2855, "step": 1650 }, { "epoch": 0.53, "grad_norm": 1.3506910800933838, "learning_rate": 8.259208152000849e-05, "loss": 0.2412, "step": 1660 }, { "epoch": 0.53, "grad_norm": 2.2037456035614014, "learning_rate": 8.24859356756183e-05, "loss": 0.2399, "step": 1670 }, { "epoch": 0.53, "grad_norm": 2.2852354049682617, "learning_rate": 8.237978983122812e-05, "loss": 0.202, "step": 1680 }, { "epoch": 0.54, "grad_norm": 0.2693609297275543, "learning_rate": 8.227364398683793e-05, "loss": 0.3235, "step": 1690 }, { "epoch": 0.54, "grad_norm": 3.526648998260498, "learning_rate": 8.216749814244773e-05, "loss": 0.3102, "step": 1700 }, { "epoch": 0.54, "grad_norm": 1.9742597341537476, "learning_rate": 8.206135229805753e-05, "loss": 0.3293, "step": 1710 }, { "epoch": 0.55, "grad_norm": 2.933436155319214, "learning_rate": 8.195520645366734e-05, "loss": 0.207, "step": 1720 }, { "epoch": 0.55, "grad_norm": 0.5870353579521179, "learning_rate": 8.184906060927715e-05, "loss": 0.3731, "step": 1730 }, { "epoch": 0.55, "grad_norm": 1.7825034856796265, "learning_rate": 8.174291476488696e-05, "loss": 0.1747, "step": 1740 }, { "epoch": 0.56, "grad_norm": 4.706550598144531, "learning_rate": 8.163676892049676e-05, "loss": 0.2143, "step": 1750 }, { "epoch": 0.56, "grad_norm": 3.326359748840332, "learning_rate": 8.153062307610657e-05, "loss": 0.363, "step": 1760 }, { "epoch": 0.56, "grad_norm": 1.3437646627426147, "learning_rate": 8.142447723171638e-05, "loss": 0.2806, "step": 1770 }, { "epoch": 0.57, "grad_norm": 4.6950249671936035, "learning_rate": 8.131833138732619e-05, "loss": 0.2547, "step": 1780 }, { "epoch": 0.57, "grad_norm": 1.557305097579956, "learning_rate": 8.1212185542936e-05, "loss": 0.277, "step": 1790 }, { "epoch": 0.57, "grad_norm": 1.5373164415359497, "learning_rate": 8.110603969854581e-05, "loss": 0.2878, "step": 1800 }, { "epoch": 0.58, "grad_norm": 1.3761144876480103, "learning_rate": 8.099989385415561e-05, "loss": 0.4071, "step": 1810 }, { "epoch": 0.58, "grad_norm": 0.7141520977020264, "learning_rate": 8.089374800976543e-05, "loss": 0.2002, "step": 1820 }, { "epoch": 0.58, "grad_norm": 0.6471810340881348, "learning_rate": 8.078760216537523e-05, "loss": 0.1962, "step": 1830 }, { "epoch": 0.59, "grad_norm": 1.8333234786987305, "learning_rate": 8.068145632098504e-05, "loss": 0.23, "step": 1840 }, { "epoch": 0.59, "grad_norm": 0.7382714152336121, "learning_rate": 8.057531047659484e-05, "loss": 0.1602, "step": 1850 }, { "epoch": 0.59, "grad_norm": 2.2624874114990234, "learning_rate": 8.046916463220466e-05, "loss": 0.3355, "step": 1860 }, { "epoch": 0.6, "grad_norm": 1.3432509899139404, "learning_rate": 8.036301878781446e-05, "loss": 0.1226, "step": 1870 }, { "epoch": 0.6, "grad_norm": 1.3153080940246582, "learning_rate": 8.025687294342426e-05, "loss": 0.2797, "step": 1880 }, { "epoch": 0.6, "grad_norm": 0.13998636603355408, "learning_rate": 8.015072709903407e-05, "loss": 0.3126, "step": 1890 }, { "epoch": 0.6, "grad_norm": 7.6837382316589355, "learning_rate": 8.004458125464388e-05, "loss": 0.348, "step": 1900 }, { "epoch": 0.61, "grad_norm": 2.536726236343384, "learning_rate": 7.993843541025369e-05, "loss": 0.2518, "step": 1910 }, { "epoch": 0.61, "grad_norm": 2.798586130142212, "learning_rate": 7.98322895658635e-05, "loss": 0.187, "step": 1920 }, { "epoch": 0.61, "grad_norm": 2.047030210494995, "learning_rate": 7.972614372147331e-05, "loss": 0.1801, "step": 1930 }, { "epoch": 0.62, "grad_norm": 2.5127789974212646, "learning_rate": 7.961999787708311e-05, "loss": 0.2613, "step": 1940 }, { "epoch": 0.62, "grad_norm": 5.015801429748535, "learning_rate": 7.951385203269293e-05, "loss": 0.4155, "step": 1950 }, { "epoch": 0.62, "grad_norm": 4.095780849456787, "learning_rate": 7.940770618830273e-05, "loss": 0.2413, "step": 1960 }, { "epoch": 0.63, "grad_norm": 0.575307309627533, "learning_rate": 7.930156034391254e-05, "loss": 0.2799, "step": 1970 }, { "epoch": 0.63, "grad_norm": 0.26382434368133545, "learning_rate": 7.919541449952234e-05, "loss": 0.1894, "step": 1980 }, { "epoch": 0.63, "grad_norm": 1.7955100536346436, "learning_rate": 7.908926865513216e-05, "loss": 0.199, "step": 1990 }, { "epoch": 0.64, "grad_norm": 0.4029354453086853, "learning_rate": 7.898312281074196e-05, "loss": 0.2465, "step": 2000 }, { "epoch": 0.64, "grad_norm": 1.4386157989501953, "learning_rate": 7.887697696635177e-05, "loss": 0.2603, "step": 2010 }, { "epoch": 0.64, "grad_norm": 4.048315525054932, "learning_rate": 7.877083112196157e-05, "loss": 0.3663, "step": 2020 }, { "epoch": 0.65, "grad_norm": 4.0357255935668945, "learning_rate": 7.866468527757139e-05, "loss": 0.2365, "step": 2030 }, { "epoch": 0.65, "grad_norm": 0.6603661775588989, "learning_rate": 7.85585394331812e-05, "loss": 0.2848, "step": 2040 }, { "epoch": 0.65, "grad_norm": 2.005911111831665, "learning_rate": 7.845239358879101e-05, "loss": 0.316, "step": 2050 }, { "epoch": 0.66, "grad_norm": 1.5447591543197632, "learning_rate": 7.834624774440081e-05, "loss": 0.2741, "step": 2060 }, { "epoch": 0.66, "grad_norm": 3.2413675785064697, "learning_rate": 7.824010190001062e-05, "loss": 0.4234, "step": 2070 }, { "epoch": 0.66, "grad_norm": 2.6230356693267822, "learning_rate": 7.813395605562043e-05, "loss": 0.1797, "step": 2080 }, { "epoch": 0.67, "grad_norm": 1.5376132726669312, "learning_rate": 7.802781021123024e-05, "loss": 0.3815, "step": 2090 }, { "epoch": 0.67, "grad_norm": 1.4491734504699707, "learning_rate": 7.792166436684004e-05, "loss": 0.3153, "step": 2100 }, { "epoch": 0.67, "grad_norm": 1.949112057685852, "learning_rate": 7.781551852244984e-05, "loss": 0.2751, "step": 2110 }, { "epoch": 0.67, "grad_norm": 0.3488381803035736, "learning_rate": 7.770937267805966e-05, "loss": 0.3558, "step": 2120 }, { "epoch": 0.68, "grad_norm": 1.4437161684036255, "learning_rate": 7.760322683366946e-05, "loss": 0.2827, "step": 2130 }, { "epoch": 0.68, "grad_norm": 1.1105573177337646, "learning_rate": 7.749708098927927e-05, "loss": 0.1867, "step": 2140 }, { "epoch": 0.68, "grad_norm": 2.1235313415527344, "learning_rate": 7.739093514488907e-05, "loss": 0.1689, "step": 2150 }, { "epoch": 0.69, "grad_norm": 1.60935378074646, "learning_rate": 7.728478930049889e-05, "loss": 0.3198, "step": 2160 }, { "epoch": 0.69, "grad_norm": 1.3222334384918213, "learning_rate": 7.71786434561087e-05, "loss": 0.1978, "step": 2170 }, { "epoch": 0.69, "grad_norm": 1.4521784782409668, "learning_rate": 7.707249761171851e-05, "loss": 0.3276, "step": 2180 }, { "epoch": 0.7, "grad_norm": 0.4480780363082886, "learning_rate": 7.696635176732831e-05, "loss": 0.2151, "step": 2190 }, { "epoch": 0.7, "grad_norm": 1.5750231742858887, "learning_rate": 7.686020592293812e-05, "loss": 0.1659, "step": 2200 }, { "epoch": 0.7, "grad_norm": 2.5736334323883057, "learning_rate": 7.675406007854793e-05, "loss": 0.3704, "step": 2210 }, { "epoch": 0.71, "grad_norm": 3.719284772872925, "learning_rate": 7.664791423415774e-05, "loss": 0.1645, "step": 2220 }, { "epoch": 0.71, "grad_norm": 3.429244041442871, "learning_rate": 7.654176838976754e-05, "loss": 0.3323, "step": 2230 }, { "epoch": 0.71, "grad_norm": 2.801398277282715, "learning_rate": 7.643562254537735e-05, "loss": 0.2805, "step": 2240 }, { "epoch": 0.72, "grad_norm": 2.050607204437256, "learning_rate": 7.632947670098716e-05, "loss": 0.2308, "step": 2250 }, { "epoch": 0.72, "grad_norm": 3.164123773574829, "learning_rate": 7.622333085659697e-05, "loss": 0.2401, "step": 2260 }, { "epoch": 0.72, "grad_norm": 3.276832342147827, "learning_rate": 7.611718501220677e-05, "loss": 0.2399, "step": 2270 }, { "epoch": 0.73, "grad_norm": 2.8366944789886475, "learning_rate": 7.601103916781659e-05, "loss": 0.4004, "step": 2280 }, { "epoch": 0.73, "grad_norm": 2.4258265495300293, "learning_rate": 7.590489332342639e-05, "loss": 0.3202, "step": 2290 }, { "epoch": 0.73, "grad_norm": 1.4008164405822754, "learning_rate": 7.579874747903621e-05, "loss": 0.1952, "step": 2300 }, { "epoch": 0.74, "grad_norm": 1.1098754405975342, "learning_rate": 7.569260163464601e-05, "loss": 0.1867, "step": 2310 }, { "epoch": 0.74, "grad_norm": 0.15033583343029022, "learning_rate": 7.558645579025582e-05, "loss": 0.1995, "step": 2320 }, { "epoch": 0.74, "grad_norm": 0.9557719230651855, "learning_rate": 7.548030994586562e-05, "loss": 0.2475, "step": 2330 }, { "epoch": 0.74, "grad_norm": 8.91406536102295, "learning_rate": 7.537416410147544e-05, "loss": 0.2756, "step": 2340 }, { "epoch": 0.75, "grad_norm": 1.9521056413650513, "learning_rate": 7.526801825708524e-05, "loss": 0.2595, "step": 2350 }, { "epoch": 0.75, "grad_norm": 3.3855483531951904, "learning_rate": 7.516187241269504e-05, "loss": 0.2948, "step": 2360 }, { "epoch": 0.75, "grad_norm": 1.6990065574645996, "learning_rate": 7.506634115274387e-05, "loss": 0.2755, "step": 2370 }, { "epoch": 0.76, "grad_norm": 2.098942518234253, "learning_rate": 7.496019530835369e-05, "loss": 0.175, "step": 2380 }, { "epoch": 0.76, "grad_norm": 0.9781967997550964, "learning_rate": 7.48540494639635e-05, "loss": 0.4592, "step": 2390 }, { "epoch": 0.76, "grad_norm": 0.4728473722934723, "learning_rate": 7.47479036195733e-05, "loss": 0.3847, "step": 2400 }, { "epoch": 0.77, "grad_norm": 3.3047373294830322, "learning_rate": 7.46417577751831e-05, "loss": 0.1848, "step": 2410 }, { "epoch": 0.77, "grad_norm": 2.424025535583496, "learning_rate": 7.453561193079292e-05, "loss": 0.2197, "step": 2420 }, { "epoch": 0.77, "grad_norm": 2.697960376739502, "learning_rate": 7.442946608640272e-05, "loss": 0.2314, "step": 2430 }, { "epoch": 0.78, "grad_norm": 0.496898353099823, "learning_rate": 7.432332024201253e-05, "loss": 0.3299, "step": 2440 }, { "epoch": 0.78, "grad_norm": 1.4845099449157715, "learning_rate": 7.421717439762233e-05, "loss": 0.2832, "step": 2450 }, { "epoch": 0.78, "grad_norm": 3.8896942138671875, "learning_rate": 7.411102855323215e-05, "loss": 0.2837, "step": 2460 }, { "epoch": 0.79, "grad_norm": 4.288979530334473, "learning_rate": 7.400488270884195e-05, "loss": 0.1653, "step": 2470 }, { "epoch": 0.79, "grad_norm": 3.0013909339904785, "learning_rate": 7.389873686445176e-05, "loss": 0.3207, "step": 2480 }, { "epoch": 0.79, "grad_norm": 0.38008421659469604, "learning_rate": 7.379259102006156e-05, "loss": 0.2916, "step": 2490 }, { "epoch": 0.8, "grad_norm": 3.843106985092163, "learning_rate": 7.368644517567138e-05, "loss": 0.4216, "step": 2500 }, { "epoch": 0.8, "grad_norm": 0.46844518184661865, "learning_rate": 7.35802993312812e-05, "loss": 0.3038, "step": 2510 }, { "epoch": 0.8, "grad_norm": 0.5063233375549316, "learning_rate": 7.3474153486891e-05, "loss": 0.2392, "step": 2520 }, { "epoch": 0.81, "grad_norm": 6.260082721710205, "learning_rate": 7.33680076425008e-05, "loss": 0.317, "step": 2530 }, { "epoch": 0.81, "grad_norm": 1.771292805671692, "learning_rate": 7.32618617981106e-05, "loss": 0.2229, "step": 2540 }, { "epoch": 0.81, "grad_norm": 5.619741439819336, "learning_rate": 7.315571595372042e-05, "loss": 0.1364, "step": 2550 }, { "epoch": 0.81, "grad_norm": 2.196967363357544, "learning_rate": 7.304957010933023e-05, "loss": 0.2732, "step": 2560 }, { "epoch": 0.82, "grad_norm": 0.6409101486206055, "learning_rate": 7.294342426494003e-05, "loss": 0.2754, "step": 2570 }, { "epoch": 0.82, "grad_norm": 1.4790414571762085, "learning_rate": 7.283727842054983e-05, "loss": 0.2017, "step": 2580 }, { "epoch": 0.82, "grad_norm": 2.013932943344116, "learning_rate": 7.273113257615965e-05, "loss": 0.24, "step": 2590 }, { "epoch": 0.83, "grad_norm": 3.7832634449005127, "learning_rate": 7.262498673176945e-05, "loss": 0.3675, "step": 2600 }, { "epoch": 0.83, "grad_norm": 0.3102867007255554, "learning_rate": 7.251884088737926e-05, "loss": 0.379, "step": 2610 }, { "epoch": 0.83, "grad_norm": 2.4098093509674072, "learning_rate": 7.241269504298906e-05, "loss": 0.381, "step": 2620 }, { "epoch": 0.84, "grad_norm": 2.3519186973571777, "learning_rate": 7.230654919859888e-05, "loss": 0.2574, "step": 2630 }, { "epoch": 0.84, "grad_norm": 1.1589571237564087, "learning_rate": 7.22004033542087e-05, "loss": 0.1603, "step": 2640 }, { "epoch": 0.84, "grad_norm": 3.823918342590332, "learning_rate": 7.20942575098185e-05, "loss": 0.2485, "step": 2650 }, { "epoch": 0.85, "grad_norm": 1.778441071510315, "learning_rate": 7.19881116654283e-05, "loss": 0.234, "step": 2660 }, { "epoch": 0.85, "grad_norm": 2.2710683345794678, "learning_rate": 7.188196582103811e-05, "loss": 0.1746, "step": 2670 }, { "epoch": 0.85, "grad_norm": 6.078259468078613, "learning_rate": 7.177581997664792e-05, "loss": 0.3255, "step": 2680 }, { "epoch": 0.86, "grad_norm": 0.585472583770752, "learning_rate": 7.166967413225773e-05, "loss": 0.3718, "step": 2690 }, { "epoch": 0.86, "grad_norm": 1.9394687414169312, "learning_rate": 7.156352828786753e-05, "loss": 0.3181, "step": 2700 }, { "epoch": 0.86, "grad_norm": 1.6753870248794556, "learning_rate": 7.145738244347734e-05, "loss": 0.2424, "step": 2710 }, { "epoch": 0.87, "grad_norm": 0.37682977318763733, "learning_rate": 7.135123659908714e-05, "loss": 0.2963, "step": 2720 }, { "epoch": 0.87, "grad_norm": 3.564805507659912, "learning_rate": 7.124509075469696e-05, "loss": 0.2822, "step": 2730 }, { "epoch": 0.87, "grad_norm": 0.22953364253044128, "learning_rate": 7.113894491030676e-05, "loss": 0.3489, "step": 2740 }, { "epoch": 0.88, "grad_norm": 4.16074275970459, "learning_rate": 7.103279906591658e-05, "loss": 0.405, "step": 2750 }, { "epoch": 0.88, "grad_norm": 1.4540446996688843, "learning_rate": 7.092665322152638e-05, "loss": 0.2634, "step": 2760 }, { "epoch": 0.88, "grad_norm": 1.9992202520370483, "learning_rate": 7.082050737713618e-05, "loss": 0.2762, "step": 2770 }, { "epoch": 0.88, "grad_norm": 1.3939869403839111, "learning_rate": 7.0714361532746e-05, "loss": 0.3462, "step": 2780 }, { "epoch": 0.89, "grad_norm": 0.6099751591682434, "learning_rate": 7.06082156883558e-05, "loss": 0.367, "step": 2790 }, { "epoch": 0.89, "grad_norm": 6.303842067718506, "learning_rate": 7.050206984396561e-05, "loss": 0.2596, "step": 2800 }, { "epoch": 0.89, "grad_norm": 1.5723298788070679, "learning_rate": 7.039592399957541e-05, "loss": 0.3136, "step": 2810 }, { "epoch": 0.9, "grad_norm": 1.3614245653152466, "learning_rate": 7.028977815518523e-05, "loss": 0.2983, "step": 2820 }, { "epoch": 0.9, "grad_norm": 2.220656633377075, "learning_rate": 7.018363231079503e-05, "loss": 0.3549, "step": 2830 }, { "epoch": 0.9, "grad_norm": 2.8158984184265137, "learning_rate": 7.007748646640484e-05, "loss": 0.2431, "step": 2840 }, { "epoch": 0.91, "grad_norm": 0.46454083919525146, "learning_rate": 6.997134062201464e-05, "loss": 0.204, "step": 2850 }, { "epoch": 0.91, "grad_norm": 2.5426604747772217, "learning_rate": 6.986519477762446e-05, "loss": 0.1241, "step": 2860 }, { "epoch": 0.91, "grad_norm": 2.6442790031433105, "learning_rate": 6.975904893323428e-05, "loss": 0.2026, "step": 2870 }, { "epoch": 0.92, "grad_norm": 0.07216634601354599, "learning_rate": 6.965290308884408e-05, "loss": 0.1619, "step": 2880 }, { "epoch": 0.92, "grad_norm": 1.6410995721817017, "learning_rate": 6.954675724445388e-05, "loss": 0.309, "step": 2890 }, { "epoch": 0.92, "grad_norm": 1.0634126663208008, "learning_rate": 6.944061140006369e-05, "loss": 0.2269, "step": 2900 }, { "epoch": 0.93, "grad_norm": 1.272518277168274, "learning_rate": 6.93344655556735e-05, "loss": 0.2748, "step": 2910 }, { "epoch": 0.93, "grad_norm": 8.030739784240723, "learning_rate": 6.922831971128331e-05, "loss": 0.2386, "step": 2920 }, { "epoch": 0.93, "grad_norm": 1.0459538698196411, "learning_rate": 6.912217386689311e-05, "loss": 0.2162, "step": 2930 }, { "epoch": 0.94, "grad_norm": 2.7766873836517334, "learning_rate": 6.901602802250292e-05, "loss": 0.18, "step": 2940 }, { "epoch": 0.94, "grad_norm": 1.345751166343689, "learning_rate": 6.890988217811273e-05, "loss": 0.1927, "step": 2950 }, { "epoch": 0.94, "grad_norm": 3.475550889968872, "learning_rate": 6.880373633372254e-05, "loss": 0.1593, "step": 2960 }, { "epoch": 0.95, "grad_norm": 4.3208088874816895, "learning_rate": 6.869759048933234e-05, "loss": 0.3782, "step": 2970 }, { "epoch": 0.95, "grad_norm": 0.5283639430999756, "learning_rate": 6.859144464494214e-05, "loss": 0.2065, "step": 2980 }, { "epoch": 0.95, "grad_norm": 0.3912002444267273, "learning_rate": 6.848529880055196e-05, "loss": 0.2094, "step": 2990 }, { "epoch": 0.95, "grad_norm": 5.560369968414307, "learning_rate": 6.837915295616178e-05, "loss": 0.2598, "step": 3000 }, { "epoch": 0.96, "grad_norm": 2.0859804153442383, "learning_rate": 6.827300711177158e-05, "loss": 0.2396, "step": 3010 }, { "epoch": 0.96, "grad_norm": 1.9198240041732788, "learning_rate": 6.816686126738139e-05, "loss": 0.326, "step": 3020 }, { "epoch": 0.96, "grad_norm": 2.559525728225708, "learning_rate": 6.806071542299119e-05, "loss": 0.2846, "step": 3030 }, { "epoch": 0.97, "grad_norm": 8.122730255126953, "learning_rate": 6.7954569578601e-05, "loss": 0.3404, "step": 3040 }, { "epoch": 0.97, "grad_norm": 1.4377597570419312, "learning_rate": 6.784842373421081e-05, "loss": 0.3534, "step": 3050 }, { "epoch": 0.97, "grad_norm": 1.3202710151672363, "learning_rate": 6.774227788982061e-05, "loss": 0.3151, "step": 3060 }, { "epoch": 0.98, "grad_norm": 1.2933627367019653, "learning_rate": 6.763613204543042e-05, "loss": 0.1983, "step": 3070 }, { "epoch": 0.98, "grad_norm": 0.8253432512283325, "learning_rate": 6.752998620104023e-05, "loss": 0.1989, "step": 3080 }, { "epoch": 0.98, "grad_norm": 1.008435606956482, "learning_rate": 6.742384035665004e-05, "loss": 0.2045, "step": 3090 }, { "epoch": 0.99, "grad_norm": 4.022599220275879, "learning_rate": 6.731769451225984e-05, "loss": 0.2166, "step": 3100 }, { "epoch": 0.99, "grad_norm": 0.5018757581710815, "learning_rate": 6.721154866786966e-05, "loss": 0.1841, "step": 3110 }, { "epoch": 0.99, "grad_norm": 1.1110012531280518, "learning_rate": 6.710540282347946e-05, "loss": 0.208, "step": 3120 }, { "epoch": 1.0, "grad_norm": 4.160871505737305, "learning_rate": 6.699925697908928e-05, "loss": 0.2853, "step": 3130 }, { "epoch": 1.0, "grad_norm": 3.1839327812194824, "learning_rate": 6.689311113469908e-05, "loss": 0.239, "step": 3140 }, { "epoch": 1.0, "grad_norm": 1.2867355346679688, "learning_rate": 6.678696529030889e-05, "loss": 0.1678, "step": 3150 }, { "epoch": 1.01, "grad_norm": 0.3853776454925537, "learning_rate": 6.668081944591869e-05, "loss": 0.1119, "step": 3160 }, { "epoch": 1.01, "grad_norm": 0.9403756856918335, "learning_rate": 6.657467360152851e-05, "loss": 0.1772, "step": 3170 }, { "epoch": 1.01, "grad_norm": 2.8056976795196533, "learning_rate": 6.646852775713831e-05, "loss": 0.1438, "step": 3180 }, { "epoch": 1.02, "grad_norm": 0.9233602285385132, "learning_rate": 6.636238191274812e-05, "loss": 0.2491, "step": 3190 }, { "epoch": 1.02, "grad_norm": 2.179743766784668, "learning_rate": 6.625623606835792e-05, "loss": 0.1493, "step": 3200 }, { "epoch": 1.02, "grad_norm": 1.8002713918685913, "learning_rate": 6.615009022396774e-05, "loss": 0.1557, "step": 3210 }, { "epoch": 1.02, "grad_norm": 1.0567578077316284, "learning_rate": 6.604394437957754e-05, "loss": 0.1573, "step": 3220 }, { "epoch": 1.03, "grad_norm": 1.7498853206634521, "learning_rate": 6.593779853518734e-05, "loss": 0.2639, "step": 3230 }, { "epoch": 1.03, "grad_norm": 0.14960238337516785, "learning_rate": 6.583165269079716e-05, "loss": 0.2314, "step": 3240 }, { "epoch": 1.03, "grad_norm": 0.858378529548645, "learning_rate": 6.572550684640697e-05, "loss": 0.1898, "step": 3250 }, { "epoch": 1.04, "grad_norm": 4.104907989501953, "learning_rate": 6.561936100201678e-05, "loss": 0.2381, "step": 3260 }, { "epoch": 1.04, "grad_norm": 0.1154847964644432, "learning_rate": 6.551321515762659e-05, "loss": 0.0987, "step": 3270 }, { "epoch": 1.04, "grad_norm": 1.8907705545425415, "learning_rate": 6.540706931323639e-05, "loss": 0.125, "step": 3280 }, { "epoch": 1.05, "grad_norm": 1.2750372886657715, "learning_rate": 6.53009234688462e-05, "loss": 0.234, "step": 3290 }, { "epoch": 1.05, "grad_norm": 1.584429144859314, "learning_rate": 6.519477762445601e-05, "loss": 0.1328, "step": 3300 }, { "epoch": 1.05, "grad_norm": 2.3900089263916016, "learning_rate": 6.508863178006581e-05, "loss": 0.2681, "step": 3310 }, { "epoch": 1.06, "grad_norm": 1.9859068393707275, "learning_rate": 6.498248593567562e-05, "loss": 0.4136, "step": 3320 }, { "epoch": 1.06, "grad_norm": 3.4652695655822754, "learning_rate": 6.487634009128542e-05, "loss": 0.2059, "step": 3330 }, { "epoch": 1.06, "grad_norm": 4.06072473526001, "learning_rate": 6.477019424689524e-05, "loss": 0.2378, "step": 3340 }, { "epoch": 1.07, "grad_norm": 1.2823538780212402, "learning_rate": 6.466404840250504e-05, "loss": 0.1772, "step": 3350 }, { "epoch": 1.07, "grad_norm": 0.545313835144043, "learning_rate": 6.455790255811486e-05, "loss": 0.1587, "step": 3360 }, { "epoch": 1.07, "grad_norm": 5.666371822357178, "learning_rate": 6.445175671372466e-05, "loss": 0.1486, "step": 3370 }, { "epoch": 1.08, "grad_norm": 0.3175773620605469, "learning_rate": 6.434561086933447e-05, "loss": 0.2295, "step": 3380 }, { "epoch": 1.08, "grad_norm": 3.88968563079834, "learning_rate": 6.423946502494428e-05, "loss": 0.16, "step": 3390 }, { "epoch": 1.08, "grad_norm": 2.4445409774780273, "learning_rate": 6.413331918055409e-05, "loss": 0.1766, "step": 3400 }, { "epoch": 1.09, "grad_norm": 0.5478050708770752, "learning_rate": 6.402717333616389e-05, "loss": 0.1299, "step": 3410 }, { "epoch": 1.09, "grad_norm": 4.029285907745361, "learning_rate": 6.393164207621272e-05, "loss": 0.3463, "step": 3420 }, { "epoch": 1.09, "grad_norm": 0.3899819552898407, "learning_rate": 6.382549623182253e-05, "loss": 0.1214, "step": 3430 }, { "epoch": 1.1, "grad_norm": 0.7180734276771545, "learning_rate": 6.371935038743233e-05, "loss": 0.2756, "step": 3440 }, { "epoch": 1.1, "grad_norm": 3.6423099040985107, "learning_rate": 6.361320454304213e-05, "loss": 0.2059, "step": 3450 }, { "epoch": 1.1, "grad_norm": 3.006516933441162, "learning_rate": 6.350705869865195e-05, "loss": 0.2151, "step": 3460 }, { "epoch": 1.1, "grad_norm": 2.1426503658294678, "learning_rate": 6.340091285426177e-05, "loss": 0.2644, "step": 3470 }, { "epoch": 1.11, "grad_norm": 1.4418883323669434, "learning_rate": 6.329476700987157e-05, "loss": 0.1675, "step": 3480 }, { "epoch": 1.11, "grad_norm": 1.2576738595962524, "learning_rate": 6.318862116548138e-05, "loss": 0.1612, "step": 3490 }, { "epoch": 1.11, "grad_norm": 3.26369309425354, "learning_rate": 6.308247532109118e-05, "loss": 0.2346, "step": 3500 }, { "epoch": 1.12, "grad_norm": 0.9214788675308228, "learning_rate": 6.2976329476701e-05, "loss": 0.1714, "step": 3510 }, { "epoch": 1.12, "grad_norm": 1.696925163269043, "learning_rate": 6.28701836323108e-05, "loss": 0.1306, "step": 3520 }, { "epoch": 1.12, "grad_norm": 1.1808693408966064, "learning_rate": 6.27640377879206e-05, "loss": 0.1135, "step": 3530 }, { "epoch": 1.13, "grad_norm": 4.710297107696533, "learning_rate": 6.265789194353041e-05, "loss": 0.158, "step": 3540 }, { "epoch": 1.13, "grad_norm": 0.5521005988121033, "learning_rate": 6.255174609914022e-05, "loss": 0.3224, "step": 3550 }, { "epoch": 1.13, "grad_norm": 2.172825336456299, "learning_rate": 6.244560025475003e-05, "loss": 0.0946, "step": 3560 }, { "epoch": 1.14, "grad_norm": 1.8690552711486816, "learning_rate": 6.233945441035983e-05, "loss": 0.1972, "step": 3570 }, { "epoch": 1.14, "grad_norm": 0.059970393776893616, "learning_rate": 6.223330856596965e-05, "loss": 0.0601, "step": 3580 }, { "epoch": 1.14, "grad_norm": 0.0773802176117897, "learning_rate": 6.212716272157945e-05, "loss": 0.2881, "step": 3590 }, { "epoch": 1.15, "grad_norm": 1.320061206817627, "learning_rate": 6.202101687718927e-05, "loss": 0.1966, "step": 3600 }, { "epoch": 1.15, "grad_norm": 2.4339261054992676, "learning_rate": 6.191487103279907e-05, "loss": 0.1808, "step": 3610 }, { "epoch": 1.15, "grad_norm": 5.3104729652404785, "learning_rate": 6.180872518840888e-05, "loss": 0.1737, "step": 3620 }, { "epoch": 1.16, "grad_norm": 3.9139719009399414, "learning_rate": 6.170257934401868e-05, "loss": 0.239, "step": 3630 }, { "epoch": 1.16, "grad_norm": 0.9480198621749878, "learning_rate": 6.15964334996285e-05, "loss": 0.1556, "step": 3640 }, { "epoch": 1.16, "grad_norm": 0.807107150554657, "learning_rate": 6.14902876552383e-05, "loss": 0.131, "step": 3650 }, { "epoch": 1.17, "grad_norm": 0.059983473271131516, "learning_rate": 6.13841418108481e-05, "loss": 0.1479, "step": 3660 }, { "epoch": 1.17, "grad_norm": 0.7000637650489807, "learning_rate": 6.127799596645791e-05, "loss": 0.0861, "step": 3670 }, { "epoch": 1.17, "grad_norm": 0.43273600935935974, "learning_rate": 6.117185012206771e-05, "loss": 0.1848, "step": 3680 }, { "epoch": 1.17, "grad_norm": 0.056298673152923584, "learning_rate": 6.106570427767753e-05, "loss": 0.1313, "step": 3690 }, { "epoch": 1.18, "grad_norm": 0.6714267134666443, "learning_rate": 6.095955843328735e-05, "loss": 0.2817, "step": 3700 }, { "epoch": 1.18, "grad_norm": 2.8052423000335693, "learning_rate": 6.085341258889715e-05, "loss": 0.2095, "step": 3710 }, { "epoch": 1.18, "grad_norm": 3.0490353107452393, "learning_rate": 6.074726674450696e-05, "loss": 0.2707, "step": 3720 }, { "epoch": 1.19, "grad_norm": 2.3823633193969727, "learning_rate": 6.0641120900116766e-05, "loss": 0.1918, "step": 3730 }, { "epoch": 1.19, "grad_norm": 5.9893293380737305, "learning_rate": 6.0534975055726576e-05, "loss": 0.1855, "step": 3740 }, { "epoch": 1.19, "grad_norm": 5.253934383392334, "learning_rate": 6.042882921133638e-05, "loss": 0.1286, "step": 3750 }, { "epoch": 1.2, "grad_norm": 3.3353893756866455, "learning_rate": 6.0322683366946183e-05, "loss": 0.1656, "step": 3760 }, { "epoch": 1.2, "grad_norm": 1.5391966104507446, "learning_rate": 6.0216537522555994e-05, "loss": 0.1783, "step": 3770 }, { "epoch": 1.2, "grad_norm": 3.3716678619384766, "learning_rate": 6.01103916781658e-05, "loss": 0.1025, "step": 3780 }, { "epoch": 1.21, "grad_norm": 0.8058392405509949, "learning_rate": 6.000424583377561e-05, "loss": 0.1224, "step": 3790 }, { "epoch": 1.21, "grad_norm": 1.5231162309646606, "learning_rate": 5.989809998938541e-05, "loss": 0.0579, "step": 3800 }, { "epoch": 1.21, "grad_norm": 3.7527573108673096, "learning_rate": 5.979195414499522e-05, "loss": 0.3109, "step": 3810 }, { "epoch": 1.22, "grad_norm": 1.884722113609314, "learning_rate": 5.968580830060504e-05, "loss": 0.2569, "step": 3820 }, { "epoch": 1.22, "grad_norm": 1.2949138879776, "learning_rate": 5.957966245621484e-05, "loss": 0.2067, "step": 3830 }, { "epoch": 1.22, "grad_norm": 1.9406439065933228, "learning_rate": 5.9473516611824654e-05, "loss": 0.1397, "step": 3840 }, { "epoch": 1.23, "grad_norm": 3.048089027404785, "learning_rate": 5.936737076743446e-05, "loss": 0.1903, "step": 3850 }, { "epoch": 1.23, "grad_norm": 2.7827141284942627, "learning_rate": 5.926122492304427e-05, "loss": 0.2375, "step": 3860 }, { "epoch": 1.23, "grad_norm": 0.30664700269699097, "learning_rate": 5.915507907865407e-05, "loss": 0.2605, "step": 3870 }, { "epoch": 1.24, "grad_norm": 5.038077354431152, "learning_rate": 5.904893323426388e-05, "loss": 0.2249, "step": 3880 }, { "epoch": 1.24, "grad_norm": 0.5563170313835144, "learning_rate": 5.8942787389873686e-05, "loss": 0.1407, "step": 3890 }, { "epoch": 1.24, "grad_norm": 3.5176491737365723, "learning_rate": 5.8836641545483496e-05, "loss": 0.1955, "step": 3900 }, { "epoch": 1.24, "grad_norm": 0.16444259881973267, "learning_rate": 5.87304957010933e-05, "loss": 0.2973, "step": 3910 }, { "epoch": 1.25, "grad_norm": 2.3163607120513916, "learning_rate": 5.862434985670311e-05, "loss": 0.1388, "step": 3920 }, { "epoch": 1.25, "grad_norm": 2.4921140670776367, "learning_rate": 5.8518204012312914e-05, "loss": 0.2844, "step": 3930 }, { "epoch": 1.25, "grad_norm": 6.664550304412842, "learning_rate": 5.841205816792273e-05, "loss": 0.5434, "step": 3940 }, { "epoch": 1.26, "grad_norm": 0.27615758776664734, "learning_rate": 5.830591232353254e-05, "loss": 0.2716, "step": 3950 }, { "epoch": 1.26, "grad_norm": 7.205143451690674, "learning_rate": 5.8199766479142345e-05, "loss": 0.1927, "step": 3960 }, { "epoch": 1.26, "grad_norm": 2.423842191696167, "learning_rate": 5.8093620634752156e-05, "loss": 0.2013, "step": 3970 }, { "epoch": 1.27, "grad_norm": 0.6563037037849426, "learning_rate": 5.798747479036196e-05, "loss": 0.2597, "step": 3980 }, { "epoch": 1.27, "grad_norm": 2.216214418411255, "learning_rate": 5.788132894597177e-05, "loss": 0.1484, "step": 3990 }, { "epoch": 1.27, "grad_norm": 0.21049724519252777, "learning_rate": 5.7775183101581574e-05, "loss": 0.1205, "step": 4000 }, { "epoch": 1.28, "grad_norm": 1.838711142539978, "learning_rate": 5.7669037257191384e-05, "loss": 0.1806, "step": 4010 }, { "epoch": 1.28, "grad_norm": 4.584275245666504, "learning_rate": 5.756289141280119e-05, "loss": 0.1459, "step": 4020 }, { "epoch": 1.28, "grad_norm": 3.7076704502105713, "learning_rate": 5.7456745568411e-05, "loss": 0.2119, "step": 4030 }, { "epoch": 1.29, "grad_norm": 4.600487232208252, "learning_rate": 5.73505997240208e-05, "loss": 0.1846, "step": 4040 }, { "epoch": 1.29, "grad_norm": 2.9479613304138184, "learning_rate": 5.724445387963061e-05, "loss": 0.1373, "step": 4050 }, { "epoch": 1.29, "grad_norm": 2.7824301719665527, "learning_rate": 5.7138308035240416e-05, "loss": 0.1573, "step": 4060 }, { "epoch": 1.3, "grad_norm": 1.3697668313980103, "learning_rate": 5.703216219085023e-05, "loss": 0.1067, "step": 4070 }, { "epoch": 1.3, "grad_norm": 4.134962558746338, "learning_rate": 5.6926016346460044e-05, "loss": 0.3154, "step": 4080 }, { "epoch": 1.3, "grad_norm": 1.986623764038086, "learning_rate": 5.681987050206985e-05, "loss": 0.162, "step": 4090 }, { "epoch": 1.31, "grad_norm": 1.7553232908248901, "learning_rate": 5.671372465767966e-05, "loss": 0.2197, "step": 4100 }, { "epoch": 1.31, "grad_norm": 1.666942834854126, "learning_rate": 5.660757881328946e-05, "loss": 0.2144, "step": 4110 }, { "epoch": 1.31, "grad_norm": 1.3620635271072388, "learning_rate": 5.650143296889927e-05, "loss": 0.2823, "step": 4120 }, { "epoch": 1.31, "grad_norm": 3.4056193828582764, "learning_rate": 5.6395287124509076e-05, "loss": 0.3223, "step": 4130 }, { "epoch": 1.32, "grad_norm": 0.8397992253303528, "learning_rate": 5.6289141280118886e-05, "loss": 0.1297, "step": 4140 }, { "epoch": 1.32, "grad_norm": 0.09627294540405273, "learning_rate": 5.618299543572869e-05, "loss": 0.1154, "step": 4150 }, { "epoch": 1.32, "grad_norm": 2.1529462337493896, "learning_rate": 5.60768495913385e-05, "loss": 0.1903, "step": 4160 }, { "epoch": 1.33, "grad_norm": 0.42282378673553467, "learning_rate": 5.5970703746948304e-05, "loss": 0.0992, "step": 4170 }, { "epoch": 1.33, "grad_norm": 0.34097906947135925, "learning_rate": 5.5864557902558115e-05, "loss": 0.2193, "step": 4180 }, { "epoch": 1.33, "grad_norm": 0.11647669225931168, "learning_rate": 5.575841205816793e-05, "loss": 0.1511, "step": 4190 }, { "epoch": 1.34, "grad_norm": 7.489476680755615, "learning_rate": 5.5652266213777736e-05, "loss": 0.182, "step": 4200 }, { "epoch": 1.34, "grad_norm": 0.0627538189291954, "learning_rate": 5.5546120369387546e-05, "loss": 0.2056, "step": 4210 }, { "epoch": 1.34, "grad_norm": 1.6038990020751953, "learning_rate": 5.543997452499735e-05, "loss": 0.317, "step": 4220 }, { "epoch": 1.35, "grad_norm": 2.0296130180358887, "learning_rate": 5.533382868060716e-05, "loss": 0.221, "step": 4230 }, { "epoch": 1.35, "grad_norm": 3.08427357673645, "learning_rate": 5.5227682836216964e-05, "loss": 0.309, "step": 4240 }, { "epoch": 1.35, "grad_norm": 6.700926303863525, "learning_rate": 5.5121536991826774e-05, "loss": 0.3862, "step": 4250 }, { "epoch": 1.36, "grad_norm": 3.3283987045288086, "learning_rate": 5.501539114743658e-05, "loss": 0.1449, "step": 4260 }, { "epoch": 1.36, "grad_norm": 2.7718186378479004, "learning_rate": 5.490924530304639e-05, "loss": 0.1237, "step": 4270 }, { "epoch": 1.36, "grad_norm": 1.7264149188995361, "learning_rate": 5.480309945865619e-05, "loss": 0.0537, "step": 4280 }, { "epoch": 1.37, "grad_norm": 2.8292267322540283, "learning_rate": 5.4696953614266e-05, "loss": 0.1139, "step": 4290 }, { "epoch": 1.37, "grad_norm": 2.6377663612365723, "learning_rate": 5.4590807769875806e-05, "loss": 0.1632, "step": 4300 }, { "epoch": 1.37, "grad_norm": 0.1827862560749054, "learning_rate": 5.4484661925485624e-05, "loss": 0.1809, "step": 4310 }, { "epoch": 1.38, "grad_norm": 5.187005996704102, "learning_rate": 5.4378516081095434e-05, "loss": 0.1735, "step": 4320 }, { "epoch": 1.38, "grad_norm": 2.064953327178955, "learning_rate": 5.427237023670524e-05, "loss": 0.3226, "step": 4330 }, { "epoch": 1.38, "grad_norm": 0.03769757226109505, "learning_rate": 5.416622439231505e-05, "loss": 0.1563, "step": 4340 }, { "epoch": 1.38, "grad_norm": 5.220246315002441, "learning_rate": 5.406007854792485e-05, "loss": 0.2403, "step": 4350 }, { "epoch": 1.39, "grad_norm": 0.1891440451145172, "learning_rate": 5.395393270353466e-05, "loss": 0.1741, "step": 4360 }, { "epoch": 1.39, "grad_norm": 5.661322116851807, "learning_rate": 5.3847786859144466e-05, "loss": 0.1514, "step": 4370 }, { "epoch": 1.39, "grad_norm": 8.325531005859375, "learning_rate": 5.3741641014754277e-05, "loss": 0.1954, "step": 4380 }, { "epoch": 1.4, "grad_norm": 3.1849327087402344, "learning_rate": 5.363549517036408e-05, "loss": 0.2667, "step": 4390 }, { "epoch": 1.4, "grad_norm": 4.426061153411865, "learning_rate": 5.352934932597389e-05, "loss": 0.1621, "step": 4400 }, { "epoch": 1.4, "grad_norm": 0.08511369675397873, "learning_rate": 5.3423203481583694e-05, "loss": 0.2384, "step": 4410 }, { "epoch": 1.41, "grad_norm": 2.6035985946655273, "learning_rate": 5.3317057637193505e-05, "loss": 0.2029, "step": 4420 }, { "epoch": 1.41, "grad_norm": 3.637746810913086, "learning_rate": 5.321091179280332e-05, "loss": 0.2054, "step": 4430 }, { "epoch": 1.41, "grad_norm": 2.6887290477752686, "learning_rate": 5.3104765948413126e-05, "loss": 0.194, "step": 4440 }, { "epoch": 1.42, "grad_norm": 0.5362237691879272, "learning_rate": 5.2998620104022936e-05, "loss": 0.1243, "step": 4450 }, { "epoch": 1.42, "grad_norm": 6.602662086486816, "learning_rate": 5.289247425963274e-05, "loss": 0.1005, "step": 4460 }, { "epoch": 1.42, "grad_norm": 0.16585449874401093, "learning_rate": 5.278632841524255e-05, "loss": 0.116, "step": 4470 }, { "epoch": 1.43, "grad_norm": 3.062458038330078, "learning_rate": 5.2690797155291374e-05, "loss": 0.2236, "step": 4480 }, { "epoch": 1.43, "grad_norm": 3.1578338146209717, "learning_rate": 5.258465131090118e-05, "loss": 0.1248, "step": 4490 }, { "epoch": 1.43, "grad_norm": 6.487752914428711, "learning_rate": 5.247850546651099e-05, "loss": 0.2268, "step": 4500 }, { "epoch": 1.44, "grad_norm": 4.561209678649902, "learning_rate": 5.237235962212079e-05, "loss": 0.3183, "step": 4510 }, { "epoch": 1.44, "grad_norm": 1.6614716053009033, "learning_rate": 5.22662137777306e-05, "loss": 0.2555, "step": 4520 }, { "epoch": 1.44, "grad_norm": 2.4814791679382324, "learning_rate": 5.216006793334042e-05, "loss": 0.1524, "step": 4530 }, { "epoch": 1.45, "grad_norm": 0.17691956460475922, "learning_rate": 5.205392208895022e-05, "loss": 0.1934, "step": 4540 }, { "epoch": 1.45, "grad_norm": 5.082562446594238, "learning_rate": 5.1947776244560033e-05, "loss": 0.4279, "step": 4550 }, { "epoch": 1.45, "grad_norm": 3.106387138366699, "learning_rate": 5.184163040016984e-05, "loss": 0.1194, "step": 4560 }, { "epoch": 1.45, "grad_norm": 7.02073335647583, "learning_rate": 5.173548455577965e-05, "loss": 0.1109, "step": 4570 }, { "epoch": 1.46, "grad_norm": 0.2526942193508148, "learning_rate": 5.162933871138945e-05, "loss": 0.1913, "step": 4580 }, { "epoch": 1.46, "grad_norm": 4.575504302978516, "learning_rate": 5.152319286699926e-05, "loss": 0.2151, "step": 4590 }, { "epoch": 1.46, "grad_norm": 2.3890509605407715, "learning_rate": 5.1417047022609066e-05, "loss": 0.2336, "step": 4600 }, { "epoch": 1.47, "grad_norm": 0.8267619013786316, "learning_rate": 5.1310901178218876e-05, "loss": 0.0856, "step": 4610 }, { "epoch": 1.47, "grad_norm": 4.056538105010986, "learning_rate": 5.120475533382868e-05, "loss": 0.1947, "step": 4620 }, { "epoch": 1.47, "grad_norm": 6.964923858642578, "learning_rate": 5.109860948943849e-05, "loss": 0.1195, "step": 4630 }, { "epoch": 1.48, "grad_norm": 2.813004970550537, "learning_rate": 5.100307822948732e-05, "loss": 0.1225, "step": 4640 }, { "epoch": 1.48, "grad_norm": 2.654339075088501, "learning_rate": 5.089693238509713e-05, "loss": 0.1006, "step": 4650 }, { "epoch": 1.48, "grad_norm": 6.5991644859313965, "learning_rate": 5.0790786540706934e-05, "loss": 0.2646, "step": 4660 }, { "epoch": 1.49, "grad_norm": 5.099368572235107, "learning_rate": 5.0684640696316745e-05, "loss": 0.2748, "step": 4670 }, { "epoch": 1.49, "grad_norm": 5.0444655418396, "learning_rate": 5.057849485192655e-05, "loss": 0.2295, "step": 4680 }, { "epoch": 1.49, "grad_norm": 0.07431354373693466, "learning_rate": 5.047234900753636e-05, "loss": 0.1348, "step": 4690 }, { "epoch": 1.5, "grad_norm": 0.1366661787033081, "learning_rate": 5.036620316314616e-05, "loss": 0.1164, "step": 4700 }, { "epoch": 1.5, "grad_norm": 4.550073146820068, "learning_rate": 5.026005731875597e-05, "loss": 0.2377, "step": 4710 }, { "epoch": 1.5, "grad_norm": 0.12663549184799194, "learning_rate": 5.015391147436578e-05, "loss": 0.0871, "step": 4720 }, { "epoch": 1.51, "grad_norm": 5.191462993621826, "learning_rate": 5.004776562997559e-05, "loss": 0.2778, "step": 4730 }, { "epoch": 1.51, "grad_norm": 2.7582337856292725, "learning_rate": 4.99416197855854e-05, "loss": 0.203, "step": 4740 }, { "epoch": 1.51, "grad_norm": 7.114481449127197, "learning_rate": 4.98354739411952e-05, "loss": 0.1426, "step": 4750 }, { "epoch": 1.52, "grad_norm": 0.41717416048049927, "learning_rate": 4.972932809680501e-05, "loss": 0.2009, "step": 4760 }, { "epoch": 1.52, "grad_norm": 1.8175145387649536, "learning_rate": 4.9623182252414816e-05, "loss": 0.1152, "step": 4770 }, { "epoch": 1.52, "grad_norm": 3.585702419281006, "learning_rate": 4.951703640802463e-05, "loss": 0.1615, "step": 4780 }, { "epoch": 1.52, "grad_norm": 0.385105699300766, "learning_rate": 4.9410890563634437e-05, "loss": 0.1569, "step": 4790 }, { "epoch": 1.53, "grad_norm": 2.8163392543792725, "learning_rate": 4.930474471924425e-05, "loss": 0.0942, "step": 4800 }, { "epoch": 1.53, "grad_norm": 5.181662082672119, "learning_rate": 4.919859887485405e-05, "loss": 0.2076, "step": 4810 }, { "epoch": 1.53, "grad_norm": 0.15229104459285736, "learning_rate": 4.909245303046386e-05, "loss": 0.2249, "step": 4820 }, { "epoch": 1.54, "grad_norm": 3.2373440265655518, "learning_rate": 4.8986307186073665e-05, "loss": 0.5439, "step": 4830 }, { "epoch": 1.54, "grad_norm": 1.7857202291488647, "learning_rate": 4.8880161341683475e-05, "loss": 0.1806, "step": 4840 }, { "epoch": 1.54, "grad_norm": 1.1035951375961304, "learning_rate": 4.8774015497293286e-05, "loss": 0.1309, "step": 4850 }, { "epoch": 1.55, "grad_norm": 7.660123825073242, "learning_rate": 4.866786965290309e-05, "loss": 0.1587, "step": 4860 }, { "epoch": 1.55, "grad_norm": 0.20227286219596863, "learning_rate": 4.85617238085129e-05, "loss": 0.3051, "step": 4870 }, { "epoch": 1.55, "grad_norm": 6.558931827545166, "learning_rate": 4.8455577964122704e-05, "loss": 0.2137, "step": 4880 }, { "epoch": 1.56, "grad_norm": 2.683018922805786, "learning_rate": 4.8349432119732514e-05, "loss": 0.1528, "step": 4890 }, { "epoch": 1.56, "grad_norm": 1.2843786478042603, "learning_rate": 4.8243286275342325e-05, "loss": 0.1525, "step": 4900 }, { "epoch": 1.56, "grad_norm": 0.9824750423431396, "learning_rate": 4.8137140430952135e-05, "loss": 0.1682, "step": 4910 }, { "epoch": 1.57, "grad_norm": 1.0165822505950928, "learning_rate": 4.803099458656194e-05, "loss": 0.2397, "step": 4920 }, { "epoch": 1.57, "grad_norm": 2.0921578407287598, "learning_rate": 4.792484874217175e-05, "loss": 0.2342, "step": 4930 }, { "epoch": 1.57, "grad_norm": 2.5232343673706055, "learning_rate": 4.781870289778155e-05, "loss": 0.2216, "step": 4940 }, { "epoch": 1.58, "grad_norm": 5.7156782150268555, "learning_rate": 4.7712557053391363e-05, "loss": 0.2342, "step": 4950 }, { "epoch": 1.58, "grad_norm": 3.128016233444214, "learning_rate": 4.760641120900117e-05, "loss": 0.1759, "step": 4960 }, { "epoch": 1.58, "grad_norm": 2.2040598392486572, "learning_rate": 4.750026536461098e-05, "loss": 0.1414, "step": 4970 }, { "epoch": 1.59, "grad_norm": 2.1795644760131836, "learning_rate": 4.739411952022079e-05, "loss": 0.1648, "step": 4980 }, { "epoch": 1.59, "grad_norm": 5.399777412414551, "learning_rate": 4.728797367583059e-05, "loss": 0.1344, "step": 4990 }, { "epoch": 1.59, "grad_norm": 0.06098851189017296, "learning_rate": 4.71818278314404e-05, "loss": 0.1188, "step": 5000 }, { "epoch": 1.59, "grad_norm": 3.174159049987793, "learning_rate": 4.7075681987050206e-05, "loss": 0.3419, "step": 5010 }, { "epoch": 1.6, "grad_norm": 4.566168308258057, "learning_rate": 4.6969536142660016e-05, "loss": 0.2582, "step": 5020 }, { "epoch": 1.6, "grad_norm": 0.5227226614952087, "learning_rate": 4.686339029826983e-05, "loss": 0.1691, "step": 5030 }, { "epoch": 1.6, "grad_norm": 5.8460869789123535, "learning_rate": 4.675724445387963e-05, "loss": 0.1399, "step": 5040 }, { "epoch": 1.61, "grad_norm": 2.2399487495422363, "learning_rate": 4.665109860948944e-05, "loss": 0.1549, "step": 5050 }, { "epoch": 1.61, "grad_norm": 2.9508166313171387, "learning_rate": 4.6544952765099245e-05, "loss": 0.1665, "step": 5060 }, { "epoch": 1.61, "grad_norm": 2.5230746269226074, "learning_rate": 4.6438806920709055e-05, "loss": 0.2108, "step": 5070 }, { "epoch": 1.62, "grad_norm": 0.5516650080680847, "learning_rate": 4.633266107631886e-05, "loss": 0.2275, "step": 5080 }, { "epoch": 1.62, "grad_norm": 8.398303985595703, "learning_rate": 4.6226515231928676e-05, "loss": 0.1801, "step": 5090 }, { "epoch": 1.62, "grad_norm": 0.2512928247451782, "learning_rate": 4.612036938753848e-05, "loss": 0.2654, "step": 5100 }, { "epoch": 1.63, "grad_norm": 5.312344551086426, "learning_rate": 4.601422354314829e-05, "loss": 0.2992, "step": 5110 }, { "epoch": 1.63, "grad_norm": 1.728023648262024, "learning_rate": 4.5908077698758094e-05, "loss": 0.1638, "step": 5120 }, { "epoch": 1.63, "grad_norm": 1.6222649812698364, "learning_rate": 4.5801931854367904e-05, "loss": 0.2216, "step": 5130 }, { "epoch": 1.64, "grad_norm": 0.5581383109092712, "learning_rate": 4.569578600997771e-05, "loss": 0.2467, "step": 5140 }, { "epoch": 1.64, "grad_norm": 3.051811456680298, "learning_rate": 4.558964016558752e-05, "loss": 0.1486, "step": 5150 }, { "epoch": 1.64, "grad_norm": 0.6013765931129456, "learning_rate": 4.548349432119733e-05, "loss": 0.123, "step": 5160 }, { "epoch": 1.65, "grad_norm": 3.8984789848327637, "learning_rate": 4.537734847680713e-05, "loss": 0.3698, "step": 5170 }, { "epoch": 1.65, "grad_norm": 1.3346749544143677, "learning_rate": 4.527120263241694e-05, "loss": 0.1814, "step": 5180 }, { "epoch": 1.65, "grad_norm": 11.491423606872559, "learning_rate": 4.516505678802675e-05, "loss": 0.1745, "step": 5190 }, { "epoch": 1.66, "grad_norm": 2.358656883239746, "learning_rate": 4.505891094363656e-05, "loss": 0.2734, "step": 5200 }, { "epoch": 1.66, "grad_norm": 3.3352041244506836, "learning_rate": 4.495276509924637e-05, "loss": 0.2054, "step": 5210 }, { "epoch": 1.66, "grad_norm": 0.052441373467445374, "learning_rate": 4.484661925485618e-05, "loss": 0.1389, "step": 5220 }, { "epoch": 1.66, "grad_norm": 0.20047003030776978, "learning_rate": 4.474047341046598e-05, "loss": 0.1197, "step": 5230 }, { "epoch": 1.67, "grad_norm": 1.4837030172348022, "learning_rate": 4.463432756607579e-05, "loss": 0.2446, "step": 5240 }, { "epoch": 1.67, "grad_norm": 0.3104861378669739, "learning_rate": 4.4528181721685596e-05, "loss": 0.1842, "step": 5250 }, { "epoch": 1.67, "grad_norm": 7.954286098480225, "learning_rate": 4.442203587729541e-05, "loss": 0.1221, "step": 5260 }, { "epoch": 1.68, "grad_norm": 0.03400198742747307, "learning_rate": 4.431589003290522e-05, "loss": 0.1513, "step": 5270 }, { "epoch": 1.68, "grad_norm": 0.08371475338935852, "learning_rate": 4.420974418851502e-05, "loss": 0.2098, "step": 5280 }, { "epoch": 1.68, "grad_norm": 1.2470760345458984, "learning_rate": 4.410359834412483e-05, "loss": 0.117, "step": 5290 }, { "epoch": 1.69, "grad_norm": 1.5426656007766724, "learning_rate": 4.3997452499734635e-05, "loss": 0.1826, "step": 5300 }, { "epoch": 1.69, "grad_norm": 3.978109121322632, "learning_rate": 4.3891306655344445e-05, "loss": 0.1103, "step": 5310 }, { "epoch": 1.69, "grad_norm": 1.6321693658828735, "learning_rate": 4.378516081095425e-05, "loss": 0.151, "step": 5320 }, { "epoch": 1.7, "grad_norm": 2.555723190307617, "learning_rate": 4.3679014966564066e-05, "loss": 0.1786, "step": 5330 }, { "epoch": 1.7, "grad_norm": 0.2461155354976654, "learning_rate": 4.357286912217387e-05, "loss": 0.1914, "step": 5340 }, { "epoch": 1.7, "grad_norm": 0.41670894622802734, "learning_rate": 4.346672327778368e-05, "loss": 0.2582, "step": 5350 }, { "epoch": 1.71, "grad_norm": 4.785902976989746, "learning_rate": 4.3360577433393484e-05, "loss": 0.0911, "step": 5360 }, { "epoch": 1.71, "grad_norm": 4.179080963134766, "learning_rate": 4.3254431589003295e-05, "loss": 0.2264, "step": 5370 }, { "epoch": 1.71, "grad_norm": 0.9344226717948914, "learning_rate": 4.31482857446131e-05, "loss": 0.2003, "step": 5380 }, { "epoch": 1.72, "grad_norm": 0.3643859624862671, "learning_rate": 4.304213990022291e-05, "loss": 0.1, "step": 5390 }, { "epoch": 1.72, "grad_norm": 2.3688154220581055, "learning_rate": 4.293599405583272e-05, "loss": 0.2461, "step": 5400 }, { "epoch": 1.72, "grad_norm": 4.223112106323242, "learning_rate": 4.282984821144252e-05, "loss": 0.1316, "step": 5410 }, { "epoch": 1.73, "grad_norm": 1.52751886844635, "learning_rate": 4.2723702367052333e-05, "loss": 0.162, "step": 5420 }, { "epoch": 1.73, "grad_norm": 0.06534834951162338, "learning_rate": 4.261755652266214e-05, "loss": 0.1787, "step": 5430 }, { "epoch": 1.73, "grad_norm": 0.0435919463634491, "learning_rate": 4.251141067827195e-05, "loss": 0.2196, "step": 5440 }, { "epoch": 1.73, "grad_norm": 1.0877362489700317, "learning_rate": 4.240526483388176e-05, "loss": 0.2829, "step": 5450 }, { "epoch": 1.74, "grad_norm": 1.7220368385314941, "learning_rate": 4.229911898949156e-05, "loss": 0.211, "step": 5460 }, { "epoch": 1.74, "grad_norm": 1.6200969219207764, "learning_rate": 4.219297314510137e-05, "loss": 0.2046, "step": 5470 }, { "epoch": 1.74, "grad_norm": 2.376384735107422, "learning_rate": 4.2086827300711176e-05, "loss": 0.2518, "step": 5480 }, { "epoch": 1.75, "grad_norm": 1.6646453142166138, "learning_rate": 4.1980681456320986e-05, "loss": 0.1542, "step": 5490 }, { "epoch": 1.75, "grad_norm": 0.580792248249054, "learning_rate": 4.187453561193079e-05, "loss": 0.1503, "step": 5500 }, { "epoch": 1.75, "grad_norm": 2.325477123260498, "learning_rate": 4.176838976754061e-05, "loss": 0.1867, "step": 5510 }, { "epoch": 1.76, "grad_norm": 3.004499673843384, "learning_rate": 4.166224392315041e-05, "loss": 0.1816, "step": 5520 }, { "epoch": 1.76, "grad_norm": 1.7592769861221313, "learning_rate": 4.155609807876022e-05, "loss": 0.2155, "step": 5530 }, { "epoch": 1.76, "grad_norm": 0.4255143105983734, "learning_rate": 4.1449952234370025e-05, "loss": 0.2298, "step": 5540 }, { "epoch": 1.77, "grad_norm": 4.217332363128662, "learning_rate": 4.1343806389979836e-05, "loss": 0.1263, "step": 5550 }, { "epoch": 1.77, "grad_norm": 1.6670517921447754, "learning_rate": 4.123766054558964e-05, "loss": 0.1993, "step": 5560 }, { "epoch": 1.77, "grad_norm": 0.2432798445224762, "learning_rate": 4.113151470119945e-05, "loss": 0.1992, "step": 5570 }, { "epoch": 1.78, "grad_norm": 5.0905070304870605, "learning_rate": 4.102536885680926e-05, "loss": 0.1381, "step": 5580 }, { "epoch": 1.78, "grad_norm": 12.299093246459961, "learning_rate": 4.0919223012419064e-05, "loss": 0.2233, "step": 5590 }, { "epoch": 1.78, "grad_norm": 0.27092546224594116, "learning_rate": 4.0813077168028874e-05, "loss": 0.1675, "step": 5600 }, { "epoch": 1.79, "grad_norm": 3.4481306076049805, "learning_rate": 4.070693132363868e-05, "loss": 0.3113, "step": 5610 }, { "epoch": 1.79, "grad_norm": 12.642804145812988, "learning_rate": 4.060078547924849e-05, "loss": 0.1557, "step": 5620 }, { "epoch": 1.79, "grad_norm": 4.341307163238525, "learning_rate": 4.049463963485829e-05, "loss": 0.0825, "step": 5630 }, { "epoch": 1.8, "grad_norm": 0.728386402130127, "learning_rate": 4.038849379046811e-05, "loss": 0.1589, "step": 5640 }, { "epoch": 1.8, "grad_norm": 4.2692084312438965, "learning_rate": 4.028234794607791e-05, "loss": 0.0908, "step": 5650 }, { "epoch": 1.8, "grad_norm": 3.5218265056610107, "learning_rate": 4.0176202101687724e-05, "loss": 0.2008, "step": 5660 }, { "epoch": 1.8, "grad_norm": 0.6934779286384583, "learning_rate": 4.007005625729753e-05, "loss": 0.1652, "step": 5670 }, { "epoch": 1.81, "grad_norm": 7.079185485839844, "learning_rate": 3.996391041290734e-05, "loss": 0.1854, "step": 5680 }, { "epoch": 1.81, "grad_norm": 2.6828112602233887, "learning_rate": 3.985776456851714e-05, "loss": 0.0911, "step": 5690 }, { "epoch": 1.81, "grad_norm": 5.049779891967773, "learning_rate": 3.975161872412695e-05, "loss": 0.1191, "step": 5700 }, { "epoch": 1.82, "grad_norm": 2.4732673168182373, "learning_rate": 3.9656087464175775e-05, "loss": 0.2192, "step": 5710 }, { "epoch": 1.82, "grad_norm": 0.11808130145072937, "learning_rate": 3.9549941619785586e-05, "loss": 0.1782, "step": 5720 }, { "epoch": 1.82, "grad_norm": 3.8879833221435547, "learning_rate": 3.944379577539539e-05, "loss": 0.1692, "step": 5730 }, { "epoch": 1.83, "grad_norm": 3.667048931121826, "learning_rate": 3.933764993100521e-05, "loss": 0.1236, "step": 5740 }, { "epoch": 1.83, "grad_norm": 4.494665622711182, "learning_rate": 3.923150408661501e-05, "loss": 0.2373, "step": 5750 }, { "epoch": 1.83, "grad_norm": 0.3976966440677643, "learning_rate": 3.912535824222482e-05, "loss": 0.2805, "step": 5760 }, { "epoch": 1.84, "grad_norm": 2.046142578125, "learning_rate": 3.9019212397834625e-05, "loss": 0.1198, "step": 5770 }, { "epoch": 1.84, "grad_norm": 0.27937573194503784, "learning_rate": 3.8913066553444435e-05, "loss": 0.1443, "step": 5780 }, { "epoch": 1.84, "grad_norm": 6.109045028686523, "learning_rate": 3.880692070905424e-05, "loss": 0.3341, "step": 5790 }, { "epoch": 1.85, "grad_norm": 0.7306396961212158, "learning_rate": 3.870077486466405e-05, "loss": 0.1208, "step": 5800 }, { "epoch": 1.85, "grad_norm": 1.7087950706481934, "learning_rate": 3.859462902027386e-05, "loss": 0.1464, "step": 5810 }, { "epoch": 1.85, "grad_norm": 0.5200537443161011, "learning_rate": 3.8488483175883663e-05, "loss": 0.1639, "step": 5820 }, { "epoch": 1.86, "grad_norm": 6.455096244812012, "learning_rate": 3.8382337331493474e-05, "loss": 0.1885, "step": 5830 }, { "epoch": 1.86, "grad_norm": 7.437272548675537, "learning_rate": 3.827619148710328e-05, "loss": 0.1916, "step": 5840 }, { "epoch": 1.86, "grad_norm": 6.395534515380859, "learning_rate": 3.817004564271309e-05, "loss": 0.2988, "step": 5850 }, { "epoch": 1.87, "grad_norm": 20.61446762084961, "learning_rate": 3.80638997983229e-05, "loss": 0.0853, "step": 5860 }, { "epoch": 1.87, "grad_norm": 1.0395785570144653, "learning_rate": 3.795775395393271e-05, "loss": 0.2113, "step": 5870 }, { "epoch": 1.87, "grad_norm": 8.83860969543457, "learning_rate": 3.785160810954251e-05, "loss": 0.1904, "step": 5880 }, { "epoch": 1.87, "grad_norm": 5.42601203918457, "learning_rate": 3.774546226515232e-05, "loss": 0.3887, "step": 5890 }, { "epoch": 1.88, "grad_norm": 3.3505442142486572, "learning_rate": 3.763931642076213e-05, "loss": 0.1397, "step": 5900 }, { "epoch": 1.88, "grad_norm": 4.929141521453857, "learning_rate": 3.753317057637194e-05, "loss": 0.2773, "step": 5910 }, { "epoch": 1.88, "grad_norm": 2.1540703773498535, "learning_rate": 3.742702473198175e-05, "loss": 0.1679, "step": 5920 }, { "epoch": 1.89, "grad_norm": 10.82689094543457, "learning_rate": 3.732087888759155e-05, "loss": 0.1776, "step": 5930 }, { "epoch": 1.89, "grad_norm": 3.0525174140930176, "learning_rate": 3.721473304320136e-05, "loss": 0.1619, "step": 5940 }, { "epoch": 1.89, "grad_norm": 5.296212196350098, "learning_rate": 3.7108587198811166e-05, "loss": 0.3294, "step": 5950 }, { "epoch": 1.9, "grad_norm": 3.4226958751678467, "learning_rate": 3.7002441354420976e-05, "loss": 0.3229, "step": 5960 }, { "epoch": 1.9, "grad_norm": 0.4734908938407898, "learning_rate": 3.689629551003078e-05, "loss": 0.1179, "step": 5970 }, { "epoch": 1.9, "grad_norm": 5.436024188995361, "learning_rate": 3.67901496656406e-05, "loss": 0.1892, "step": 5980 }, { "epoch": 1.91, "grad_norm": 5.233070373535156, "learning_rate": 3.66840038212504e-05, "loss": 0.2054, "step": 5990 }, { "epoch": 1.91, "grad_norm": 0.5661432147026062, "learning_rate": 3.657785797686021e-05, "loss": 0.2202, "step": 6000 }, { "epoch": 1.91, "grad_norm": 0.23524077236652374, "learning_rate": 3.6471712132470015e-05, "loss": 0.2318, "step": 6010 }, { "epoch": 1.92, "grad_norm": 0.05953243002295494, "learning_rate": 3.6365566288079825e-05, "loss": 0.2486, "step": 6020 }, { "epoch": 1.92, "grad_norm": 1.3823449611663818, "learning_rate": 3.625942044368963e-05, "loss": 0.1171, "step": 6030 }, { "epoch": 1.92, "grad_norm": 7.733388423919678, "learning_rate": 3.615327459929944e-05, "loss": 0.2469, "step": 6040 }, { "epoch": 1.93, "grad_norm": 1.4917621612548828, "learning_rate": 3.604712875490925e-05, "loss": 0.2045, "step": 6050 }, { "epoch": 1.93, "grad_norm": 7.689728736877441, "learning_rate": 3.5940982910519054e-05, "loss": 0.1648, "step": 6060 }, { "epoch": 1.93, "grad_norm": 2.2216577529907227, "learning_rate": 3.5834837066128864e-05, "loss": 0.2779, "step": 6070 }, { "epoch": 1.94, "grad_norm": 1.7362425327301025, "learning_rate": 3.572869122173867e-05, "loss": 0.1664, "step": 6080 }, { "epoch": 1.94, "grad_norm": 4.933811187744141, "learning_rate": 3.562254537734848e-05, "loss": 0.293, "step": 6090 }, { "epoch": 1.94, "grad_norm": 4.054910182952881, "learning_rate": 3.551639953295829e-05, "loss": 0.1539, "step": 6100 }, { "epoch": 1.94, "grad_norm": 0.9219651222229004, "learning_rate": 3.541025368856809e-05, "loss": 0.1111, "step": 6110 }, { "epoch": 1.95, "grad_norm": 4.558506488800049, "learning_rate": 3.53041078441779e-05, "loss": 0.1783, "step": 6120 }, { "epoch": 1.95, "grad_norm": 2.6951773166656494, "learning_rate": 3.5197961999787707e-05, "loss": 0.2916, "step": 6130 }, { "epoch": 1.95, "grad_norm": 0.9989050626754761, "learning_rate": 3.509181615539752e-05, "loss": 0.2099, "step": 6140 }, { "epoch": 1.96, "grad_norm": 0.08494656533002853, "learning_rate": 3.498567031100732e-05, "loss": 0.1255, "step": 6150 }, { "epoch": 1.96, "grad_norm": 0.20273062586784363, "learning_rate": 3.487952446661714e-05, "loss": 0.1523, "step": 6160 }, { "epoch": 1.96, "grad_norm": 0.2878829538822174, "learning_rate": 3.477337862222694e-05, "loss": 0.1732, "step": 6170 }, { "epoch": 1.97, "grad_norm": 2.026616096496582, "learning_rate": 3.466723277783675e-05, "loss": 0.1037, "step": 6180 }, { "epoch": 1.97, "grad_norm": 0.8350101709365845, "learning_rate": 3.4561086933446556e-05, "loss": 0.1169, "step": 6190 }, { "epoch": 1.97, "grad_norm": 0.6492775082588196, "learning_rate": 3.4454941089056366e-05, "loss": 0.1758, "step": 6200 }, { "epoch": 1.98, "grad_norm": 4.830353736877441, "learning_rate": 3.434879524466617e-05, "loss": 0.3367, "step": 6210 }, { "epoch": 1.98, "grad_norm": 5.267330169677734, "learning_rate": 3.424264940027598e-05, "loss": 0.1753, "step": 6220 }, { "epoch": 1.98, "grad_norm": 0.11368358880281448, "learning_rate": 3.413650355588579e-05, "loss": 0.2409, "step": 6230 }, { "epoch": 1.99, "grad_norm": 0.10408168286085129, "learning_rate": 3.4030357711495595e-05, "loss": 0.1407, "step": 6240 }, { "epoch": 1.99, "grad_norm": 4.495917320251465, "learning_rate": 3.3924211867105405e-05, "loss": 0.1504, "step": 6250 }, { "epoch": 1.99, "grad_norm": 0.16925585269927979, "learning_rate": 3.381806602271521e-05, "loss": 0.1323, "step": 6260 }, { "epoch": 2.0, "grad_norm": 2.5475289821624756, "learning_rate": 3.371192017832502e-05, "loss": 0.1902, "step": 6270 }, { "epoch": 2.0, "grad_norm": 2.21207332611084, "learning_rate": 3.360577433393483e-05, "loss": 0.1019, "step": 6280 }, { "epoch": 2.0, "grad_norm": 2.7308425903320312, "learning_rate": 3.349962848954464e-05, "loss": 0.1368, "step": 6290 }, { "epoch": 2.01, "grad_norm": 0.8695929646492004, "learning_rate": 3.3393482645154444e-05, "loss": 0.1979, "step": 6300 }, { "epoch": 2.01, "grad_norm": 5.150228977203369, "learning_rate": 3.3287336800764254e-05, "loss": 0.1237, "step": 6310 }, { "epoch": 2.01, "grad_norm": 0.1432078331708908, "learning_rate": 3.318119095637406e-05, "loss": 0.1547, "step": 6320 }, { "epoch": 2.01, "grad_norm": 3.952962875366211, "learning_rate": 3.307504511198387e-05, "loss": 0.1682, "step": 6330 }, { "epoch": 2.02, "grad_norm": 0.044416822493076324, "learning_rate": 3.296889926759367e-05, "loss": 0.0388, "step": 6340 }, { "epoch": 2.02, "grad_norm": 6.307524681091309, "learning_rate": 3.286275342320348e-05, "loss": 0.1418, "step": 6350 }, { "epoch": 2.02, "grad_norm": 0.1354295015335083, "learning_rate": 3.275660757881329e-05, "loss": 0.2588, "step": 6360 }, { "epoch": 2.03, "grad_norm": 3.275066614151001, "learning_rate": 3.26504617344231e-05, "loss": 0.1091, "step": 6370 }, { "epoch": 2.03, "grad_norm": 0.0923081785440445, "learning_rate": 3.254431589003291e-05, "loss": 0.1384, "step": 6380 }, { "epoch": 2.03, "grad_norm": 3.508528232574463, "learning_rate": 3.243817004564271e-05, "loss": 0.217, "step": 6390 }, { "epoch": 2.04, "grad_norm": 2.36240291595459, "learning_rate": 3.233202420125252e-05, "loss": 0.0337, "step": 6400 }, { "epoch": 2.04, "grad_norm": 0.20124652981758118, "learning_rate": 3.222587835686233e-05, "loss": 0.0982, "step": 6410 }, { "epoch": 2.04, "grad_norm": 0.8248081803321838, "learning_rate": 3.211973251247214e-05, "loss": 0.2217, "step": 6420 }, { "epoch": 2.05, "grad_norm": 1.1201878786087036, "learning_rate": 3.2013586668081946e-05, "loss": 0.0651, "step": 6430 }, { "epoch": 2.05, "grad_norm": 1.6418076753616333, "learning_rate": 3.1907440823691757e-05, "loss": 0.0738, "step": 6440 }, { "epoch": 2.05, "grad_norm": 2.1913180351257324, "learning_rate": 3.180129497930156e-05, "loss": 0.0863, "step": 6450 }, { "epoch": 2.06, "grad_norm": 1.3282325267791748, "learning_rate": 3.1695149134911364e-05, "loss": 0.0582, "step": 6460 }, { "epoch": 2.06, "grad_norm": 2.451772451400757, "learning_rate": 3.158900329052118e-05, "loss": 0.1187, "step": 6470 }, { "epoch": 2.06, "grad_norm": 0.1372409611940384, "learning_rate": 3.1482857446130985e-05, "loss": 0.0618, "step": 6480 }, { "epoch": 2.07, "grad_norm": 0.08469751477241516, "learning_rate": 3.1376711601740795e-05, "loss": 0.0316, "step": 6490 }, { "epoch": 2.07, "grad_norm": 0.1473696529865265, "learning_rate": 3.12705657573506e-05, "loss": 0.0954, "step": 6500 }, { "epoch": 2.07, "grad_norm": 0.06819278746843338, "learning_rate": 3.116441991296041e-05, "loss": 0.1365, "step": 6510 }, { "epoch": 2.08, "grad_norm": 8.832886695861816, "learning_rate": 3.105827406857021e-05, "loss": 0.1828, "step": 6520 }, { "epoch": 2.08, "grad_norm": 0.043228354305028915, "learning_rate": 3.0952128224180024e-05, "loss": 0.1541, "step": 6530 }, { "epoch": 2.08, "grad_norm": 0.1457592248916626, "learning_rate": 3.0845982379789834e-05, "loss": 0.0291, "step": 6540 }, { "epoch": 2.08, "grad_norm": 1.5548399686813354, "learning_rate": 3.073983653539964e-05, "loss": 0.123, "step": 6550 }, { "epoch": 2.09, "grad_norm": 5.61803674697876, "learning_rate": 3.063369069100945e-05, "loss": 0.1871, "step": 6560 }, { "epoch": 2.09, "grad_norm": 0.020372767001390457, "learning_rate": 3.052754484661925e-05, "loss": 0.0865, "step": 6570 }, { "epoch": 2.09, "grad_norm": 5.178860664367676, "learning_rate": 3.0421399002229062e-05, "loss": 0.1568, "step": 6580 }, { "epoch": 2.1, "grad_norm": 4.118620872497559, "learning_rate": 3.0315253157838873e-05, "loss": 0.0729, "step": 6590 }, { "epoch": 2.1, "grad_norm": 3.9899566173553467, "learning_rate": 3.020910731344868e-05, "loss": 0.2327, "step": 6600 }, { "epoch": 2.1, "grad_norm": 1.3902517557144165, "learning_rate": 3.0102961469058487e-05, "loss": 0.1305, "step": 6610 }, { "epoch": 2.11, "grad_norm": 5.5835957527160645, "learning_rate": 2.9996815624668294e-05, "loss": 0.1032, "step": 6620 }, { "epoch": 2.11, "grad_norm": 1.521474003791809, "learning_rate": 2.98906697802781e-05, "loss": 0.1188, "step": 6630 }, { "epoch": 2.11, "grad_norm": 0.19501766562461853, "learning_rate": 2.978452393588791e-05, "loss": 0.0989, "step": 6640 }, { "epoch": 2.12, "grad_norm": 0.03989823907613754, "learning_rate": 2.9678378091497722e-05, "loss": 0.0736, "step": 6650 }, { "epoch": 2.12, "grad_norm": 3.9346630573272705, "learning_rate": 2.957223224710753e-05, "loss": 0.0347, "step": 6660 }, { "epoch": 2.12, "grad_norm": 0.05866791680455208, "learning_rate": 2.9466086402717336e-05, "loss": 0.1317, "step": 6670 }, { "epoch": 2.13, "grad_norm": 0.660900890827179, "learning_rate": 2.9359940558327143e-05, "loss": 0.1365, "step": 6680 }, { "epoch": 2.13, "grad_norm": 0.20864763855934143, "learning_rate": 2.925379471393695e-05, "loss": 0.2221, "step": 6690 }, { "epoch": 2.13, "grad_norm": 2.8652963638305664, "learning_rate": 2.9147648869546758e-05, "loss": 0.0355, "step": 6700 }, { "epoch": 2.14, "grad_norm": 3.0343375205993652, "learning_rate": 2.9041503025156568e-05, "loss": 0.2081, "step": 6710 }, { "epoch": 2.14, "grad_norm": 2.393002510070801, "learning_rate": 2.8935357180766375e-05, "loss": 0.1076, "step": 6720 }, { "epoch": 2.14, "grad_norm": 0.08225111663341522, "learning_rate": 2.8829211336376182e-05, "loss": 0.1367, "step": 6730 }, { "epoch": 2.15, "grad_norm": 4.09624719619751, "learning_rate": 2.872306549198599e-05, "loss": 0.2712, "step": 6740 }, { "epoch": 2.15, "grad_norm": 0.667273998260498, "learning_rate": 2.8616919647595796e-05, "loss": 0.152, "step": 6750 }, { "epoch": 2.15, "grad_norm": 1.4781357049942017, "learning_rate": 2.8510773803205603e-05, "loss": 0.0949, "step": 6760 }, { "epoch": 2.16, "grad_norm": 4.563651084899902, "learning_rate": 2.8404627958815417e-05, "loss": 0.0873, "step": 6770 }, { "epoch": 2.16, "grad_norm": 3.7740418910980225, "learning_rate": 2.830909669886424e-05, "loss": 0.1207, "step": 6780 }, { "epoch": 2.16, "grad_norm": 10.370115280151367, "learning_rate": 2.8202950854474048e-05, "loss": 0.1369, "step": 6790 }, { "epoch": 2.16, "grad_norm": 0.13098500669002533, "learning_rate": 2.8096805010083855e-05, "loss": 0.1016, "step": 6800 }, { "epoch": 2.17, "grad_norm": 9.170578956604004, "learning_rate": 2.7990659165693665e-05, "loss": 0.0463, "step": 6810 }, { "epoch": 2.17, "grad_norm": 10.379976272583008, "learning_rate": 2.7884513321303472e-05, "loss": 0.0798, "step": 6820 }, { "epoch": 2.17, "grad_norm": 0.10993140935897827, "learning_rate": 2.777836747691328e-05, "loss": 0.084, "step": 6830 }, { "epoch": 2.18, "grad_norm": 0.4707590937614441, "learning_rate": 2.7672221632523087e-05, "loss": 0.1232, "step": 6840 }, { "epoch": 2.18, "grad_norm": 4.587014198303223, "learning_rate": 2.7566075788132894e-05, "loss": 0.1474, "step": 6850 }, { "epoch": 2.18, "grad_norm": 10.61086654663086, "learning_rate": 2.74599299437427e-05, "loss": 0.1282, "step": 6860 }, { "epoch": 2.19, "grad_norm": 0.2299477756023407, "learning_rate": 2.7353784099352515e-05, "loss": 0.0667, "step": 6870 }, { "epoch": 2.19, "grad_norm": 5.911661624908447, "learning_rate": 2.724763825496232e-05, "loss": 0.1222, "step": 6880 }, { "epoch": 2.19, "grad_norm": 0.1657014936208725, "learning_rate": 2.714149241057213e-05, "loss": 0.072, "step": 6890 }, { "epoch": 2.2, "grad_norm": 0.04870441555976868, "learning_rate": 2.7035346566181936e-05, "loss": 0.1132, "step": 6900 }, { "epoch": 2.2, "grad_norm": 0.7382871508598328, "learning_rate": 2.6929200721791743e-05, "loss": 0.0272, "step": 6910 }, { "epoch": 2.2, "grad_norm": 1.1875141859054565, "learning_rate": 2.682305487740155e-05, "loss": 0.0833, "step": 6920 }, { "epoch": 2.21, "grad_norm": 0.070220448076725, "learning_rate": 2.671690903301136e-05, "loss": 0.1321, "step": 6930 }, { "epoch": 2.21, "grad_norm": 3.7514150142669678, "learning_rate": 2.6610763188621167e-05, "loss": 0.0971, "step": 6940 }, { "epoch": 2.21, "grad_norm": 0.04383459314703941, "learning_rate": 2.6504617344230975e-05, "loss": 0.0878, "step": 6950 }, { "epoch": 2.22, "grad_norm": 0.11518880724906921, "learning_rate": 2.639847149984078e-05, "loss": 0.0679, "step": 6960 }, { "epoch": 2.22, "grad_norm": 5.474330902099609, "learning_rate": 2.629232565545059e-05, "loss": 0.0497, "step": 6970 }, { "epoch": 2.22, "grad_norm": 0.03785128891468048, "learning_rate": 2.6186179811060396e-05, "loss": 0.1183, "step": 6980 }, { "epoch": 2.23, "grad_norm": 0.050687942653894424, "learning_rate": 2.608003396667021e-05, "loss": 0.1141, "step": 6990 }, { "epoch": 2.23, "grad_norm": 5.501091003417969, "learning_rate": 2.5973888122280017e-05, "loss": 0.1175, "step": 7000 }, { "epoch": 2.23, "grad_norm": 1.3896145820617676, "learning_rate": 2.5867742277889824e-05, "loss": 0.1665, "step": 7010 }, { "epoch": 2.23, "grad_norm": 5.888062000274658, "learning_rate": 2.576159643349963e-05, "loss": 0.1868, "step": 7020 }, { "epoch": 2.24, "grad_norm": 0.3350411653518677, "learning_rate": 2.5655450589109438e-05, "loss": 0.0262, "step": 7030 }, { "epoch": 2.24, "grad_norm": 0.12134930491447449, "learning_rate": 2.5549304744719245e-05, "loss": 0.1391, "step": 7040 }, { "epoch": 2.24, "grad_norm": 2.653724193572998, "learning_rate": 2.5443158900329056e-05, "loss": 0.044, "step": 7050 }, { "epoch": 2.25, "grad_norm": 1.480675458908081, "learning_rate": 2.5337013055938863e-05, "loss": 0.125, "step": 7060 }, { "epoch": 2.25, "grad_norm": 2.112579584121704, "learning_rate": 2.523086721154867e-05, "loss": 0.0774, "step": 7070 }, { "epoch": 2.25, "grad_norm": 0.03731192275881767, "learning_rate": 2.5124721367158477e-05, "loss": 0.0703, "step": 7080 }, { "epoch": 2.26, "grad_norm": 0.06327365338802338, "learning_rate": 2.5018575522768284e-05, "loss": 0.1557, "step": 7090 }, { "epoch": 2.26, "grad_norm": 0.10991324484348297, "learning_rate": 2.4912429678378094e-05, "loss": 0.0682, "step": 7100 }, { "epoch": 2.26, "grad_norm": 0.03156714513897896, "learning_rate": 2.48062838339879e-05, "loss": 0.1716, "step": 7110 }, { "epoch": 2.27, "grad_norm": 9.979147911071777, "learning_rate": 2.470013798959771e-05, "loss": 0.1797, "step": 7120 }, { "epoch": 2.27, "grad_norm": 3.263706684112549, "learning_rate": 2.459399214520752e-05, "loss": 0.0659, "step": 7130 }, { "epoch": 2.27, "grad_norm": 6.261413097381592, "learning_rate": 2.4487846300817323e-05, "loss": 0.0968, "step": 7140 }, { "epoch": 2.28, "grad_norm": 1.550948143005371, "learning_rate": 2.438170045642713e-05, "loss": 0.1336, "step": 7150 }, { "epoch": 2.28, "grad_norm": 1.1487703323364258, "learning_rate": 2.427555461203694e-05, "loss": 0.0647, "step": 7160 }, { "epoch": 2.28, "grad_norm": 6.673706531524658, "learning_rate": 2.4169408767646747e-05, "loss": 0.1567, "step": 7170 }, { "epoch": 2.29, "grad_norm": 0.17169363796710968, "learning_rate": 2.4063262923256554e-05, "loss": 0.1096, "step": 7180 }, { "epoch": 2.29, "grad_norm": 8.660694122314453, "learning_rate": 2.3957117078866365e-05, "loss": 0.1589, "step": 7190 }, { "epoch": 2.29, "grad_norm": 1.5906010866165161, "learning_rate": 2.3850971234476172e-05, "loss": 0.1224, "step": 7200 }, { "epoch": 2.3, "grad_norm": 0.8341835141181946, "learning_rate": 2.374482539008598e-05, "loss": 0.02, "step": 7210 }, { "epoch": 2.3, "grad_norm": 10.785898208618164, "learning_rate": 2.363867954569579e-05, "loss": 0.1153, "step": 7220 }, { "epoch": 2.3, "grad_norm": 5.174521446228027, "learning_rate": 2.3532533701305597e-05, "loss": 0.0843, "step": 7230 }, { "epoch": 2.3, "grad_norm": 0.7447335720062256, "learning_rate": 2.3426387856915404e-05, "loss": 0.1465, "step": 7240 }, { "epoch": 2.31, "grad_norm": 3.2618470191955566, "learning_rate": 2.332024201252521e-05, "loss": 0.0874, "step": 7250 }, { "epoch": 2.31, "grad_norm": 1.483594298362732, "learning_rate": 2.3214096168135018e-05, "loss": 0.1061, "step": 7260 }, { "epoch": 2.31, "grad_norm": 0.303654283285141, "learning_rate": 2.3107950323744825e-05, "loss": 0.117, "step": 7270 }, { "epoch": 2.32, "grad_norm": 10.942138671875, "learning_rate": 2.3001804479354635e-05, "loss": 0.2048, "step": 7280 }, { "epoch": 2.32, "grad_norm": 7.95550012588501, "learning_rate": 2.2895658634964442e-05, "loss": 0.1158, "step": 7290 }, { "epoch": 2.32, "grad_norm": 0.05263487249612808, "learning_rate": 2.278951279057425e-05, "loss": 0.0142, "step": 7300 }, { "epoch": 2.33, "grad_norm": 0.04684547707438469, "learning_rate": 2.268336694618406e-05, "loss": 0.15, "step": 7310 }, { "epoch": 2.33, "grad_norm": 6.8654890060424805, "learning_rate": 2.2577221101793867e-05, "loss": 0.1816, "step": 7320 }, { "epoch": 2.33, "grad_norm": 11.469459533691406, "learning_rate": 2.2471075257403674e-05, "loss": 0.1426, "step": 7330 }, { "epoch": 2.34, "grad_norm": 5.302177906036377, "learning_rate": 2.236492941301348e-05, "loss": 0.2248, "step": 7340 }, { "epoch": 2.34, "grad_norm": 2.6794090270996094, "learning_rate": 2.2258783568623288e-05, "loss": 0.157, "step": 7350 }, { "epoch": 2.34, "grad_norm": 1.3895156383514404, "learning_rate": 2.2152637724233095e-05, "loss": 0.159, "step": 7360 }, { "epoch": 2.35, "grad_norm": 0.17077626287937164, "learning_rate": 2.2046491879842906e-05, "loss": 0.1298, "step": 7370 }, { "epoch": 2.35, "grad_norm": 0.14379891753196716, "learning_rate": 2.1940346035452713e-05, "loss": 0.072, "step": 7380 }, { "epoch": 2.35, "grad_norm": 0.946506142616272, "learning_rate": 2.183420019106252e-05, "loss": 0.0414, "step": 7390 }, { "epoch": 2.36, "grad_norm": 0.10742925852537155, "learning_rate": 2.172805434667233e-05, "loss": 0.1991, "step": 7400 }, { "epoch": 2.36, "grad_norm": 4.503111362457275, "learning_rate": 2.1621908502282138e-05, "loss": 0.1018, "step": 7410 }, { "epoch": 2.36, "grad_norm": 0.025181856006383896, "learning_rate": 2.1515762657891945e-05, "loss": 0.2192, "step": 7420 }, { "epoch": 2.37, "grad_norm": 0.2496863454580307, "learning_rate": 2.140961681350175e-05, "loss": 0.1513, "step": 7430 }, { "epoch": 2.37, "grad_norm": 0.18356376886367798, "learning_rate": 2.1303470969111562e-05, "loss": 0.0928, "step": 7440 }, { "epoch": 2.37, "grad_norm": 4.700144290924072, "learning_rate": 2.119732512472137e-05, "loss": 0.1076, "step": 7450 }, { "epoch": 2.37, "grad_norm": 1.5925829410552979, "learning_rate": 2.1091179280331176e-05, "loss": 0.0673, "step": 7460 }, { "epoch": 2.38, "grad_norm": 0.5920007228851318, "learning_rate": 2.0985033435940983e-05, "loss": 0.1291, "step": 7470 }, { "epoch": 2.38, "grad_norm": 5.156589508056641, "learning_rate": 2.087888759155079e-05, "loss": 0.3071, "step": 7480 }, { "epoch": 2.38, "grad_norm": 0.03765925392508507, "learning_rate": 2.0772741747160598e-05, "loss": 0.1093, "step": 7490 }, { "epoch": 2.39, "grad_norm": 0.4249335825443268, "learning_rate": 2.0666595902770408e-05, "loss": 0.1279, "step": 7500 }, { "epoch": 2.39, "grad_norm": 0.016695374622941017, "learning_rate": 2.0560450058380215e-05, "loss": 0.2595, "step": 7510 }, { "epoch": 2.39, "grad_norm": 0.8157448768615723, "learning_rate": 2.0454304213990022e-05, "loss": 0.2879, "step": 7520 }, { "epoch": 2.4, "grad_norm": 0.43193209171295166, "learning_rate": 2.0348158369599833e-05, "loss": 0.1147, "step": 7530 }, { "epoch": 2.4, "grad_norm": 1.1754236221313477, "learning_rate": 2.024201252520964e-05, "loss": 0.1228, "step": 7540 }, { "epoch": 2.4, "grad_norm": 0.073044553399086, "learning_rate": 2.0135866680819447e-05, "loss": 0.0953, "step": 7550 }, { "epoch": 2.41, "grad_norm": 6.481806755065918, "learning_rate": 2.0029720836429254e-05, "loss": 0.0925, "step": 7560 }, { "epoch": 2.41, "grad_norm": 3.421597719192505, "learning_rate": 1.992357499203906e-05, "loss": 0.1844, "step": 7570 }, { "epoch": 2.41, "grad_norm": 0.15194571018218994, "learning_rate": 1.9817429147648868e-05, "loss": 0.3675, "step": 7580 }, { "epoch": 2.42, "grad_norm": 0.44171637296676636, "learning_rate": 1.971128330325868e-05, "loss": 0.043, "step": 7590 }, { "epoch": 2.42, "grad_norm": 0.06510256975889206, "learning_rate": 1.9605137458868486e-05, "loss": 0.1354, "step": 7600 }, { "epoch": 2.42, "grad_norm": 2.7437000274658203, "learning_rate": 1.9498991614478293e-05, "loss": 0.0389, "step": 7610 }, { "epoch": 2.43, "grad_norm": 1.2895437479019165, "learning_rate": 1.9392845770088103e-05, "loss": 0.1704, "step": 7620 }, { "epoch": 2.43, "grad_norm": 0.03322044759988785, "learning_rate": 1.928669992569791e-05, "loss": 0.1065, "step": 7630 }, { "epoch": 2.43, "grad_norm": 2.8655242919921875, "learning_rate": 1.9180554081307717e-05, "loss": 0.1504, "step": 7640 }, { "epoch": 2.44, "grad_norm": 0.2032870352268219, "learning_rate": 1.9074408236917528e-05, "loss": 0.0229, "step": 7650 }, { "epoch": 2.44, "grad_norm": 1.7102253437042236, "learning_rate": 1.8968262392527335e-05, "loss": 0.1019, "step": 7660 }, { "epoch": 2.44, "grad_norm": 2.740474224090576, "learning_rate": 1.8862116548137142e-05, "loss": 0.1235, "step": 7670 }, { "epoch": 2.44, "grad_norm": 2.9120683670043945, "learning_rate": 1.875597070374695e-05, "loss": 0.0516, "step": 7680 }, { "epoch": 2.45, "grad_norm": 0.11502601206302643, "learning_rate": 1.8649824859356756e-05, "loss": 0.2791, "step": 7690 }, { "epoch": 2.45, "grad_norm": 0.7027528882026672, "learning_rate": 1.8543679014966563e-05, "loss": 0.0385, "step": 7700 }, { "epoch": 2.45, "grad_norm": 2.4370245933532715, "learning_rate": 1.8437533170576374e-05, "loss": 0.0936, "step": 7710 }, { "epoch": 2.46, "grad_norm": 6.21151876449585, "learning_rate": 1.833138732618618e-05, "loss": 0.0806, "step": 7720 }, { "epoch": 2.46, "grad_norm": 0.052706655114889145, "learning_rate": 1.8225241481795988e-05, "loss": 0.1684, "step": 7730 }, { "epoch": 2.46, "grad_norm": 0.24665802717208862, "learning_rate": 1.8119095637405798e-05, "loss": 0.1383, "step": 7740 }, { "epoch": 2.47, "grad_norm": 3.6017708778381348, "learning_rate": 1.8012949793015605e-05, "loss": 0.1204, "step": 7750 }, { "epoch": 2.47, "grad_norm": 3.1942765712738037, "learning_rate": 1.7906803948625412e-05, "loss": 0.0627, "step": 7760 }, { "epoch": 2.47, "grad_norm": 3.020968437194824, "learning_rate": 1.780065810423522e-05, "loss": 0.1656, "step": 7770 }, { "epoch": 2.48, "grad_norm": 0.13594868779182434, "learning_rate": 1.7694512259845027e-05, "loss": 0.0529, "step": 7780 }, { "epoch": 2.48, "grad_norm": 0.0280010886490345, "learning_rate": 1.7588366415454834e-05, "loss": 0.1539, "step": 7790 }, { "epoch": 2.48, "grad_norm": 8.52804946899414, "learning_rate": 1.7482220571064644e-05, "loss": 0.0498, "step": 7800 }, { "epoch": 2.49, "grad_norm": 0.20770138502120972, "learning_rate": 1.737607472667445e-05, "loss": 0.0903, "step": 7810 }, { "epoch": 2.49, "grad_norm": 0.06971104443073273, "learning_rate": 1.7269928882284258e-05, "loss": 0.2458, "step": 7820 }, { "epoch": 2.49, "grad_norm": 0.022506361827254295, "learning_rate": 1.716378303789407e-05, "loss": 0.0741, "step": 7830 }, { "epoch": 2.5, "grad_norm": 4.818386077880859, "learning_rate": 1.7057637193503876e-05, "loss": 0.0586, "step": 7840 }, { "epoch": 2.5, "grad_norm": 0.05160210281610489, "learning_rate": 1.6951491349113683e-05, "loss": 0.0817, "step": 7850 }, { "epoch": 2.5, "grad_norm": 0.15953780710697174, "learning_rate": 1.6845345504723493e-05, "loss": 0.0905, "step": 7860 }, { "epoch": 2.51, "grad_norm": 0.015429453924298286, "learning_rate": 1.67391996603333e-05, "loss": 0.0719, "step": 7870 }, { "epoch": 2.51, "grad_norm": 3.159700632095337, "learning_rate": 1.6633053815943108e-05, "loss": 0.048, "step": 7880 }, { "epoch": 2.51, "grad_norm": 1.702974796295166, "learning_rate": 1.6526907971552915e-05, "loss": 0.1025, "step": 7890 }, { "epoch": 2.51, "grad_norm": 0.7218146324157715, "learning_rate": 1.6420762127162722e-05, "loss": 0.0534, "step": 7900 }, { "epoch": 2.52, "grad_norm": 4.001716136932373, "learning_rate": 1.632523086721155e-05, "loss": 0.1611, "step": 7910 }, { "epoch": 2.52, "grad_norm": 0.6529110074043274, "learning_rate": 1.6219085022821356e-05, "loss": 0.1739, "step": 7920 }, { "epoch": 2.52, "grad_norm": 3.3086657524108887, "learning_rate": 1.6112939178431166e-05, "loss": 0.1373, "step": 7930 }, { "epoch": 2.53, "grad_norm": 2.368133068084717, "learning_rate": 1.6006793334040973e-05, "loss": 0.0841, "step": 7940 }, { "epoch": 2.53, "grad_norm": 5.263741970062256, "learning_rate": 1.590064748965078e-05, "loss": 0.1226, "step": 7950 }, { "epoch": 2.53, "grad_norm": 10.581872940063477, "learning_rate": 1.579450164526059e-05, "loss": 0.1063, "step": 7960 }, { "epoch": 2.54, "grad_norm": 0.07476484030485153, "learning_rate": 1.5688355800870398e-05, "loss": 0.3208, "step": 7970 }, { "epoch": 2.54, "grad_norm": 0.976747453212738, "learning_rate": 1.5582209956480205e-05, "loss": 0.2336, "step": 7980 }, { "epoch": 2.54, "grad_norm": 0.2981054186820984, "learning_rate": 1.5476064112090012e-05, "loss": 0.0603, "step": 7990 }, { "epoch": 2.55, "grad_norm": 0.032338302582502365, "learning_rate": 1.536991826769982e-05, "loss": 0.0123, "step": 8000 }, { "epoch": 2.55, "grad_norm": 7.625821590423584, "learning_rate": 1.5263772423309626e-05, "loss": 0.1735, "step": 8010 }, { "epoch": 2.55, "grad_norm": 4.120946407318115, "learning_rate": 1.5157626578919436e-05, "loss": 0.1199, "step": 8020 }, { "epoch": 2.56, "grad_norm": 0.04417848959565163, "learning_rate": 1.5051480734529244e-05, "loss": 0.0629, "step": 8030 }, { "epoch": 2.56, "grad_norm": 3.9831886291503906, "learning_rate": 1.494533489013905e-05, "loss": 0.1507, "step": 8040 }, { "epoch": 2.56, "grad_norm": 0.2706742286682129, "learning_rate": 1.4839189045748861e-05, "loss": 0.1195, "step": 8050 }, { "epoch": 2.57, "grad_norm": 0.045659586787223816, "learning_rate": 1.4733043201358668e-05, "loss": 0.0875, "step": 8060 }, { "epoch": 2.57, "grad_norm": 2.9574756622314453, "learning_rate": 1.4626897356968475e-05, "loss": 0.0828, "step": 8070 }, { "epoch": 2.57, "grad_norm": 11.923121452331543, "learning_rate": 1.4520751512578284e-05, "loss": 0.1937, "step": 8080 }, { "epoch": 2.58, "grad_norm": 0.8571139574050903, "learning_rate": 1.4414605668188091e-05, "loss": 0.1385, "step": 8090 }, { "epoch": 2.58, "grad_norm": 11.532151222229004, "learning_rate": 1.4308459823797898e-05, "loss": 0.1644, "step": 8100 }, { "epoch": 2.58, "grad_norm": 0.02608746476471424, "learning_rate": 1.4202313979407709e-05, "loss": 0.0845, "step": 8110 }, { "epoch": 2.58, "grad_norm": 0.3875482976436615, "learning_rate": 1.4096168135017516e-05, "loss": 0.1526, "step": 8120 }, { "epoch": 2.59, "grad_norm": 0.46190938353538513, "learning_rate": 1.3990022290627323e-05, "loss": 0.0656, "step": 8130 }, { "epoch": 2.59, "grad_norm": 0.06178577244281769, "learning_rate": 1.3883876446237132e-05, "loss": 0.0245, "step": 8140 }, { "epoch": 2.59, "grad_norm": 0.41626548767089844, "learning_rate": 1.3777730601846939e-05, "loss": 0.1448, "step": 8150 }, { "epoch": 2.6, "grad_norm": 0.8394218683242798, "learning_rate": 1.3671584757456746e-05, "loss": 0.1566, "step": 8160 }, { "epoch": 2.6, "grad_norm": 0.030064724385738373, "learning_rate": 1.3565438913066556e-05, "loss": 0.1614, "step": 8170 }, { "epoch": 2.6, "grad_norm": 0.7408326864242554, "learning_rate": 1.3459293068676362e-05, "loss": 0.0493, "step": 8180 }, { "epoch": 2.61, "grad_norm": 6.210927486419678, "learning_rate": 1.3353147224286169e-05, "loss": 0.173, "step": 8190 }, { "epoch": 2.61, "grad_norm": 0.3989274501800537, "learning_rate": 1.3247001379895976e-05, "loss": 0.2242, "step": 8200 }, { "epoch": 2.61, "grad_norm": 0.21221469342708588, "learning_rate": 1.3140855535505786e-05, "loss": 0.1021, "step": 8210 }, { "epoch": 2.62, "grad_norm": 0.018684396520256996, "learning_rate": 1.3034709691115593e-05, "loss": 0.1168, "step": 8220 }, { "epoch": 2.62, "grad_norm": 4.258501052856445, "learning_rate": 1.29285638467254e-05, "loss": 0.101, "step": 8230 }, { "epoch": 2.62, "grad_norm": 0.18293698132038116, "learning_rate": 1.282241800233521e-05, "loss": 0.0888, "step": 8240 }, { "epoch": 2.63, "grad_norm": 5.2559685707092285, "learning_rate": 1.2716272157945016e-05, "loss": 0.1593, "step": 8250 }, { "epoch": 2.63, "grad_norm": 0.714055597782135, "learning_rate": 1.2610126313554823e-05, "loss": 0.0548, "step": 8260 }, { "epoch": 2.63, "grad_norm": 5.772704124450684, "learning_rate": 1.2503980469164634e-05, "loss": 0.1758, "step": 8270 }, { "epoch": 2.64, "grad_norm": 0.15256932377815247, "learning_rate": 1.2397834624774441e-05, "loss": 0.1546, "step": 8280 }, { "epoch": 2.64, "grad_norm": 0.17343765497207642, "learning_rate": 1.2291688780384248e-05, "loss": 0.0422, "step": 8290 }, { "epoch": 2.64, "grad_norm": 5.067286491394043, "learning_rate": 1.2185542935994057e-05, "loss": 0.0384, "step": 8300 }, { "epoch": 2.65, "grad_norm": 2.087721109390259, "learning_rate": 1.2079397091603864e-05, "loss": 0.1132, "step": 8310 }, { "epoch": 2.65, "grad_norm": 6.7488017082214355, "learning_rate": 1.1973251247213673e-05, "loss": 0.0729, "step": 8320 }, { "epoch": 2.65, "grad_norm": 10.669734954833984, "learning_rate": 1.1867105402823481e-05, "loss": 0.0677, "step": 8330 }, { "epoch": 2.65, "grad_norm": 4.734282970428467, "learning_rate": 1.1760959558433288e-05, "loss": 0.1741, "step": 8340 }, { "epoch": 2.66, "grad_norm": 1.1807498931884766, "learning_rate": 1.1665428298482115e-05, "loss": 0.1219, "step": 8350 }, { "epoch": 2.66, "grad_norm": 0.1118198037147522, "learning_rate": 1.1559282454091922e-05, "loss": 0.0724, "step": 8360 }, { "epoch": 2.66, "grad_norm": 8.563444137573242, "learning_rate": 1.1453136609701731e-05, "loss": 0.1453, "step": 8370 }, { "epoch": 2.67, "grad_norm": 1.4987778663635254, "learning_rate": 1.1346990765311538e-05, "loss": 0.1087, "step": 8380 }, { "epoch": 2.67, "grad_norm": 6.070169448852539, "learning_rate": 1.1240844920921347e-05, "loss": 0.1372, "step": 8390 }, { "epoch": 2.67, "grad_norm": 3.5378408432006836, "learning_rate": 1.1134699076531156e-05, "loss": 0.1421, "step": 8400 }, { "epoch": 2.68, "grad_norm": 0.18879607319831848, "learning_rate": 1.1028553232140961e-05, "loss": 0.0617, "step": 8410 }, { "epoch": 2.68, "grad_norm": 3.873791217803955, "learning_rate": 1.092240738775077e-05, "loss": 0.1256, "step": 8420 }, { "epoch": 2.68, "grad_norm": 3.0632710456848145, "learning_rate": 1.0816261543360579e-05, "loss": 0.1084, "step": 8430 }, { "epoch": 2.69, "grad_norm": 0.044198133051395416, "learning_rate": 1.0710115698970386e-05, "loss": 0.0972, "step": 8440 }, { "epoch": 2.69, "grad_norm": 0.06533059477806091, "learning_rate": 1.0603969854580194e-05, "loss": 0.0659, "step": 8450 }, { "epoch": 2.69, "grad_norm": 0.024154966697096825, "learning_rate": 1.0497824010190002e-05, "loss": 0.2245, "step": 8460 }, { "epoch": 2.7, "grad_norm": 0.06551453471183777, "learning_rate": 1.0391678165799809e-05, "loss": 0.0679, "step": 8470 }, { "epoch": 2.7, "grad_norm": 2.244358777999878, "learning_rate": 1.0285532321409617e-05, "loss": 0.1138, "step": 8480 }, { "epoch": 2.7, "grad_norm": 1.3429971933364868, "learning_rate": 1.0179386477019426e-05, "loss": 0.1286, "step": 8490 }, { "epoch": 2.71, "grad_norm": 13.364596366882324, "learning_rate": 1.0073240632629233e-05, "loss": 0.1304, "step": 8500 }, { "epoch": 2.71, "grad_norm": 1.5777560472488403, "learning_rate": 9.96709478823904e-06, "loss": 0.0324, "step": 8510 }, { "epoch": 2.71, "grad_norm": 3.5468719005584717, "learning_rate": 9.860948943848847e-06, "loss": 0.1142, "step": 8520 }, { "epoch": 2.72, "grad_norm": 9.198564529418945, "learning_rate": 9.754803099458656e-06, "loss": 0.1208, "step": 8530 }, { "epoch": 2.72, "grad_norm": 0.10464298725128174, "learning_rate": 9.648657255068465e-06, "loss": 0.062, "step": 8540 }, { "epoch": 2.72, "grad_norm": 7.4889702796936035, "learning_rate": 9.542511410678272e-06, "loss": 0.1081, "step": 8550 }, { "epoch": 2.72, "grad_norm": 4.211546897888184, "learning_rate": 9.43636556628808e-06, "loss": 0.122, "step": 8560 }, { "epoch": 2.73, "grad_norm": 5.125463008880615, "learning_rate": 9.330219721897888e-06, "loss": 0.2547, "step": 8570 }, { "epoch": 2.73, "grad_norm": 0.17111606895923615, "learning_rate": 9.224073877507695e-06, "loss": 0.0792, "step": 8580 }, { "epoch": 2.73, "grad_norm": 0.17677658796310425, "learning_rate": 9.117928033117504e-06, "loss": 0.2517, "step": 8590 }, { "epoch": 2.74, "grad_norm": 0.88303542137146, "learning_rate": 9.011782188727312e-06, "loss": 0.1207, "step": 8600 }, { "epoch": 2.74, "grad_norm": 0.934140682220459, "learning_rate": 8.90563634433712e-06, "loss": 0.0874, "step": 8610 }, { "epoch": 2.74, "grad_norm": 0.1124495416879654, "learning_rate": 8.799490499946927e-06, "loss": 0.2207, "step": 8620 }, { "epoch": 2.75, "grad_norm": 1.9301073551177979, "learning_rate": 8.693344655556735e-06, "loss": 0.1351, "step": 8630 }, { "epoch": 2.75, "grad_norm": 0.42326900362968445, "learning_rate": 8.587198811166543e-06, "loss": 0.1563, "step": 8640 }, { "epoch": 2.75, "grad_norm": 0.01322962436825037, "learning_rate": 8.481052966776351e-06, "loss": 0.0387, "step": 8650 }, { "epoch": 2.76, "grad_norm": 3.7665517330169678, "learning_rate": 8.37490712238616e-06, "loss": 0.2157, "step": 8660 }, { "epoch": 2.76, "grad_norm": 0.2205476611852646, "learning_rate": 8.268761277995967e-06, "loss": 0.0491, "step": 8670 }, { "epoch": 2.76, "grad_norm": 0.10910103470087051, "learning_rate": 8.162615433605774e-06, "loss": 0.0924, "step": 8680 }, { "epoch": 2.77, "grad_norm": 0.030913598835468292, "learning_rate": 8.056469589215583e-06, "loss": 0.0553, "step": 8690 }, { "epoch": 2.77, "grad_norm": 0.08986567705869675, "learning_rate": 7.95032374482539e-06, "loss": 0.0613, "step": 8700 }, { "epoch": 2.77, "grad_norm": 0.21952463686466217, "learning_rate": 7.844177900435199e-06, "loss": 0.0898, "step": 8710 }, { "epoch": 2.78, "grad_norm": 0.6068935990333557, "learning_rate": 7.738032056045006e-06, "loss": 0.043, "step": 8720 }, { "epoch": 2.78, "grad_norm": 0.03201749920845032, "learning_rate": 7.631886211654813e-06, "loss": 0.1957, "step": 8730 }, { "epoch": 2.78, "grad_norm": 3.205738067626953, "learning_rate": 7.525740367264622e-06, "loss": 0.1425, "step": 8740 }, { "epoch": 2.79, "grad_norm": 3.265514612197876, "learning_rate": 7.4195945228744306e-06, "loss": 0.1652, "step": 8750 }, { "epoch": 2.79, "grad_norm": 0.11868763715028763, "learning_rate": 7.313448678484238e-06, "loss": 0.0193, "step": 8760 }, { "epoch": 2.79, "grad_norm": 0.03614291548728943, "learning_rate": 7.2073028340940456e-06, "loss": 0.1368, "step": 8770 }, { "epoch": 2.79, "grad_norm": 2.512045383453369, "learning_rate": 7.101156989703854e-06, "loss": 0.0949, "step": 8780 }, { "epoch": 2.8, "grad_norm": 5.77540922164917, "learning_rate": 6.995011145313661e-06, "loss": 0.1639, "step": 8790 }, { "epoch": 2.8, "grad_norm": 7.473822116851807, "learning_rate": 6.888865300923469e-06, "loss": 0.1023, "step": 8800 }, { "epoch": 2.8, "grad_norm": 0.0789722427725792, "learning_rate": 6.782719456533278e-06, "loss": 0.0627, "step": 8810 }, { "epoch": 2.81, "grad_norm": 2.9245636463165283, "learning_rate": 6.676573612143084e-06, "loss": 0.1771, "step": 8820 }, { "epoch": 2.81, "grad_norm": 2.1707448959350586, "learning_rate": 6.570427767752893e-06, "loss": 0.0423, "step": 8830 }, { "epoch": 2.81, "grad_norm": 4.990893363952637, "learning_rate": 6.4642819233627e-06, "loss": 0.073, "step": 8840 }, { "epoch": 2.82, "grad_norm": 6.3620452880859375, "learning_rate": 6.358136078972508e-06, "loss": 0.0627, "step": 8850 }, { "epoch": 2.82, "grad_norm": 0.09669307619333267, "learning_rate": 6.251990234582317e-06, "loss": 0.0879, "step": 8860 }, { "epoch": 2.82, "grad_norm": 5.8794779777526855, "learning_rate": 6.145844390192124e-06, "loss": 0.1667, "step": 8870 }, { "epoch": 2.83, "grad_norm": 0.0750487744808197, "learning_rate": 6.039698545801932e-06, "loss": 0.1538, "step": 8880 }, { "epoch": 2.83, "grad_norm": 4.174580097198486, "learning_rate": 5.933552701411741e-06, "loss": 0.1782, "step": 8890 }, { "epoch": 2.83, "grad_norm": 2.7931034564971924, "learning_rate": 5.827406857021548e-06, "loss": 0.1047, "step": 8900 }, { "epoch": 2.84, "grad_norm": 0.11179756373167038, "learning_rate": 5.721261012631356e-06, "loss": 0.0648, "step": 8910 }, { "epoch": 2.84, "grad_norm": 0.25602421164512634, "learning_rate": 5.615115168241164e-06, "loss": 0.1657, "step": 8920 }, { "epoch": 2.84, "grad_norm": 0.030272111296653748, "learning_rate": 5.5089693238509715e-06, "loss": 0.1344, "step": 8930 }, { "epoch": 2.85, "grad_norm": 1.8802919387817383, "learning_rate": 5.4028234794607795e-06, "loss": 0.1284, "step": 8940 }, { "epoch": 2.85, "grad_norm": 0.9859854578971863, "learning_rate": 5.296677635070587e-06, "loss": 0.0504, "step": 8950 }, { "epoch": 2.85, "grad_norm": 0.5083135962486267, "learning_rate": 5.190531790680395e-06, "loss": 0.0193, "step": 8960 }, { "epoch": 2.86, "grad_norm": 3.466031789779663, "learning_rate": 5.084385946290203e-06, "loss": 0.0459, "step": 8970 }, { "epoch": 2.86, "grad_norm": 8.049098014831543, "learning_rate": 4.97824010190001e-06, "loss": 0.1025, "step": 8980 }, { "epoch": 2.86, "grad_norm": 5.528136730194092, "learning_rate": 4.872094257509819e-06, "loss": 0.109, "step": 8990 }, { "epoch": 2.86, "grad_norm": 0.02654377557337284, "learning_rate": 4.765948413119627e-06, "loss": 0.1655, "step": 9000 } ], "logging_steps": 10, "max_steps": 9423, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.0453875280157082e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }