{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.36997299197158606, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014798919678863443, "grad_norm": 7.47622537612915, "learning_rate": 7.2e-06, "loss": 1.1232, "step": 10 }, { "epoch": 0.0029597839357726886, "grad_norm": 1.880067229270935, "learning_rate": 1.52e-05, "loss": 0.5006, "step": 20 }, { "epoch": 0.004439675903659033, "grad_norm": 1.9676536321640015, "learning_rate": 2.32e-05, "loss": 0.2438, "step": 30 }, { "epoch": 0.005919567871545377, "grad_norm": 1.628406047821045, "learning_rate": 3.12e-05, "loss": 0.1641, "step": 40 }, { "epoch": 0.007399459839431722, "grad_norm": 2.5903799533843994, "learning_rate": 3.9200000000000004e-05, "loss": 0.1585, "step": 50 }, { "epoch": 0.008879351807318065, "grad_norm": 1.520799160003662, "learning_rate": 4.72e-05, "loss": 0.1282, "step": 60 }, { "epoch": 0.01035924377520441, "grad_norm": 1.7377681732177734, "learning_rate": 5.520000000000001e-05, "loss": 0.1256, "step": 70 }, { "epoch": 0.011839135743090754, "grad_norm": 1.098388433456421, "learning_rate": 6.32e-05, "loss": 0.1031, "step": 80 }, { "epoch": 0.013319027710977099, "grad_norm": 1.4509732723236084, "learning_rate": 7.12e-05, "loss": 0.0973, "step": 90 }, { "epoch": 0.014798919678863444, "grad_norm": 1.9007303714752197, "learning_rate": 7.920000000000001e-05, "loss": 0.1152, "step": 100 }, { "epoch": 0.016278811646749786, "grad_norm": 1.5740575790405273, "learning_rate": 8.72e-05, "loss": 0.1151, "step": 110 }, { "epoch": 0.01775870361463613, "grad_norm": 1.2793081998825073, "learning_rate": 9.52e-05, "loss": 0.1182, "step": 120 }, { "epoch": 0.019238595582522475, "grad_norm": 1.5838185548782349, "learning_rate": 9.999930010724872e-05, "loss": 0.0955, "step": 130 }, { "epoch": 0.02071848755040882, "grad_norm": 1.8031917810440063, "learning_rate": 9.999142653881985e-05, "loss": 0.1141, "step": 140 }, { "epoch": 0.022198379518295164, "grad_norm": 1.5219581127166748, "learning_rate": 9.997480591826183e-05, "loss": 0.1069, "step": 150 }, { "epoch": 0.02367827148618151, "grad_norm": 1.1764310598373413, "learning_rate": 9.994944115370199e-05, "loss": 0.0964, "step": 160 }, { "epoch": 0.025158163454067854, "grad_norm": 1.0616095066070557, "learning_rate": 9.991533668323974e-05, "loss": 0.0761, "step": 170 }, { "epoch": 0.026638055421954198, "grad_norm": 1.1210483312606812, "learning_rate": 9.987249847416987e-05, "loss": 0.087, "step": 180 }, { "epoch": 0.028117947389840543, "grad_norm": 1.1077345609664917, "learning_rate": 9.982093402193857e-05, "loss": 0.0931, "step": 190 }, { "epoch": 0.029597839357726887, "grad_norm": 1.352004051208496, "learning_rate": 9.976065234883193e-05, "loss": 0.083, "step": 200 }, { "epoch": 0.03107773132561323, "grad_norm": 1.0884106159210205, "learning_rate": 9.969166400239726e-05, "loss": 0.0783, "step": 210 }, { "epoch": 0.03255762329349957, "grad_norm": 1.1031562089920044, "learning_rate": 9.961398105359764e-05, "loss": 0.1047, "step": 220 }, { "epoch": 0.03403751526138592, "grad_norm": 1.0741270780563354, "learning_rate": 9.952761709469975e-05, "loss": 0.1017, "step": 230 }, { "epoch": 0.03551740722927226, "grad_norm": 1.2804232835769653, "learning_rate": 9.94325872368957e-05, "loss": 0.0968, "step": 240 }, { "epoch": 0.036997299197158606, "grad_norm": 1.0856305360794067, "learning_rate": 9.932890810765902e-05, "loss": 0.0644, "step": 250 }, { "epoch": 0.03847719116504495, "grad_norm": 1.1068660020828247, "learning_rate": 9.921659784783526e-05, "loss": 0.0644, "step": 260 }, { "epoch": 0.039957083132931295, "grad_norm": 1.043481469154358, "learning_rate": 9.909567610846788e-05, "loss": 0.0667, "step": 270 }, { "epoch": 0.04143697510081764, "grad_norm": 1.3862495422363281, "learning_rate": 9.896616404736001e-05, "loss": 0.0694, "step": 280 }, { "epoch": 0.042916867068703984, "grad_norm": 1.405779480934143, "learning_rate": 9.882808432537224e-05, "loss": 0.0823, "step": 290 }, { "epoch": 0.04439675903659033, "grad_norm": 1.4005481004714966, "learning_rate": 9.86814611024578e-05, "loss": 0.0812, "step": 300 }, { "epoch": 0.04587665100447667, "grad_norm": 0.8486709594726562, "learning_rate": 9.852632003343518e-05, "loss": 0.0966, "step": 310 }, { "epoch": 0.04735654297236302, "grad_norm": 0.6197394132614136, "learning_rate": 9.836268826349933e-05, "loss": 0.0595, "step": 320 }, { "epoch": 0.04883643494024936, "grad_norm": 0.776345431804657, "learning_rate": 9.819059442347193e-05, "loss": 0.0599, "step": 330 }, { "epoch": 0.05031632690813571, "grad_norm": 0.7929945588111877, "learning_rate": 9.801006862479202e-05, "loss": 0.07, "step": 340 }, { "epoch": 0.05179621887602205, "grad_norm": 0.9411725997924805, "learning_rate": 9.782114245424718e-05, "loss": 0.0637, "step": 350 }, { "epoch": 0.053276110843908396, "grad_norm": 0.825547993183136, "learning_rate": 9.762384896844684e-05, "loss": 0.0628, "step": 360 }, { "epoch": 0.05475600281179474, "grad_norm": 1.1270116567611694, "learning_rate": 9.741822268803833e-05, "loss": 0.0702, "step": 370 }, { "epoch": 0.056235894779681085, "grad_norm": 0.947913408279419, "learning_rate": 9.720429959166675e-05, "loss": 0.073, "step": 380 }, { "epoch": 0.05771578674756743, "grad_norm": 1.0599220991134644, "learning_rate": 9.69821171096798e-05, "loss": 0.0678, "step": 390 }, { "epoch": 0.059195678715453774, "grad_norm": 1.0569720268249512, "learning_rate": 9.675171411757842e-05, "loss": 0.0653, "step": 400 }, { "epoch": 0.06067557068334012, "grad_norm": 1.1427547931671143, "learning_rate": 9.65131309292149e-05, "loss": 0.0739, "step": 410 }, { "epoch": 0.06215546265122646, "grad_norm": 0.816104531288147, "learning_rate": 9.626640928973892e-05, "loss": 0.0518, "step": 420 }, { "epoch": 0.06363535461911281, "grad_norm": 0.7173371911048889, "learning_rate": 9.601159236829352e-05, "loss": 0.0687, "step": 430 }, { "epoch": 0.06511524658699915, "grad_norm": 0.8344293236732483, "learning_rate": 9.574872475046166e-05, "loss": 0.059, "step": 440 }, { "epoch": 0.0665951385548855, "grad_norm": 0.6896253824234009, "learning_rate": 9.547785243046505e-05, "loss": 0.046, "step": 450 }, { "epoch": 0.06807503052277183, "grad_norm": 1.0698304176330566, "learning_rate": 9.519902280311653e-05, "loss": 0.0511, "step": 460 }, { "epoch": 0.06955492249065819, "grad_norm": 0.7512239217758179, "learning_rate": 9.491228465552726e-05, "loss": 0.0578, "step": 470 }, { "epoch": 0.07103481445854452, "grad_norm": 1.0382441282272339, "learning_rate": 9.461768815857053e-05, "loss": 0.0618, "step": 480 }, { "epoch": 0.07251470642643087, "grad_norm": 0.7745275497436523, "learning_rate": 9.431528485810316e-05, "loss": 0.069, "step": 490 }, { "epoch": 0.07399459839431721, "grad_norm": 0.7567397952079773, "learning_rate": 9.400512766594659e-05, "loss": 0.047, "step": 500 }, { "epoch": 0.07547449036220356, "grad_norm": 0.7035396695137024, "learning_rate": 9.368727085062872e-05, "loss": 0.0563, "step": 510 }, { "epoch": 0.0769543823300899, "grad_norm": 0.8480479717254639, "learning_rate": 9.336177002788862e-05, "loss": 0.0498, "step": 520 }, { "epoch": 0.07843427429797625, "grad_norm": 0.6571253538131714, "learning_rate": 9.302868215094534e-05, "loss": 0.0579, "step": 530 }, { "epoch": 0.07991416626586259, "grad_norm": 0.6242689490318298, "learning_rate": 9.268806550053264e-05, "loss": 0.0465, "step": 540 }, { "epoch": 0.08139405823374894, "grad_norm": 1.0013155937194824, "learning_rate": 9.233997967470174e-05, "loss": 0.0792, "step": 550 }, { "epoch": 0.08287395020163528, "grad_norm": 1.0697152614593506, "learning_rate": 9.198448557839321e-05, "loss": 0.0707, "step": 560 }, { "epoch": 0.08435384216952163, "grad_norm": 0.9719803333282471, "learning_rate": 9.162164541278051e-05, "loss": 0.0586, "step": 570 }, { "epoch": 0.08583373413740797, "grad_norm": 1.0201226472854614, "learning_rate": 9.125152266438649e-05, "loss": 0.0535, "step": 580 }, { "epoch": 0.08731362610529432, "grad_norm": 0.8870009779930115, "learning_rate": 9.087418209397506e-05, "loss": 0.058, "step": 590 }, { "epoch": 0.08879351807318066, "grad_norm": 0.7464373707771301, "learning_rate": 9.04896897252201e-05, "loss": 0.0676, "step": 600 }, { "epoch": 0.090273410041067, "grad_norm": 0.6516630053520203, "learning_rate": 9.009811283315304e-05, "loss": 0.0506, "step": 610 }, { "epoch": 0.09175330200895335, "grad_norm": 0.6615219712257385, "learning_rate": 8.969951993239177e-05, "loss": 0.0479, "step": 620 }, { "epoch": 0.09323319397683968, "grad_norm": 0.6776785850524902, "learning_rate": 8.929398076515259e-05, "loss": 0.0459, "step": 630 }, { "epoch": 0.09471308594472604, "grad_norm": 0.7998208403587341, "learning_rate": 8.888156628904724e-05, "loss": 0.0493, "step": 640 }, { "epoch": 0.09619297791261237, "grad_norm": 0.8668547868728638, "learning_rate": 8.846234866466747e-05, "loss": 0.0431, "step": 650 }, { "epoch": 0.09767286988049872, "grad_norm": 0.7501780390739441, "learning_rate": 8.803640124295902e-05, "loss": 0.0664, "step": 660 }, { "epoch": 0.09915276184838506, "grad_norm": 0.5983700156211853, "learning_rate": 8.760379855238723e-05, "loss": 0.0424, "step": 670 }, { "epoch": 0.10063265381627141, "grad_norm": 0.7453112006187439, "learning_rate": 8.716461628589683e-05, "loss": 0.0508, "step": 680 }, { "epoch": 0.10211254578415775, "grad_norm": 0.7227299809455872, "learning_rate": 8.671893128766784e-05, "loss": 0.045, "step": 690 }, { "epoch": 0.1035924377520441, "grad_norm": 0.7978153824806213, "learning_rate": 8.626682153967001e-05, "loss": 0.0473, "step": 700 }, { "epoch": 0.10507232971993044, "grad_norm": 0.6989890336990356, "learning_rate": 8.580836614801827e-05, "loss": 0.046, "step": 710 }, { "epoch": 0.10655222168781679, "grad_norm": 0.8369282484054565, "learning_rate": 8.534364532913144e-05, "loss": 0.049, "step": 720 }, { "epoch": 0.10803211365570313, "grad_norm": 0.5447561144828796, "learning_rate": 8.487274039569675e-05, "loss": 0.0428, "step": 730 }, { "epoch": 0.10951200562358948, "grad_norm": 0.7491251826286316, "learning_rate": 8.439573374244237e-05, "loss": 0.0368, "step": 740 }, { "epoch": 0.11099189759147582, "grad_norm": 0.48374220728874207, "learning_rate": 8.391270883172073e-05, "loss": 0.0403, "step": 750 }, { "epoch": 0.11247178955936217, "grad_norm": 0.9518898129463196, "learning_rate": 8.342375017890512e-05, "loss": 0.038, "step": 760 }, { "epoch": 0.11395168152724851, "grad_norm": 0.8046047687530518, "learning_rate": 8.292894333760186e-05, "loss": 0.0414, "step": 770 }, { "epoch": 0.11543157349513486, "grad_norm": 0.6393342614173889, "learning_rate": 8.242837488468087e-05, "loss": 0.0615, "step": 780 }, { "epoch": 0.1169114654630212, "grad_norm": 0.6403966546058655, "learning_rate": 8.192213240512737e-05, "loss": 0.0403, "step": 790 }, { "epoch": 0.11839135743090755, "grad_norm": 0.5846046805381775, "learning_rate": 8.141030447671686e-05, "loss": 0.0376, "step": 800 }, { "epoch": 0.11987124939879389, "grad_norm": 1.1184839010238647, "learning_rate": 8.089298065451672e-05, "loss": 0.042, "step": 810 }, { "epoch": 0.12135114136668024, "grad_norm": 0.804345965385437, "learning_rate": 8.037025145521657e-05, "loss": 0.0448, "step": 820 }, { "epoch": 0.12283103333456658, "grad_norm": 0.40974166989326477, "learning_rate": 7.984220834129052e-05, "loss": 0.0691, "step": 830 }, { "epoch": 0.12431092530245293, "grad_norm": 0.5297871232032776, "learning_rate": 7.93089437049939e-05, "loss": 0.0477, "step": 840 }, { "epoch": 0.12579081727033928, "grad_norm": 0.6452186107635498, "learning_rate": 7.877055085219721e-05, "loss": 0.0505, "step": 850 }, { "epoch": 0.12727070923822562, "grad_norm": 0.6497986316680908, "learning_rate": 7.82271239860604e-05, "loss": 0.0471, "step": 860 }, { "epoch": 0.12875060120611195, "grad_norm": 0.6113291382789612, "learning_rate": 7.767875819054997e-05, "loss": 0.0485, "step": 870 }, { "epoch": 0.1302304931739983, "grad_norm": 0.8844181895256042, "learning_rate": 7.712554941380206e-05, "loss": 0.0429, "step": 880 }, { "epoch": 0.13171038514188466, "grad_norm": 0.7872292995452881, "learning_rate": 7.656759445133428e-05, "loss": 0.0471, "step": 890 }, { "epoch": 0.133190277109771, "grad_norm": 0.8881542682647705, "learning_rate": 7.600499092910934e-05, "loss": 0.0498, "step": 900 }, { "epoch": 0.13467016907765733, "grad_norm": 0.6990298628807068, "learning_rate": 7.543783728645328e-05, "loss": 0.0385, "step": 910 }, { "epoch": 0.13615006104554367, "grad_norm": 0.8354172110557556, "learning_rate": 7.486623275883151e-05, "loss": 0.0347, "step": 920 }, { "epoch": 0.13762995301343, "grad_norm": 0.8185162544250488, "learning_rate": 7.429027736048535e-05, "loss": 0.0409, "step": 930 }, { "epoch": 0.13910984498131637, "grad_norm": 0.9761926531791687, "learning_rate": 7.37100718669326e-05, "loss": 0.0422, "step": 940 }, { "epoch": 0.1405897369492027, "grad_norm": 0.5239071249961853, "learning_rate": 7.312571779733463e-05, "loss": 0.0269, "step": 950 }, { "epoch": 0.14206962891708905, "grad_norm": 0.7226734757423401, "learning_rate": 7.253731739673349e-05, "loss": 0.0366, "step": 960 }, { "epoch": 0.14354952088497538, "grad_norm": 0.6310585141181946, "learning_rate": 7.194497361816196e-05, "loss": 0.0527, "step": 970 }, { "epoch": 0.14502941285286175, "grad_norm": 0.5752474665641785, "learning_rate": 7.134879010462988e-05, "loss": 0.0312, "step": 980 }, { "epoch": 0.1465093048207481, "grad_norm": 0.5619791150093079, "learning_rate": 7.07488711709894e-05, "loss": 0.0314, "step": 990 }, { "epoch": 0.14798919678863443, "grad_norm": 0.6874368190765381, "learning_rate": 7.014532178568314e-05, "loss": 0.0294, "step": 1000 }, { "epoch": 0.14946908875652076, "grad_norm": 0.6269967555999756, "learning_rate": 6.953824755237756e-05, "loss": 0.0357, "step": 1010 }, { "epoch": 0.15094898072440713, "grad_norm": 0.7482361793518066, "learning_rate": 6.892775469148553e-05, "loss": 0.043, "step": 1020 }, { "epoch": 0.15242887269229347, "grad_norm": 0.7447315454483032, "learning_rate": 6.831395002158067e-05, "loss": 0.0319, "step": 1030 }, { "epoch": 0.1539087646601798, "grad_norm": 0.5281906127929688, "learning_rate": 6.76969409407074e-05, "loss": 0.0311, "step": 1040 }, { "epoch": 0.15538865662806614, "grad_norm": 0.5784289240837097, "learning_rate": 6.707683540758915e-05, "loss": 0.0362, "step": 1050 }, { "epoch": 0.1568685485959525, "grad_norm": 0.5917581915855408, "learning_rate": 6.645374192273894e-05, "loss": 0.0406, "step": 1060 }, { "epoch": 0.15834844056383884, "grad_norm": 0.8599121570587158, "learning_rate": 6.582776950947474e-05, "loss": 0.0468, "step": 1070 }, { "epoch": 0.15982833253172518, "grad_norm": 0.5944121479988098, "learning_rate": 6.519902769484368e-05, "loss": 0.0464, "step": 1080 }, { "epoch": 0.16130822449961152, "grad_norm": 0.6295680999755859, "learning_rate": 6.456762649045796e-05, "loss": 0.0371, "step": 1090 }, { "epoch": 0.16278811646749788, "grad_norm": 1.0905753374099731, "learning_rate": 6.393367637324593e-05, "loss": 0.0465, "step": 1100 }, { "epoch": 0.16426800843538422, "grad_norm": 0.7028170228004456, "learning_rate": 6.329728826612192e-05, "loss": 0.0493, "step": 1110 }, { "epoch": 0.16574790040327056, "grad_norm": 0.6126824617385864, "learning_rate": 6.265857351857788e-05, "loss": 0.0369, "step": 1120 }, { "epoch": 0.1672277923711569, "grad_norm": 0.7399412393569946, "learning_rate": 6.201764388720049e-05, "loss": 0.0412, "step": 1130 }, { "epoch": 0.16870768433904326, "grad_norm": 0.5751072764396667, "learning_rate": 6.137461151611692e-05, "loss": 0.0446, "step": 1140 }, { "epoch": 0.1701875763069296, "grad_norm": 0.4830611050128937, "learning_rate": 6.072958891737296e-05, "loss": 0.0396, "step": 1150 }, { "epoch": 0.17166746827481594, "grad_norm": 0.5445456504821777, "learning_rate": 6.00826889512466e-05, "loss": 0.0367, "step": 1160 }, { "epoch": 0.17314736024270228, "grad_norm": 0.7560216188430786, "learning_rate": 5.943402480650071e-05, "loss": 0.0328, "step": 1170 }, { "epoch": 0.17462725221058864, "grad_norm": 0.6100801229476929, "learning_rate": 5.8783709980578414e-05, "loss": 0.031, "step": 1180 }, { "epoch": 0.17610714417847498, "grad_norm": 0.5339049696922302, "learning_rate": 5.813185825974419e-05, "loss": 0.0365, "step": 1190 }, { "epoch": 0.17758703614636132, "grad_norm": 0.7650025486946106, "learning_rate": 5.747858369917465e-05, "loss": 0.034, "step": 1200 }, { "epoch": 0.17906692811424765, "grad_norm": 0.6139253377914429, "learning_rate": 5.682400060300213e-05, "loss": 0.0399, "step": 1210 }, { "epoch": 0.180546820082134, "grad_norm": 0.514707088470459, "learning_rate": 5.6168223504314863e-05, "loss": 0.0371, "step": 1220 }, { "epoch": 0.18202671205002036, "grad_norm": 0.5504834651947021, "learning_rate": 5.551136714511691e-05, "loss": 0.0338, "step": 1230 }, { "epoch": 0.1835066040179067, "grad_norm": 0.6514374017715454, "learning_rate": 5.485354645625167e-05, "loss": 0.0392, "step": 1240 }, { "epoch": 0.18498649598579303, "grad_norm": 0.5081560015678406, "learning_rate": 5.419487653729234e-05, "loss": 0.0332, "step": 1250 }, { "epoch": 0.18646638795367937, "grad_norm": 0.904439389705658, "learning_rate": 5.353547263640273e-05, "loss": 0.0382, "step": 1260 }, { "epoch": 0.18794627992156573, "grad_norm": 0.5332797765731812, "learning_rate": 5.2875450130172324e-05, "loss": 0.0221, "step": 1270 }, { "epoch": 0.18942617188945207, "grad_norm": 0.6516315937042236, "learning_rate": 5.221492450342856e-05, "loss": 0.0371, "step": 1280 }, { "epoch": 0.1909060638573384, "grad_norm": 0.7484680414199829, "learning_rate": 5.155401132903045e-05, "loss": 0.0377, "step": 1290 }, { "epoch": 0.19238595582522475, "grad_norm": 0.9511647820472717, "learning_rate": 5.089282624764654e-05, "loss": 0.0386, "step": 1300 }, { "epoch": 0.1938658477931111, "grad_norm": 0.3802145719528198, "learning_rate": 5.0231484947521336e-05, "loss": 0.0324, "step": 1310 }, { "epoch": 0.19534573976099745, "grad_norm": 0.5430207252502441, "learning_rate": 4.9570103144233024e-05, "loss": 0.0296, "step": 1320 }, { "epoch": 0.1968256317288838, "grad_norm": 0.631252646446228, "learning_rate": 4.890879656044669e-05, "loss": 0.0241, "step": 1330 }, { "epoch": 0.19830552369677013, "grad_norm": 0.4435235261917114, "learning_rate": 4.824768090566618e-05, "loss": 0.0285, "step": 1340 }, { "epoch": 0.1997854156646565, "grad_norm": 0.7757057547569275, "learning_rate": 4.7586871855988326e-05, "loss": 0.0419, "step": 1350 }, { "epoch": 0.20126530763254283, "grad_norm": 0.6761030554771423, "learning_rate": 4.692648503386289e-05, "loss": 0.0338, "step": 1360 }, { "epoch": 0.20274519960042917, "grad_norm": 0.3639812767505646, "learning_rate": 4.6266635987862086e-05, "loss": 0.0244, "step": 1370 }, { "epoch": 0.2042250915683155, "grad_norm": 0.3727477490901947, "learning_rate": 4.560744017246284e-05, "loss": 0.025, "step": 1380 }, { "epoch": 0.20570498353620187, "grad_norm": 0.5406703948974609, "learning_rate": 4.4949012927845676e-05, "loss": 0.031, "step": 1390 }, { "epoch": 0.2071848755040882, "grad_norm": 0.45045414566993713, "learning_rate": 4.429146945971346e-05, "loss": 0.0236, "step": 1400 }, { "epoch": 0.20866476747197454, "grad_norm": 0.549584686756134, "learning_rate": 4.3634924819133746e-05, "loss": 0.0286, "step": 1410 }, { "epoch": 0.21014465943986088, "grad_norm": 0.7233926653862, "learning_rate": 4.297949388240823e-05, "loss": 0.0297, "step": 1420 }, { "epoch": 0.21162455140774725, "grad_norm": 0.5810157656669617, "learning_rate": 4.2325291330972664e-05, "loss": 0.0189, "step": 1430 }, { "epoch": 0.21310444337563358, "grad_norm": 0.5259445905685425, "learning_rate": 4.167243163133094e-05, "loss": 0.0271, "step": 1440 }, { "epoch": 0.21458433534351992, "grad_norm": 0.5740777254104614, "learning_rate": 4.1021029015026736e-05, "loss": 0.0367, "step": 1450 }, { "epoch": 0.21606422731140626, "grad_norm": 0.6542416214942932, "learning_rate": 4.037119745865641e-05, "loss": 0.0323, "step": 1460 }, { "epoch": 0.21754411927929262, "grad_norm": 0.5732387900352478, "learning_rate": 3.972305066392626e-05, "loss": 0.0295, "step": 1470 }, { "epoch": 0.21902401124717896, "grad_norm": 0.8444030284881592, "learning_rate": 3.9076702037758076e-05, "loss": 0.027, "step": 1480 }, { "epoch": 0.2205039032150653, "grad_norm": 0.36182349920272827, "learning_rate": 3.8432264672446293e-05, "loss": 0.0306, "step": 1490 }, { "epoch": 0.22198379518295164, "grad_norm": 0.367953360080719, "learning_rate": 3.778985132586995e-05, "loss": 0.0258, "step": 1500 }, { "epoch": 0.22346368715083798, "grad_norm": 0.43248283863067627, "learning_rate": 3.714957440176345e-05, "loss": 0.0237, "step": 1510 }, { "epoch": 0.22494357911872434, "grad_norm": 0.5691545605659485, "learning_rate": 3.651154593004911e-05, "loss": 0.0257, "step": 1520 }, { "epoch": 0.22642347108661068, "grad_norm": 0.4153839945793152, "learning_rate": 3.587587754723523e-05, "loss": 0.0256, "step": 1530 }, { "epoch": 0.22790336305449702, "grad_norm": 0.4405042827129364, "learning_rate": 3.5242680476882815e-05, "loss": 0.0243, "step": 1540 }, { "epoch": 0.22938325502238335, "grad_norm": 0.5467635989189148, "learning_rate": 3.461206551014481e-05, "loss": 0.0242, "step": 1550 }, { "epoch": 0.23086314699026972, "grad_norm": 0.4557804763317108, "learning_rate": 3.3984142986380764e-05, "loss": 0.0338, "step": 1560 }, { "epoch": 0.23234303895815606, "grad_norm": 0.39460909366607666, "learning_rate": 3.335902277385067e-05, "loss": 0.0205, "step": 1570 }, { "epoch": 0.2338229309260424, "grad_norm": 0.5078433752059937, "learning_rate": 3.2736814250491196e-05, "loss": 0.0248, "step": 1580 }, { "epoch": 0.23530282289392873, "grad_norm": 0.6240681409835815, "learning_rate": 3.211762628477771e-05, "loss": 0.0312, "step": 1590 }, { "epoch": 0.2367827148618151, "grad_norm": 0.37351563572883606, "learning_rate": 3.150156721667547e-05, "loss": 0.0245, "step": 1600 }, { "epoch": 0.23826260682970143, "grad_norm": 0.3536587953567505, "learning_rate": 3.088874483868325e-05, "loss": 0.021, "step": 1610 }, { "epoch": 0.23974249879758777, "grad_norm": 0.30240142345428467, "learning_rate": 3.0279266376972715e-05, "loss": 0.025, "step": 1620 }, { "epoch": 0.2412223907654741, "grad_norm": 0.4821987450122833, "learning_rate": 2.96732384726271e-05, "loss": 0.0427, "step": 1630 }, { "epoch": 0.24270228273336047, "grad_norm": 0.6150904297828674, "learning_rate": 2.907076716298196e-05, "loss": 0.0297, "step": 1640 }, { "epoch": 0.2441821747012468, "grad_norm": 0.49226638674736023, "learning_rate": 2.847195786307174e-05, "loss": 0.0267, "step": 1650 }, { "epoch": 0.24566206666913315, "grad_norm": 0.49682047963142395, "learning_rate": 2.7876915347185227e-05, "loss": 0.0326, "step": 1660 }, { "epoch": 0.2471419586370195, "grad_norm": 0.42303502559661865, "learning_rate": 2.7285743730533143e-05, "loss": 0.0212, "step": 1670 }, { "epoch": 0.24862185060490585, "grad_norm": 0.5483497977256775, "learning_rate": 2.6698546451030826e-05, "loss": 0.0202, "step": 1680 }, { "epoch": 0.25010174257279216, "grad_norm": 0.5056328177452087, "learning_rate": 2.611542625119975e-05, "loss": 0.0228, "step": 1690 }, { "epoch": 0.25158163454067856, "grad_norm": 0.32099300622940063, "learning_rate": 2.5536485160190482e-05, "loss": 0.0224, "step": 1700 }, { "epoch": 0.2530615265085649, "grad_norm": 0.5951581597328186, "learning_rate": 2.496182447593055e-05, "loss": 0.0272, "step": 1710 }, { "epoch": 0.25454141847645123, "grad_norm": 0.45417091250419617, "learning_rate": 2.4391544747400252e-05, "loss": 0.0227, "step": 1720 }, { "epoch": 0.25602131044433757, "grad_norm": 0.5122156739234924, "learning_rate": 2.3825745757039452e-05, "loss": 0.024, "step": 1730 }, { "epoch": 0.2575012024122239, "grad_norm": 0.4841585159301758, "learning_rate": 2.3264526503288642e-05, "loss": 0.0225, "step": 1740 }, { "epoch": 0.25898109438011024, "grad_norm": 0.43821659684181213, "learning_rate": 2.2707985183266978e-05, "loss": 0.0215, "step": 1750 }, { "epoch": 0.2604609863479966, "grad_norm": 0.42495104670524597, "learning_rate": 2.215621917559062e-05, "loss": 0.0213, "step": 1760 }, { "epoch": 0.2619408783158829, "grad_norm": 0.4166795015335083, "learning_rate": 2.1609325023334377e-05, "loss": 0.018, "step": 1770 }, { "epoch": 0.2634207702837693, "grad_norm": 0.29304540157318115, "learning_rate": 2.1067398417139466e-05, "loss": 0.029, "step": 1780 }, { "epoch": 0.26490066225165565, "grad_norm": 0.6335829496383667, "learning_rate": 2.0530534178470322e-05, "loss": 0.021, "step": 1790 }, { "epoch": 0.266380554219542, "grad_norm": 0.35307204723358154, "learning_rate": 1.9998826243023666e-05, "loss": 0.021, "step": 1800 }, { "epoch": 0.2678604461874283, "grad_norm": 0.4115915894508362, "learning_rate": 1.9472367644292457e-05, "loss": 0.0212, "step": 1810 }, { "epoch": 0.26934033815531466, "grad_norm": 0.5536908507347107, "learning_rate": 1.8951250497287716e-05, "loss": 0.0291, "step": 1820 }, { "epoch": 0.270820230123201, "grad_norm": 0.6118968725204468, "learning_rate": 1.843556598242109e-05, "loss": 0.0351, "step": 1830 }, { "epoch": 0.27230012209108734, "grad_norm": 0.5278725624084473, "learning_rate": 1.792540432955087e-05, "loss": 0.0243, "step": 1840 }, { "epoch": 0.2737800140589737, "grad_norm": 0.365327388048172, "learning_rate": 1.742085480219449e-05, "loss": 0.0303, "step": 1850 }, { "epoch": 0.27525990602686, "grad_norm": 0.5491426587104797, "learning_rate": 1.6922005681909843e-05, "loss": 0.0321, "step": 1860 }, { "epoch": 0.2767397979947464, "grad_norm": 0.5193561315536499, "learning_rate": 1.642894425284867e-05, "loss": 0.0207, "step": 1870 }, { "epoch": 0.27821968996263274, "grad_norm": 0.39915916323661804, "learning_rate": 1.5941756786484335e-05, "loss": 0.0197, "step": 1880 }, { "epoch": 0.2796995819305191, "grad_norm": 0.4479086101055145, "learning_rate": 1.5460528526516804e-05, "loss": 0.0233, "step": 1890 }, { "epoch": 0.2811794738984054, "grad_norm": 0.3311406373977661, "learning_rate": 1.498534367395748e-05, "loss": 0.0202, "step": 1900 }, { "epoch": 0.28265936586629176, "grad_norm": 0.2992391586303711, "learning_rate": 1.4516285372396437e-05, "loss": 0.0264, "step": 1910 }, { "epoch": 0.2841392578341781, "grad_norm": 0.480135440826416, "learning_rate": 1.4053435693454775e-05, "loss": 0.0221, "step": 1920 }, { "epoch": 0.28561914980206443, "grad_norm": 0.27029678225517273, "learning_rate": 1.359687562242437e-05, "loss": 0.0153, "step": 1930 }, { "epoch": 0.28709904176995077, "grad_norm": 0.5432844758033752, "learning_rate": 1.314668504409779e-05, "loss": 0.0207, "step": 1940 }, { "epoch": 0.28857893373783716, "grad_norm": 0.4192567765712738, "learning_rate": 1.2702942728790895e-05, "loss": 0.0168, "step": 1950 }, { "epoch": 0.2900588257057235, "grad_norm": 0.5646419525146484, "learning_rate": 1.2265726318560172e-05, "loss": 0.0269, "step": 1960 }, { "epoch": 0.29153871767360984, "grad_norm": 0.47364342212677, "learning_rate": 1.1835112313617697e-05, "loss": 0.0158, "step": 1970 }, { "epoch": 0.2930186096414962, "grad_norm": 0.3461420238018036, "learning_rate": 1.1411176058945771e-05, "loss": 0.0212, "step": 1980 }, { "epoch": 0.2944985016093825, "grad_norm": 0.34054312109947205, "learning_rate": 1.0993991731113817e-05, "loss": 0.0143, "step": 1990 }, { "epoch": 0.29597839357726885, "grad_norm": 0.48517104983329773, "learning_rate": 1.058363232529948e-05, "loss": 0.0256, "step": 2000 }, { "epoch": 0.2974582855451552, "grad_norm": 0.4162018299102783, "learning_rate": 1.0180169642516718e-05, "loss": 0.033, "step": 2010 }, { "epoch": 0.2989381775130415, "grad_norm": 0.4353783130645752, "learning_rate": 9.783674277052667e-06, "loss": 0.018, "step": 2020 }, { "epoch": 0.3004180694809279, "grad_norm": 0.4821432828903198, "learning_rate": 9.394215604115641e-06, "loss": 0.0136, "step": 2030 }, { "epoch": 0.30189796144881426, "grad_norm": 0.3058512806892395, "learning_rate": 9.011861767696522e-06, "loss": 0.0217, "step": 2040 }, { "epoch": 0.3033778534167006, "grad_norm": 0.4392576813697815, "learning_rate": 8.636679668645536e-06, "loss": 0.0182, "step": 2050 }, { "epoch": 0.30485774538458693, "grad_norm": 0.3896658718585968, "learning_rate": 8.268734952966505e-06, "loss": 0.0189, "step": 2060 }, { "epoch": 0.30633763735247327, "grad_norm": 0.4406259059906006, "learning_rate": 7.908092000330747e-06, "loss": 0.0145, "step": 2070 }, { "epoch": 0.3078175293203596, "grad_norm": 0.357083261013031, "learning_rate": 7.5548139128124364e-06, "loss": 0.0177, "step": 2080 }, { "epoch": 0.30929742128824594, "grad_norm": 0.2061689794063568, "learning_rate": 7.2089625038476606e-06, "loss": 0.0173, "step": 2090 }, { "epoch": 0.3107773132561323, "grad_norm": 0.4433155953884125, "learning_rate": 6.87059828741875e-06, "loss": 0.0188, "step": 2100 }, { "epoch": 0.3122572052240186, "grad_norm": 0.31645211577415466, "learning_rate": 6.539780467466172e-06, "loss": 0.0322, "step": 2110 }, { "epoch": 0.313737097191905, "grad_norm": 0.3037683367729187, "learning_rate": 6.216566927529455e-06, "loss": 0.0177, "step": 2120 }, { "epoch": 0.31521698915979135, "grad_norm": 0.2664813995361328, "learning_rate": 5.9010142206194e-06, "loss": 0.018, "step": 2130 }, { "epoch": 0.3166968811276777, "grad_norm": 0.22976835072040558, "learning_rate": 5.593177559322777e-06, "loss": 0.0145, "step": 2140 }, { "epoch": 0.318176773095564, "grad_norm": 0.3369593024253845, "learning_rate": 5.293110806141832e-06, "loss": 0.0218, "step": 2150 }, { "epoch": 0.31965666506345036, "grad_norm": 0.3195848762989044, "learning_rate": 5.000866464069842e-06, "loss": 0.0194, "step": 2160 }, { "epoch": 0.3211365570313367, "grad_norm": 0.19008630514144897, "learning_rate": 4.716495667404691e-06, "loss": 0.0174, "step": 2170 }, { "epoch": 0.32261644899922304, "grad_norm": 0.42658621072769165, "learning_rate": 4.440048172801725e-06, "loss": 0.0247, "step": 2180 }, { "epoch": 0.3240963409671094, "grad_norm": 0.6167186498641968, "learning_rate": 4.171572350567898e-06, "loss": 0.0193, "step": 2190 }, { "epoch": 0.32557623293499577, "grad_norm": 0.30169373750686646, "learning_rate": 3.9111151761983265e-06, "loss": 0.017, "step": 2200 }, { "epoch": 0.3270561249028821, "grad_norm": 0.3697860836982727, "learning_rate": 3.6587222221569075e-06, "loss": 0.0149, "step": 2210 }, { "epoch": 0.32853601687076844, "grad_norm": 0.43350812792778015, "learning_rate": 3.414437649902491e-06, "loss": 0.0142, "step": 2220 }, { "epoch": 0.3300159088386548, "grad_norm": 0.3299703896045685, "learning_rate": 3.1783042021619026e-06, "loss": 0.0161, "step": 2230 }, { "epoch": 0.3314958008065411, "grad_norm": 0.5686686038970947, "learning_rate": 2.9503631954511833e-06, "loss": 0.0204, "step": 2240 }, { "epoch": 0.33297569277442746, "grad_norm": 0.28825682401657104, "learning_rate": 2.7306545128464202e-06, "loss": 0.0225, "step": 2250 }, { "epoch": 0.3344555847423138, "grad_norm": 0.248334601521492, "learning_rate": 2.5192165970053307e-06, "loss": 0.0122, "step": 2260 }, { "epoch": 0.33593547671020013, "grad_norm": 0.4341728985309601, "learning_rate": 2.316086443440962e-06, "loss": 0.0164, "step": 2270 }, { "epoch": 0.3374153686780865, "grad_norm": 0.23554910719394684, "learning_rate": 2.1212995940485036e-06, "loss": 0.0171, "step": 2280 }, { "epoch": 0.33889526064597286, "grad_norm": 0.4276198744773865, "learning_rate": 1.9348901308864796e-06, "loss": 0.0246, "step": 2290 }, { "epoch": 0.3403751526138592, "grad_norm": 0.35541170835494995, "learning_rate": 1.7568906702134124e-06, "loss": 0.0112, "step": 2300 }, { "epoch": 0.34185504458174554, "grad_norm": 0.4070402681827545, "learning_rate": 1.5873323567808963e-06, "loss": 0.0202, "step": 2310 }, { "epoch": 0.3433349365496319, "grad_norm": 0.16610288619995117, "learning_rate": 1.4262448583841793e-06, "loss": 0.0129, "step": 2320 }, { "epoch": 0.3448148285175182, "grad_norm": 0.32411548495292664, "learning_rate": 1.2736563606711382e-06, "loss": 0.0173, "step": 2330 }, { "epoch": 0.34629472048540455, "grad_norm": 0.3122292160987854, "learning_rate": 1.1295935622106513e-06, "loss": 0.0183, "step": 2340 }, { "epoch": 0.3477746124532909, "grad_norm": 0.3919500708580017, "learning_rate": 9.94081669821062e-07, "loss": 0.0223, "step": 2350 }, { "epoch": 0.3492545044211773, "grad_norm": 0.3418029546737671, "learning_rate": 8.671443941597523e-07, "loss": 0.0172, "step": 2360 }, { "epoch": 0.3507343963890636, "grad_norm": 0.28939440846443176, "learning_rate": 7.488039455744611e-07, "loss": 0.0146, "step": 2370 }, { "epoch": 0.35221428835694996, "grad_norm": 0.407928466796875, "learning_rate": 6.390810302171146e-07, "loss": 0.0244, "step": 2380 }, { "epoch": 0.3536941803248363, "grad_norm": 0.32271912693977356, "learning_rate": 5.379948464208418e-07, "loss": 0.0214, "step": 2390 }, { "epoch": 0.35517407229272263, "grad_norm": 0.4125051498413086, "learning_rate": 4.455630813408329e-07, "loss": 0.0166, "step": 2400 }, { "epoch": 0.35665396426060897, "grad_norm": 0.24020278453826904, "learning_rate": 3.61801907859588e-07, "loss": 0.0174, "step": 2410 }, { "epoch": 0.3581338562284953, "grad_norm": 0.2490530014038086, "learning_rate": 2.867259817571355e-07, "loss": 0.0224, "step": 2420 }, { "epoch": 0.35961374819638164, "grad_norm": 0.1842329204082489, "learning_rate": 2.2034843914670588e-07, "loss": 0.0163, "step": 2430 }, { "epoch": 0.361093640164268, "grad_norm": 0.4745646119117737, "learning_rate": 1.626808941762703e-07, "loss": 0.0207, "step": 2440 }, { "epoch": 0.3625735321321544, "grad_norm": 0.4176884889602661, "learning_rate": 1.1373343699642158e-07, "loss": 0.0159, "step": 2450 }, { "epoch": 0.3640534241000407, "grad_norm": 0.280038982629776, "learning_rate": 7.351463199488651e-08, "loss": 0.017, "step": 2460 }, { "epoch": 0.36553331606792705, "grad_norm": 0.4648924767971039, "learning_rate": 4.203151629798563e-08, "loss": 0.0196, "step": 2470 }, { "epoch": 0.3670132080358134, "grad_norm": 0.42389336228370667, "learning_rate": 1.928959853936263e-08, "loss": 0.0275, "step": 2480 }, { "epoch": 0.3684931000036997, "grad_norm": 0.5853281021118164, "learning_rate": 5.292857896133097e-09, "loss": 0.0182, "step": 2490 }, { "epoch": 0.36997299197158606, "grad_norm": 0.2962166368961334, "learning_rate": 4.374339263035765e-11, "loss": 0.014, "step": 2500 } ], "logging_steps": 10, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }