{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9993075751280986, "eval_steps": 50.0, "global_step": 2706, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007385865300281586, "grad_norm": 56.16927719116211, "learning_rate": 6.666666666666667e-07, "loss": 2.2986, "step": 10 }, { "epoch": 0.014771730600563172, "grad_norm": 52.235191345214844, "learning_rate": 1.3333333333333334e-06, "loss": 2.1946, "step": 20 }, { "epoch": 0.02215759590084476, "grad_norm": 14.341564178466797, "learning_rate": 2.0000000000000003e-06, "loss": 1.9498, "step": 30 }, { "epoch": 0.029543461201126345, "grad_norm": 9.743363380432129, "learning_rate": 2.666666666666667e-06, "loss": 1.7599, "step": 40 }, { "epoch": 0.03692932650140793, "grad_norm": 10.694592475891113, "learning_rate": 3.3333333333333333e-06, "loss": 1.77, "step": 50 }, { "epoch": 0.04431519180168952, "grad_norm": 6.704069137573242, "learning_rate": 4.000000000000001e-06, "loss": 1.5609, "step": 60 }, { "epoch": 0.0517010571019711, "grad_norm": 5.9378342628479, "learning_rate": 4.666666666666667e-06, "loss": 1.5134, "step": 70 }, { "epoch": 0.05908692240225269, "grad_norm": 5.821998119354248, "learning_rate": 5.333333333333334e-06, "loss": 1.4795, "step": 80 }, { "epoch": 0.06647278770253427, "grad_norm": 6.466773986816406, "learning_rate": 6e-06, "loss": 1.4666, "step": 90 }, { "epoch": 0.07385865300281585, "grad_norm": 5.7971625328063965, "learning_rate": 6.666666666666667e-06, "loss": 1.4187, "step": 100 }, { "epoch": 0.08124451830309745, "grad_norm": 19.75885581970215, "learning_rate": 7.333333333333333e-06, "loss": 1.4012, "step": 110 }, { "epoch": 0.08863038360337903, "grad_norm": 6.692321300506592, "learning_rate": 8.000000000000001e-06, "loss": 1.3932, "step": 120 }, { "epoch": 0.09601624890366062, "grad_norm": 8.816634178161621, "learning_rate": 8.666666666666668e-06, "loss": 1.3924, "step": 130 }, { "epoch": 0.1034021142039422, "grad_norm": 6.486945152282715, "learning_rate": 9.333333333333334e-06, "loss": 1.3117, "step": 140 }, { "epoch": 0.1107879795042238, "grad_norm": 8.362743377685547, "learning_rate": 1e-05, "loss": 1.2642, "step": 150 }, { "epoch": 0.11817384480450538, "grad_norm": 7.534619331359863, "learning_rate": 1.0666666666666667e-05, "loss": 1.2891, "step": 160 }, { "epoch": 0.12555971010478698, "grad_norm": 7.239850997924805, "learning_rate": 1.1333333333333334e-05, "loss": 1.2664, "step": 170 }, { "epoch": 0.13294557540506854, "grad_norm": 6.650047779083252, "learning_rate": 1.2e-05, "loss": 1.2494, "step": 180 }, { "epoch": 0.14033144070535014, "grad_norm": 5.859479904174805, "learning_rate": 1.2666666666666667e-05, "loss": 1.2844, "step": 190 }, { "epoch": 0.1477173060056317, "grad_norm": 7.5547027587890625, "learning_rate": 1.3333333333333333e-05, "loss": 1.2898, "step": 200 }, { "epoch": 0.1551031713059133, "grad_norm": 8.316688537597656, "learning_rate": 1.4e-05, "loss": 1.1774, "step": 210 }, { "epoch": 0.1624890366061949, "grad_norm": 7.763572692871094, "learning_rate": 1.4666666666666666e-05, "loss": 1.2881, "step": 220 }, { "epoch": 0.16987490190647647, "grad_norm": 7.132694244384766, "learning_rate": 1.5333333333333334e-05, "loss": 1.2359, "step": 230 }, { "epoch": 0.17726076720675807, "grad_norm": 6.167331218719482, "learning_rate": 1.6000000000000003e-05, "loss": 1.1939, "step": 240 }, { "epoch": 0.18464663250703967, "grad_norm": 7.399999141693115, "learning_rate": 1.6666666666666667e-05, "loss": 1.2213, "step": 250 }, { "epoch": 0.19203249780732123, "grad_norm": 5.161776065826416, "learning_rate": 1.7333333333333336e-05, "loss": 1.1747, "step": 260 }, { "epoch": 0.19941836310760283, "grad_norm": 9.162799835205078, "learning_rate": 1.8e-05, "loss": 1.1751, "step": 270 }, { "epoch": 0.2068042284078844, "grad_norm": 6.043332576751709, "learning_rate": 1.866666666666667e-05, "loss": 1.181, "step": 280 }, { "epoch": 0.214190093708166, "grad_norm": 5.533137798309326, "learning_rate": 1.9333333333333333e-05, "loss": 1.1727, "step": 290 }, { "epoch": 0.2215759590084476, "grad_norm": 4.7085862159729, "learning_rate": 2e-05, "loss": 1.3127, "step": 300 }, { "epoch": 0.22896182430872916, "grad_norm": 5.2254815101623535, "learning_rate": 1.9999147543290536e-05, "loss": 1.1853, "step": 310 }, { "epoch": 0.23634768960901076, "grad_norm": 5.015223503112793, "learning_rate": 1.999659031849863e-05, "loss": 1.1846, "step": 320 }, { "epoch": 0.24373355490929235, "grad_norm": 6.505156993865967, "learning_rate": 1.9992328761608965e-05, "loss": 1.1572, "step": 330 }, { "epoch": 0.25111942020957395, "grad_norm": 4.331061840057373, "learning_rate": 1.99863635991801e-05, "loss": 1.0744, "step": 340 }, { "epoch": 0.2585052855098555, "grad_norm": 6.760496616363525, "learning_rate": 1.997869584822058e-05, "loss": 1.1019, "step": 350 }, { "epoch": 0.2658911508101371, "grad_norm": 6.3948235511779785, "learning_rate": 1.9969326816015556e-05, "loss": 1.1073, "step": 360 }, { "epoch": 0.2732770161104187, "grad_norm": 5.087249279022217, "learning_rate": 1.9958258099903894e-05, "loss": 1.0751, "step": 370 }, { "epoch": 0.2806628814107003, "grad_norm": 10.829612731933594, "learning_rate": 1.9945491587005867e-05, "loss": 1.083, "step": 380 }, { "epoch": 0.28804874671098185, "grad_norm": 5.7423176765441895, "learning_rate": 1.9931029453901384e-05, "loss": 1.0639, "step": 390 }, { "epoch": 0.2954346120112634, "grad_norm": 4.613246440887451, "learning_rate": 1.9914874166258927e-05, "loss": 1.0604, "step": 400 }, { "epoch": 0.30282047731154504, "grad_norm": 4.079463005065918, "learning_rate": 1.9897028478415165e-05, "loss": 1.0017, "step": 410 }, { "epoch": 0.3102063426118266, "grad_norm": 4.641962051391602, "learning_rate": 1.9877495432905363e-05, "loss": 1.0263, "step": 420 }, { "epoch": 0.3175922079121082, "grad_norm": 6.14805269241333, "learning_rate": 1.9856278359944664e-05, "loss": 1.0451, "step": 430 }, { "epoch": 0.3249780732123898, "grad_norm": 5.665846824645996, "learning_rate": 1.9833380876860305e-05, "loss": 1.0361, "step": 440 }, { "epoch": 0.3323639385126714, "grad_norm": 7.826813220977783, "learning_rate": 1.9808806887474907e-05, "loss": 0.9795, "step": 450 }, { "epoch": 0.33974980381295294, "grad_norm": 4.955426216125488, "learning_rate": 1.9782560581440894e-05, "loss": 1.0433, "step": 460 }, { "epoch": 0.34713566911323457, "grad_norm": 5.327470302581787, "learning_rate": 1.97546464335262e-05, "loss": 0.9605, "step": 470 }, { "epoch": 0.35452153441351614, "grad_norm": 4.838713645935059, "learning_rate": 1.972506920285136e-05, "loss": 0.9935, "step": 480 }, { "epoch": 0.3619073997137977, "grad_norm": 6.030056476593018, "learning_rate": 1.969383393207813e-05, "loss": 1.0043, "step": 490 }, { "epoch": 0.36929326501407933, "grad_norm": 5.917972087860107, "learning_rate": 1.9660945946549727e-05, "loss": 0.9701, "step": 500 }, { "epoch": 0.3766791303143609, "grad_norm": 4.341779708862305, "learning_rate": 1.962641085338294e-05, "loss": 0.9913, "step": 510 }, { "epoch": 0.38406499561464247, "grad_norm": 4.399661064147949, "learning_rate": 1.959023454051215e-05, "loss": 0.9196, "step": 520 }, { "epoch": 0.39145086091492404, "grad_norm": 4.028534412384033, "learning_rate": 1.9552423175685478e-05, "loss": 0.9369, "step": 530 }, { "epoch": 0.39883672621520566, "grad_norm": 4.389466285705566, "learning_rate": 1.9512983205413253e-05, "loss": 1.0191, "step": 540 }, { "epoch": 0.40622259151548723, "grad_norm": 5.277081489562988, "learning_rate": 1.9471921353868932e-05, "loss": 0.9399, "step": 550 }, { "epoch": 0.4136084568157688, "grad_norm": 4.73630428314209, "learning_rate": 1.9429244621742685e-05, "loss": 0.9588, "step": 560 }, { "epoch": 0.4209943221160504, "grad_norm": 3.3033573627471924, "learning_rate": 1.938496028504784e-05, "loss": 0.9038, "step": 570 }, { "epoch": 0.428380187416332, "grad_norm": 7.80294942855835, "learning_rate": 1.9339075893880382e-05, "loss": 0.9403, "step": 580 }, { "epoch": 0.43576605271661356, "grad_norm": 4.098162651062012, "learning_rate": 1.9291599271131737e-05, "loss": 0.9344, "step": 590 }, { "epoch": 0.4431519180168952, "grad_norm": 3.7808070182800293, "learning_rate": 1.9242538511155024e-05, "loss": 0.8939, "step": 600 }, { "epoch": 0.45053778331717675, "grad_norm": 4.160403728485107, "learning_rate": 1.9191901978385048e-05, "loss": 0.8786, "step": 610 }, { "epoch": 0.4579236486174583, "grad_norm": 3.7800965309143066, "learning_rate": 1.9139698305912227e-05, "loss": 0.8977, "step": 620 }, { "epoch": 0.46530951391773995, "grad_norm": 3.8200621604919434, "learning_rate": 1.9085936394010733e-05, "loss": 0.8793, "step": 630 }, { "epoch": 0.4726953792180215, "grad_norm": 4.453779220581055, "learning_rate": 1.903062540862107e-05, "loss": 0.8813, "step": 640 }, { "epoch": 0.4800812445183031, "grad_norm": 5.653434753417969, "learning_rate": 1.897377477978736e-05, "loss": 0.9544, "step": 650 }, { "epoch": 0.4874671098185847, "grad_norm": 4.868826389312744, "learning_rate": 1.8915394200049597e-05, "loss": 0.8858, "step": 660 }, { "epoch": 0.4948529751188663, "grad_norm": 4.187640190124512, "learning_rate": 1.8855493622791163e-05, "loss": 0.9077, "step": 670 }, { "epoch": 0.5022388404191479, "grad_norm": 4.503122806549072, "learning_rate": 1.8794083260541853e-05, "loss": 0.9278, "step": 680 }, { "epoch": 0.5096247057194294, "grad_norm": 4.902103900909424, "learning_rate": 1.8731173583236737e-05, "loss": 0.8281, "step": 690 }, { "epoch": 0.517010571019711, "grad_norm": 4.273303031921387, "learning_rate": 1.8666775316431113e-05, "loss": 0.8054, "step": 700 }, { "epoch": 0.5243964363199927, "grad_norm": 55.874359130859375, "learning_rate": 1.8600899439471902e-05, "loss": 0.8091, "step": 710 }, { "epoch": 0.5317823016202742, "grad_norm": 4.271385192871094, "learning_rate": 1.8533557183625773e-05, "loss": 0.788, "step": 720 }, { "epoch": 0.5391681669205558, "grad_norm": 5.59772253036499, "learning_rate": 1.8464760030164287e-05, "loss": 0.7942, "step": 730 }, { "epoch": 0.5465540322208374, "grad_norm": 3.724728584289551, "learning_rate": 1.8394519708406454e-05, "loss": 0.8234, "step": 740 }, { "epoch": 0.5539398975211189, "grad_norm": 5.2906036376953125, "learning_rate": 1.8322848193718984e-05, "loss": 0.8143, "step": 750 }, { "epoch": 0.5613257628214006, "grad_norm": 5.114410877227783, "learning_rate": 1.82497577054746e-05, "loss": 0.7946, "step": 760 }, { "epoch": 0.5687116281216821, "grad_norm": 4.730770587921143, "learning_rate": 1.8175260704968716e-05, "loss": 0.7771, "step": 770 }, { "epoch": 0.5760974934219637, "grad_norm": 3.0836727619171143, "learning_rate": 1.809936989329492e-05, "loss": 0.739, "step": 780 }, { "epoch": 0.5834833587222453, "grad_norm": 2.7664663791656494, "learning_rate": 1.802209820917952e-05, "loss": 0.731, "step": 790 }, { "epoch": 0.5908692240225268, "grad_norm": 3.5617446899414062, "learning_rate": 1.7943458826775646e-05, "loss": 0.6807, "step": 800 }, { "epoch": 0.5982550893228085, "grad_norm": 7.652963638305664, "learning_rate": 1.786346515341712e-05, "loss": 0.6883, "step": 810 }, { "epoch": 0.6056409546230901, "grad_norm": 3.5472395420074463, "learning_rate": 1.778213082733266e-05, "loss": 0.6822, "step": 820 }, { "epoch": 0.6130268199233716, "grad_norm": 4.652453899383545, "learning_rate": 1.7699469715320663e-05, "loss": 0.6508, "step": 830 }, { "epoch": 0.6204126852236532, "grad_norm": 3.976405620574951, "learning_rate": 1.7615495910385036e-05, "loss": 0.6007, "step": 840 }, { "epoch": 0.6277985505239349, "grad_norm": 3.0713090896606445, "learning_rate": 1.7530223729332464e-05, "loss": 0.6174, "step": 850 }, { "epoch": 0.6351844158242164, "grad_norm": 4.036540508270264, "learning_rate": 1.7443667710331523e-05, "loss": 0.617, "step": 860 }, { "epoch": 0.642570281124498, "grad_norm": 7.731866836547852, "learning_rate": 1.7355842610434045e-05, "loss": 0.6245, "step": 870 }, { "epoch": 0.6499561464247796, "grad_norm": 4.550940036773682, "learning_rate": 1.7266763403059162e-05, "loss": 0.593, "step": 880 }, { "epoch": 0.6573420117250611, "grad_norm": 2.5473084449768066, "learning_rate": 1.7176445275440468e-05, "loss": 0.5677, "step": 890 }, { "epoch": 0.6647278770253428, "grad_norm": 2.1716835498809814, "learning_rate": 1.7084903626036743e-05, "loss": 0.5452, "step": 900 }, { "epoch": 0.6721137423256244, "grad_norm": 4.398560523986816, "learning_rate": 1.6992154061906637e-05, "loss": 0.5599, "step": 910 }, { "epoch": 0.6794996076259059, "grad_norm": 2.8742692470550537, "learning_rate": 1.6898212396047788e-05, "loss": 0.5004, "step": 920 }, { "epoch": 0.6868854729261875, "grad_norm": 3.202517032623291, "learning_rate": 1.6803094644700878e-05, "loss": 0.5079, "step": 930 }, { "epoch": 0.6942713382264691, "grad_norm": 5.449188232421875, "learning_rate": 1.6706817024618966e-05, "loss": 0.5122, "step": 940 }, { "epoch": 0.7016572035267507, "grad_norm": 5.538541316986084, "learning_rate": 1.6609395950302693e-05, "loss": 0.5241, "step": 950 }, { "epoch": 0.7090430688270323, "grad_norm": 3.380526304244995, "learning_rate": 1.6510848031201755e-05, "loss": 0.4631, "step": 960 }, { "epoch": 0.7164289341273139, "grad_norm": 3.240527629852295, "learning_rate": 1.6411190068883114e-05, "loss": 0.5214, "step": 970 }, { "epoch": 0.7238147994275954, "grad_norm": 16.668127059936523, "learning_rate": 1.63104390541665e-05, "loss": 0.5373, "step": 980 }, { "epoch": 0.731200664727877, "grad_norm": 3.9278078079223633, "learning_rate": 1.6208612164227605e-05, "loss": 0.4789, "step": 990 }, { "epoch": 0.7385865300281587, "grad_norm": 3.5258326530456543, "learning_rate": 1.6105726759669534e-05, "loss": 0.465, "step": 1000 }, { "epoch": 0.7459723953284402, "grad_norm": 2.779311418533325, "learning_rate": 1.600180038156298e-05, "loss": 0.4501, "step": 1010 }, { "epoch": 0.7533582606287218, "grad_norm": 3.857485771179199, "learning_rate": 1.58968507484556e-05, "loss": 0.4519, "step": 1020 }, { "epoch": 0.7607441259290034, "grad_norm": 2.959052324295044, "learning_rate": 1.579089575335117e-05, "loss": 0.4357, "step": 1030 }, { "epoch": 0.7681299912292849, "grad_norm": 1.8662097454071045, "learning_rate": 1.568395346065899e-05, "loss": 0.3633, "step": 1040 }, { "epoch": 0.7755158565295666, "grad_norm": 5.543001174926758, "learning_rate": 1.5576042103114043e-05, "loss": 0.4111, "step": 1050 }, { "epoch": 0.7829017218298481, "grad_norm": 6.083206653594971, "learning_rate": 1.5467180078668485e-05, "loss": 0.3764, "step": 1060 }, { "epoch": 0.7902875871301297, "grad_norm": 2.5218305587768555, "learning_rate": 1.5357385947354945e-05, "loss": 0.374, "step": 1070 }, { "epoch": 0.7976734524304113, "grad_norm": 4.317601680755615, "learning_rate": 1.52466784281222e-05, "loss": 0.3571, "step": 1080 }, { "epoch": 0.8050593177306928, "grad_norm": 2.0782041549682617, "learning_rate": 1.5135076395643765e-05, "loss": 0.3739, "step": 1090 }, { "epoch": 0.8124451830309745, "grad_norm": 2.443953037261963, "learning_rate": 1.5022598877099913e-05, "loss": 0.3607, "step": 1100 }, { "epoch": 0.8198310483312561, "grad_norm": 2.276827573776245, "learning_rate": 1.4909265048933716e-05, "loss": 0.3607, "step": 1110 }, { "epoch": 0.8272169136315376, "grad_norm": 2.808431386947632, "learning_rate": 1.4795094233581616e-05, "loss": 0.3387, "step": 1120 }, { "epoch": 0.8346027789318192, "grad_norm": 2.5325915813446045, "learning_rate": 1.468010589617913e-05, "loss": 0.3172, "step": 1130 }, { "epoch": 0.8419886442321008, "grad_norm": 2.4943833351135254, "learning_rate": 1.4564319641242202e-05, "loss": 0.3193, "step": 1140 }, { "epoch": 0.8493745095323824, "grad_norm": 2.2182066440582275, "learning_rate": 1.4447755209324807e-05, "loss": 0.3118, "step": 1150 }, { "epoch": 0.856760374832664, "grad_norm": 1.920409083366394, "learning_rate": 1.4330432473653369e-05, "loss": 0.3246, "step": 1160 }, { "epoch": 0.8641462401329456, "grad_norm": 3.2863781452178955, "learning_rate": 1.4212371436738518e-05, "loss": 0.3065, "step": 1170 }, { "epoch": 0.8715321054332271, "grad_norm": 2.6266987323760986, "learning_rate": 1.4093592226964863e-05, "loss": 0.2813, "step": 1180 }, { "epoch": 0.8789179707335087, "grad_norm": 2.526742935180664, "learning_rate": 1.3974115095159273e-05, "loss": 0.284, "step": 1190 }, { "epoch": 0.8863038360337904, "grad_norm": 2.1190872192382812, "learning_rate": 1.3853960411138272e-05, "loss": 0.2865, "step": 1200 }, { "epoch": 0.8936897013340719, "grad_norm": 3.0260584354400635, "learning_rate": 1.373314866023517e-05, "loss": 0.3019, "step": 1210 }, { "epoch": 0.9010755666343535, "grad_norm": 4.537729740142822, "learning_rate": 1.3611700439807503e-05, "loss": 0.2946, "step": 1220 }, { "epoch": 0.9084614319346351, "grad_norm": 3.150209903717041, "learning_rate": 1.3489636455725337e-05, "loss": 0.2795, "step": 1230 }, { "epoch": 0.9158472972349166, "grad_norm": 1.6362818479537964, "learning_rate": 1.336697751884111e-05, "loss": 0.2815, "step": 1240 }, { "epoch": 0.9232331625351983, "grad_norm": 1.3282984495162964, "learning_rate": 1.3243744541441578e-05, "loss": 0.2679, "step": 1250 }, { "epoch": 0.9306190278354799, "grad_norm": 4.261312961578369, "learning_rate": 1.3119958533682417e-05, "loss": 0.2634, "step": 1260 }, { "epoch": 0.9380048931357614, "grad_norm": 2.1109001636505127, "learning_rate": 1.2995640600006196e-05, "loss": 0.2566, "step": 1270 }, { "epoch": 0.945390758436043, "grad_norm": 2.4117610454559326, "learning_rate": 1.2870811935544252e-05, "loss": 0.2502, "step": 1280 }, { "epoch": 0.9527766237363247, "grad_norm": 2.0748672485351562, "learning_rate": 1.2745493822503096e-05, "loss": 0.2422, "step": 1290 }, { "epoch": 0.9601624890366062, "grad_norm": 3.0310394763946533, "learning_rate": 1.261970762653598e-05, "loss": 0.2508, "step": 1300 }, { "epoch": 0.9675483543368878, "grad_norm": 2.0341477394104004, "learning_rate": 1.2493474793100249e-05, "loss": 0.2467, "step": 1310 }, { "epoch": 0.9749342196371694, "grad_norm": 1.4582960605621338, "learning_rate": 1.2366816843801066e-05, "loss": 0.2479, "step": 1320 }, { "epoch": 0.9823200849374509, "grad_norm": 3.3330225944519043, "learning_rate": 1.2239755372722169e-05, "loss": 0.2516, "step": 1330 }, { "epoch": 0.9897059502377326, "grad_norm": 1.4349642992019653, "learning_rate": 1.2112312042744263e-05, "loss": 0.2153, "step": 1340 }, { "epoch": 0.9970918155380141, "grad_norm": 2.073673725128174, "learning_rate": 1.1984508581851694e-05, "loss": 0.1858, "step": 1350 }, { "epoch": 1.0051701057101972, "grad_norm": 4.247702598571777, "learning_rate": 1.1856366779428008e-05, "loss": 0.2183, "step": 1360 }, { "epoch": 1.0125559710104788, "grad_norm": 4.242294788360596, "learning_rate": 1.1727908482541048e-05, "loss": 0.2059, "step": 1370 }, { "epoch": 1.0199418363107602, "grad_norm": 2.2901999950408936, "learning_rate": 1.1599155592218234e-05, "loss": 0.2207, "step": 1380 }, { "epoch": 1.0273277016110418, "grad_norm": 1.7798693180084229, "learning_rate": 1.1470130059712607e-05, "loss": 0.1898, "step": 1390 }, { "epoch": 1.0347135669113234, "grad_norm": 1.9651380777359009, "learning_rate": 1.1340853882760343e-05, "loss": 0.1958, "step": 1400 }, { "epoch": 1.042099432211605, "grad_norm": 1.8335607051849365, "learning_rate": 1.1211349101830323e-05, "loss": 0.2201, "step": 1410 }, { "epoch": 1.0494852975118867, "grad_norm": 2.270725965499878, "learning_rate": 1.1081637796366432e-05, "loss": 0.1881, "step": 1420 }, { "epoch": 1.0568711628121683, "grad_norm": 3.337350368499756, "learning_rate": 1.0951742081023196e-05, "loss": 0.2176, "step": 1430 }, { "epoch": 1.0642570281124497, "grad_norm": 3.7382607460021973, "learning_rate": 1.0821684101895429e-05, "loss": 0.2043, "step": 1440 }, { "epoch": 1.0716428934127313, "grad_norm": 1.3422726392745972, "learning_rate": 1.0691486032742522e-05, "loss": 0.1908, "step": 1450 }, { "epoch": 1.079028758713013, "grad_norm": 3.4625842571258545, "learning_rate": 1.0561170071207987e-05, "loss": 0.1747, "step": 1460 }, { "epoch": 1.0864146240132946, "grad_norm": 1.8566938638687134, "learning_rate": 1.0430758435034985e-05, "loss": 0.2003, "step": 1470 }, { "epoch": 1.0938004893135762, "grad_norm": 4.041960716247559, "learning_rate": 1.0300273358278362e-05, "loss": 0.1716, "step": 1480 }, { "epoch": 1.1011863546138578, "grad_norm": 1.5447806119918823, "learning_rate": 1.016973708751395e-05, "loss": 0.1911, "step": 1490 }, { "epoch": 1.1085722199141392, "grad_norm": 1.8091706037521362, "learning_rate": 1.003917187804572e-05, "loss": 0.1687, "step": 1500 }, { "epoch": 1.1159580852144209, "grad_norm": 1.5981247425079346, "learning_rate": 9.908599990111438e-06, "loss": 0.1706, "step": 1510 }, { "epoch": 1.1233439505147025, "grad_norm": 1.5762553215026855, "learning_rate": 9.778043685087488e-06, "loss": 0.1896, "step": 1520 }, { "epoch": 1.130729815814984, "grad_norm": 1.4694616794586182, "learning_rate": 9.64752522169351e-06, "loss": 0.1718, "step": 1530 }, { "epoch": 1.1381156811152657, "grad_norm": 1.4669324159622192, "learning_rate": 9.517066852197469e-06, "loss": 0.1481, "step": 1540 }, { "epoch": 1.1455015464155474, "grad_norm": 2.1808154582977295, "learning_rate": 9.386690818621845e-06, "loss": 0.1878, "step": 1550 }, { "epoch": 1.1528874117158288, "grad_norm": 1.0794235467910767, "learning_rate": 9.256419348951545e-06, "loss": 0.1809, "step": 1560 }, { "epoch": 1.1602732770161104, "grad_norm": 1.1634767055511475, "learning_rate": 9.126274653344249e-06, "loss": 0.1558, "step": 1570 }, { "epoch": 1.167659142316392, "grad_norm": 3.980741024017334, "learning_rate": 8.996278920343753e-06, "loss": 0.1714, "step": 1580 }, { "epoch": 1.1750450076166736, "grad_norm": 1.3018531799316406, "learning_rate": 8.866454313097011e-06, "loss": 0.1476, "step": 1590 }, { "epoch": 1.1824308729169553, "grad_norm": 1.6033530235290527, "learning_rate": 8.736822965575526e-06, "loss": 0.1702, "step": 1600 }, { "epoch": 1.1898167382172367, "grad_norm": 1.6837263107299805, "learning_rate": 8.607406978801692e-06, "loss": 0.1622, "step": 1610 }, { "epoch": 1.1972026035175183, "grad_norm": 4.44855260848999, "learning_rate": 8.478228417080749e-06, "loss": 0.2111, "step": 1620 }, { "epoch": 1.2045884688178, "grad_norm": 1.133955478668213, "learning_rate": 8.349309304239033e-06, "loss": 0.1407, "step": 1630 }, { "epoch": 1.2119743341180815, "grad_norm": 2.430974006652832, "learning_rate": 8.22067161986909e-06, "loss": 0.1502, "step": 1640 }, { "epoch": 1.2193601994183632, "grad_norm": 1.0593976974487305, "learning_rate": 8.092337295582342e-06, "loss": 0.1461, "step": 1650 }, { "epoch": 1.2267460647186448, "grad_norm": 1.5466171503067017, "learning_rate": 7.964328211269949e-06, "loss": 0.1257, "step": 1660 }, { "epoch": 1.2341319300189264, "grad_norm": 3.7850043773651123, "learning_rate": 7.83666619137247e-06, "loss": 0.1237, "step": 1670 }, { "epoch": 1.2415177953192078, "grad_norm": 2.987395763397217, "learning_rate": 7.709373001158989e-06, "loss": 0.135, "step": 1680 }, { "epoch": 1.2489036606194894, "grad_norm": 1.1026815176010132, "learning_rate": 7.582470343016315e-06, "loss": 0.1339, "step": 1690 }, { "epoch": 1.256289525919771, "grad_norm": 0.8675901293754578, "learning_rate": 7.455979852748926e-06, "loss": 0.1187, "step": 1700 }, { "epoch": 1.2636753912200527, "grad_norm": 1.0071134567260742, "learning_rate": 7.3299230958902455e-06, "loss": 0.1288, "step": 1710 }, { "epoch": 1.2710612565203343, "grad_norm": 1.257807731628418, "learning_rate": 7.2043215640259045e-06, "loss": 0.1219, "step": 1720 }, { "epoch": 1.2784471218206157, "grad_norm": 1.5844953060150146, "learning_rate": 7.079196671129613e-06, "loss": 0.1293, "step": 1730 }, { "epoch": 1.2858329871208973, "grad_norm": 1.242968201637268, "learning_rate": 6.954569749912268e-06, "loss": 0.1242, "step": 1740 }, { "epoch": 1.293218852421179, "grad_norm": 6.035883903503418, "learning_rate": 6.8304620481849e-06, "loss": 0.1324, "step": 1750 }, { "epoch": 1.3006047177214606, "grad_norm": 1.1064496040344238, "learning_rate": 6.706894725236118e-06, "loss": 0.113, "step": 1760 }, { "epoch": 1.3079905830217422, "grad_norm": 3.75222110748291, "learning_rate": 6.583888848224628e-06, "loss": 0.1402, "step": 1770 }, { "epoch": 1.3153764483220236, "grad_norm": 2.064958333969116, "learning_rate": 6.4614653885874564e-06, "loss": 0.1354, "step": 1780 }, { "epoch": 1.3227623136223052, "grad_norm": 1.2012087106704712, "learning_rate": 6.339645218464521e-06, "loss": 0.1162, "step": 1790 }, { "epoch": 1.3301481789225869, "grad_norm": 3.533600330352783, "learning_rate": 6.218449107140093e-06, "loss": 0.114, "step": 1800 }, { "epoch": 1.3375340442228685, "grad_norm": 1.0663248300552368, "learning_rate": 6.097897717501829e-06, "loss": 0.1102, "step": 1810 }, { "epoch": 1.34491990952315, "grad_norm": 2.6653411388397217, "learning_rate": 5.978011602517908e-06, "loss": 0.1115, "step": 1820 }, { "epoch": 1.3523057748234317, "grad_norm": 2.8922715187072754, "learning_rate": 5.858811201732952e-06, "loss": 0.1168, "step": 1830 }, { "epoch": 1.3596916401237134, "grad_norm": 0.7805532813072205, "learning_rate": 5.740316837783247e-06, "loss": 0.0985, "step": 1840 }, { "epoch": 1.3670775054239948, "grad_norm": 1.6969873905181885, "learning_rate": 5.622548712931907e-06, "loss": 0.115, "step": 1850 }, { "epoch": 1.3744633707242764, "grad_norm": 1.0871217250823975, "learning_rate": 5.50552690562457e-06, "loss": 0.1077, "step": 1860 }, { "epoch": 1.381849236024558, "grad_norm": 1.25892174243927, "learning_rate": 5.389271367066193e-06, "loss": 0.0974, "step": 1870 }, { "epoch": 1.3892351013248396, "grad_norm": 0.6338607668876648, "learning_rate": 5.273801917819552e-06, "loss": 0.098, "step": 1880 }, { "epoch": 1.3966209666251213, "grad_norm": 0.43911364674568176, "learning_rate": 5.159138244425996e-06, "loss": 0.0965, "step": 1890 }, { "epoch": 1.4040068319254027, "grad_norm": 0.7171842455863953, "learning_rate": 5.045299896049063e-06, "loss": 0.1043, "step": 1900 }, { "epoch": 1.4113926972256843, "grad_norm": 0.7495408058166504, "learning_rate": 4.932306281141531e-06, "loss": 0.1067, "step": 1910 }, { "epoch": 1.418778562525966, "grad_norm": 0.6386808753013611, "learning_rate": 4.82017666413643e-06, "loss": 0.095, "step": 1920 }, { "epoch": 1.4261644278262475, "grad_norm": 0.4710920751094818, "learning_rate": 4.7089301621626285e-06, "loss": 0.0946, "step": 1930 }, { "epoch": 1.4335502931265292, "grad_norm": 2.0037851333618164, "learning_rate": 4.598585741785529e-06, "loss": 0.1343, "step": 1940 }, { "epoch": 1.4409361584268106, "grad_norm": 0.731887936592102, "learning_rate": 4.489162215773437e-06, "loss": 0.1021, "step": 1950 }, { "epoch": 1.4483220237270924, "grad_norm": 1.012526035308838, "learning_rate": 4.380678239890128e-06, "loss": 0.0986, "step": 1960 }, { "epoch": 1.4557078890273738, "grad_norm": 1.7591279745101929, "learning_rate": 4.273152309714231e-06, "loss": 0.0921, "step": 1970 }, { "epoch": 1.4630937543276554, "grad_norm": 0.5881451964378357, "learning_rate": 4.166602757485865e-06, "loss": 0.0889, "step": 1980 }, { "epoch": 1.470479619627937, "grad_norm": 0.6772285103797913, "learning_rate": 4.061047748981171e-06, "loss": 0.0999, "step": 1990 }, { "epoch": 1.4778654849282187, "grad_norm": 1.0633774995803833, "learning_rate": 3.9565052804151925e-06, "loss": 0.0929, "step": 2000 }, { "epoch": 1.4852513502285003, "grad_norm": 0.5887898802757263, "learning_rate": 3.852993175373679e-06, "loss": 0.0929, "step": 2010 }, { "epoch": 1.4926372155287817, "grad_norm": 0.9685658812522888, "learning_rate": 3.7505290817743256e-06, "loss": 0.0932, "step": 2020 }, { "epoch": 1.5000230808290633, "grad_norm": 3.481058120727539, "learning_rate": 3.6491304688579376e-06, "loss": 0.1034, "step": 2030 }, { "epoch": 1.507408946129345, "grad_norm": 1.2913931608200073, "learning_rate": 3.5488146242101018e-06, "loss": 0.0914, "step": 2040 }, { "epoch": 1.5147948114296266, "grad_norm": 0.49071353673934937, "learning_rate": 3.4495986508137847e-06, "loss": 0.097, "step": 2050 }, { "epoch": 1.5221806767299082, "grad_norm": 0.7845070362091064, "learning_rate": 3.3514994641334274e-06, "loss": 0.0895, "step": 2060 }, { "epoch": 1.5295665420301896, "grad_norm": 0.7540778517723083, "learning_rate": 3.254533789231008e-06, "loss": 0.094, "step": 2070 }, { "epoch": 1.5369524073304714, "grad_norm": 0.8221713900566101, "learning_rate": 3.158718157914559e-06, "loss": 0.0857, "step": 2080 }, { "epoch": 1.5443382726307529, "grad_norm": 0.458886057138443, "learning_rate": 3.0640689059196328e-06, "loss": 0.0834, "step": 2090 }, { "epoch": 1.5517241379310345, "grad_norm": 5.687739372253418, "learning_rate": 2.9706021701242127e-06, "loss": 0.0944, "step": 2100 }, { "epoch": 1.559110003231316, "grad_norm": 0.609434962272644, "learning_rate": 2.8783338857975087e-06, "loss": 0.0926, "step": 2110 }, { "epoch": 1.5664958685315975, "grad_norm": 3.346607208251953, "learning_rate": 2.787279783883129e-06, "loss": 0.087, "step": 2120 }, { "epoch": 1.5738817338318793, "grad_norm": 2.047215700149536, "learning_rate": 2.697455388317094e-06, "loss": 0.0807, "step": 2130 }, { "epoch": 1.5812675991321608, "grad_norm": 1.0655306577682495, "learning_rate": 2.6088760133811418e-06, "loss": 0.0857, "step": 2140 }, { "epoch": 1.5886534644324424, "grad_norm": 1.1660749912261963, "learning_rate": 2.5215567610917623e-06, "loss": 0.08, "step": 2150 }, { "epoch": 1.596039329732724, "grad_norm": 0.45875102281570435, "learning_rate": 2.4355125186254547e-06, "loss": 0.0931, "step": 2160 }, { "epoch": 1.6034251950330056, "grad_norm": 1.5347977876663208, "learning_rate": 2.3507579557805803e-06, "loss": 0.083, "step": 2170 }, { "epoch": 1.6108110603332872, "grad_norm": 1.1268221139907837, "learning_rate": 2.26730752247629e-06, "loss": 0.0841, "step": 2180 }, { "epoch": 1.6181969256335687, "grad_norm": 0.4492045044898987, "learning_rate": 2.1851754462889373e-06, "loss": 0.0791, "step": 2190 }, { "epoch": 1.6255827909338505, "grad_norm": 0.9329794645309448, "learning_rate": 2.104375730026406e-06, "loss": 0.0827, "step": 2200 }, { "epoch": 1.632968656234132, "grad_norm": 0.4460253119468689, "learning_rate": 2.024922149340748e-06, "loss": 0.0812, "step": 2210 }, { "epoch": 1.6403545215344135, "grad_norm": 3.0073747634887695, "learning_rate": 1.9468282503795465e-06, "loss": 0.0836, "step": 2220 }, { "epoch": 1.6477403868346951, "grad_norm": 0.7037497758865356, "learning_rate": 1.8701073474764342e-06, "loss": 0.0757, "step": 2230 }, { "epoch": 1.6551262521349765, "grad_norm": 2.326693058013916, "learning_rate": 1.7947725208810962e-06, "loss": 0.0743, "step": 2240 }, { "epoch": 1.6625121174352584, "grad_norm": 0.2990873456001282, "learning_rate": 1.720836614529211e-06, "loss": 0.0799, "step": 2250 }, { "epoch": 1.6698979827355398, "grad_norm": 0.4213595688343048, "learning_rate": 1.648312233852666e-06, "loss": 0.0802, "step": 2260 }, { "epoch": 1.6772838480358214, "grad_norm": 0.5848265290260315, "learning_rate": 1.5772117436304446e-06, "loss": 0.0795, "step": 2270 }, { "epoch": 1.684669713336103, "grad_norm": 0.6411451697349548, "learning_rate": 1.5075472658805301e-06, "loss": 0.0739, "step": 2280 }, { "epoch": 1.6920555786363847, "grad_norm": 0.8654035925865173, "learning_rate": 1.4393306777932192e-06, "loss": 0.0796, "step": 2290 }, { "epoch": 1.6994414439366663, "grad_norm": 0.7043092250823975, "learning_rate": 1.3725736097061537e-06, "loss": 0.0811, "step": 2300 }, { "epoch": 1.7068273092369477, "grad_norm": 1.6693702936172485, "learning_rate": 1.307287443121452e-06, "loss": 0.094, "step": 2310 }, { "epoch": 1.7142131745372293, "grad_norm": 0.33761119842529297, "learning_rate": 1.2434833087652642e-06, "loss": 0.0759, "step": 2320 }, { "epoch": 1.721599039837511, "grad_norm": 0.9389520883560181, "learning_rate": 1.181172084690072e-06, "loss": 0.0727, "step": 2330 }, { "epoch": 1.7289849051377926, "grad_norm": 0.2903837263584137, "learning_rate": 1.120364394420087e-06, "loss": 0.0743, "step": 2340 }, { "epoch": 1.7363707704380742, "grad_norm": 0.325009822845459, "learning_rate": 1.0610706051400165e-06, "loss": 0.0801, "step": 2350 }, { "epoch": 1.7437566357383556, "grad_norm": 0.9325069785118103, "learning_rate": 1.0033008259275635e-06, "loss": 0.0759, "step": 2360 }, { "epoch": 1.7511425010386374, "grad_norm": 1.0802961587905884, "learning_rate": 9.470649060299041e-07, "loss": 0.0779, "step": 2370 }, { "epoch": 1.7585283663389188, "grad_norm": 0.4947347939014435, "learning_rate": 8.923724331844875e-07, "loss": 0.0786, "step": 2380 }, { "epoch": 1.7659142316392005, "grad_norm": 0.47125598788261414, "learning_rate": 8.392327319843985e-07, "loss": 0.0751, "step": 2390 }, { "epoch": 1.773300096939482, "grad_norm": 0.3219301402568817, "learning_rate": 7.876548622886038e-07, "loss": 0.0702, "step": 2400 }, { "epoch": 1.7806859622397635, "grad_norm": 0.602854311466217, "learning_rate": 7.376476176773184e-07, "loss": 0.0772, "step": 2410 }, { "epoch": 1.7880718275400453, "grad_norm": 0.48326513171195984, "learning_rate": 6.89219523952781e-07, "loss": 0.0797, "step": 2420 }, { "epoch": 1.7954576928403267, "grad_norm": 0.5595663189888, "learning_rate": 6.423788376856765e-07, "loss": 0.066, "step": 2430 }, { "epoch": 1.8028435581406084, "grad_norm": 1.7976887226104736, "learning_rate": 5.971335448074611e-07, "loss": 0.0732, "step": 2440 }, { "epoch": 1.81022942344089, "grad_norm": 1.282763957977295, "learning_rate": 5.534913592488322e-07, "loss": 0.0816, "step": 2450 }, { "epoch": 1.8176152887411716, "grad_norm": 0.9589461088180542, "learning_rate": 5.114597216245698e-07, "loss": 0.0798, "step": 2460 }, { "epoch": 1.8250011540414532, "grad_norm": 0.43628719449043274, "learning_rate": 4.7104579796497405e-07, "loss": 0.0835, "step": 2470 }, { "epoch": 1.8323870193417346, "grad_norm": 0.49431607127189636, "learning_rate": 4.3225647849411854e-07, "loss": 0.074, "step": 2480 }, { "epoch": 1.8397728846420165, "grad_norm": 0.9135465025901794, "learning_rate": 3.9509837645513306e-07, "loss": 0.0736, "step": 2490 }, { "epoch": 1.847158749942298, "grad_norm": 0.6499918103218079, "learning_rate": 3.595778269826966e-07, "loss": 0.0723, "step": 2500 }, { "epoch": 1.8545446152425795, "grad_norm": 1.299659013748169, "learning_rate": 3.257008860229527e-07, "loss": 0.0735, "step": 2510 }, { "epoch": 1.8619304805428611, "grad_norm": 0.7049327492713928, "learning_rate": 2.9347332930102503e-07, "loss": 0.0713, "step": 2520 }, { "epoch": 1.8693163458431425, "grad_norm": 0.29024580121040344, "learning_rate": 2.6290065133630637e-07, "loss": 0.0774, "step": 2530 }, { "epoch": 1.8767022111434244, "grad_norm": 0.7386340498924255, "learning_rate": 2.3398806450568577e-07, "loss": 0.0739, "step": 2540 }, { "epoch": 1.8840880764437058, "grad_norm": 0.5153611898422241, "learning_rate": 2.067404981548915e-07, "loss": 0.0702, "step": 2550 }, { "epoch": 1.8914739417439874, "grad_norm": 1.2201671600341797, "learning_rate": 1.811625977580722e-07, "loss": 0.082, "step": 2560 }, { "epoch": 1.898859807044269, "grad_norm": 0.7881399989128113, "learning_rate": 1.5725872412579058e-07, "loss": 0.0677, "step": 2570 }, { "epoch": 1.9062456723445507, "grad_norm": 0.3312283456325531, "learning_rate": 1.3503295266153903e-07, "loss": 0.0756, "step": 2580 }, { "epoch": 1.9136315376448323, "grad_norm": 0.4955926239490509, "learning_rate": 1.14489072666919e-07, "loss": 0.0692, "step": 2590 }, { "epoch": 1.9210174029451137, "grad_norm": 0.45805656909942627, "learning_rate": 9.563058669559755e-08, "loss": 0.0753, "step": 2600 }, { "epoch": 1.9284032682453955, "grad_norm": 0.5555469393730164, "learning_rate": 7.846070995615518e-08, "loss": 0.0716, "step": 2610 }, { "epoch": 1.935789133545677, "grad_norm": 0.5252045392990112, "learning_rate": 6.298236976391537e-08, "loss": 0.0772, "step": 2620 }, { "epoch": 1.9431749988459586, "grad_norm": 1.8346993923187256, "learning_rate": 4.919820504186934e-08, "loss": 0.0764, "step": 2630 }, { "epoch": 1.9505608641462402, "grad_norm": 0.4004700481891632, "learning_rate": 3.711056587075712e-08, "loss": 0.0739, "step": 2640 }, { "epoch": 1.9579467294465216, "grad_norm": 1.077645182609558, "learning_rate": 2.672151308840243e-08, "loss": 0.07, "step": 2650 }, { "epoch": 1.9653325947468034, "grad_norm": 0.6247801184654236, "learning_rate": 1.8032817938352653e-08, "loss": 0.0666, "step": 2660 }, { "epoch": 1.9727184600470848, "grad_norm": 0.4016879200935364, "learning_rate": 1.1045961767904844e-08, "loss": 0.0695, "step": 2670 }, { "epoch": 1.9801043253473665, "grad_norm": 0.5175566673278809, "learning_rate": 5.7621357755432984e-09, "loss": 0.0722, "step": 2680 }, { "epoch": 1.987490190647648, "grad_norm": 0.5656958222389221, "learning_rate": 2.1822408078508994e-09, "loss": 0.0728, "step": 2690 }, { "epoch": 1.9948760559479295, "grad_norm": 0.5182742476463318, "learning_rate": 3.068872059253103e-10, "loss": 0.0727, "step": 2700 } ], "logging_steps": 10, "max_steps": 2706, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.22919470739456e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }