| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 200, | |
| "global_step": 5070, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014795088030773782, | |
| "grad_norm": 0.08312089741230011, | |
| "learning_rate": 4.8e-05, | |
| "loss": 25.3001, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.029590176061547565, | |
| "grad_norm": 0.16134510934352875, | |
| "learning_rate": 9.8e-05, | |
| "loss": 25.2653, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.04438526409232135, | |
| "grad_norm": 0.20426703989505768, | |
| "learning_rate": 0.000148, | |
| "loss": 25.1317, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.05918035212309513, | |
| "grad_norm": 0.12948599457740784, | |
| "learning_rate": 0.00019800000000000002, | |
| "loss": 25.1001, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07397544015386892, | |
| "grad_norm": 0.12529096007347107, | |
| "learning_rate": 0.00019903420523138834, | |
| "loss": 25.1769, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.0887705281846427, | |
| "grad_norm": 0.1009916439652443, | |
| "learning_rate": 0.00019802816901408452, | |
| "loss": 25.173, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.10356561621541648, | |
| "grad_norm": 0.11219477653503418, | |
| "learning_rate": 0.0001970221327967807, | |
| "loss": 25.0957, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.11836070424619026, | |
| "grad_norm": 0.08366899192333221, | |
| "learning_rate": 0.00019601609657947687, | |
| "loss": 25.069, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.13315579227696406, | |
| "grad_norm": 0.05875258892774582, | |
| "learning_rate": 0.00019501006036217304, | |
| "loss": 25.0362, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.14795088030773784, | |
| "grad_norm": 0.06766419112682343, | |
| "learning_rate": 0.00019400402414486922, | |
| "loss": 25.0112, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.16274596833851163, | |
| "grad_norm": 0.07124276459217072, | |
| "learning_rate": 0.0001929979879275654, | |
| "loss": 25.0194, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.1775410563692854, | |
| "grad_norm": 0.09075228869915009, | |
| "learning_rate": 0.0001919919517102616, | |
| "loss": 25.1531, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.19233614440005917, | |
| "grad_norm": 0.06596764922142029, | |
| "learning_rate": 0.00019098591549295774, | |
| "loss": 25.0991, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.20713123243083295, | |
| "grad_norm": 0.06218697875738144, | |
| "learning_rate": 0.00018997987927565392, | |
| "loss": 25.0787, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.22192632046160674, | |
| "grad_norm": 0.07952570170164108, | |
| "learning_rate": 0.00018897384305835012, | |
| "loss": 25.0317, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.23672140849238052, | |
| "grad_norm": 0.06121606379747391, | |
| "learning_rate": 0.0001879678068410463, | |
| "loss": 25.0975, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.25151649652315433, | |
| "grad_norm": 0.06957080215215683, | |
| "learning_rate": 0.00018696177062374247, | |
| "loss": 25.1054, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.2663115845539281, | |
| "grad_norm": 0.06280346214771271, | |
| "learning_rate": 0.00018595573440643862, | |
| "loss": 25.0019, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2811066725847019, | |
| "grad_norm": 0.0642634779214859, | |
| "learning_rate": 0.00018494969818913482, | |
| "loss": 25.1581, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.2959017606154757, | |
| "grad_norm": 0.05680055171251297, | |
| "learning_rate": 0.000183943661971831, | |
| "loss": 25.0826, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.31069684864624947, | |
| "grad_norm": 0.038917265832424164, | |
| "learning_rate": 0.00018293762575452717, | |
| "loss": 25.1511, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.32549193667702325, | |
| "grad_norm": 0.047237247228622437, | |
| "learning_rate": 0.00018193158953722335, | |
| "loss": 25.022, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.34028702470779704, | |
| "grad_norm": 0.04916401579976082, | |
| "learning_rate": 0.00018092555331991953, | |
| "loss": 25.0202, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.3550821127385708, | |
| "grad_norm": 0.06772845983505249, | |
| "learning_rate": 0.0001799195171026157, | |
| "loss": 25.0699, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3698772007693446, | |
| "grad_norm": 0.048369135707616806, | |
| "learning_rate": 0.00017891348088531188, | |
| "loss": 25.0411, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.38467228880011833, | |
| "grad_norm": 0.045925214886665344, | |
| "learning_rate": 0.00017790744466800805, | |
| "loss": 25.1226, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3994673768308921, | |
| "grad_norm": 0.0681011825799942, | |
| "learning_rate": 0.00017690140845070425, | |
| "loss": 25.0724, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.4142624648616659, | |
| "grad_norm": 0.06605461239814758, | |
| "learning_rate": 0.0001758953722334004, | |
| "loss": 25.0152, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4290575528924397, | |
| "grad_norm": 0.05168164521455765, | |
| "learning_rate": 0.00017488933601609658, | |
| "loss": 25.1138, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.44385264092321347, | |
| "grad_norm": 0.03383536636829376, | |
| "learning_rate": 0.00017388329979879275, | |
| "loss": 25.1285, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.45864772895398725, | |
| "grad_norm": 0.03839387744665146, | |
| "learning_rate": 0.00017287726358148896, | |
| "loss": 25.0419, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.47344281698476104, | |
| "grad_norm": 0.05215064063668251, | |
| "learning_rate": 0.00017187122736418513, | |
| "loss": 25.0599, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4882379050155348, | |
| "grad_norm": 0.05809099227190018, | |
| "learning_rate": 0.00017086519114688128, | |
| "loss": 25.0682, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.5030329930463087, | |
| "grad_norm": 0.04687444493174553, | |
| "learning_rate": 0.00016985915492957746, | |
| "loss": 25.1285, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5178280810770824, | |
| "grad_norm": 0.043396566063165665, | |
| "learning_rate": 0.00016885311871227366, | |
| "loss": 25.0015, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.5326231691078562, | |
| "grad_norm": 0.04217955097556114, | |
| "learning_rate": 0.00016784708249496983, | |
| "loss": 25.0274, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.54741825713863, | |
| "grad_norm": 0.04644334316253662, | |
| "learning_rate": 0.000166841046277666, | |
| "loss": 25.1121, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.5622133451694038, | |
| "grad_norm": 0.06119885668158531, | |
| "learning_rate": 0.00016583501006036218, | |
| "loss": 25.1759, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5770084332001776, | |
| "grad_norm": 0.05505023151636124, | |
| "learning_rate": 0.00016482897384305836, | |
| "loss": 25.0348, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.5918035212309514, | |
| "grad_norm": 0.036642227321863174, | |
| "learning_rate": 0.00016382293762575454, | |
| "loss": 25.1238, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6065986092617252, | |
| "grad_norm": 0.020652858540415764, | |
| "learning_rate": 0.0001628169014084507, | |
| "loss": 25.042, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.6213936972924989, | |
| "grad_norm": 0.03491384536027908, | |
| "learning_rate": 0.00016181086519114689, | |
| "loss": 25.03, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6361887853232727, | |
| "grad_norm": 0.04702286049723625, | |
| "learning_rate": 0.00016080482897384306, | |
| "loss": 25.055, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.6509838733540465, | |
| "grad_norm": 0.04155293107032776, | |
| "learning_rate": 0.00015979879275653924, | |
| "loss": 25.0551, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6657789613848203, | |
| "grad_norm": 0.09428809583187103, | |
| "learning_rate": 0.0001587927565392354, | |
| "loss": 25.046, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.6805740494155941, | |
| "grad_norm": 0.0833851769566536, | |
| "learning_rate": 0.00015778672032193162, | |
| "loss": 25.1491, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6953691374463679, | |
| "grad_norm": 0.05562705919146538, | |
| "learning_rate": 0.0001567806841046278, | |
| "loss": 25.1716, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.7101642254771416, | |
| "grad_norm": 0.03471198305487633, | |
| "learning_rate": 0.00015577464788732394, | |
| "loss": 25.0191, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7249593135079154, | |
| "grad_norm": 0.0596119686961174, | |
| "learning_rate": 0.00015476861167002011, | |
| "loss": 25.0971, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.7397544015386892, | |
| "grad_norm": 0.04158185049891472, | |
| "learning_rate": 0.00015376257545271632, | |
| "loss": 25.0796, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.7545494895694629, | |
| "grad_norm": 0.04319076985120773, | |
| "learning_rate": 0.0001527565392354125, | |
| "loss": 25.0667, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.7693445776002367, | |
| "grad_norm": 0.03822188079357147, | |
| "learning_rate": 0.00015175050301810867, | |
| "loss": 24.9891, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7841396656310105, | |
| "grad_norm": 0.04330127686262131, | |
| "learning_rate": 0.00015074446680080482, | |
| "loss": 25.0619, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.7989347536617842, | |
| "grad_norm": 0.041904184967279434, | |
| "learning_rate": 0.00014973843058350102, | |
| "loss": 25.0861, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.813729841692558, | |
| "grad_norm": 0.033138617873191833, | |
| "learning_rate": 0.0001487323943661972, | |
| "loss": 25.0589, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.8285249297233318, | |
| "grad_norm": 0.041870083659887314, | |
| "learning_rate": 0.00014772635814889337, | |
| "loss": 25.0909, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.8433200177541056, | |
| "grad_norm": 0.04157957062125206, | |
| "learning_rate": 0.00014672032193158955, | |
| "loss": 24.993, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.8581151057848794, | |
| "grad_norm": 0.07715047895908356, | |
| "learning_rate": 0.00014571428571428572, | |
| "loss": 25.064, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.8729101938156532, | |
| "grad_norm": 0.034074705094099045, | |
| "learning_rate": 0.0001447082494969819, | |
| "loss": 25.0995, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.8877052818464269, | |
| "grad_norm": 0.030254002660512924, | |
| "learning_rate": 0.00014370221327967807, | |
| "loss": 25.059, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.9025003698772007, | |
| "grad_norm": 0.031632065773010254, | |
| "learning_rate": 0.00014269617706237425, | |
| "loss": 25.0555, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.9172954579079745, | |
| "grad_norm": 0.041883744299411774, | |
| "learning_rate": 0.00014169014084507045, | |
| "loss": 25.077, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.9320905459387483, | |
| "grad_norm": 0.03643975779414177, | |
| "learning_rate": 0.0001406841046277666, | |
| "loss": 25.0556, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.9468856339695221, | |
| "grad_norm": 0.0535462461411953, | |
| "learning_rate": 0.00013967806841046277, | |
| "loss": 25.0945, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.9616807220002959, | |
| "grad_norm": 0.0349336676299572, | |
| "learning_rate": 0.00013867203219315898, | |
| "loss": 25.0823, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.9764758100310696, | |
| "grad_norm": 0.041822321712970734, | |
| "learning_rate": 0.00013766599597585515, | |
| "loss": 25.0856, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.9912708980618434, | |
| "grad_norm": 0.04077128693461418, | |
| "learning_rate": 0.0001366599597585513, | |
| "loss": 25.0666, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 1.0059180352123096, | |
| "grad_norm": 0.019722955301404, | |
| "learning_rate": 0.00013565392354124748, | |
| "loss": 25.0682, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.0207131232430833, | |
| "grad_norm": 0.03321698307991028, | |
| "learning_rate": 0.00013464788732394368, | |
| "loss": 25.1371, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 1.0355082112738572, | |
| "grad_norm": 0.041920218616724014, | |
| "learning_rate": 0.00013364185110663985, | |
| "loss": 24.9934, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.0503032993046308, | |
| "grad_norm": 0.03674553334712982, | |
| "learning_rate": 0.00013263581488933603, | |
| "loss": 25.1142, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 1.0650983873354047, | |
| "grad_norm": 0.03412509709596634, | |
| "learning_rate": 0.00013162977867203218, | |
| "loss": 25.0697, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.0798934753661784, | |
| "grad_norm": 0.039162006229162216, | |
| "learning_rate": 0.00013062374245472838, | |
| "loss": 25.0618, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 1.0946885633969523, | |
| "grad_norm": 0.06998290121555328, | |
| "learning_rate": 0.00012961770623742456, | |
| "loss": 25.1045, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.109483651427726, | |
| "grad_norm": 0.029092537239193916, | |
| "learning_rate": 0.00012861167002012073, | |
| "loss": 24.9204, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 1.1242787394584999, | |
| "grad_norm": 0.027615416795015335, | |
| "learning_rate": 0.0001276056338028169, | |
| "loss": 25.066, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.1390738274892735, | |
| "grad_norm": 0.029055271297693253, | |
| "learning_rate": 0.00012659959758551308, | |
| "loss": 25.0113, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 1.1538689155200474, | |
| "grad_norm": 0.03415543958544731, | |
| "learning_rate": 0.00012559356136820926, | |
| "loss": 25.025, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.1686640035508211, | |
| "grad_norm": 0.04656201973557472, | |
| "learning_rate": 0.00012458752515090543, | |
| "loss": 25.0861, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 1.1834590915815948, | |
| "grad_norm": 0.03144572675228119, | |
| "learning_rate": 0.0001235814889336016, | |
| "loss": 25.025, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.1982541796123687, | |
| "grad_norm": 0.036187801510095596, | |
| "learning_rate": 0.0001225754527162978, | |
| "loss": 25.1279, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 1.2130492676431426, | |
| "grad_norm": 0.031620871275663376, | |
| "learning_rate": 0.00012156941649899396, | |
| "loss": 24.9931, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.2278443556739163, | |
| "grad_norm": 0.034921254962682724, | |
| "learning_rate": 0.00012056338028169015, | |
| "loss": 25.0693, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 1.24263944370469, | |
| "grad_norm": 0.03131638467311859, | |
| "learning_rate": 0.00011955734406438632, | |
| "loss": 25.0967, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.2574345317354638, | |
| "grad_norm": 0.03819598630070686, | |
| "learning_rate": 0.0001185513078470825, | |
| "loss": 25.1187, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 1.2722296197662377, | |
| "grad_norm": 0.02910611405968666, | |
| "learning_rate": 0.00011754527162977869, | |
| "loss": 25.1261, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.2870247077970114, | |
| "grad_norm": 0.028234301134943962, | |
| "learning_rate": 0.00011653923541247485, | |
| "loss": 25.1246, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 1.301819795827785, | |
| "grad_norm": 0.03955462947487831, | |
| "learning_rate": 0.00011553319919517103, | |
| "loss": 25.1732, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.316614883858559, | |
| "grad_norm": 0.06937497854232788, | |
| "learning_rate": 0.00011452716297786721, | |
| "loss": 25.1451, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 1.3314099718893329, | |
| "grad_norm": 0.025304747745394707, | |
| "learning_rate": 0.00011352112676056339, | |
| "loss": 25.0555, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.3462050599201065, | |
| "grad_norm": 0.03724834695458412, | |
| "learning_rate": 0.00011251509054325957, | |
| "loss": 25.0753, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 1.3610001479508802, | |
| "grad_norm": 0.033879607915878296, | |
| "learning_rate": 0.00011150905432595573, | |
| "loss": 25.0954, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.375795235981654, | |
| "grad_norm": 0.024116437882184982, | |
| "learning_rate": 0.00011050301810865192, | |
| "loss": 25.0815, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 1.390590324012428, | |
| "grad_norm": 0.0369427315890789, | |
| "learning_rate": 0.00010949698189134809, | |
| "loss": 25.1871, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.4053854120432017, | |
| "grad_norm": 0.03194064274430275, | |
| "learning_rate": 0.00010849094567404428, | |
| "loss": 25.0658, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 1.4201805000739753, | |
| "grad_norm": 0.028080854564905167, | |
| "learning_rate": 0.00010748490945674046, | |
| "loss": 25.072, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.4349755881047492, | |
| "grad_norm": 0.028207939118146896, | |
| "learning_rate": 0.00010647887323943662, | |
| "loss": 25.0783, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 1.4497706761355231, | |
| "grad_norm": 0.028028592467308044, | |
| "learning_rate": 0.0001054728370221328, | |
| "loss": 25.0861, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.4645657641662968, | |
| "grad_norm": 0.03247331827878952, | |
| "learning_rate": 0.00010446680080482898, | |
| "loss": 25.096, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 1.4793608521970705, | |
| "grad_norm": 0.031846486032009125, | |
| "learning_rate": 0.00010346076458752516, | |
| "loss": 25.1463, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.4941559402278444, | |
| "grad_norm": 0.02977578528225422, | |
| "learning_rate": 0.00010245472837022135, | |
| "loss": 25.0884, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 1.5089510282586183, | |
| "grad_norm": 0.028014151379466057, | |
| "learning_rate": 0.0001014486921529175, | |
| "loss": 25.1056, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.523746116289392, | |
| "grad_norm": 0.03780132159590721, | |
| "learning_rate": 0.00010044265593561368, | |
| "loss": 25.0724, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 1.5385412043201656, | |
| "grad_norm": 0.0535607747733593, | |
| "learning_rate": 9.943661971830986e-05, | |
| "loss": 25.078, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.5533362923509395, | |
| "grad_norm": 0.03189392760396004, | |
| "learning_rate": 9.843058350100605e-05, | |
| "loss": 25.0673, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 1.5681313803817134, | |
| "grad_norm": 0.049738895148038864, | |
| "learning_rate": 9.742454728370221e-05, | |
| "loss": 24.9634, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.582926468412487, | |
| "grad_norm": 0.02845073491334915, | |
| "learning_rate": 9.64185110663984e-05, | |
| "loss": 25.0932, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 1.5977215564432607, | |
| "grad_norm": 0.023111697286367416, | |
| "learning_rate": 9.541247484909458e-05, | |
| "loss": 25.0889, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.6125166444740346, | |
| "grad_norm": 0.029083825647830963, | |
| "learning_rate": 9.440643863179075e-05, | |
| "loss": 25.0686, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 1.6273117325048085, | |
| "grad_norm": 0.02137155272066593, | |
| "learning_rate": 9.340040241448693e-05, | |
| "loss": 25.0946, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.6421068205355822, | |
| "grad_norm": 0.0255599282681942, | |
| "learning_rate": 9.23943661971831e-05, | |
| "loss": 24.9854, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 1.6569019085663559, | |
| "grad_norm": 0.029308602213859558, | |
| "learning_rate": 9.138832997987928e-05, | |
| "loss": 25.082, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.6716969965971298, | |
| "grad_norm": 0.03403579071164131, | |
| "learning_rate": 9.038229376257545e-05, | |
| "loss": 25.0498, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 1.6864920846279037, | |
| "grad_norm": 0.03986509144306183, | |
| "learning_rate": 8.937625754527164e-05, | |
| "loss": 25.018, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.7012871726586773, | |
| "grad_norm": 0.03191569447517395, | |
| "learning_rate": 8.83702213279678e-05, | |
| "loss": 24.9913, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 1.716082260689451, | |
| "grad_norm": 0.04105932265520096, | |
| "learning_rate": 8.736418511066399e-05, | |
| "loss": 25.0711, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.730877348720225, | |
| "grad_norm": 0.025144483894109726, | |
| "learning_rate": 8.635814889336017e-05, | |
| "loss": 25.1367, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 1.7456724367509988, | |
| "grad_norm": 0.043975986540317535, | |
| "learning_rate": 8.535211267605634e-05, | |
| "loss": 25.0362, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.7604675247817725, | |
| "grad_norm": 0.03802431747317314, | |
| "learning_rate": 8.434607645875252e-05, | |
| "loss": 25.0193, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 1.7752626128125462, | |
| "grad_norm": 0.04078783094882965, | |
| "learning_rate": 8.33400402414487e-05, | |
| "loss": 25.1258, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.79005770084332, | |
| "grad_norm": 0.03382691368460655, | |
| "learning_rate": 8.233400402414487e-05, | |
| "loss": 25.1644, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 1.804852788874094, | |
| "grad_norm": 0.038282644003629684, | |
| "learning_rate": 8.132796780684106e-05, | |
| "loss": 25.0844, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.8196478769048676, | |
| "grad_norm": 0.035348743200302124, | |
| "learning_rate": 8.032193158953722e-05, | |
| "loss": 25.0073, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 1.8344429649356413, | |
| "grad_norm": 0.02938913181424141, | |
| "learning_rate": 7.931589537223341e-05, | |
| "loss": 25.1201, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.8492380529664152, | |
| "grad_norm": 0.07416849583387375, | |
| "learning_rate": 7.830985915492957e-05, | |
| "loss": 25.0517, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 1.864033140997189, | |
| "grad_norm": 0.05911872908473015, | |
| "learning_rate": 7.730382293762576e-05, | |
| "loss": 25.085, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.8788282290279628, | |
| "grad_norm": 0.03909540921449661, | |
| "learning_rate": 7.629778672032194e-05, | |
| "loss": 25.0817, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 1.8936233170587364, | |
| "grad_norm": 0.03437948599457741, | |
| "learning_rate": 7.529175050301811e-05, | |
| "loss": 25.0958, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.9084184050895103, | |
| "grad_norm": 0.03295779228210449, | |
| "learning_rate": 7.428571428571429e-05, | |
| "loss": 25.075, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 1.9232134931202842, | |
| "grad_norm": 0.028107503429055214, | |
| "learning_rate": 7.327967806841046e-05, | |
| "loss": 25.0909, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.938008581151058, | |
| "grad_norm": 0.03533456474542618, | |
| "learning_rate": 7.227364185110664e-05, | |
| "loss": 25.0143, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 1.9528036691818316, | |
| "grad_norm": 0.030729498714208603, | |
| "learning_rate": 7.126760563380283e-05, | |
| "loss": 25.0024, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.9675987572126055, | |
| "grad_norm": 0.02123214863240719, | |
| "learning_rate": 7.0261569416499e-05, | |
| "loss": 25.1211, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 1.9823938452433794, | |
| "grad_norm": 0.03742063418030739, | |
| "learning_rate": 6.925553319919518e-05, | |
| "loss": 25.1436, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.997188933274153, | |
| "grad_norm": 0.03187941387295723, | |
| "learning_rate": 6.824949698189135e-05, | |
| "loss": 24.9736, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 2.011836070424619, | |
| "grad_norm": 0.032343629747629166, | |
| "learning_rate": 6.724346076458753e-05, | |
| "loss": 25.0978, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.026631158455393, | |
| "grad_norm": 0.032404378056526184, | |
| "learning_rate": 6.62374245472837e-05, | |
| "loss": 25.0795, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 2.0414262464861666, | |
| "grad_norm": 0.025061512365937233, | |
| "learning_rate": 6.523138832997988e-05, | |
| "loss": 25.0731, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.0562213345169402, | |
| "grad_norm": 0.03708725795149803, | |
| "learning_rate": 6.422535211267607e-05, | |
| "loss": 25.1702, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 2.0710164225477143, | |
| "grad_norm": 0.030988432466983795, | |
| "learning_rate": 6.321931589537223e-05, | |
| "loss": 25.0596, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.085811510578488, | |
| "grad_norm": 0.02645169198513031, | |
| "learning_rate": 6.221327967806842e-05, | |
| "loss": 25.078, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 2.1006065986092617, | |
| "grad_norm": 0.032431282103061676, | |
| "learning_rate": 6.120724346076458e-05, | |
| "loss": 25.0198, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.1154016866400354, | |
| "grad_norm": 0.02121451124548912, | |
| "learning_rate": 6.0201207243460764e-05, | |
| "loss": 25.103, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 2.1301967746708095, | |
| "grad_norm": 0.02786272205412388, | |
| "learning_rate": 5.9195171026156946e-05, | |
| "loss": 25.1066, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.144991862701583, | |
| "grad_norm": 0.031123634427785873, | |
| "learning_rate": 5.818913480885312e-05, | |
| "loss": 25.0548, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 2.159786950732357, | |
| "grad_norm": 0.02966328151524067, | |
| "learning_rate": 5.71830985915493e-05, | |
| "loss": 25.072, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.1745820387631305, | |
| "grad_norm": 0.03140266612172127, | |
| "learning_rate": 5.617706237424547e-05, | |
| "loss": 25.0909, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 2.1893771267939046, | |
| "grad_norm": 0.02315523661673069, | |
| "learning_rate": 5.5171026156941655e-05, | |
| "loss": 25.0799, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.2041722148246783, | |
| "grad_norm": 0.030268024653196335, | |
| "learning_rate": 5.416498993963784e-05, | |
| "loss": 25.031, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 2.218967302855452, | |
| "grad_norm": 0.028935110196471214, | |
| "learning_rate": 5.3158953722334005e-05, | |
| "loss": 25.0775, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.2337623908862256, | |
| "grad_norm": 0.025782838463783264, | |
| "learning_rate": 5.215291750503019e-05, | |
| "loss": 25.0107, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 2.2485574789169998, | |
| "grad_norm": 0.032656069844961166, | |
| "learning_rate": 5.1146881287726356e-05, | |
| "loss": 25.0786, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.2633525669477734, | |
| "grad_norm": 0.02676152065396309, | |
| "learning_rate": 5.014084507042254e-05, | |
| "loss": 25.1132, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 2.278147654978547, | |
| "grad_norm": 0.05472303926944733, | |
| "learning_rate": 4.9134808853118714e-05, | |
| "loss": 25.0132, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.2929427430093208, | |
| "grad_norm": 0.028085239231586456, | |
| "learning_rate": 4.812877263581489e-05, | |
| "loss": 25.0521, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 2.307737831040095, | |
| "grad_norm": 0.030685044825077057, | |
| "learning_rate": 4.712273641851107e-05, | |
| "loss": 25.1104, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.3225329190708686, | |
| "grad_norm": 0.03963826224207878, | |
| "learning_rate": 4.611670020120725e-05, | |
| "loss": 25.0079, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 2.3373280071016422, | |
| "grad_norm": 0.025965016335248947, | |
| "learning_rate": 4.511066398390342e-05, | |
| "loss": 25.0729, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.352123095132416, | |
| "grad_norm": 0.0241608377546072, | |
| "learning_rate": 4.41046277665996e-05, | |
| "loss": 25.0157, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 2.3669181831631896, | |
| "grad_norm": 0.028290821239352226, | |
| "learning_rate": 4.3098591549295774e-05, | |
| "loss": 25.102, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.3817132711939637, | |
| "grad_norm": 0.0340087004005909, | |
| "learning_rate": 4.2092555331991956e-05, | |
| "loss": 25.028, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 2.3965083592247374, | |
| "grad_norm": 0.04680619016289711, | |
| "learning_rate": 4.108651911468813e-05, | |
| "loss": 25.1001, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.411303447255511, | |
| "grad_norm": 0.0324757918715477, | |
| "learning_rate": 4.008048289738431e-05, | |
| "loss": 25.0479, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 2.426098535286285, | |
| "grad_norm": 0.0365682914853096, | |
| "learning_rate": 3.907444668008048e-05, | |
| "loss": 25.0715, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.440893623317059, | |
| "grad_norm": 0.030593266710639, | |
| "learning_rate": 3.806841046277666e-05, | |
| "loss": 25.11, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 2.4556887113478325, | |
| "grad_norm": 0.031157072633504868, | |
| "learning_rate": 3.706237424547283e-05, | |
| "loss": 25.0853, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.470483799378606, | |
| "grad_norm": 0.03608427569270134, | |
| "learning_rate": 3.6056338028169015e-05, | |
| "loss": 24.9746, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 2.48527888740938, | |
| "grad_norm": 0.025035865604877472, | |
| "learning_rate": 3.505030181086519e-05, | |
| "loss": 25.0303, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.500073975440154, | |
| "grad_norm": 0.031874652951955795, | |
| "learning_rate": 3.4044265593561366e-05, | |
| "loss": 25.1169, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 2.5148690634709276, | |
| "grad_norm": 0.02959771454334259, | |
| "learning_rate": 3.303822937625755e-05, | |
| "loss": 25.0229, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.5296641515017013, | |
| "grad_norm": 0.029369182884693146, | |
| "learning_rate": 3.2032193158953724e-05, | |
| "loss": 25.1294, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 2.5444592395324754, | |
| "grad_norm": 0.03211820125579834, | |
| "learning_rate": 3.1026156941649906e-05, | |
| "loss": 25.0991, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.559254327563249, | |
| "grad_norm": 0.0338427871465683, | |
| "learning_rate": 3.002012072434608e-05, | |
| "loss": 25.0448, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 2.574049415594023, | |
| "grad_norm": 0.03451085835695267, | |
| "learning_rate": 2.9014084507042254e-05, | |
| "loss": 25.0453, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.5888445036247965, | |
| "grad_norm": 0.02821294404566288, | |
| "learning_rate": 2.8008048289738433e-05, | |
| "loss": 25.0382, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 2.60363959165557, | |
| "grad_norm": 0.032822128385305405, | |
| "learning_rate": 2.7002012072434608e-05, | |
| "loss": 25.0809, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.6184346796863442, | |
| "grad_norm": 0.03818770870566368, | |
| "learning_rate": 2.5995975855130787e-05, | |
| "loss": 25.067, | |
| "step": 4425 | |
| }, | |
| { | |
| "epoch": 2.633229767717118, | |
| "grad_norm": 0.03749888017773628, | |
| "learning_rate": 2.4989939637826962e-05, | |
| "loss": 25.091, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.6480248557478916, | |
| "grad_norm": 0.027703339233994484, | |
| "learning_rate": 2.398390342052314e-05, | |
| "loss": 25.1411, | |
| "step": 4475 | |
| }, | |
| { | |
| "epoch": 2.6628199437786657, | |
| "grad_norm": 0.03599558025598526, | |
| "learning_rate": 2.2977867203219317e-05, | |
| "loss": 25.0994, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.6776150318094394, | |
| "grad_norm": 0.030554693192243576, | |
| "learning_rate": 2.1971830985915496e-05, | |
| "loss": 25.0371, | |
| "step": 4525 | |
| }, | |
| { | |
| "epoch": 2.692410119840213, | |
| "grad_norm": 0.02407955750823021, | |
| "learning_rate": 2.096579476861167e-05, | |
| "loss": 25.0952, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.7072052078709867, | |
| "grad_norm": 0.03127657622098923, | |
| "learning_rate": 1.9959758551307846e-05, | |
| "loss": 25.1274, | |
| "step": 4575 | |
| }, | |
| { | |
| "epoch": 2.7220002959017604, | |
| "grad_norm": 0.03968256711959839, | |
| "learning_rate": 1.8953722334004025e-05, | |
| "loss": 24.9342, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.7367953839325345, | |
| "grad_norm": 0.034262653440237045, | |
| "learning_rate": 1.79476861167002e-05, | |
| "loss": 25.1047, | |
| "step": 4625 | |
| }, | |
| { | |
| "epoch": 2.751590471963308, | |
| "grad_norm": 0.022854868322610855, | |
| "learning_rate": 1.6941649899396376e-05, | |
| "loss": 25.1082, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.766385559994082, | |
| "grad_norm": 0.03031456097960472, | |
| "learning_rate": 1.5935613682092555e-05, | |
| "loss": 25.0574, | |
| "step": 4675 | |
| }, | |
| { | |
| "epoch": 2.781180648024856, | |
| "grad_norm": 0.033804699778556824, | |
| "learning_rate": 1.4929577464788732e-05, | |
| "loss": 25.0682, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.7959757360556297, | |
| "grad_norm": 0.03536156564950943, | |
| "learning_rate": 1.3923541247484911e-05, | |
| "loss": 25.0752, | |
| "step": 4725 | |
| }, | |
| { | |
| "epoch": 2.8107708240864033, | |
| "grad_norm": 0.02477777935564518, | |
| "learning_rate": 1.2917505030181087e-05, | |
| "loss": 25.0747, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.825565912117177, | |
| "grad_norm": 0.03472886234521866, | |
| "learning_rate": 1.1911468812877265e-05, | |
| "loss": 25.058, | |
| "step": 4775 | |
| }, | |
| { | |
| "epoch": 2.8403610001479507, | |
| "grad_norm": 0.03604018688201904, | |
| "learning_rate": 1.0905432595573441e-05, | |
| "loss": 25.1504, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.855156088178725, | |
| "grad_norm": 0.03106354922056198, | |
| "learning_rate": 9.899396378269618e-06, | |
| "loss": 25.1244, | |
| "step": 4825 | |
| }, | |
| { | |
| "epoch": 2.8699511762094985, | |
| "grad_norm": 0.033049870282411575, | |
| "learning_rate": 8.893360160965795e-06, | |
| "loss": 25.1288, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.884746264240272, | |
| "grad_norm": 0.03024250827729702, | |
| "learning_rate": 7.887323943661972e-06, | |
| "loss": 25.0513, | |
| "step": 4875 | |
| }, | |
| { | |
| "epoch": 2.8995413522710463, | |
| "grad_norm": 0.032848477363586426, | |
| "learning_rate": 6.881287726358149e-06, | |
| "loss": 25.0365, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.91433644030182, | |
| "grad_norm": 0.03320826590061188, | |
| "learning_rate": 5.875251509054326e-06, | |
| "loss": 25.1055, | |
| "step": 4925 | |
| }, | |
| { | |
| "epoch": 2.9291315283325936, | |
| "grad_norm": 0.033013731241226196, | |
| "learning_rate": 4.869215291750504e-06, | |
| "loss": 25.1015, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.9439266163633673, | |
| "grad_norm": 0.026161963120102882, | |
| "learning_rate": 3.86317907444668e-06, | |
| "loss": 25.0681, | |
| "step": 4975 | |
| }, | |
| { | |
| "epoch": 2.958721704394141, | |
| "grad_norm": 0.031280238181352615, | |
| "learning_rate": 2.8571428571428573e-06, | |
| "loss": 25.0965, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.973516792424915, | |
| "grad_norm": 0.02889222465455532, | |
| "learning_rate": 1.8511066398390342e-06, | |
| "loss": 25.0899, | |
| "step": 5025 | |
| }, | |
| { | |
| "epoch": 2.9883118804556887, | |
| "grad_norm": 0.04580514132976532, | |
| "learning_rate": 8.450704225352112e-07, | |
| "loss": 25.1615, | |
| "step": 5050 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 5070, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.599080012584059e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |