| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 869, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0011507479861910242, |
| "grad_norm": 0.4465322789934592, |
| "learning_rate": 2.2988505747126437e-06, |
| "loss": 1.3558, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.005753739930955121, |
| "grad_norm": 0.4729686939623816, |
| "learning_rate": 1.1494252873563218e-05, |
| "loss": 1.3725, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.011507479861910242, |
| "grad_norm": 0.5386929459384674, |
| "learning_rate": 2.2988505747126437e-05, |
| "loss": 1.3751, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.01726121979286536, |
| "grad_norm": 0.1777783055450745, |
| "learning_rate": 3.4482758620689657e-05, |
| "loss": 1.3698, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.023014959723820484, |
| "grad_norm": 0.16074111506146127, |
| "learning_rate": 4.597701149425287e-05, |
| "loss": 1.308, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.028768699654775604, |
| "grad_norm": 0.13109040303886016, |
| "learning_rate": 5.747126436781609e-05, |
| "loss": 1.3108, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.03452243958573072, |
| "grad_norm": 0.1399833196718492, |
| "learning_rate": 6.896551724137931e-05, |
| "loss": 1.2826, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.04027617951668585, |
| "grad_norm": 0.1280676761539199, |
| "learning_rate": 8.045977011494253e-05, |
| "loss": 1.2773, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.04602991944764097, |
| "grad_norm": 0.12005806159718177, |
| "learning_rate": 9.195402298850575e-05, |
| "loss": 1.2377, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.05178365937859609, |
| "grad_norm": 0.10506239475224881, |
| "learning_rate": 0.00010344827586206898, |
| "loss": 1.2296, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.05753739930955121, |
| "grad_norm": 0.0819749468856537, |
| "learning_rate": 0.00011494252873563218, |
| "loss": 1.197, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.06329113924050633, |
| "grad_norm": 0.08509500503388337, |
| "learning_rate": 0.0001264367816091954, |
| "loss": 1.1709, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.06904487917146145, |
| "grad_norm": 0.07393378633841856, |
| "learning_rate": 0.00013793103448275863, |
| "loss": 1.1692, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.07479861910241657, |
| "grad_norm": 0.06625962718107734, |
| "learning_rate": 0.00014942528735632183, |
| "loss": 1.2001, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.0805523590333717, |
| "grad_norm": 0.07029239664213276, |
| "learning_rate": 0.00016091954022988506, |
| "loss": 1.1746, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.08630609896432681, |
| "grad_norm": 0.0734118145247187, |
| "learning_rate": 0.00017241379310344826, |
| "loss": 1.1477, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.09205983889528194, |
| "grad_norm": 0.07794825571468995, |
| "learning_rate": 0.0001839080459770115, |
| "loss": 1.1657, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.09781357882623705, |
| "grad_norm": 0.08512834878649493, |
| "learning_rate": 0.00019540229885057472, |
| "loss": 1.1775, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.10356731875719218, |
| "grad_norm": 0.081601122042688, |
| "learning_rate": 0.00019999273737707646, |
| "loss": 1.1584, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.1093210586881473, |
| "grad_norm": 0.09077503220224638, |
| "learning_rate": 0.00019994835850163924, |
| "loss": 1.1332, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.11507479861910241, |
| "grad_norm": 0.08933595254839481, |
| "learning_rate": 0.00019986365342513265, |
| "loss": 1.1341, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.12082853855005754, |
| "grad_norm": 0.0927003880602526, |
| "learning_rate": 0.00019973865632354516, |
| "loss": 1.1355, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.12658227848101267, |
| "grad_norm": 0.07791130311867683, |
| "learning_rate": 0.00019957341762950344, |
| "loss": 1.124, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.1323360184119678, |
| "grad_norm": 0.0790112418591026, |
| "learning_rate": 0.0001993680040119244, |
| "loss": 1.1307, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.1380897583429229, |
| "grad_norm": 0.09227634318705202, |
| "learning_rate": 0.000199122498349116, |
| "loss": 1.098, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.14384349827387802, |
| "grad_norm": 0.08290995925084253, |
| "learning_rate": 0.0001988369996953386, |
| "loss": 1.1347, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.14959723820483314, |
| "grad_norm": 0.08548470364290049, |
| "learning_rate": 0.00019851162324083932, |
| "loss": 1.1418, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.15535097813578827, |
| "grad_norm": 0.08440345287709118, |
| "learning_rate": 0.0001981465002653763, |
| "loss": 1.135, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.1611047180667434, |
| "grad_norm": 0.08444559508248843, |
| "learning_rate": 0.00019774177808525113, |
| "loss": 1.1273, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.1668584579976985, |
| "grad_norm": 0.0840175874952237, |
| "learning_rate": 0.00019729761999387103, |
| "loss": 1.1129, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.17261219792865362, |
| "grad_norm": 0.07480610551808778, |
| "learning_rate": 0.000196814205195865, |
| "loss": 1.1182, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.17836593785960875, |
| "grad_norm": 0.07608916685245905, |
| "learning_rate": 0.00019629172873477995, |
| "loss": 1.1017, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.18411967779056387, |
| "grad_norm": 0.09515413227881493, |
| "learning_rate": 0.00019573040141438624, |
| "loss": 1.1281, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.189873417721519, |
| "grad_norm": 0.07493301926746701, |
| "learning_rate": 0.00019513044971362494, |
| "loss": 1.1213, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.1956271576524741, |
| "grad_norm": 0.07980883594622108, |
| "learning_rate": 0.00019449211569523, |
| "loss": 1.1289, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.20138089758342922, |
| "grad_norm": 0.08091237279913081, |
| "learning_rate": 0.00019381565690806328, |
| "loss": 1.1222, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.20713463751438435, |
| "grad_norm": 0.07635483082524008, |
| "learning_rate": 0.00019310134628320114, |
| "loss": 1.1173, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.21288837744533948, |
| "grad_norm": 0.07974746871493653, |
| "learning_rate": 0.00019234947202381486, |
| "loss": 1.1074, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.2186421173762946, |
| "grad_norm": 0.07924995122930302, |
| "learning_rate": 0.00019156033748888917, |
| "loss": 1.1193, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.2243958573072497, |
| "grad_norm": 0.08344491509033138, |
| "learning_rate": 0.000190734261070826, |
| "loss": 1.1162, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.23014959723820483, |
| "grad_norm": 0.0809591767505205, |
| "learning_rate": 0.00018987157606698235, |
| "loss": 1.1085, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.23590333716915995, |
| "grad_norm": 0.08094087212789886, |
| "learning_rate": 0.00018897263054519498, |
| "loss": 1.0996, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.24165707710011508, |
| "grad_norm": 0.07353474154724883, |
| "learning_rate": 0.0001880377872033451, |
| "loss": 1.1301, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.2474108170310702, |
| "grad_norm": 0.0784057120949034, |
| "learning_rate": 0.00018706742322302064, |
| "loss": 1.1149, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.25316455696202533, |
| "grad_norm": 0.07563014633070095, |
| "learning_rate": 0.0001860619301173347, |
| "loss": 1.1084, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.25891829689298046, |
| "grad_norm": 0.0732650161077316, |
| "learning_rate": 0.00018502171357296144, |
| "loss": 1.0934, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.2646720368239356, |
| "grad_norm": 0.07642524798153359, |
| "learning_rate": 0.0001839471932864537, |
| "loss": 1.1279, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.27042577675489066, |
| "grad_norm": 0.07924123360978817, |
| "learning_rate": 0.0001828388027949078, |
| "loss": 1.1164, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.2761795166858458, |
| "grad_norm": 0.08291524400368479, |
| "learning_rate": 0.0001816969893010442, |
| "loss": 1.1426, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.2819332566168009, |
| "grad_norm": 0.09503646536490909, |
| "learning_rate": 0.00018052221349277442, |
| "loss": 1.1007, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.28768699654775604, |
| "grad_norm": 0.08546247272232582, |
| "learning_rate": 0.0001793149493573271, |
| "loss": 1.0996, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.29344073647871116, |
| "grad_norm": 0.07292435172406726, |
| "learning_rate": 0.00017807568399000822, |
| "loss": 1.1297, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.2991944764096663, |
| "grad_norm": 0.08793050679899453, |
| "learning_rate": 0.0001768049173976727, |
| "loss": 1.128, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.3049482163406214, |
| "grad_norm": 0.08228996895895493, |
| "learning_rate": 0.0001755031622969862, |
| "loss": 1.1067, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.31070195627157654, |
| "grad_norm": 0.08034368274055034, |
| "learning_rate": 0.00017417094390755934, |
| "loss": 1.1399, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.31645569620253167, |
| "grad_norm": 0.07355220396446509, |
| "learning_rate": 0.00017280879974003707, |
| "loss": 1.105, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.3222094361334868, |
| "grad_norm": 0.07695547066904129, |
| "learning_rate": 0.0001714172793792291, |
| "loss": 1.1282, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.32796317606444186, |
| "grad_norm": 0.07242274233224258, |
| "learning_rate": 0.0001699969442623686, |
| "loss": 1.1138, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.333716915995397, |
| "grad_norm": 0.08486742993293918, |
| "learning_rate": 0.0001685483674525891, |
| "loss": 1.1, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.3394706559263521, |
| "grad_norm": 0.0747389855159797, |
| "learning_rate": 0.0001670721334077103, |
| "loss": 1.1016, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.34522439585730724, |
| "grad_norm": 0.07533809685344381, |
| "learning_rate": 0.00016556883774442675, |
| "loss": 1.1181, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.35097813578826237, |
| "grad_norm": 0.07699490502726904, |
| "learning_rate": 0.00016403908699799425, |
| "loss": 1.0986, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.3567318757192175, |
| "grad_norm": 0.07297020155234164, |
| "learning_rate": 0.00016248349837751062, |
| "loss": 1.0951, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.3624856156501726, |
| "grad_norm": 0.08519497936629768, |
| "learning_rate": 0.0001609026995168904, |
| "loss": 1.1148, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.36823935558112775, |
| "grad_norm": 0.07458638521829272, |
| "learning_rate": 0.00015929732822163287, |
| "loss": 1.1165, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.3739930955120829, |
| "grad_norm": 0.08000996094690015, |
| "learning_rate": 0.00015766803221148673, |
| "loss": 1.1072, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.379746835443038, |
| "grad_norm": 0.08213267404173417, |
| "learning_rate": 0.00015601546885911404, |
| "loss": 1.1078, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.38550057537399307, |
| "grad_norm": 0.08374970807354172, |
| "learning_rate": 0.00015434030492486023, |
| "loss": 1.0842, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.3912543153049482, |
| "grad_norm": 0.07669296380369849, |
| "learning_rate": 0.0001526432162877356, |
| "loss": 1.1124, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.3970080552359033, |
| "grad_norm": 0.07376471868371856, |
| "learning_rate": 0.00015092488767271857, |
| "loss": 1.0882, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.40276179516685845, |
| "grad_norm": 0.08896375142853388, |
| "learning_rate": 0.00014918601237448923, |
| "loss": 1.1217, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.4085155350978136, |
| "grad_norm": 0.07333518083670185, |
| "learning_rate": 0.00014742729197770552, |
| "loss": 1.0853, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.4142692750287687, |
| "grad_norm": 0.07541021900185346, |
| "learning_rate": 0.00014564943607393459, |
| "loss": 1.1078, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.42002301495972383, |
| "grad_norm": 0.07240408031237026, |
| "learning_rate": 0.00014385316197535372, |
| "loss": 1.0963, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.42577675489067895, |
| "grad_norm": 0.07726369320922719, |
| "learning_rate": 0.00014203919442533597, |
| "loss": 1.082, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.4315304948216341, |
| "grad_norm": 0.07947926565287182, |
| "learning_rate": 0.00014020826530603776, |
| "loss": 1.0775, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.4372842347525892, |
| "grad_norm": 0.0835226760212179, |
| "learning_rate": 0.0001383611133431062, |
| "loss": 1.1005, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.4430379746835443, |
| "grad_norm": 0.07628153739619549, |
| "learning_rate": 0.00013649848380762513, |
| "loss": 1.1139, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.4487917146144994, |
| "grad_norm": 0.07650525696735216, |
| "learning_rate": 0.00013462112821542016, |
| "loss": 1.1171, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.45454545454545453, |
| "grad_norm": 0.07569647253340571, |
| "learning_rate": 0.0001327298040238446, |
| "loss": 1.0918, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.46029919447640966, |
| "grad_norm": 0.07572977420046635, |
| "learning_rate": 0.0001308252743261675, |
| "loss": 1.1018, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.4660529344073648, |
| "grad_norm": 0.084906098343464, |
| "learning_rate": 0.00012890830754368855, |
| "loss": 1.1145, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.4718066743383199, |
| "grad_norm": 0.08856218479901995, |
| "learning_rate": 0.00012697967711570242, |
| "loss": 1.1049, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.47756041426927504, |
| "grad_norm": 0.07498278458233507, |
| "learning_rate": 0.00012504016118743935, |
| "loss": 1.1061, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.48331415420023016, |
| "grad_norm": 0.07551499247429329, |
| "learning_rate": 0.00012309054229610623, |
| "loss": 1.0858, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.4890678941311853, |
| "grad_norm": 0.07573682259745337, |
| "learning_rate": 0.00012113160705515625, |
| "loss": 1.0917, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.4948216340621404, |
| "grad_norm": 0.09077558081676396, |
| "learning_rate": 0.0001191641458369136, |
| "loss": 1.118, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.5005753739930955, |
| "grad_norm": 0.07771540883960386, |
| "learning_rate": 0.00011718895245368167, |
| "loss": 1.1056, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.5063291139240507, |
| "grad_norm": 0.08583872028129896, |
| "learning_rate": 0.00011520682383746333, |
| "loss": 1.0792, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.5120828538550057, |
| "grad_norm": 0.07563432127302055, |
| "learning_rate": 0.00011321855971842243, |
| "loss": 1.1024, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.5178365937859609, |
| "grad_norm": 0.07968899236090986, |
| "learning_rate": 0.00011122496230221645, |
| "loss": 1.1037, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.523590333716916, |
| "grad_norm": 0.08595525245941885, |
| "learning_rate": 0.00010922683594633021, |
| "loss": 1.0991, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.5293440736478712, |
| "grad_norm": 0.0870194667650804, |
| "learning_rate": 0.0001072249868355415, |
| "loss": 1.1193, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.5350978135788262, |
| "grad_norm": 0.07553504230608732, |
| "learning_rate": 0.0001052202226566494, |
| "loss": 1.0876, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.5408515535097813, |
| "grad_norm": 0.08165627721996481, |
| "learning_rate": 0.00010321335227259661, |
| "loss": 1.0855, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.5466052934407365, |
| "grad_norm": 0.07548535900683774, |
| "learning_rate": 0.0001012051853961172, |
| "loss": 1.1067, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.5523590333716916, |
| "grad_norm": 0.08254614481275399, |
| "learning_rate": 9.919653226304148e-05, |
| "loss": 1.1185, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.5581127733026467, |
| "grad_norm": 0.07261119139795047, |
| "learning_rate": 9.718820330538998e-05, |
| "loss": 1.0913, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.5638665132336018, |
| "grad_norm": 0.07666759909446064, |
| "learning_rate": 9.51810088243879e-05, |
| "loss": 1.1058, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.569620253164557, |
| "grad_norm": 0.0795402304145169, |
| "learning_rate": 9.317575866353292e-05, |
| "loss": 1.1249, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.5753739930955121, |
| "grad_norm": 0.07900824948192435, |
| "learning_rate": 9.117326188184695e-05, |
| "loss": 1.1043, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5811277330264673, |
| "grad_norm": 0.07403897844355141, |
| "learning_rate": 8.917432642744518e-05, |
| "loss": 1.0858, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.5868814729574223, |
| "grad_norm": 0.07943002355447462, |
| "learning_rate": 8.717975881155261e-05, |
| "loss": 1.0842, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.5926352128883774, |
| "grad_norm": 0.07687109951447574, |
| "learning_rate": 8.519036378310096e-05, |
| "loss": 1.1046, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.5983889528193326, |
| "grad_norm": 0.08051861745508501, |
| "learning_rate": 8.320694400403606e-05, |
| "loss": 1.1176, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.6041426927502876, |
| "grad_norm": 0.07385921126253402, |
| "learning_rate": 8.123029972546781e-05, |
| "loss": 1.1237, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.6098964326812428, |
| "grad_norm": 0.0758570951173778, |
| "learning_rate": 7.926122846479224e-05, |
| "loss": 1.102, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.6156501726121979, |
| "grad_norm": 0.07600528797089706, |
| "learning_rate": 7.730052468391725e-05, |
| "loss": 1.0911, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.6214039125431531, |
| "grad_norm": 0.07938766187013545, |
| "learning_rate": 7.534897946872042e-05, |
| "loss": 1.0867, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.6271576524741082, |
| "grad_norm": 0.07930610991929318, |
| "learning_rate": 7.340738020986961e-05, |
| "loss": 1.1205, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.6329113924050633, |
| "grad_norm": 0.08212529428864859, |
| "learning_rate": 7.147651028513383e-05, |
| "loss": 1.0931, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.6386651323360184, |
| "grad_norm": 0.07731536951924715, |
| "learning_rate": 6.955714874331387e-05, |
| "loss": 1.0976, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.6444188722669736, |
| "grad_norm": 0.07561598587821595, |
| "learning_rate": 6.765006998991888e-05, |
| "loss": 1.0842, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.6501726121979287, |
| "grad_norm": 0.07405869311005314, |
| "learning_rate": 6.575604347471695e-05, |
| "loss": 1.0637, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.6559263521288837, |
| "grad_norm": 0.07183365685581813, |
| "learning_rate": 6.387583338128471e-05, |
| "loss": 1.114, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.6616800920598389, |
| "grad_norm": 0.07244649469955952, |
| "learning_rate": 6.201019831868208e-05, |
| "loss": 1.09, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.667433831990794, |
| "grad_norm": 0.07512222335465936, |
| "learning_rate": 6.015989101537586e-05, |
| "loss": 1.1117, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.6731875719217492, |
| "grad_norm": 0.0773183295000779, |
| "learning_rate": 5.83256580155362e-05, |
| "loss": 1.0846, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.6789413118527042, |
| "grad_norm": 0.0746381622325705, |
| "learning_rate": 5.6508239377828034e-05, |
| "loss": 1.0942, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.6846950517836594, |
| "grad_norm": 0.07207675237237808, |
| "learning_rate": 5.470836837681954e-05, |
| "loss": 1.0728, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.6904487917146145, |
| "grad_norm": 0.07598696177120805, |
| "learning_rate": 5.2926771207127254e-05, |
| "loss": 1.0954, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.6962025316455697, |
| "grad_norm": 0.07327699224620024, |
| "learning_rate": 5.116416669041843e-05, |
| "loss": 1.1158, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.7019562715765247, |
| "grad_norm": 0.07840719027532041, |
| "learning_rate": 4.9421265985387476e-05, |
| "loss": 1.0738, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.7077100115074798, |
| "grad_norm": 0.0769085876990078, |
| "learning_rate": 4.7698772300824756e-05, |
| "loss": 1.0958, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.713463751438435, |
| "grad_norm": 0.07132843178750269, |
| "learning_rate": 4.599738061189244e-05, |
| "loss": 1.1058, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.7192174913693901, |
| "grad_norm": 0.07713712307040285, |
| "learning_rate": 4.4317777379722866e-05, |
| "loss": 1.1028, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.7249712313003452, |
| "grad_norm": 0.07758438004748038, |
| "learning_rate": 4.266064027445155e-05, |
| "loss": 1.0899, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.7307249712313003, |
| "grad_norm": 0.07854544913701697, |
| "learning_rate": 4.102663790179764e-05, |
| "loss": 1.0761, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.7364787111622555, |
| "grad_norm": 0.0751537421287673, |
| "learning_rate": 3.941642953330103e-05, |
| "loss": 1.0935, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.7422324510932106, |
| "grad_norm": 0.07536112057903382, |
| "learning_rate": 3.7830664840326145e-05, |
| "loss": 1.1316, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.7479861910241657, |
| "grad_norm": 0.0757154673994382, |
| "learning_rate": 3.6269983631938475e-05, |
| "loss": 1.1105, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.7537399309551208, |
| "grad_norm": 0.07517902293608703, |
| "learning_rate": 3.473501559676088e-05, |
| "loss": 1.1183, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.759493670886076, |
| "grad_norm": 0.07512890867407872, |
| "learning_rate": 3.3226380048912585e-05, |
| "loss": 1.095, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.7652474108170311, |
| "grad_norm": 0.0747389401617661, |
| "learning_rate": 3.174468567813461e-05, |
| "loss": 1.0924, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.7710011507479861, |
| "grad_norm": 0.07269553748906823, |
| "learning_rate": 3.029053030420115e-05, |
| "loss": 1.1079, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.7767548906789413, |
| "grad_norm": 0.0752874034050374, |
| "learning_rate": 2.886450063571735e-05, |
| "loss": 1.102, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.7825086306098964, |
| "grad_norm": 0.0735724332478834, |
| "learning_rate": 2.7467172033399458e-05, |
| "loss": 1.1114, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.7882623705408516, |
| "grad_norm": 0.08104817717496945, |
| "learning_rate": 2.6099108277934103e-05, |
| "loss": 1.1077, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.7940161104718066, |
| "grad_norm": 0.07305813837737488, |
| "learning_rate": 2.4760861342509233e-05, |
| "loss": 1.0906, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.7997698504027618, |
| "grad_norm": 0.07242520365927102, |
| "learning_rate": 2.345297117010954e-05, |
| "loss": 1.0893, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.8055235903337169, |
| "grad_norm": 0.07698458847076538, |
| "learning_rate": 2.2175965455665226e-05, |
| "loss": 1.0898, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.8112773302646721, |
| "grad_norm": 0.076651036057528, |
| "learning_rate": 2.0930359433142932e-05, |
| "loss": 1.0877, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.8170310701956272, |
| "grad_norm": 0.0721820626725428, |
| "learning_rate": 1.9716655667664008e-05, |
| "loss": 1.0794, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.8227848101265823, |
| "grad_norm": 0.07461008979903963, |
| "learning_rate": 1.8535343852734332e-05, |
| "loss": 1.1101, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.8285385500575374, |
| "grad_norm": 0.07528450837009106, |
| "learning_rate": 1.7386900612667633e-05, |
| "loss": 1.0949, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.8342922899884925, |
| "grad_norm": 0.07330788015551219, |
| "learning_rate": 1.6271789310281517e-05, |
| "loss": 1.0942, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.8400460299194477, |
| "grad_norm": 0.07387736383215655, |
| "learning_rate": 1.5190459859944505e-05, |
| "loss": 1.0785, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.8457997698504027, |
| "grad_norm": 0.07362643024392242, |
| "learning_rate": 1.4143348546048707e-05, |
| "loss": 1.0834, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.8515535097813579, |
| "grad_norm": 0.07308957142384619, |
| "learning_rate": 1.3130877846982204e-05, |
| "loss": 1.0833, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.857307249712313, |
| "grad_norm": 0.07884770650029872, |
| "learning_rate": 1.2153456264671337e-05, |
| "loss": 1.1047, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.8630609896432682, |
| "grad_norm": 0.07410132122132759, |
| "learning_rate": 1.1211478159762478e-05, |
| "loss": 1.1042, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.8688147295742232, |
| "grad_norm": 0.07315305451840597, |
| "learning_rate": 1.0305323592509009e-05, |
| "loss": 1.1127, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.8745684695051784, |
| "grad_norm": 0.07241715378225073, |
| "learning_rate": 9.435358169428442e-06, |
| "loss": 1.0552, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.8803222094361335, |
| "grad_norm": 0.07310674530244925, |
| "learning_rate": 8.601932895790877e-06, |
| "loss": 1.1175, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.8860759493670886, |
| "grad_norm": 0.08154925564133862, |
| "learning_rate": 7.805384033998875e-06, |
| "loss": 1.093, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.8918296892980437, |
| "grad_norm": 0.07285132059208473, |
| "learning_rate": 7.046032967915483e-06, |
| "loss": 1.0869, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.8975834292289988, |
| "grad_norm": 0.07660142337736248, |
| "learning_rate": 6.32418607319546e-06, |
| "loss": 1.0914, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.903337169159954, |
| "grad_norm": 0.07846326221484771, |
| "learning_rate": 5.640134593671598e-06, |
| "loss": 1.0879, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.9090909090909091, |
| "grad_norm": 0.0744126532665454, |
| "learning_rate": 4.994154523846695e-06, |
| "loss": 1.0948, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.9148446490218642, |
| "grad_norm": 0.07568510267678755, |
| "learning_rate": 4.386506497537757e-06, |
| "loss": 1.1054, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.9205983889528193, |
| "grad_norm": 0.07378727253077548, |
| "learning_rate": 3.817435682718096e-06, |
| "loss": 1.0787, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.9263521288837745, |
| "grad_norm": 0.0747897379397166, |
| "learning_rate": 3.287171682599255e-06, |
| "loss": 1.0911, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.9321058688147296, |
| "grad_norm": 0.07818706538185882, |
| "learning_rate": 2.7959284429929456e-06, |
| "loss": 1.0926, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.9378596087456847, |
| "grad_norm": 0.07201156955643878, |
| "learning_rate": 2.3439041659902407e-06, |
| "loss": 1.106, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.9436133486766398, |
| "grad_norm": 0.07293591627119424, |
| "learning_rate": 1.9312812299929094e-06, |
| "loss": 1.1041, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.9493670886075949, |
| "grad_norm": 0.07620976319405062, |
| "learning_rate": 1.5582261161291245e-06, |
| "loss": 1.0835, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.9551208285385501, |
| "grad_norm": 0.07403997053394712, |
| "learning_rate": 1.2248893410832685e-06, |
| "loss": 1.1048, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.9608745684695051, |
| "grad_norm": 0.07432396016348998, |
| "learning_rate": 9.314053963669245e-07, |
| "loss": 1.0915, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.9666283084004603, |
| "grad_norm": 0.07180128894783522, |
| "learning_rate": 6.778926940555152e-07, |
| "loss": 1.0981, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.9723820483314154, |
| "grad_norm": 0.07204281549304838, |
| "learning_rate": 4.644535190125421e-07, |
| "loss": 1.0962, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.9781357882623706, |
| "grad_norm": 0.07295868011398168, |
| "learning_rate": 2.9117398762069647e-07, |
| "loss": 1.1083, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.9838895281933256, |
| "grad_norm": 0.07600348765786609, |
| "learning_rate": 1.5812401303639813e-07, |
| "loss": 1.1138, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.9896432681242808, |
| "grad_norm": 0.07208207285243298, |
| "learning_rate": 6.535727698199213e-08, |
| "loss": 1.0787, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.9953970080552359, |
| "grad_norm": 0.07662851683615209, |
| "learning_rate": 1.2911208086663351e-08, |
| "loss": 1.087, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 1.038455605506897, |
| "eval_runtime": 2.7622, |
| "eval_samples_per_second": 2.534, |
| "eval_steps_per_second": 0.724, |
| "step": 869 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 869, |
| "total_flos": 1.0927921245978624e+16, |
| "train_loss": 1.118139435424629, |
| "train_runtime": 17754.8873, |
| "train_samples_per_second": 3.132, |
| "train_steps_per_second": 0.049 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 869, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.0927921245978624e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|