| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 2148, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.006985679357317499, |
| "grad_norm": 9.417793273925781, |
| "learning_rate": 6.153846153846155e-07, |
| "loss": 0.4856, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.013971358714634998, |
| "grad_norm": 7.6536865234375, |
| "learning_rate": 1.3846153846153848e-06, |
| "loss": 0.4457, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.020957038071952497, |
| "grad_norm": 6.9425950050354, |
| "learning_rate": 2.153846153846154e-06, |
| "loss": 0.4359, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.027942717429269997, |
| "grad_norm": 7.008752822875977, |
| "learning_rate": 2.9230769230769236e-06, |
| "loss": 0.4247, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0349283967865875, |
| "grad_norm": 7.943561553955078, |
| "learning_rate": 3.692307692307693e-06, |
| "loss": 0.4144, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.041914076143904994, |
| "grad_norm": 8.960087776184082, |
| "learning_rate": 4.461538461538462e-06, |
| "loss": 0.4095, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0488997555012225, |
| "grad_norm": 9.126522064208984, |
| "learning_rate": 5.230769230769232e-06, |
| "loss": 0.4195, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.055885434858539994, |
| "grad_norm": 6.81967306137085, |
| "learning_rate": 6e-06, |
| "loss": 0.409, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.06287111421585749, |
| "grad_norm": 9.193132400512695, |
| "learning_rate": 6.76923076923077e-06, |
| "loss": 0.4256, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.069856793573175, |
| "grad_norm": 7.224976062774658, |
| "learning_rate": 7.538461538461539e-06, |
| "loss": 0.41, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.07684247293049248, |
| "grad_norm": 7.260319232940674, |
| "learning_rate": 8.307692307692309e-06, |
| "loss": 0.4534, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.08382815228780999, |
| "grad_norm": 8.081445693969727, |
| "learning_rate": 9.076923076923078e-06, |
| "loss": 0.4435, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.09081383164512749, |
| "grad_norm": 8.150150299072266, |
| "learning_rate": 9.846153846153848e-06, |
| "loss": 0.4457, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.097799511002445, |
| "grad_norm": 6.927804946899414, |
| "learning_rate": 9.999909012888162e-06, |
| "loss": 0.4559, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.10478519035976248, |
| "grad_norm": 7.140408039093018, |
| "learning_rate": 9.999539383421715e-06, |
| "loss": 0.4217, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.11177086971707999, |
| "grad_norm": 7.418540000915527, |
| "learning_rate": 9.998885445909666e-06, |
| "loss": 0.4857, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.11875654907439749, |
| "grad_norm": 8.253525733947754, |
| "learning_rate": 9.997947237539373e-06, |
| "loss": 0.4863, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.12574222843171498, |
| "grad_norm": 6.550653457641602, |
| "learning_rate": 9.996724811663797e-06, |
| "loss": 0.4663, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.13272790778903248, |
| "grad_norm": 6.81130313873291, |
| "learning_rate": 9.99521823779844e-06, |
| "loss": 0.4684, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.13971358714635, |
| "grad_norm": 7.913576602935791, |
| "learning_rate": 9.99342760161741e-06, |
| "loss": 0.5049, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1466992665036675, |
| "grad_norm": 8.523401260375977, |
| "learning_rate": 9.991353004948538e-06, |
| "loss": 0.4635, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.15368494586098497, |
| "grad_norm": 6.084781169891357, |
| "learning_rate": 9.988994565767604e-06, |
| "loss": 0.4752, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.16067062521830247, |
| "grad_norm": 5.975230693817139, |
| "learning_rate": 9.986352418191608e-06, |
| "loss": 0.4722, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.16765630457561997, |
| "grad_norm": 6.941432476043701, |
| "learning_rate": 9.983426712471151e-06, |
| "loss": 0.4991, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.17464198393293748, |
| "grad_norm": 6.190479755401611, |
| "learning_rate": 9.9802176149819e-06, |
| "loss": 0.4821, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.18162766329025498, |
| "grad_norm": 6.3899006843566895, |
| "learning_rate": 9.976725308215109e-06, |
| "loss": 0.5163, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.18861334264757249, |
| "grad_norm": 5.7873430252075195, |
| "learning_rate": 9.97294999076726e-06, |
| "loss": 0.4517, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.19559902200489, |
| "grad_norm": 6.523060321807861, |
| "learning_rate": 9.968891877328752e-06, |
| "loss": 0.4632, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.20258470136220746, |
| "grad_norm": 7.010469436645508, |
| "learning_rate": 9.964551198671709e-06, |
| "loss": 0.4643, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.20957038071952497, |
| "grad_norm": 6.9780473709106445, |
| "learning_rate": 9.95992820163684e-06, |
| "loss": 0.4737, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.21655606007684247, |
| "grad_norm": 6.335629463195801, |
| "learning_rate": 9.955023149119413e-06, |
| "loss": 0.4552, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.22354173943415998, |
| "grad_norm": 5.958488464355469, |
| "learning_rate": 9.949836320054309e-06, |
| "loss": 0.4553, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.23052741879147748, |
| "grad_norm": 6.195608139038086, |
| "learning_rate": 9.944368009400145e-06, |
| "loss": 0.5117, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.23751309814879498, |
| "grad_norm": 6.441509246826172, |
| "learning_rate": 9.93861852812251e-06, |
| "loss": 0.4652, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.24449877750611246, |
| "grad_norm": 5.994947910308838, |
| "learning_rate": 9.932588203176281e-06, |
| "loss": 0.4832, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.25148445686342996, |
| "grad_norm": 6.672873020172119, |
| "learning_rate": 9.92627737748703e-06, |
| "loss": 0.4827, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.2584701362207475, |
| "grad_norm": 6.571279048919678, |
| "learning_rate": 9.919686409931523e-06, |
| "loss": 0.5003, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.26545581557806497, |
| "grad_norm": 6.049185276031494, |
| "learning_rate": 9.912815675317307e-06, |
| "loss": 0.4905, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.27244149493538244, |
| "grad_norm": 6.473181247711182, |
| "learning_rate": 9.905665564361403e-06, |
| "loss": 0.4843, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.2794271742927, |
| "grad_norm": 7.581283092498779, |
| "learning_rate": 9.898236483668086e-06, |
| "loss": 0.5097, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.28641285365001745, |
| "grad_norm": 7.333038806915283, |
| "learning_rate": 9.890528855705753e-06, |
| "loss": 0.4698, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.293398533007335, |
| "grad_norm": 6.216223239898682, |
| "learning_rate": 9.882543118782913e-06, |
| "loss": 0.5034, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.30038421236465246, |
| "grad_norm": 6.861793041229248, |
| "learning_rate": 9.874279727023253e-06, |
| "loss": 0.4756, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.30736989172196993, |
| "grad_norm": 6.37476110458374, |
| "learning_rate": 9.865739150339815e-06, |
| "loss": 0.5117, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.31435557107928747, |
| "grad_norm": 5.816511154174805, |
| "learning_rate": 9.856921874408272e-06, |
| "loss": 0.4925, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.32134125043660494, |
| "grad_norm": 6.3242692947387695, |
| "learning_rate": 9.847828400639317e-06, |
| "loss": 0.4679, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.3283269297939225, |
| "grad_norm": 6.17088508605957, |
| "learning_rate": 9.838459246150138e-06, |
| "loss": 0.4646, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.33531260915123995, |
| "grad_norm": 7.1427812576293945, |
| "learning_rate": 9.828814943735019e-06, |
| "loss": 0.4633, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.3422982885085575, |
| "grad_norm": 6.307351112365723, |
| "learning_rate": 9.818896041835036e-06, |
| "loss": 0.4586, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.34928396786587496, |
| "grad_norm": 6.033569812774658, |
| "learning_rate": 9.808703104506874e-06, |
| "loss": 0.5144, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.35626964722319243, |
| "grad_norm": 6.253153324127197, |
| "learning_rate": 9.798236711390754e-06, |
| "loss": 0.4936, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.36325532658050996, |
| "grad_norm": 6.369323253631592, |
| "learning_rate": 9.787497457677456e-06, |
| "loss": 0.5142, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.37024100593782744, |
| "grad_norm": 7.059694766998291, |
| "learning_rate": 9.776485954074492e-06, |
| "loss": 0.508, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.37722668529514497, |
| "grad_norm": 5.437386989593506, |
| "learning_rate": 9.765202826771358e-06, |
| "loss": 0.4517, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.38421236465246245, |
| "grad_norm": 6.353536605834961, |
| "learning_rate": 9.753648717403944e-06, |
| "loss": 0.5177, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.39119804400978, |
| "grad_norm": 6.7363481521606445, |
| "learning_rate": 9.741824283018022e-06, |
| "loss": 0.4768, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.39818372336709745, |
| "grad_norm": 6.804571151733398, |
| "learning_rate": 9.729730196031908e-06, |
| "loss": 0.5009, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.40516940272441493, |
| "grad_norm": 6.188082695007324, |
| "learning_rate": 9.717367144198209e-06, |
| "loss": 0.4975, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.41215508208173246, |
| "grad_norm": 6.352089881896973, |
| "learning_rate": 9.704735830564707e-06, |
| "loss": 0.458, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.41914076143904994, |
| "grad_norm": 5.823917388916016, |
| "learning_rate": 9.6918369734344e-06, |
| "loss": 0.4863, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.42612644079636747, |
| "grad_norm": 6.301001071929932, |
| "learning_rate": 9.678671306324627e-06, |
| "loss": 0.5175, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.43311212015368494, |
| "grad_norm": 6.185693740844727, |
| "learning_rate": 9.665239577925384e-06, |
| "loss": 0.4844, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.4400977995110024, |
| "grad_norm": 6.580015182495117, |
| "learning_rate": 9.65154255205672e-06, |
| "loss": 0.4907, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.44708347886831995, |
| "grad_norm": 5.962856769561768, |
| "learning_rate": 9.637581007625328e-06, |
| "loss": 0.4552, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.4540691582256374, |
| "grad_norm": 6.2723774909973145, |
| "learning_rate": 9.623355738580226e-06, |
| "loss": 0.4994, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.46105483758295496, |
| "grad_norm": 6.140941143035889, |
| "learning_rate": 9.608867553867628e-06, |
| "loss": 0.4829, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.46804051694027243, |
| "grad_norm": 6.278982639312744, |
| "learning_rate": 9.594117277384928e-06, |
| "loss": 0.5118, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.47502619629758996, |
| "grad_norm": 6.221822261810303, |
| "learning_rate": 9.579105747933858e-06, |
| "loss": 0.4903, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.48201187565490744, |
| "grad_norm": 6.153563976287842, |
| "learning_rate": 9.563833819172777e-06, |
| "loss": 0.4729, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.4889975550122249, |
| "grad_norm": 5.723773956298828, |
| "learning_rate": 9.548302359568136e-06, |
| "loss": 0.536, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.49598323436954245, |
| "grad_norm": 5.7412238121032715, |
| "learning_rate": 9.532512252345086e-06, |
| "loss": 0.4818, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.5029689137268599, |
| "grad_norm": 5.776915073394775, |
| "learning_rate": 9.516464395437251e-06, |
| "loss": 0.4818, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5099545930841775, |
| "grad_norm": 6.4714741706848145, |
| "learning_rate": 9.50015970143567e-06, |
| "loss": 0.486, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.516940272441495, |
| "grad_norm": 5.570414066314697, |
| "learning_rate": 9.483599097536895e-06, |
| "loss": 0.4821, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.5239259517988124, |
| "grad_norm": 5.86240816116333, |
| "learning_rate": 9.466783525490271e-06, |
| "loss": 0.5014, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.5309116311561299, |
| "grad_norm": 5.690156936645508, |
| "learning_rate": 9.449713941544375e-06, |
| "loss": 0.505, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5378973105134475, |
| "grad_norm": 5.273125648498535, |
| "learning_rate": 9.432391316392639e-06, |
| "loss": 0.4738, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.5448829898707649, |
| "grad_norm": 6.2202887535095215, |
| "learning_rate": 9.414816635118154e-06, |
| "loss": 0.5023, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.5518686692280824, |
| "grad_norm": 6.270415782928467, |
| "learning_rate": 9.396990897137647e-06, |
| "loss": 0.4772, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.5588543485854, |
| "grad_norm": 5.550614356994629, |
| "learning_rate": 9.378915116144646e-06, |
| "loss": 0.4556, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5658400279427175, |
| "grad_norm": 5.853529930114746, |
| "learning_rate": 9.360590320051844e-06, |
| "loss": 0.4942, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.5728257073000349, |
| "grad_norm": 5.692981243133545, |
| "learning_rate": 9.342017550932627e-06, |
| "loss": 0.4772, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.5798113866573524, |
| "grad_norm": 5.6134185791015625, |
| "learning_rate": 9.323197864961835e-06, |
| "loss": 0.4245, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.58679706601467, |
| "grad_norm": 6.195576190948486, |
| "learning_rate": 9.304132332355685e-06, |
| "loss": 0.5083, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.5937827453719874, |
| "grad_norm": 6.21861457824707, |
| "learning_rate": 9.28482203731092e-06, |
| "loss": 0.5366, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.6007684247293049, |
| "grad_norm": 5.109756946563721, |
| "learning_rate": 9.265268077943152e-06, |
| "loss": 0.4774, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.6077541040866224, |
| "grad_norm": 6.137323379516602, |
| "learning_rate": 9.245471566224416e-06, |
| "loss": 0.5125, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.6147397834439399, |
| "grad_norm": 5.771416187286377, |
| "learning_rate": 9.225433627919927e-06, |
| "loss": 0.5453, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.6217254628012574, |
| "grad_norm": 5.52296781539917, |
| "learning_rate": 9.20515540252408e-06, |
| "loss": 0.4663, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.6287111421585749, |
| "grad_norm": 6.380407333374023, |
| "learning_rate": 9.184638043195628e-06, |
| "loss": 0.4752, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.6356968215158925, |
| "grad_norm": 5.940698623657227, |
| "learning_rate": 9.16388271669213e-06, |
| "loss": 0.4309, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.6426825008732099, |
| "grad_norm": 6.604904651641846, |
| "learning_rate": 9.142890603303573e-06, |
| "loss": 0.5043, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.6496681802305274, |
| "grad_norm": 6.345200061798096, |
| "learning_rate": 9.121662896785285e-06, |
| "loss": 0.4991, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.656653859587845, |
| "grad_norm": 5.294460773468018, |
| "learning_rate": 9.100200804290014e-06, |
| "loss": 0.4788, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.6636395389451624, |
| "grad_norm": 5.905223846435547, |
| "learning_rate": 9.078505546299317e-06, |
| "loss": 0.4871, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.6706252183024799, |
| "grad_norm": 4.967826843261719, |
| "learning_rate": 9.056578356554124e-06, |
| "loss": 0.5174, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.6776108976597974, |
| "grad_norm": 5.444486618041992, |
| "learning_rate": 9.034420481984604e-06, |
| "loss": 0.4889, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.684596577017115, |
| "grad_norm": 5.278167724609375, |
| "learning_rate": 9.012033182639238e-06, |
| "loss": 0.4793, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.6915822563744324, |
| "grad_norm": 6.902711391448975, |
| "learning_rate": 8.989417731613169e-06, |
| "loss": 0.5259, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.6985679357317499, |
| "grad_norm": 6.086877346038818, |
| "learning_rate": 8.966575414975813e-06, |
| "loss": 0.5203, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7055536150890674, |
| "grad_norm": 6.7947564125061035, |
| "learning_rate": 8.943507531697715e-06, |
| "loss": 0.4609, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.7125392944463849, |
| "grad_norm": 6.169280052185059, |
| "learning_rate": 8.920215393576685e-06, |
| "loss": 0.4799, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.7195249738037024, |
| "grad_norm": 6.514042377471924, |
| "learning_rate": 8.8967003251632e-06, |
| "loss": 0.5, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.7265106531610199, |
| "grad_norm": 7.015414714813232, |
| "learning_rate": 8.872963663685075e-06, |
| "loss": 0.5226, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.7334963325183375, |
| "grad_norm": 4.968021869659424, |
| "learning_rate": 8.849006758971429e-06, |
| "loss": 0.4697, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.7404820118756549, |
| "grad_norm": 6.74957799911499, |
| "learning_rate": 8.824830973375917e-06, |
| "loss": 0.5045, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.7474676912329724, |
| "grad_norm": 5.4804816246032715, |
| "learning_rate": 8.800437681699264e-06, |
| "loss": 0.465, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.7544533705902899, |
| "grad_norm": 5.658529281616211, |
| "learning_rate": 8.775828271111074e-06, |
| "loss": 0.474, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.7614390499476074, |
| "grad_norm": 5.418574333190918, |
| "learning_rate": 8.751004141070958e-06, |
| "loss": 0.4904, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.7684247293049249, |
| "grad_norm": 5.922712802886963, |
| "learning_rate": 8.72596670324894e-06, |
| "loss": 0.4721, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.7754104086622424, |
| "grad_norm": 6.200154781341553, |
| "learning_rate": 8.70071738144519e-06, |
| "loss": 0.4902, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.78239608801956, |
| "grad_norm": 5.705479145050049, |
| "learning_rate": 8.67525761150905e-06, |
| "loss": 0.4726, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.7893817673768774, |
| "grad_norm": 6.879328727722168, |
| "learning_rate": 8.649588841257383e-06, |
| "loss": 0.4757, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.7963674467341949, |
| "grad_norm": 7.012612819671631, |
| "learning_rate": 8.623712530392244e-06, |
| "loss": 0.4522, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.8033531260915124, |
| "grad_norm": 6.293166160583496, |
| "learning_rate": 8.597630150417867e-06, |
| "loss": 0.4651, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.8103388054488299, |
| "grad_norm": 5.768970012664795, |
| "learning_rate": 8.571343184556985e-06, |
| "loss": 0.4277, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.8173244848061474, |
| "grad_norm": 6.452169895172119, |
| "learning_rate": 8.544853127666483e-06, |
| "loss": 0.5337, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.8243101641634649, |
| "grad_norm": 5.4347615242004395, |
| "learning_rate": 8.518161486152406e-06, |
| "loss": 0.468, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.8312958435207825, |
| "grad_norm": 5.982383728027344, |
| "learning_rate": 8.491269777884264e-06, |
| "loss": 0.4511, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.8382815228780999, |
| "grad_norm": 5.2124457359313965, |
| "learning_rate": 8.464179532108742e-06, |
| "loss": 0.4667, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.8452672022354174, |
| "grad_norm": 5.130034923553467, |
| "learning_rate": 8.436892289362728e-06, |
| "loss": 0.4551, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.8522528815927349, |
| "grad_norm": 5.86264705657959, |
| "learning_rate": 8.409409601385702e-06, |
| "loss": 0.5026, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.8592385609500524, |
| "grad_norm": 6.521675109863281, |
| "learning_rate": 8.381733031031503e-06, |
| "loss": 0.5017, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.8662242403073699, |
| "grad_norm": 5.794342994689941, |
| "learning_rate": 8.353864152179445e-06, |
| "loss": 0.4638, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.8732099196646874, |
| "grad_norm": 5.721254348754883, |
| "learning_rate": 8.325804549644823e-06, |
| "loss": 0.4652, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.8801955990220048, |
| "grad_norm": 5.914518356323242, |
| "learning_rate": 8.297555819088787e-06, |
| "loss": 0.4613, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.8871812783793224, |
| "grad_norm": 5.939746856689453, |
| "learning_rate": 8.269119566927599e-06, |
| "loss": 0.4599, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.8941669577366399, |
| "grad_norm": 5.655862808227539, |
| "learning_rate": 8.240497410241286e-06, |
| "loss": 0.4602, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.9011526370939574, |
| "grad_norm": 6.510814666748047, |
| "learning_rate": 8.211690976681678e-06, |
| "loss": 0.4582, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.9081383164512749, |
| "grad_norm": 5.543778419494629, |
| "learning_rate": 8.18270190437985e-06, |
| "loss": 0.4777, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.9151239958085924, |
| "grad_norm": 5.059428691864014, |
| "learning_rate": 8.153531841852969e-06, |
| "loss": 0.4756, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.9221096751659099, |
| "grad_norm": 5.040985584259033, |
| "learning_rate": 8.12418244791054e-06, |
| "loss": 0.4742, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.9290953545232273, |
| "grad_norm": 5.490272521972656, |
| "learning_rate": 8.094655391560086e-06, |
| "loss": 0.4614, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.9360810338805449, |
| "grad_norm": 5.865604877471924, |
| "learning_rate": 8.064952351912227e-06, |
| "loss": 0.4671, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.9430667132378624, |
| "grad_norm": 5.491858005523682, |
| "learning_rate": 8.035075018085204e-06, |
| "loss": 0.4624, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.9500523925951799, |
| "grad_norm": 5.839197158813477, |
| "learning_rate": 8.005025089108812e-06, |
| "loss": 0.4811, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.9570380719524973, |
| "grad_norm": 6.362710952758789, |
| "learning_rate": 7.974804273827791e-06, |
| "loss": 0.4829, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.9640237513098149, |
| "grad_norm": 5.933167457580566, |
| "learning_rate": 7.944414290804647e-06, |
| "loss": 0.4895, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.9710094306671324, |
| "grad_norm": 6.338855743408203, |
| "learning_rate": 7.913856868221922e-06, |
| "loss": 0.4729, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.9779951100244498, |
| "grad_norm": 5.022581577301025, |
| "learning_rate": 7.883133743783918e-06, |
| "loss": 0.4772, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.9849807893817674, |
| "grad_norm": 5.898752212524414, |
| "learning_rate": 7.852246664617878e-06, |
| "loss": 0.457, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.9919664687390849, |
| "grad_norm": 4.9067864418029785, |
| "learning_rate": 7.821197387174638e-06, |
| "loss": 0.4577, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.9989521480964024, |
| "grad_norm": 4.425784111022949, |
| "learning_rate": 7.78998767712873e-06, |
| "loss": 0.4683, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.005588543485854, |
| "grad_norm": 3.8555009365081787, |
| "learning_rate": 7.758619309277988e-06, |
| "loss": 0.2635, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.0125742228431716, |
| "grad_norm": 3.948883056640625, |
| "learning_rate": 7.72709406744262e-06, |
| "loss": 0.2383, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.019559902200489, |
| "grad_norm": 6.131858825683594, |
| "learning_rate": 7.695413744363753e-06, |
| "loss": 0.2488, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.0265455815578064, |
| "grad_norm": 6.018847942352295, |
| "learning_rate": 7.663580141601504e-06, |
| "loss": 0.2176, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.033531260915124, |
| "grad_norm": 6.508975982666016, |
| "learning_rate": 7.631595069432515e-06, |
| "loss": 0.2193, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.0405169402724415, |
| "grad_norm": 7.317805767059326, |
| "learning_rate": 7.599460346747024e-06, |
| "loss": 0.2054, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.047502619629759, |
| "grad_norm": 5.333045482635498, |
| "learning_rate": 7.567177800945413e-06, |
| "loss": 0.2133, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.0544882989870765, |
| "grad_norm": 4.751222610473633, |
| "learning_rate": 7.534749267834309e-06, |
| "loss": 0.2313, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.061473978344394, |
| "grad_norm": 4.327038288116455, |
| "learning_rate": 7.5021765915221656e-06, |
| "loss": 0.1994, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.0684596577017116, |
| "grad_norm": 5.050411224365234, |
| "learning_rate": 7.469461624314416e-06, |
| "loss": 0.2121, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.075445337059029, |
| "grad_norm": 5.225903511047363, |
| "learning_rate": 7.436606226608122e-06, |
| "loss": 0.2147, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.0824310164163464, |
| "grad_norm": 5.33516788482666, |
| "learning_rate": 7.403612266786188e-06, |
| "loss": 0.21, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.089416695773664, |
| "grad_norm": 5.235130786895752, |
| "learning_rate": 7.370481621111106e-06, |
| "loss": 0.2034, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.0964023751309815, |
| "grad_norm": 5.345615386962891, |
| "learning_rate": 7.337216173618269e-06, |
| "loss": 0.1926, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.103388054488299, |
| "grad_norm": 5.1708221435546875, |
| "learning_rate": 7.303817816008814e-06, |
| "loss": 0.2309, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.1103737338456166, |
| "grad_norm": 5.851431846618652, |
| "learning_rate": 7.2702884475420645e-06, |
| "loss": 0.2179, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.117359413202934, |
| "grad_norm": 5.048578262329102, |
| "learning_rate": 7.236629974927517e-06, |
| "loss": 0.2269, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.1243450925602514, |
| "grad_norm": 5.8538641929626465, |
| "learning_rate": 7.202844312216415e-06, |
| "loss": 0.2353, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.131330771917569, |
| "grad_norm": 4.558470726013184, |
| "learning_rate": 7.168933380692899e-06, |
| "loss": 0.1886, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.1383164512748865, |
| "grad_norm": 5.602941513061523, |
| "learning_rate": 7.134899108764754e-06, |
| "loss": 0.2056, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.145302130632204, |
| "grad_norm": 5.849012851715088, |
| "learning_rate": 7.1007434318537424e-06, |
| "loss": 0.2279, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.1522878099895215, |
| "grad_norm": 4.99567985534668, |
| "learning_rate": 7.06646829228555e-06, |
| "loss": 0.2233, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.159273489346839, |
| "grad_norm": 4.784215450286865, |
| "learning_rate": 7.0320756391793256e-06, |
| "loss": 0.209, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.1662591687041566, |
| "grad_norm": 5.286474227905273, |
| "learning_rate": 6.9975674283368385e-06, |
| "loss": 0.2066, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.1732448480614739, |
| "grad_norm": 4.656322956085205, |
| "learning_rate": 6.962945622131269e-06, |
| "loss": 0.2251, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.1802305274187914, |
| "grad_norm": 5.176768779754639, |
| "learning_rate": 6.9282121893956026e-06, |
| "loss": 0.2373, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.187216206776109, |
| "grad_norm": 5.336601257324219, |
| "learning_rate": 6.893369105310673e-06, |
| "loss": 0.2128, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.1942018861334265, |
| "grad_norm": 5.231692314147949, |
| "learning_rate": 6.858418351292845e-06, |
| "loss": 0.2069, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.201187565490744, |
| "grad_norm": 5.922867298126221, |
| "learning_rate": 6.823361914881331e-06, |
| "loss": 0.2006, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.2081732448480615, |
| "grad_norm": 5.954738140106201, |
| "learning_rate": 6.788201789625166e-06, |
| "loss": 0.2189, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.215158924205379, |
| "grad_norm": 5.425834655761719, |
| "learning_rate": 6.7529399749698465e-06, |
| "loss": 0.1976, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.2221446035626964, |
| "grad_norm": 4.986149311065674, |
| "learning_rate": 6.717578476143621e-06, |
| "loss": 0.1959, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.229130282920014, |
| "grad_norm": 6.26975679397583, |
| "learning_rate": 6.682119304043464e-06, |
| "loss": 0.2197, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.2361159622773314, |
| "grad_norm": 5.916511058807373, |
| "learning_rate": 6.6465644751207225e-06, |
| "loss": 0.2146, |
| "step": 885 |
| }, |
| { |
| "epoch": 1.243101641634649, |
| "grad_norm": 4.7209601402282715, |
| "learning_rate": 6.610916011266445e-06, |
| "loss": 0.2143, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.2500873209919665, |
| "grad_norm": 5.150550842285156, |
| "learning_rate": 6.575175939696401e-06, |
| "loss": 0.206, |
| "step": 895 |
| }, |
| { |
| "epoch": 1.257073000349284, |
| "grad_norm": 4.861652374267578, |
| "learning_rate": 6.539346292835804e-06, |
| "loss": 0.2198, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.2640586797066016, |
| "grad_norm": 4.239111423492432, |
| "learning_rate": 6.503429108203734e-06, |
| "loss": 0.204, |
| "step": 905 |
| }, |
| { |
| "epoch": 1.2710443590639189, |
| "grad_norm": 5.413881301879883, |
| "learning_rate": 6.467426428297262e-06, |
| "loss": 0.2263, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.2780300384212364, |
| "grad_norm": 5.223084449768066, |
| "learning_rate": 6.431340300475314e-06, |
| "loss": 0.201, |
| "step": 915 |
| }, |
| { |
| "epoch": 1.285015717778554, |
| "grad_norm": 4.555695056915283, |
| "learning_rate": 6.395172776842229e-06, |
| "loss": 0.2117, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.2920013971358715, |
| "grad_norm": 5.600462913513184, |
| "learning_rate": 6.358925914131071e-06, |
| "loss": 0.1974, |
| "step": 925 |
| }, |
| { |
| "epoch": 1.298987076493189, |
| "grad_norm": 7.300204753875732, |
| "learning_rate": 6.322601773586669e-06, |
| "loss": 0.2158, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.3059727558505065, |
| "grad_norm": 4.54530143737793, |
| "learning_rate": 6.2862024208484e-06, |
| "loss": 0.2356, |
| "step": 935 |
| }, |
| { |
| "epoch": 1.312958435207824, |
| "grad_norm": 6.054986000061035, |
| "learning_rate": 6.249729925832716e-06, |
| "loss": 0.2008, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.3199441145651414, |
| "grad_norm": 5.145763874053955, |
| "learning_rate": 6.213186362615444e-06, |
| "loss": 0.2086, |
| "step": 945 |
| }, |
| { |
| "epoch": 1.326929793922459, |
| "grad_norm": 4.843288421630859, |
| "learning_rate": 6.176573809313836e-06, |
| "loss": 0.1962, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.3339154732797764, |
| "grad_norm": 5.428574085235596, |
| "learning_rate": 6.139894347968389e-06, |
| "loss": 0.2361, |
| "step": 955 |
| }, |
| { |
| "epoch": 1.340901152637094, |
| "grad_norm": 5.883815765380859, |
| "learning_rate": 6.103150064424454e-06, |
| "loss": 0.1931, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.3478868319944115, |
| "grad_norm": 4.711780071258545, |
| "learning_rate": 6.066343048213611e-06, |
| "loss": 0.2122, |
| "step": 965 |
| }, |
| { |
| "epoch": 1.354872511351729, |
| "grad_norm": 5.004215240478516, |
| "learning_rate": 6.029475392434855e-06, |
| "loss": 0.2247, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.3618581907090466, |
| "grad_norm": 5.238073825836182, |
| "learning_rate": 5.9925491936355575e-06, |
| "loss": 0.2228, |
| "step": 975 |
| }, |
| { |
| "epoch": 1.3688438700663639, |
| "grad_norm": 5.151479721069336, |
| "learning_rate": 5.955566551692247e-06, |
| "loss": 0.2103, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.3758295494236814, |
| "grad_norm": 5.709561824798584, |
| "learning_rate": 5.918529569691202e-06, |
| "loss": 0.2069, |
| "step": 985 |
| }, |
| { |
| "epoch": 1.382815228780999, |
| "grad_norm": 5.741881370544434, |
| "learning_rate": 5.88144035380884e-06, |
| "loss": 0.2225, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.3898009081383165, |
| "grad_norm": 5.143665790557861, |
| "learning_rate": 5.8443010131919574e-06, |
| "loss": 0.2411, |
| "step": 995 |
| }, |
| { |
| "epoch": 1.396786587495634, |
| "grad_norm": 6.438896179199219, |
| "learning_rate": 5.807113659837792e-06, |
| "loss": 0.2324, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.4037722668529513, |
| "grad_norm": 4.634160041809082, |
| "learning_rate": 5.769880408473907e-06, |
| "loss": 0.2005, |
| "step": 1005 |
| }, |
| { |
| "epoch": 1.410757946210269, |
| "grad_norm": 4.902849197387695, |
| "learning_rate": 5.732603376437944e-06, |
| "loss": 0.2174, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.4177436255675864, |
| "grad_norm": 5.148767948150635, |
| "learning_rate": 5.695284683557216e-06, |
| "loss": 0.237, |
| "step": 1015 |
| }, |
| { |
| "epoch": 1.424729304924904, |
| "grad_norm": 5.740781784057617, |
| "learning_rate": 5.6579264520281515e-06, |
| "loss": 0.2197, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.4317149842822214, |
| "grad_norm": 5.8418097496032715, |
| "learning_rate": 5.620530806295621e-06, |
| "loss": 0.2185, |
| "step": 1025 |
| }, |
| { |
| "epoch": 1.438700663639539, |
| "grad_norm": 5.2934250831604, |
| "learning_rate": 5.583099872932123e-06, |
| "loss": 0.2092, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.4456863429968565, |
| "grad_norm": 4.844241619110107, |
| "learning_rate": 5.545635780516848e-06, |
| "loss": 0.2084, |
| "step": 1035 |
| }, |
| { |
| "epoch": 1.4526720223541738, |
| "grad_norm": 3.787388324737549, |
| "learning_rate": 5.508140659514644e-06, |
| "loss": 0.2007, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.4596577017114916, |
| "grad_norm": 5.787258148193359, |
| "learning_rate": 5.470616642154853e-06, |
| "loss": 0.2263, |
| "step": 1045 |
| }, |
| { |
| "epoch": 1.4666433810688089, |
| "grad_norm": 4.828206539154053, |
| "learning_rate": 5.433065862310059e-06, |
| "loss": 0.2041, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.4736290604261264, |
| "grad_norm": 5.000522613525391, |
| "learning_rate": 5.39549045537475e-06, |
| "loss": 0.2125, |
| "step": 1055 |
| }, |
| { |
| "epoch": 1.480614739783444, |
| "grad_norm": 4.484281063079834, |
| "learning_rate": 5.3578925581438765e-06, |
| "loss": 0.1907, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.4876004191407615, |
| "grad_norm": 5.4914164543151855, |
| "learning_rate": 5.32027430869134e-06, |
| "loss": 0.2468, |
| "step": 1065 |
| }, |
| { |
| "epoch": 1.494586098498079, |
| "grad_norm": 5.7337188720703125, |
| "learning_rate": 5.28263784624841e-06, |
| "loss": 0.2048, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.5015717778553963, |
| "grad_norm": 5.739035606384277, |
| "learning_rate": 5.244985311082073e-06, |
| "loss": 0.2097, |
| "step": 1075 |
| }, |
| { |
| "epoch": 1.508557457212714, |
| "grad_norm": 5.063661575317383, |
| "learning_rate": 5.207318844373315e-06, |
| "loss": 0.1988, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.5155431365700314, |
| "grad_norm": 4.986030578613281, |
| "learning_rate": 5.1696405880953715e-06, |
| "loss": 0.2037, |
| "step": 1085 |
| }, |
| { |
| "epoch": 1.522528815927349, |
| "grad_norm": 5.575538158416748, |
| "learning_rate": 5.13195268489191e-06, |
| "loss": 0.2026, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.5295144952846664, |
| "grad_norm": 6.1051530838012695, |
| "learning_rate": 5.094257277955187e-06, |
| "loss": 0.2067, |
| "step": 1095 |
| }, |
| { |
| "epoch": 1.536500174641984, |
| "grad_norm": 7.7973761558532715, |
| "learning_rate": 5.056556510904178e-06, |
| "loss": 0.2305, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.5434858539993015, |
| "grad_norm": 4.967608451843262, |
| "learning_rate": 5.0188525276626675e-06, |
| "loss": 0.1946, |
| "step": 1105 |
| }, |
| { |
| "epoch": 1.5504715333566188, |
| "grad_norm": 4.415831089019775, |
| "learning_rate": 4.981147472337333e-06, |
| "loss": 0.1934, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.5574572127139366, |
| "grad_norm": 6.092747211456299, |
| "learning_rate": 4.943443489095822e-06, |
| "loss": 0.261, |
| "step": 1115 |
| }, |
| { |
| "epoch": 1.5644428920712539, |
| "grad_norm": 5.598076820373535, |
| "learning_rate": 4.905742722044813e-06, |
| "loss": 0.1967, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.5714285714285714, |
| "grad_norm": 4.891939640045166, |
| "learning_rate": 4.868047315108091e-06, |
| "loss": 0.2134, |
| "step": 1125 |
| }, |
| { |
| "epoch": 1.578414250785889, |
| "grad_norm": 5.4631667137146, |
| "learning_rate": 4.83035941190463e-06, |
| "loss": 0.2164, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.5853999301432065, |
| "grad_norm": 4.748926162719727, |
| "learning_rate": 4.792681155626687e-06, |
| "loss": 0.2095, |
| "step": 1135 |
| }, |
| { |
| "epoch": 1.592385609500524, |
| "grad_norm": 5.863819122314453, |
| "learning_rate": 4.75501468891793e-06, |
| "loss": 0.2115, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.5993712888578413, |
| "grad_norm": 5.605953216552734, |
| "learning_rate": 4.717362153751591e-06, |
| "loss": 0.1883, |
| "step": 1145 |
| }, |
| { |
| "epoch": 1.606356968215159, |
| "grad_norm": 5.130843639373779, |
| "learning_rate": 4.679725691308662e-06, |
| "loss": 0.2082, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.6133426475724764, |
| "grad_norm": 5.240429401397705, |
| "learning_rate": 4.642107441856125e-06, |
| "loss": 0.2219, |
| "step": 1155 |
| }, |
| { |
| "epoch": 1.620328326929794, |
| "grad_norm": 4.621257305145264, |
| "learning_rate": 4.604509544625252e-06, |
| "loss": 0.1798, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.6273140062871114, |
| "grad_norm": 4.74717903137207, |
| "learning_rate": 4.566934137689943e-06, |
| "loss": 0.1898, |
| "step": 1165 |
| }, |
| { |
| "epoch": 1.634299685644429, |
| "grad_norm": 4.941923141479492, |
| "learning_rate": 4.529383357845148e-06, |
| "loss": 0.1949, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.6412853650017465, |
| "grad_norm": 5.3948516845703125, |
| "learning_rate": 4.491859340485356e-06, |
| "loss": 0.1925, |
| "step": 1175 |
| }, |
| { |
| "epoch": 1.6482710443590638, |
| "grad_norm": 5.2328667640686035, |
| "learning_rate": 4.454364219483153e-06, |
| "loss": 0.2067, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.6552567237163816, |
| "grad_norm": 5.516434669494629, |
| "learning_rate": 4.416900127067879e-06, |
| "loss": 0.2076, |
| "step": 1185 |
| }, |
| { |
| "epoch": 1.6622424030736989, |
| "grad_norm": 5.7635040283203125, |
| "learning_rate": 4.3794691937043804e-06, |
| "loss": 0.2152, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.6692280824310164, |
| "grad_norm": 4.992877006530762, |
| "learning_rate": 4.342073547971849e-06, |
| "loss": 0.2091, |
| "step": 1195 |
| }, |
| { |
| "epoch": 1.676213761788334, |
| "grad_norm": 5.6958136558532715, |
| "learning_rate": 4.304715316442785e-06, |
| "loss": 0.2059, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.6831994411456515, |
| "grad_norm": 6.601348400115967, |
| "learning_rate": 4.267396623562057e-06, |
| "loss": 0.2227, |
| "step": 1205 |
| }, |
| { |
| "epoch": 1.690185120502969, |
| "grad_norm": 4.910021781921387, |
| "learning_rate": 4.230119591526095e-06, |
| "loss": 0.2157, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.6971707998602863, |
| "grad_norm": 5.821145534515381, |
| "learning_rate": 4.19288634016221e-06, |
| "loss": 0.2197, |
| "step": 1215 |
| }, |
| { |
| "epoch": 1.704156479217604, |
| "grad_norm": 5.628259658813477, |
| "learning_rate": 4.155698986808045e-06, |
| "loss": 0.2159, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.7111421585749214, |
| "grad_norm": 5.30466890335083, |
| "learning_rate": 4.118559646191164e-06, |
| "loss": 0.1895, |
| "step": 1225 |
| }, |
| { |
| "epoch": 1.7181278379322389, |
| "grad_norm": 4.880356311798096, |
| "learning_rate": 4.081470430308799e-06, |
| "loss": 0.2022, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.7251135172895564, |
| "grad_norm": 5.982080936431885, |
| "learning_rate": 4.044433448307753e-06, |
| "loss": 0.2135, |
| "step": 1235 |
| }, |
| { |
| "epoch": 1.732099196646874, |
| "grad_norm": 5.4156341552734375, |
| "learning_rate": 4.007450806364443e-06, |
| "loss": 0.1854, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.7390848760041915, |
| "grad_norm": 4.503352165222168, |
| "learning_rate": 3.970524607565146e-06, |
| "loss": 0.1988, |
| "step": 1245 |
| }, |
| { |
| "epoch": 1.7460705553615088, |
| "grad_norm": 4.712128162384033, |
| "learning_rate": 3.93365695178639e-06, |
| "loss": 0.213, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.7530562347188265, |
| "grad_norm": 6.144698143005371, |
| "learning_rate": 3.896849935575548e-06, |
| "loss": 0.2213, |
| "step": 1255 |
| }, |
| { |
| "epoch": 1.7600419140761439, |
| "grad_norm": 5.0675482749938965, |
| "learning_rate": 3.860105652031612e-06, |
| "loss": 0.1937, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.7670275934334614, |
| "grad_norm": 5.1114020347595215, |
| "learning_rate": 3.823426190686166e-06, |
| "loss": 0.2159, |
| "step": 1265 |
| }, |
| { |
| "epoch": 1.774013272790779, |
| "grad_norm": 5.182307720184326, |
| "learning_rate": 3.7868136373845577e-06, |
| "loss": 0.2074, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.7809989521480964, |
| "grad_norm": 5.307186126708984, |
| "learning_rate": 3.7502700741672864e-06, |
| "loss": 0.2072, |
| "step": 1275 |
| }, |
| { |
| "epoch": 1.787984631505414, |
| "grad_norm": 5.354731559753418, |
| "learning_rate": 3.7137975791516025e-06, |
| "loss": 0.2029, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.7949703108627313, |
| "grad_norm": 5.206900119781494, |
| "learning_rate": 3.6773982264133324e-06, |
| "loss": 0.1993, |
| "step": 1285 |
| }, |
| { |
| "epoch": 1.801955990220049, |
| "grad_norm": 5.040802955627441, |
| "learning_rate": 3.641074085868932e-06, |
| "loss": 0.1863, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.8089416695773664, |
| "grad_norm": 4.957560062408447, |
| "learning_rate": 3.604827223157773e-06, |
| "loss": 0.2037, |
| "step": 1295 |
| }, |
| { |
| "epoch": 1.8159273489346839, |
| "grad_norm": 5.18066930770874, |
| "learning_rate": 3.5686596995246868e-06, |
| "loss": 0.1939, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.8229130282920014, |
| "grad_norm": 6.234875202178955, |
| "learning_rate": 3.5325735717027386e-06, |
| "loss": 0.2052, |
| "step": 1305 |
| }, |
| { |
| "epoch": 1.829898707649319, |
| "grad_norm": 5.431389808654785, |
| "learning_rate": 3.496570891796267e-06, |
| "loss": 0.1935, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.8368843870066365, |
| "grad_norm": 5.478362083435059, |
| "learning_rate": 3.4606537071641966e-06, |
| "loss": 0.207, |
| "step": 1315 |
| }, |
| { |
| "epoch": 1.8438700663639538, |
| "grad_norm": 5.117818832397461, |
| "learning_rate": 3.424824060303601e-06, |
| "loss": 0.2017, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.8508557457212715, |
| "grad_norm": 4.163613319396973, |
| "learning_rate": 3.389083988733556e-06, |
| "loss": 0.1818, |
| "step": 1325 |
| }, |
| { |
| "epoch": 1.8578414250785888, |
| "grad_norm": 4.954441070556641, |
| "learning_rate": 3.3534355248792787e-06, |
| "loss": 0.1909, |
| "step": 1330 |
| }, |
| { |
| "epoch": 1.8648271044359064, |
| "grad_norm": 5.696218490600586, |
| "learning_rate": 3.3178806959565378e-06, |
| "loss": 0.2004, |
| "step": 1335 |
| }, |
| { |
| "epoch": 1.871812783793224, |
| "grad_norm": 5.73183536529541, |
| "learning_rate": 3.282421523856381e-06, |
| "loss": 0.1965, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.8787984631505414, |
| "grad_norm": 5.084606647491455, |
| "learning_rate": 3.247060025030156e-06, |
| "loss": 0.2062, |
| "step": 1345 |
| }, |
| { |
| "epoch": 1.885784142507859, |
| "grad_norm": 6.37736177444458, |
| "learning_rate": 3.2117982103748358e-06, |
| "loss": 0.2164, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.8927698218651763, |
| "grad_norm": 4.8795013427734375, |
| "learning_rate": 3.1766380851186695e-06, |
| "loss": 0.2003, |
| "step": 1355 |
| }, |
| { |
| "epoch": 1.899755501222494, |
| "grad_norm": 5.402559757232666, |
| "learning_rate": 3.141581648707155e-06, |
| "loss": 0.188, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.9067411805798113, |
| "grad_norm": 5.172591209411621, |
| "learning_rate": 3.106630894689328e-06, |
| "loss": 0.1847, |
| "step": 1365 |
| }, |
| { |
| "epoch": 1.9137268599371289, |
| "grad_norm": 4.233926773071289, |
| "learning_rate": 3.0717878106043987e-06, |
| "loss": 0.187, |
| "step": 1370 |
| }, |
| { |
| "epoch": 1.9207125392944464, |
| "grad_norm": 4.548036575317383, |
| "learning_rate": 3.0370543778687315e-06, |
| "loss": 0.2091, |
| "step": 1375 |
| }, |
| { |
| "epoch": 1.927698218651764, |
| "grad_norm": 6.768104076385498, |
| "learning_rate": 3.002432571663162e-06, |
| "loss": 0.1916, |
| "step": 1380 |
| }, |
| { |
| "epoch": 1.9346838980090815, |
| "grad_norm": 4.954831600189209, |
| "learning_rate": 2.9679243608206752e-06, |
| "loss": 0.1896, |
| "step": 1385 |
| }, |
| { |
| "epoch": 1.9416695773663988, |
| "grad_norm": 5.701454162597656, |
| "learning_rate": 2.933531707714451e-06, |
| "loss": 0.2246, |
| "step": 1390 |
| }, |
| { |
| "epoch": 1.9486552567237165, |
| "grad_norm": 4.5505475997924805, |
| "learning_rate": 2.8992565681462592e-06, |
| "loss": 0.1776, |
| "step": 1395 |
| }, |
| { |
| "epoch": 1.9556409360810338, |
| "grad_norm": 3.9627044200897217, |
| "learning_rate": 2.86510089123525e-06, |
| "loss": 0.1774, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.9626266154383514, |
| "grad_norm": 5.475313186645508, |
| "learning_rate": 2.831066619307104e-06, |
| "loss": 0.2232, |
| "step": 1405 |
| }, |
| { |
| "epoch": 1.969612294795669, |
| "grad_norm": 4.611178874969482, |
| "learning_rate": 2.797155687783587e-06, |
| "loss": 0.1724, |
| "step": 1410 |
| }, |
| { |
| "epoch": 1.9765979741529862, |
| "grad_norm": 4.791353225708008, |
| "learning_rate": 2.7633700250724837e-06, |
| "loss": 0.2039, |
| "step": 1415 |
| }, |
| { |
| "epoch": 1.983583653510304, |
| "grad_norm": 4.962776184082031, |
| "learning_rate": 2.7297115524579364e-06, |
| "loss": 0.1999, |
| "step": 1420 |
| }, |
| { |
| "epoch": 1.9905693328676213, |
| "grad_norm": 5.41670560836792, |
| "learning_rate": 2.6961821839911873e-06, |
| "loss": 0.2224, |
| "step": 1425 |
| }, |
| { |
| "epoch": 1.997555012224939, |
| "grad_norm": 4.636608600616455, |
| "learning_rate": 2.662783826381734e-06, |
| "loss": 0.1939, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.0041914076143903, |
| "grad_norm": 2.7747793197631836, |
| "learning_rate": 2.6295183788888945e-06, |
| "loss": 0.1141, |
| "step": 1435 |
| }, |
| { |
| "epoch": 2.011177086971708, |
| "grad_norm": 3.0276601314544678, |
| "learning_rate": 2.5963877332138133e-06, |
| "loss": 0.0733, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.0181627663290254, |
| "grad_norm": 2.265000581741333, |
| "learning_rate": 2.563393773391879e-06, |
| "loss": 0.0695, |
| "step": 1445 |
| }, |
| { |
| "epoch": 2.025148445686343, |
| "grad_norm": 2.5616984367370605, |
| "learning_rate": 2.530538375685586e-06, |
| "loss": 0.076, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.0321341250436604, |
| "grad_norm": 3.750467300415039, |
| "learning_rate": 2.4978234084778357e-06, |
| "loss": 0.0676, |
| "step": 1455 |
| }, |
| { |
| "epoch": 2.039119804400978, |
| "grad_norm": 4.017278671264648, |
| "learning_rate": 2.4652507321656927e-06, |
| "loss": 0.0733, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.0461054837582955, |
| "grad_norm": 3.2065634727478027, |
| "learning_rate": 2.4328221990545887e-06, |
| "loss": 0.066, |
| "step": 1465 |
| }, |
| { |
| "epoch": 2.053091163115613, |
| "grad_norm": 2.6953470706939697, |
| "learning_rate": 2.4005396532529786e-06, |
| "loss": 0.0628, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.0600768424729305, |
| "grad_norm": 2.3816425800323486, |
| "learning_rate": 2.3684049305674858e-06, |
| "loss": 0.062, |
| "step": 1475 |
| }, |
| { |
| "epoch": 2.067062521830248, |
| "grad_norm": 3.783911943435669, |
| "learning_rate": 2.3364198583984977e-06, |
| "loss": 0.0767, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.0740482011875656, |
| "grad_norm": 3.010451316833496, |
| "learning_rate": 2.304586255636247e-06, |
| "loss": 0.0588, |
| "step": 1485 |
| }, |
| { |
| "epoch": 2.081033880544883, |
| "grad_norm": 4.173311710357666, |
| "learning_rate": 2.2729059325573805e-06, |
| "loss": 0.0651, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.0880195599022007, |
| "grad_norm": 3.4096760749816895, |
| "learning_rate": 2.2413806907220125e-06, |
| "loss": 0.0727, |
| "step": 1495 |
| }, |
| { |
| "epoch": 2.095005239259518, |
| "grad_norm": 2.6200010776519775, |
| "learning_rate": 2.210012322871272e-06, |
| "loss": 0.0685, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.1019909186168353, |
| "grad_norm": 3.109384059906006, |
| "learning_rate": 2.1788026128253637e-06, |
| "loss": 0.059, |
| "step": 1505 |
| }, |
| { |
| "epoch": 2.108976597974153, |
| "grad_norm": 3.6358861923217773, |
| "learning_rate": 2.1477533353821226e-06, |
| "loss": 0.0584, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.1159622773314704, |
| "grad_norm": 4.2906293869018555, |
| "learning_rate": 2.116866256216083e-06, |
| "loss": 0.0602, |
| "step": 1515 |
| }, |
| { |
| "epoch": 2.122947956688788, |
| "grad_norm": 4.5148491859436035, |
| "learning_rate": 2.086143131778079e-06, |
| "loss": 0.0732, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.1299336360461054, |
| "grad_norm": 3.4261441230773926, |
| "learning_rate": 2.055585709195356e-06, |
| "loss": 0.0672, |
| "step": 1525 |
| }, |
| { |
| "epoch": 2.136919315403423, |
| "grad_norm": 2.8189690113067627, |
| "learning_rate": 2.0251957261722116e-06, |
| "loss": 0.0554, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.1439049947607405, |
| "grad_norm": 3.4528841972351074, |
| "learning_rate": 1.9949749108911886e-06, |
| "loss": 0.0693, |
| "step": 1535 |
| }, |
| { |
| "epoch": 2.150890674118058, |
| "grad_norm": 5.297241687774658, |
| "learning_rate": 1.9649249819147976e-06, |
| "loss": 0.0623, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.1578763534753755, |
| "grad_norm": 3.1499645709991455, |
| "learning_rate": 1.9350476480877735e-06, |
| "loss": 0.0646, |
| "step": 1545 |
| }, |
| { |
| "epoch": 2.164862032832693, |
| "grad_norm": 3.9576337337493896, |
| "learning_rate": 1.9053446084399153e-06, |
| "loss": 0.0675, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.1718477121900106, |
| "grad_norm": 4.909806251525879, |
| "learning_rate": 1.8758175520894622e-06, |
| "loss": 0.0725, |
| "step": 1555 |
| }, |
| { |
| "epoch": 2.178833391547328, |
| "grad_norm": 4.15227746963501, |
| "learning_rate": 1.8464681581470328e-06, |
| "loss": 0.0632, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.1858190709046457, |
| "grad_norm": 3.7336344718933105, |
| "learning_rate": 1.8172980956201502e-06, |
| "loss": 0.0681, |
| "step": 1565 |
| }, |
| { |
| "epoch": 2.192804750261963, |
| "grad_norm": 3.011342763900757, |
| "learning_rate": 1.7883090233183238e-06, |
| "loss": 0.0611, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.1997904296192803, |
| "grad_norm": 3.7468390464782715, |
| "learning_rate": 1.7595025897587154e-06, |
| "loss": 0.0584, |
| "step": 1575 |
| }, |
| { |
| "epoch": 2.206776108976598, |
| "grad_norm": 3.4422414302825928, |
| "learning_rate": 1.7308804330724021e-06, |
| "loss": 0.0794, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.2137617883339153, |
| "grad_norm": 3.476590156555176, |
| "learning_rate": 1.7024441809112158e-06, |
| "loss": 0.06, |
| "step": 1585 |
| }, |
| { |
| "epoch": 2.220747467691233, |
| "grad_norm": 3.7604615688323975, |
| "learning_rate": 1.674195450355179e-06, |
| "loss": 0.0693, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.2277331470485504, |
| "grad_norm": 3.4398772716522217, |
| "learning_rate": 1.6461358478205552e-06, |
| "loss": 0.0582, |
| "step": 1595 |
| }, |
| { |
| "epoch": 2.234718826405868, |
| "grad_norm": 3.7523179054260254, |
| "learning_rate": 1.6182669689684982e-06, |
| "loss": 0.0708, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.2417045057631855, |
| "grad_norm": 3.6123757362365723, |
| "learning_rate": 1.5905903986142983e-06, |
| "loss": 0.0626, |
| "step": 1605 |
| }, |
| { |
| "epoch": 2.248690185120503, |
| "grad_norm": 3.515021562576294, |
| "learning_rate": 1.5631077106372728e-06, |
| "loss": 0.0626, |
| "step": 1610 |
| }, |
| { |
| "epoch": 2.2556758644778205, |
| "grad_norm": 3.9685404300689697, |
| "learning_rate": 1.5358204678912597e-06, |
| "loss": 0.0735, |
| "step": 1615 |
| }, |
| { |
| "epoch": 2.262661543835138, |
| "grad_norm": 4.189309597015381, |
| "learning_rate": 1.5087302221157386e-06, |
| "loss": 0.0706, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.2696472231924556, |
| "grad_norm": 2.535946846008301, |
| "learning_rate": 1.481838513847596e-06, |
| "loss": 0.0579, |
| "step": 1625 |
| }, |
| { |
| "epoch": 2.276632902549773, |
| "grad_norm": 3.503710985183716, |
| "learning_rate": 1.4551468723335182e-06, |
| "loss": 0.0579, |
| "step": 1630 |
| }, |
| { |
| "epoch": 2.28361858190709, |
| "grad_norm": 5.277285575866699, |
| "learning_rate": 1.4286568154430187e-06, |
| "loss": 0.0749, |
| "step": 1635 |
| }, |
| { |
| "epoch": 2.290604261264408, |
| "grad_norm": 3.7260444164276123, |
| "learning_rate": 1.4023698495821347e-06, |
| "loss": 0.0618, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.2975899406217253, |
| "grad_norm": 3.1638755798339844, |
| "learning_rate": 1.3762874696077561e-06, |
| "loss": 0.0741, |
| "step": 1645 |
| }, |
| { |
| "epoch": 2.304575619979043, |
| "grad_norm": 4.0308613777160645, |
| "learning_rate": 1.3504111587426178e-06, |
| "loss": 0.0666, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.3115612993363603, |
| "grad_norm": 3.7784061431884766, |
| "learning_rate": 1.3247423884909511e-06, |
| "loss": 0.0737, |
| "step": 1655 |
| }, |
| { |
| "epoch": 2.318546978693678, |
| "grad_norm": 3.080986976623535, |
| "learning_rate": 1.2992826185548106e-06, |
| "loss": 0.0627, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.3255326580509954, |
| "grad_norm": 3.805724859237671, |
| "learning_rate": 1.2740332967510604e-06, |
| "loss": 0.0741, |
| "step": 1665 |
| }, |
| { |
| "epoch": 2.332518337408313, |
| "grad_norm": 3.431828022003174, |
| "learning_rate": 1.248995858929043e-06, |
| "loss": 0.0584, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.3395040167656305, |
| "grad_norm": 3.6512904167175293, |
| "learning_rate": 1.2241717288889272e-06, |
| "loss": 0.0692, |
| "step": 1675 |
| }, |
| { |
| "epoch": 2.3464896961229478, |
| "grad_norm": 3.4175989627838135, |
| "learning_rate": 1.1995623183007372e-06, |
| "loss": 0.0621, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.3534753754802655, |
| "grad_norm": 3.0239272117614746, |
| "learning_rate": 1.1751690266240834e-06, |
| "loss": 0.0648, |
| "step": 1685 |
| }, |
| { |
| "epoch": 2.360461054837583, |
| "grad_norm": 3.577535629272461, |
| "learning_rate": 1.1509932410285723e-06, |
| "loss": 0.0679, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.3674467341949006, |
| "grad_norm": 3.299124002456665, |
| "learning_rate": 1.1270363363149273e-06, |
| "loss": 0.0699, |
| "step": 1695 |
| }, |
| { |
| "epoch": 2.374432413552218, |
| "grad_norm": 4.893401622772217, |
| "learning_rate": 1.1032996748368018e-06, |
| "loss": 0.0603, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.381418092909535, |
| "grad_norm": 3.660778284072876, |
| "learning_rate": 1.0797846064233154e-06, |
| "loss": 0.0814, |
| "step": 1705 |
| }, |
| { |
| "epoch": 2.388403772266853, |
| "grad_norm": 3.0096163749694824, |
| "learning_rate": 1.0564924683022865e-06, |
| "loss": 0.0611, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.3953894516241703, |
| "grad_norm": 3.9784798622131348, |
| "learning_rate": 1.0334245850241892e-06, |
| "loss": 0.0693, |
| "step": 1715 |
| }, |
| { |
| "epoch": 2.402375130981488, |
| "grad_norm": 4.206497669219971, |
| "learning_rate": 1.0105822683868333e-06, |
| "loss": 0.0695, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.4093608103388053, |
| "grad_norm": 3.21100115776062, |
| "learning_rate": 9.879668173607644e-07, |
| "loss": 0.0572, |
| "step": 1725 |
| }, |
| { |
| "epoch": 2.416346489696123, |
| "grad_norm": 4.883619785308838, |
| "learning_rate": 9.655795180153965e-07, |
| "loss": 0.0678, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.4233321690534404, |
| "grad_norm": 3.869938373565674, |
| "learning_rate": 9.434216434458749e-07, |
| "loss": 0.0602, |
| "step": 1735 |
| }, |
| { |
| "epoch": 2.430317848410758, |
| "grad_norm": 3.6375582218170166, |
| "learning_rate": 9.214944537006848e-07, |
| "loss": 0.0492, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.4373035277680755, |
| "grad_norm": 3.4665791988372803, |
| "learning_rate": 8.997991957099861e-07, |
| "loss": 0.0676, |
| "step": 1745 |
| }, |
| { |
| "epoch": 2.4442892071253928, |
| "grad_norm": 2.881965398788452, |
| "learning_rate": 8.78337103214717e-07, |
| "loss": 0.0615, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.4512748864827105, |
| "grad_norm": 3.227743148803711, |
| "learning_rate": 8.571093966964272e-07, |
| "loss": 0.0605, |
| "step": 1755 |
| }, |
| { |
| "epoch": 2.458260565840028, |
| "grad_norm": 3.0547025203704834, |
| "learning_rate": 8.361172833078724e-07, |
| "loss": 0.062, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.4652462451973456, |
| "grad_norm": 2.9871292114257812, |
| "learning_rate": 8.153619568043725e-07, |
| "loss": 0.0634, |
| "step": 1765 |
| }, |
| { |
| "epoch": 2.472231924554663, |
| "grad_norm": 3.806741952896118, |
| "learning_rate": 7.948445974759228e-07, |
| "loss": 0.075, |
| "step": 1770 |
| }, |
| { |
| "epoch": 2.47921760391198, |
| "grad_norm": 3.380242347717285, |
| "learning_rate": 7.745663720800739e-07, |
| "loss": 0.0564, |
| "step": 1775 |
| }, |
| { |
| "epoch": 2.486203283269298, |
| "grad_norm": 3.2576658725738525, |
| "learning_rate": 7.545284337755848e-07, |
| "loss": 0.0637, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.4931889626266153, |
| "grad_norm": 4.28541374206543, |
| "learning_rate": 7.347319220568478e-07, |
| "loss": 0.0692, |
| "step": 1785 |
| }, |
| { |
| "epoch": 2.500174641983933, |
| "grad_norm": 3.7046804428100586, |
| "learning_rate": 7.151779626890798e-07, |
| "loss": 0.0649, |
| "step": 1790 |
| }, |
| { |
| "epoch": 2.5071603213412503, |
| "grad_norm": 3.2380423545837402, |
| "learning_rate": 6.95867667644316e-07, |
| "loss": 0.0653, |
| "step": 1795 |
| }, |
| { |
| "epoch": 2.514146000698568, |
| "grad_norm": 3.8117942810058594, |
| "learning_rate": 6.76802135038167e-07, |
| "loss": 0.0686, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.5211316800558854, |
| "grad_norm": 3.508927583694458, |
| "learning_rate": 6.579824490673742e-07, |
| "loss": 0.0615, |
| "step": 1805 |
| }, |
| { |
| "epoch": 2.528117359413203, |
| "grad_norm": 3.836540699005127, |
| "learning_rate": 6.394096799481575e-07, |
| "loss": 0.0527, |
| "step": 1810 |
| }, |
| { |
| "epoch": 2.5351030387705205, |
| "grad_norm": 3.654384136199951, |
| "learning_rate": 6.210848838553541e-07, |
| "loss": 0.0615, |
| "step": 1815 |
| }, |
| { |
| "epoch": 2.5420887181278378, |
| "grad_norm": 4.480383396148682, |
| "learning_rate": 6.030091028623542e-07, |
| "loss": 0.0599, |
| "step": 1820 |
| }, |
| { |
| "epoch": 2.5490743974851555, |
| "grad_norm": 3.871952772140503, |
| "learning_rate": 5.851833648818467e-07, |
| "loss": 0.053, |
| "step": 1825 |
| }, |
| { |
| "epoch": 2.556060076842473, |
| "grad_norm": 4.014284610748291, |
| "learning_rate": 5.67608683607363e-07, |
| "loss": 0.0637, |
| "step": 1830 |
| }, |
| { |
| "epoch": 2.5630457561997906, |
| "grad_norm": 3.3244235515594482, |
| "learning_rate": 5.502860584556275e-07, |
| "loss": 0.0645, |
| "step": 1835 |
| }, |
| { |
| "epoch": 2.570031435557108, |
| "grad_norm": 3.213613271713257, |
| "learning_rate": 5.332164745097301e-07, |
| "loss": 0.0588, |
| "step": 1840 |
| }, |
| { |
| "epoch": 2.577017114914425, |
| "grad_norm": 3.7526209354400635, |
| "learning_rate": 5.164009024631067e-07, |
| "loss": 0.0593, |
| "step": 1845 |
| }, |
| { |
| "epoch": 2.584002794271743, |
| "grad_norm": 4.042670249938965, |
| "learning_rate": 4.998402985643319e-07, |
| "loss": 0.0648, |
| "step": 1850 |
| }, |
| { |
| "epoch": 2.5909884736290603, |
| "grad_norm": 3.8450632095336914, |
| "learning_rate": 4.83535604562751e-07, |
| "loss": 0.0621, |
| "step": 1855 |
| }, |
| { |
| "epoch": 2.597974152986378, |
| "grad_norm": 4.661925315856934, |
| "learning_rate": 4.6748774765491666e-07, |
| "loss": 0.0559, |
| "step": 1860 |
| }, |
| { |
| "epoch": 2.6049598323436953, |
| "grad_norm": 4.002378940582275, |
| "learning_rate": 4.516976404318657e-07, |
| "loss": 0.0658, |
| "step": 1865 |
| }, |
| { |
| "epoch": 2.611945511701013, |
| "grad_norm": 3.1322593688964844, |
| "learning_rate": 4.3616618082722384e-07, |
| "loss": 0.052, |
| "step": 1870 |
| }, |
| { |
| "epoch": 2.6189311910583304, |
| "grad_norm": 4.374446392059326, |
| "learning_rate": 4.2089425206614356e-07, |
| "loss": 0.0581, |
| "step": 1875 |
| }, |
| { |
| "epoch": 2.625916870415648, |
| "grad_norm": 4.130561828613281, |
| "learning_rate": 4.0588272261507245e-07, |
| "loss": 0.0714, |
| "step": 1880 |
| }, |
| { |
| "epoch": 2.6329025497729655, |
| "grad_norm": 3.0091750621795654, |
| "learning_rate": 3.911324461323729e-07, |
| "loss": 0.068, |
| "step": 1885 |
| }, |
| { |
| "epoch": 2.6398882291302828, |
| "grad_norm": 3.022900342941284, |
| "learning_rate": 3.766442614197746e-07, |
| "loss": 0.0613, |
| "step": 1890 |
| }, |
| { |
| "epoch": 2.6468739084876005, |
| "grad_norm": 3.2522053718566895, |
| "learning_rate": 3.6241899237467347e-07, |
| "loss": 0.0676, |
| "step": 1895 |
| }, |
| { |
| "epoch": 2.653859587844918, |
| "grad_norm": 2.8870651721954346, |
| "learning_rate": 3.4845744794327896e-07, |
| "loss": 0.0592, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.6608452672022356, |
| "grad_norm": 3.692023992538452, |
| "learning_rate": 3.347604220746176e-07, |
| "loss": 0.0634, |
| "step": 1905 |
| }, |
| { |
| "epoch": 2.667830946559553, |
| "grad_norm": 3.13959002494812, |
| "learning_rate": 3.213286936753729e-07, |
| "loss": 0.0573, |
| "step": 1910 |
| }, |
| { |
| "epoch": 2.67481662591687, |
| "grad_norm": 3.258509635925293, |
| "learning_rate": 3.081630265656016e-07, |
| "loss": 0.0771, |
| "step": 1915 |
| }, |
| { |
| "epoch": 2.681802305274188, |
| "grad_norm": 3.533224582672119, |
| "learning_rate": 2.9526416943529314e-07, |
| "loss": 0.062, |
| "step": 1920 |
| }, |
| { |
| "epoch": 2.6887879846315053, |
| "grad_norm": 3.2382559776306152, |
| "learning_rate": 2.8263285580179325e-07, |
| "loss": 0.0576, |
| "step": 1925 |
| }, |
| { |
| "epoch": 2.695773663988823, |
| "grad_norm": 3.944854497909546, |
| "learning_rate": 2.7026980396809235e-07, |
| "loss": 0.059, |
| "step": 1930 |
| }, |
| { |
| "epoch": 2.7027593433461403, |
| "grad_norm": 2.705212116241455, |
| "learning_rate": 2.581757169819793e-07, |
| "loss": 0.0505, |
| "step": 1935 |
| }, |
| { |
| "epoch": 2.709745022703458, |
| "grad_norm": 4.2093610763549805, |
| "learning_rate": 2.46351282596059e-07, |
| "loss": 0.0568, |
| "step": 1940 |
| }, |
| { |
| "epoch": 2.7167307020607754, |
| "grad_norm": 3.779445171356201, |
| "learning_rate": 2.347971732286425e-07, |
| "loss": 0.0614, |
| "step": 1945 |
| }, |
| { |
| "epoch": 2.723716381418093, |
| "grad_norm": 3.2959094047546387, |
| "learning_rate": 2.2351404592550918e-07, |
| "loss": 0.0555, |
| "step": 1950 |
| }, |
| { |
| "epoch": 2.7307020607754104, |
| "grad_norm": 3.7622032165527344, |
| "learning_rate": 2.1250254232254518e-07, |
| "loss": 0.0487, |
| "step": 1955 |
| }, |
| { |
| "epoch": 2.7376877401327278, |
| "grad_norm": 2.774178981781006, |
| "learning_rate": 2.01763288609248e-07, |
| "loss": 0.0594, |
| "step": 1960 |
| }, |
| { |
| "epoch": 2.7446734194900455, |
| "grad_norm": 3.630791664123535, |
| "learning_rate": 1.9129689549312602e-07, |
| "loss": 0.0517, |
| "step": 1965 |
| }, |
| { |
| "epoch": 2.751659098847363, |
| "grad_norm": 3.184044122695923, |
| "learning_rate": 1.8110395816496595e-07, |
| "loss": 0.05, |
| "step": 1970 |
| }, |
| { |
| "epoch": 2.7586447782046806, |
| "grad_norm": 3.227968692779541, |
| "learning_rate": 1.711850562649825e-07, |
| "loss": 0.0626, |
| "step": 1975 |
| }, |
| { |
| "epoch": 2.765630457561998, |
| "grad_norm": 4.061306953430176, |
| "learning_rate": 1.6154075384986202e-07, |
| "loss": 0.0594, |
| "step": 1980 |
| }, |
| { |
| "epoch": 2.772616136919315, |
| "grad_norm": 3.1833701133728027, |
| "learning_rate": 1.5217159936068314e-07, |
| "loss": 0.06, |
| "step": 1985 |
| }, |
| { |
| "epoch": 2.779601816276633, |
| "grad_norm": 3.1657423973083496, |
| "learning_rate": 1.4307812559172796e-07, |
| "loss": 0.0656, |
| "step": 1990 |
| }, |
| { |
| "epoch": 2.7865874956339503, |
| "grad_norm": 3.4851529598236084, |
| "learning_rate": 1.342608496601866e-07, |
| "loss": 0.0591, |
| "step": 1995 |
| }, |
| { |
| "epoch": 2.793573174991268, |
| "grad_norm": 4.401630878448486, |
| "learning_rate": 1.257202729767487e-07, |
| "loss": 0.0659, |
| "step": 2000 |
| }, |
| { |
| "epoch": 2.8005588543485853, |
| "grad_norm": 2.809574842453003, |
| "learning_rate": 1.1745688121708843e-07, |
| "loss": 0.0546, |
| "step": 2005 |
| }, |
| { |
| "epoch": 2.8075445337059026, |
| "grad_norm": 4.462152004241943, |
| "learning_rate": 1.094711442942481e-07, |
| "loss": 0.0626, |
| "step": 2010 |
| }, |
| { |
| "epoch": 2.8145302130632204, |
| "grad_norm": 3.0803418159484863, |
| "learning_rate": 1.0176351633191583e-07, |
| "loss": 0.0594, |
| "step": 2015 |
| }, |
| { |
| "epoch": 2.821515892420538, |
| "grad_norm": 3.0584282875061035, |
| "learning_rate": 9.433443563859667e-08, |
| "loss": 0.0544, |
| "step": 2020 |
| }, |
| { |
| "epoch": 2.8285015717778554, |
| "grad_norm": 3.364630937576294, |
| "learning_rate": 8.718432468269312e-08, |
| "loss": 0.053, |
| "step": 2025 |
| }, |
| { |
| "epoch": 2.8354872511351727, |
| "grad_norm": 3.150378465652466, |
| "learning_rate": 8.03135900684776e-08, |
| "loss": 0.0485, |
| "step": 2030 |
| }, |
| { |
| "epoch": 2.8424729304924905, |
| "grad_norm": 3.073679208755493, |
| "learning_rate": 7.372262251297002e-08, |
| "loss": 0.0481, |
| "step": 2035 |
| }, |
| { |
| "epoch": 2.849458609849808, |
| "grad_norm": 3.3093550205230713, |
| "learning_rate": 6.741179682371979e-08, |
| "loss": 0.048, |
| "step": 2040 |
| }, |
| { |
| "epoch": 2.8564442892071256, |
| "grad_norm": 4.013802528381348, |
| "learning_rate": 6.138147187749133e-08, |
| "loss": 0.0654, |
| "step": 2045 |
| }, |
| { |
| "epoch": 2.863429968564443, |
| "grad_norm": 4.480405807495117, |
| "learning_rate": 5.563199059985591e-08, |
| "loss": 0.0632, |
| "step": 2050 |
| }, |
| { |
| "epoch": 2.87041564792176, |
| "grad_norm": 4.415154457092285, |
| "learning_rate": 5.0163679945691155e-08, |
| "loss": 0.0652, |
| "step": 2055 |
| }, |
| { |
| "epoch": 2.877401327279078, |
| "grad_norm": 3.2409656047821045, |
| "learning_rate": 4.497685088058701e-08, |
| "loss": 0.0566, |
| "step": 2060 |
| }, |
| { |
| "epoch": 2.8843870066363952, |
| "grad_norm": 2.8262383937835693, |
| "learning_rate": 4.0071798363162105e-08, |
| "loss": 0.0535, |
| "step": 2065 |
| }, |
| { |
| "epoch": 2.891372685993713, |
| "grad_norm": 4.663197994232178, |
| "learning_rate": 3.5448801328293316e-08, |
| "loss": 0.0642, |
| "step": 2070 |
| }, |
| { |
| "epoch": 2.8983583653510303, |
| "grad_norm": 3.1666250228881836, |
| "learning_rate": 3.110812267124841e-08, |
| "loss": 0.0587, |
| "step": 2075 |
| }, |
| { |
| "epoch": 2.9053440447083476, |
| "grad_norm": 3.907501220703125, |
| "learning_rate": 2.7050009232741927e-08, |
| "loss": 0.0581, |
| "step": 2080 |
| }, |
| { |
| "epoch": 2.9123297240656654, |
| "grad_norm": 3.8790831565856934, |
| "learning_rate": 2.327469178489139e-08, |
| "loss": 0.0584, |
| "step": 2085 |
| }, |
| { |
| "epoch": 2.919315403422983, |
| "grad_norm": 4.372128486633301, |
| "learning_rate": 1.97823850181017e-08, |
| "loss": 0.0588, |
| "step": 2090 |
| }, |
| { |
| "epoch": 2.9263010827803004, |
| "grad_norm": 5.066431045532227, |
| "learning_rate": 1.657328752884879e-08, |
| "loss": 0.0658, |
| "step": 2095 |
| }, |
| { |
| "epoch": 2.9332867621376177, |
| "grad_norm": 4.35849142074585, |
| "learning_rate": 1.3647581808393096e-08, |
| "loss": 0.0621, |
| "step": 2100 |
| }, |
| { |
| "epoch": 2.9402724414949355, |
| "grad_norm": 3.5087194442749023, |
| "learning_rate": 1.1005434232396196e-08, |
| "loss": 0.0543, |
| "step": 2105 |
| }, |
| { |
| "epoch": 2.947258120852253, |
| "grad_norm": 3.806771993637085, |
| "learning_rate": 8.646995051461715e-09, |
| "loss": 0.0563, |
| "step": 2110 |
| }, |
| { |
| "epoch": 2.9542438002095706, |
| "grad_norm": 3.2938411235809326, |
| "learning_rate": 6.57239838259216e-09, |
| "loss": 0.0574, |
| "step": 2115 |
| }, |
| { |
| "epoch": 2.961229479566888, |
| "grad_norm": 2.854013681411743, |
| "learning_rate": 4.7817622015611244e-09, |
| "loss": 0.052, |
| "step": 2120 |
| }, |
| { |
| "epoch": 2.968215158924205, |
| "grad_norm": 3.962702512741089, |
| "learning_rate": 3.2751883362036606e-09, |
| "loss": 0.0491, |
| "step": 2125 |
| }, |
| { |
| "epoch": 2.975200838281523, |
| "grad_norm": 3.7732391357421875, |
| "learning_rate": 2.0527624606264674e-09, |
| "loss": 0.0548, |
| "step": 2130 |
| }, |
| { |
| "epoch": 2.9821865176388402, |
| "grad_norm": 3.932664155960083, |
| "learning_rate": 1.1145540903362285e-09, |
| "loss": 0.0618, |
| "step": 2135 |
| }, |
| { |
| "epoch": 2.989172196996158, |
| "grad_norm": 3.8951032161712646, |
| "learning_rate": 4.606165782855554e-10, |
| "loss": 0.0604, |
| "step": 2140 |
| }, |
| { |
| "epoch": 2.9961578763534753, |
| "grad_norm": 2.8378732204437256, |
| "learning_rate": 9.09871118387473e-11, |
| "loss": 0.0609, |
| "step": 2145 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 2148, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.9054767696141353e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|