| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.16410929679166325, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0016410929679166325, | |
| "grad_norm": 0.1563968062400818, | |
| "learning_rate": 9.999992612842675e-06, | |
| "loss": 0.6605, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.003282185935833265, | |
| "grad_norm": 0.15270280838012695, | |
| "learning_rate": 9.999970451392527e-06, | |
| "loss": 0.6491, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0049232789037498975, | |
| "grad_norm": 0.17153096199035645, | |
| "learning_rate": 9.999933515715042e-06, | |
| "loss": 0.6383, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.00656437187166653, | |
| "grad_norm": 0.172921285033226, | |
| "learning_rate": 9.999881805919356e-06, | |
| "loss": 0.6506, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.008205464839583163, | |
| "grad_norm": 0.16361959278583527, | |
| "learning_rate": 9.999815322158266e-06, | |
| "loss": 0.6067, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.009846557807499795, | |
| "grad_norm": 0.12810567021369934, | |
| "learning_rate": 9.999734064628224e-06, | |
| "loss": 0.6157, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.011487650775416428, | |
| "grad_norm": 0.13882791996002197, | |
| "learning_rate": 9.999638033569334e-06, | |
| "loss": 0.6163, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.01312874374333306, | |
| "grad_norm": 0.08581311255693436, | |
| "learning_rate": 9.999527229265353e-06, | |
| "loss": 0.5795, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.014769836711249693, | |
| "grad_norm": 0.09464729577302933, | |
| "learning_rate": 9.999401652043697e-06, | |
| "loss": 0.5908, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.016410929679166325, | |
| "grad_norm": 0.08246736973524094, | |
| "learning_rate": 9.999261302275424e-06, | |
| "loss": 0.5681, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.018052022647082958, | |
| "grad_norm": 0.07881084084510803, | |
| "learning_rate": 9.999106180375251e-06, | |
| "loss": 0.5566, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.01969311561499959, | |
| "grad_norm": 0.07179544121026993, | |
| "learning_rate": 9.998936286801541e-06, | |
| "loss": 0.5806, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.021334208582916223, | |
| "grad_norm": 0.11372455954551697, | |
| "learning_rate": 9.99875162205631e-06, | |
| "loss": 0.541, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.022975301550832855, | |
| "grad_norm": 0.07621748745441437, | |
| "learning_rate": 9.998552186685211e-06, | |
| "loss": 0.5421, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.024616394518749488, | |
| "grad_norm": 0.07502977550029755, | |
| "learning_rate": 9.998337981277552e-06, | |
| "loss": 0.5634, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.02625748748666612, | |
| "grad_norm": 0.06710907071828842, | |
| "learning_rate": 9.998109006466281e-06, | |
| "loss": 0.5322, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.027898580454582753, | |
| "grad_norm": 0.066213458776474, | |
| "learning_rate": 9.997865262927984e-06, | |
| "loss": 0.5474, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.029539673422499385, | |
| "grad_norm": 0.0674639567732811, | |
| "learning_rate": 9.997606751382894e-06, | |
| "loss": 0.5525, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.031180766390416018, | |
| "grad_norm": 0.07476690411567688, | |
| "learning_rate": 9.997333472594872e-06, | |
| "loss": 0.526, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.03282185935833265, | |
| "grad_norm": 0.05499599874019623, | |
| "learning_rate": 9.997045427371423e-06, | |
| "loss": 0.5262, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03446295232624928, | |
| "grad_norm": 0.06396327167749405, | |
| "learning_rate": 9.996742616563682e-06, | |
| "loss": 0.5141, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.036104045294165915, | |
| "grad_norm": 0.06143304333090782, | |
| "learning_rate": 9.99642504106641e-06, | |
| "loss": 0.5107, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.03774513826208255, | |
| "grad_norm": 0.0630095973610878, | |
| "learning_rate": 9.996092701818004e-06, | |
| "loss": 0.5177, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.03938623122999918, | |
| "grad_norm": 0.059215761721134186, | |
| "learning_rate": 9.995745599800476e-06, | |
| "loss": 0.5098, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.04102732419791581, | |
| "grad_norm": 0.05862729996442795, | |
| "learning_rate": 9.995383736039465e-06, | |
| "loss": 0.5027, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.042668417165832445, | |
| "grad_norm": 0.05443592369556427, | |
| "learning_rate": 9.995007111604232e-06, | |
| "loss": 0.4688, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.04430951013374908, | |
| "grad_norm": 0.05443759262561798, | |
| "learning_rate": 9.994615727607648e-06, | |
| "loss": 0.4988, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.04595060310166571, | |
| "grad_norm": 0.05553797259926796, | |
| "learning_rate": 9.994209585206201e-06, | |
| "loss": 0.4864, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.04759169606958234, | |
| "grad_norm": 0.055965058505535126, | |
| "learning_rate": 9.993788685599985e-06, | |
| "loss": 0.4768, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.049232789037498975, | |
| "grad_norm": 0.06474044173955917, | |
| "learning_rate": 9.993353030032701e-06, | |
| "loss": 0.4942, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.05087388200541561, | |
| "grad_norm": 0.0566246323287487, | |
| "learning_rate": 9.992902619791652e-06, | |
| "loss": 0.4888, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.05251497497333224, | |
| "grad_norm": 0.0555800199508667, | |
| "learning_rate": 9.992437456207738e-06, | |
| "loss": 0.4687, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.05415606794124887, | |
| "grad_norm": 0.0563640259206295, | |
| "learning_rate": 9.991957540655453e-06, | |
| "loss": 0.4923, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.055797160909165505, | |
| "grad_norm": 0.05194167420268059, | |
| "learning_rate": 9.991462874552882e-06, | |
| "loss": 0.4811, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.05743825387708214, | |
| "grad_norm": 0.1205214262008667, | |
| "learning_rate": 9.990953459361696e-06, | |
| "loss": 0.4696, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.05907934684499877, | |
| "grad_norm": 0.0542314276099205, | |
| "learning_rate": 9.990429296587148e-06, | |
| "loss": 0.4547, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0607204398129154, | |
| "grad_norm": 0.06125911697745323, | |
| "learning_rate": 9.989890387778065e-06, | |
| "loss": 0.4817, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.062361532780832035, | |
| "grad_norm": 0.06071058660745621, | |
| "learning_rate": 9.98933673452685e-06, | |
| "loss": 0.4553, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.06400262574874867, | |
| "grad_norm": 0.05769752338528633, | |
| "learning_rate": 9.98876833846947e-06, | |
| "loss": 0.4632, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.0656437187166653, | |
| "grad_norm": 0.05761849135160446, | |
| "learning_rate": 9.988185201285461e-06, | |
| "loss": 0.4439, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06728481168458193, | |
| "grad_norm": 0.05703369528055191, | |
| "learning_rate": 9.987587324697912e-06, | |
| "loss": 0.4506, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.06892590465249857, | |
| "grad_norm": 0.05599252134561539, | |
| "learning_rate": 9.986974710473467e-06, | |
| "loss": 0.4573, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0705669976204152, | |
| "grad_norm": 0.05374148488044739, | |
| "learning_rate": 9.986347360422316e-06, | |
| "loss": 0.4555, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.07220809058833183, | |
| "grad_norm": 0.054009810090065, | |
| "learning_rate": 9.985705276398193e-06, | |
| "loss": 0.4436, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.07384918355624846, | |
| "grad_norm": 0.07604236155748367, | |
| "learning_rate": 9.985048460298367e-06, | |
| "loss": 0.4583, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.0754902765241651, | |
| "grad_norm": 0.052760086953639984, | |
| "learning_rate": 9.984376914063643e-06, | |
| "loss": 0.4409, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.07713136949208173, | |
| "grad_norm": 0.06065182387828827, | |
| "learning_rate": 9.983690639678343e-06, | |
| "loss": 0.4637, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.07877246245999836, | |
| "grad_norm": 0.06539740413427353, | |
| "learning_rate": 9.982989639170319e-06, | |
| "loss": 0.4636, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.08041355542791499, | |
| "grad_norm": 0.06656944006681442, | |
| "learning_rate": 9.982273914610927e-06, | |
| "loss": 0.4487, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.08205464839583163, | |
| "grad_norm": 0.05745495483279228, | |
| "learning_rate": 9.981543468115039e-06, | |
| "loss": 0.4393, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08369574136374826, | |
| "grad_norm": 0.06525252759456635, | |
| "learning_rate": 9.98079830184102e-06, | |
| "loss": 0.4713, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.08533683433166489, | |
| "grad_norm": 0.0555146224796772, | |
| "learning_rate": 9.980038417990736e-06, | |
| "loss": 0.4278, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.08697792729958152, | |
| "grad_norm": 0.0773826315999031, | |
| "learning_rate": 9.979263818809542e-06, | |
| "loss": 0.4527, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.08861902026749816, | |
| "grad_norm": 0.06476614624261856, | |
| "learning_rate": 9.978474506586269e-06, | |
| "loss": 0.4404, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.09026011323541479, | |
| "grad_norm": 0.06019666790962219, | |
| "learning_rate": 9.977670483653228e-06, | |
| "loss": 0.4414, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.09190120620333142, | |
| "grad_norm": 0.06592460721731186, | |
| "learning_rate": 9.976851752386196e-06, | |
| "loss": 0.4313, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.09354229917124805, | |
| "grad_norm": 0.06817147135734558, | |
| "learning_rate": 9.976018315204412e-06, | |
| "loss": 0.4357, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.09518339213916469, | |
| "grad_norm": 0.06738044321537018, | |
| "learning_rate": 9.97517017457057e-06, | |
| "loss": 0.4511, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.09682448510708132, | |
| "grad_norm": 0.07049284130334854, | |
| "learning_rate": 9.974307332990806e-06, | |
| "loss": 0.447, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.09846557807499795, | |
| "grad_norm": 0.06609766185283661, | |
| "learning_rate": 9.973429793014703e-06, | |
| "loss": 0.4576, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10010667104291458, | |
| "grad_norm": 0.06007273495197296, | |
| "learning_rate": 9.972537557235267e-06, | |
| "loss": 0.4397, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.10174776401083122, | |
| "grad_norm": 0.0659220889210701, | |
| "learning_rate": 9.971630628288935e-06, | |
| "loss": 0.4538, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.10338885697874785, | |
| "grad_norm": 0.08680638670921326, | |
| "learning_rate": 9.970709008855557e-06, | |
| "loss": 0.442, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.10502994994666448, | |
| "grad_norm": 0.06552395224571228, | |
| "learning_rate": 9.969772701658393e-06, | |
| "loss": 0.4195, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.10667104291458111, | |
| "grad_norm": 0.06147119030356407, | |
| "learning_rate": 9.968821709464101e-06, | |
| "loss": 0.4419, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.10831213588249775, | |
| "grad_norm": 0.06841282546520233, | |
| "learning_rate": 9.967856035082732e-06, | |
| "loss": 0.4313, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.10995322885041438, | |
| "grad_norm": 0.07704174518585205, | |
| "learning_rate": 9.966875681367724e-06, | |
| "loss": 0.4373, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.11159432181833101, | |
| "grad_norm": 0.06586287170648575, | |
| "learning_rate": 9.965880651215885e-06, | |
| "loss": 0.4475, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.11323541478624764, | |
| "grad_norm": 0.13131344318389893, | |
| "learning_rate": 9.964870947567396e-06, | |
| "loss": 0.42, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.11487650775416428, | |
| "grad_norm": 0.06351525336503983, | |
| "learning_rate": 9.963846573405791e-06, | |
| "loss": 0.4247, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11651760072208091, | |
| "grad_norm": 0.06860467791557312, | |
| "learning_rate": 9.962807531757955e-06, | |
| "loss": 0.432, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.11815869368999754, | |
| "grad_norm": 0.06938762962818146, | |
| "learning_rate": 9.961753825694112e-06, | |
| "loss": 0.4324, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.11979978665791417, | |
| "grad_norm": 0.06990928202867508, | |
| "learning_rate": 9.960685458327824e-06, | |
| "loss": 0.4136, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.1214408796258308, | |
| "grad_norm": 0.06884902715682983, | |
| "learning_rate": 9.959602432815964e-06, | |
| "loss": 0.4301, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.12308197259374744, | |
| "grad_norm": 0.06610533595085144, | |
| "learning_rate": 9.958504752358729e-06, | |
| "loss": 0.418, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.12472306556166407, | |
| "grad_norm": 0.07162132859230042, | |
| "learning_rate": 9.957392420199612e-06, | |
| "loss": 0.4335, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1263641585295807, | |
| "grad_norm": 0.07637803256511688, | |
| "learning_rate": 9.956265439625401e-06, | |
| "loss": 0.4508, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.12800525149749734, | |
| "grad_norm": 0.06776853650808334, | |
| "learning_rate": 9.955123813966172e-06, | |
| "loss": 0.4235, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.12964634446541395, | |
| "grad_norm": 0.06781169027090073, | |
| "learning_rate": 9.953967546595272e-06, | |
| "loss": 0.4296, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.1312874374333306, | |
| "grad_norm": 0.06995800137519836, | |
| "learning_rate": 9.952796640929309e-06, | |
| "loss": 0.4328, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.13292853040124722, | |
| "grad_norm": 0.06544926762580872, | |
| "learning_rate": 9.951611100428151e-06, | |
| "loss": 0.4235, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.13456962336916387, | |
| "grad_norm": 0.07403396815061569, | |
| "learning_rate": 9.95041092859491e-06, | |
| "loss": 0.4362, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.13621071633708048, | |
| "grad_norm": 0.06964828819036484, | |
| "learning_rate": 9.949196128975925e-06, | |
| "loss": 0.4134, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.13785180930499713, | |
| "grad_norm": 0.07269076257944107, | |
| "learning_rate": 9.947966705160765e-06, | |
| "loss": 0.4288, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.13949290227291375, | |
| "grad_norm": 0.0716971680521965, | |
| "learning_rate": 9.946722660782209e-06, | |
| "loss": 0.4113, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.1411339952408304, | |
| "grad_norm": 0.06757480651140213, | |
| "learning_rate": 9.945463999516236e-06, | |
| "loss": 0.4311, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.14277508820874701, | |
| "grad_norm": 0.07381222397089005, | |
| "learning_rate": 9.944190725082019e-06, | |
| "loss": 0.4313, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.14441618117666366, | |
| "grad_norm": 0.07273319363594055, | |
| "learning_rate": 9.94290284124191e-06, | |
| "loss": 0.4195, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.14605727414458028, | |
| "grad_norm": 0.07356058806180954, | |
| "learning_rate": 9.941600351801426e-06, | |
| "loss": 0.425, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.14769836711249693, | |
| "grad_norm": 0.07552187144756317, | |
| "learning_rate": 9.940283260609248e-06, | |
| "loss": 0.4295, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.14933946008041354, | |
| "grad_norm": 0.0747319757938385, | |
| "learning_rate": 9.938951571557198e-06, | |
| "loss": 0.4426, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.1509805530483302, | |
| "grad_norm": 0.06968298554420471, | |
| "learning_rate": 9.937605288580237e-06, | |
| "loss": 0.4244, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1526216460162468, | |
| "grad_norm": 0.07281242311000824, | |
| "learning_rate": 9.936244415656443e-06, | |
| "loss": 0.4263, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.15426273898416346, | |
| "grad_norm": 0.07637212425470352, | |
| "learning_rate": 9.934868956807012e-06, | |
| "loss": 0.4217, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.15590383195208007, | |
| "grad_norm": 0.07728656381368637, | |
| "learning_rate": 9.933478916096235e-06, | |
| "loss": 0.4214, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.15754492491999672, | |
| "grad_norm": 0.09080182015895844, | |
| "learning_rate": 9.932074297631494e-06, | |
| "loss": 0.4143, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.15918601788791334, | |
| "grad_norm": 0.07812851667404175, | |
| "learning_rate": 9.930655105563241e-06, | |
| "loss": 0.4307, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.16082711085582999, | |
| "grad_norm": 0.0779787003993988, | |
| "learning_rate": 9.929221344084994e-06, | |
| "loss": 0.433, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.1624682038237466, | |
| "grad_norm": 0.0747016966342926, | |
| "learning_rate": 9.927773017433325e-06, | |
| "loss": 0.4026, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.16410929679166325, | |
| "grad_norm": 0.07794748246669769, | |
| "learning_rate": 9.926310129887836e-06, | |
| "loss": 0.4331, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 9138, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.57326968815616e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |