| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 50.0, | |
| "global_step": 717, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0013953082759222116, | |
| "grad_norm": 2.108574390411377, | |
| "learning_rate": 2.7777777777777774e-08, | |
| "loss": 0.5175914764404297, | |
| "step": 1, | |
| "token_acc": 0.9343839541547277 | |
| }, | |
| { | |
| "epoch": 0.006976541379611058, | |
| "grad_norm": 2.2113378047943115, | |
| "learning_rate": 1.3888888888888888e-07, | |
| "loss": 0.5278360247612, | |
| "step": 5, | |
| "token_acc": 0.9315846730327572 | |
| }, | |
| { | |
| "epoch": 0.013953082759222116, | |
| "grad_norm": 2.659011125564575, | |
| "learning_rate": 2.7777777777777776e-07, | |
| "loss": 0.5554334640502929, | |
| "step": 10, | |
| "token_acc": 0.9270202547504698 | |
| }, | |
| { | |
| "epoch": 0.020929624138833175, | |
| "grad_norm": 2.251737117767334, | |
| "learning_rate": 4.1666666666666667e-07, | |
| "loss": 0.5544517517089844, | |
| "step": 15, | |
| "token_acc": 0.9273873055524015 | |
| }, | |
| { | |
| "epoch": 0.027906165518444232, | |
| "grad_norm": 1.9254176616668701, | |
| "learning_rate": 5.555555555555555e-07, | |
| "loss": 0.5853276729583741, | |
| "step": 20, | |
| "token_acc": 0.920275952157407 | |
| }, | |
| { | |
| "epoch": 0.03488270689805529, | |
| "grad_norm": 2.059199333190918, | |
| "learning_rate": 6.944444444444444e-07, | |
| "loss": 0.5623232841491699, | |
| "step": 25, | |
| "token_acc": 0.9284975165562914 | |
| }, | |
| { | |
| "epoch": 0.04185924827766635, | |
| "grad_norm": 2.425384998321533, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 0.5716644763946533, | |
| "step": 30, | |
| "token_acc": 0.9264636757716613 | |
| }, | |
| { | |
| "epoch": 0.048835789657277404, | |
| "grad_norm": 1.9226746559143066, | |
| "learning_rate": 9.722222222222222e-07, | |
| "loss": 0.5484982490539551, | |
| "step": 35, | |
| "token_acc": 0.9298897411313519 | |
| }, | |
| { | |
| "epoch": 0.055812331036888464, | |
| "grad_norm": 2.045583724975586, | |
| "learning_rate": 9.999148757713664e-07, | |
| "loss": 0.5991110324859619, | |
| "step": 40, | |
| "token_acc": 0.9234525698937859 | |
| }, | |
| { | |
| "epoch": 0.06278887241649952, | |
| "grad_norm": 1.9219040870666504, | |
| "learning_rate": 9.995691082675907e-07, | |
| "loss": 0.5559669494628906, | |
| "step": 45, | |
| "token_acc": 0.9266224757206721 | |
| }, | |
| { | |
| "epoch": 0.06976541379611058, | |
| "grad_norm": 1.9935494661331177, | |
| "learning_rate": 9.98957561037365e-07, | |
| "loss": 0.5474924564361572, | |
| "step": 50, | |
| "token_acc": 0.92885522721629 | |
| }, | |
| { | |
| "epoch": 0.07674195517572163, | |
| "grad_norm": 2.460942506790161, | |
| "learning_rate": 9.980805594347849e-07, | |
| "loss": 0.5159276008605957, | |
| "step": 55, | |
| "token_acc": 0.9340729405763836 | |
| }, | |
| { | |
| "epoch": 0.0837184965553327, | |
| "grad_norm": 2.300776481628418, | |
| "learning_rate": 9.969385700404345e-07, | |
| "loss": 0.5166152000427247, | |
| "step": 60, | |
| "token_acc": 0.9320153815815421 | |
| }, | |
| { | |
| "epoch": 0.09069503793494375, | |
| "grad_norm": 2.573843240737915, | |
| "learning_rate": 9.955322004131553e-07, | |
| "loss": 0.5368542194366455, | |
| "step": 65, | |
| "token_acc": 0.9301501135545339 | |
| }, | |
| { | |
| "epoch": 0.09767157931455481, | |
| "grad_norm": 2.0781192779541016, | |
| "learning_rate": 9.93862198766815e-07, | |
| "loss": 0.5094423294067383, | |
| "step": 70, | |
| "token_acc": 0.9307623850489469 | |
| }, | |
| { | |
| "epoch": 0.10464812069416586, | |
| "grad_norm": 2.0247654914855957, | |
| "learning_rate": 9.91929453572245e-07, | |
| "loss": 0.47563705444335935, | |
| "step": 75, | |
| "token_acc": 0.9330075229257042 | |
| }, | |
| { | |
| "epoch": 0.11162466207377693, | |
| "grad_norm": 2.036161422729492, | |
| "learning_rate": 9.897349930845566e-07, | |
| "loss": 0.5021390914916992, | |
| "step": 80, | |
| "token_acc": 0.9329839883551674 | |
| }, | |
| { | |
| "epoch": 0.11860120345338798, | |
| "grad_norm": 2.1073312759399414, | |
| "learning_rate": 9.872799847960918e-07, | |
| "loss": 0.501053762435913, | |
| "step": 85, | |
| "token_acc": 0.9329673985362608 | |
| }, | |
| { | |
| "epoch": 0.12557774483299905, | |
| "grad_norm": 1.834761619567871, | |
| "learning_rate": 9.845657348152955e-07, | |
| "loss": 0.4600623607635498, | |
| "step": 90, | |
| "token_acc": 0.936784676510704 | |
| }, | |
| { | |
| "epoch": 0.1325542862126101, | |
| "grad_norm": 1.7184985876083374, | |
| "learning_rate": 9.81593687171844e-07, | |
| "loss": 0.5222196578979492, | |
| "step": 95, | |
| "token_acc": 0.9302824071593162 | |
| }, | |
| { | |
| "epoch": 0.13953082759222116, | |
| "grad_norm": 1.7423793077468872, | |
| "learning_rate": 9.783654230483934e-07, | |
| "loss": 0.4808220863342285, | |
| "step": 100, | |
| "token_acc": 0.9323681279740812 | |
| }, | |
| { | |
| "epoch": 0.14650736897183222, | |
| "grad_norm": 1.5745614767074585, | |
| "learning_rate": 9.748826599393632e-07, | |
| "loss": 0.44776349067687987, | |
| "step": 105, | |
| "token_acc": 0.9381572216222361 | |
| }, | |
| { | |
| "epoch": 0.15348391035144326, | |
| "grad_norm": 1.491922378540039, | |
| "learning_rate": 9.711472507371982e-07, | |
| "loss": 0.44771714210510255, | |
| "step": 110, | |
| "token_acc": 0.9347117653668952 | |
| }, | |
| { | |
| "epoch": 0.16046045173105433, | |
| "grad_norm": 1.5303000211715698, | |
| "learning_rate": 9.671611827465971e-07, | |
| "loss": 0.42823081016540526, | |
| "step": 115, | |
| "token_acc": 0.9424545917501274 | |
| }, | |
| { | |
| "epoch": 0.1674369931106654, | |
| "grad_norm": 1.52151620388031, | |
| "learning_rate": 9.629265766272291e-07, | |
| "loss": 0.4314168930053711, | |
| "step": 120, | |
| "token_acc": 0.9379815310638763 | |
| }, | |
| { | |
| "epoch": 0.17441353449027644, | |
| "grad_norm": 1.496336579322815, | |
| "learning_rate": 9.58445685265507e-07, | |
| "loss": 0.44430341720581057, | |
| "step": 125, | |
| "token_acc": 0.938302298442336 | |
| }, | |
| { | |
| "epoch": 0.1813900758698875, | |
| "grad_norm": 1.4200767278671265, | |
| "learning_rate": 9.537208925760093e-07, | |
| "loss": 0.4397609710693359, | |
| "step": 130, | |
| "token_acc": 0.9418849948962232 | |
| }, | |
| { | |
| "epoch": 0.18836661724949857, | |
| "grad_norm": 1.419700264930725, | |
| "learning_rate": 9.487547122331964e-07, | |
| "loss": 0.5382704734802246, | |
| "step": 135, | |
| "token_acc": 0.9312300174941184 | |
| }, | |
| { | |
| "epoch": 0.19534315862910961, | |
| "grad_norm": 1.3994510173797607, | |
| "learning_rate": 9.435497863340896e-07, | |
| "loss": 0.41959681510925295, | |
| "step": 140, | |
| "token_acc": 0.9434397845325125 | |
| }, | |
| { | |
| "epoch": 0.20231970000872068, | |
| "grad_norm": 1.2424718141555786, | |
| "learning_rate": 9.381088839926292e-07, | |
| "loss": 0.39974849224090575, | |
| "step": 145, | |
| "token_acc": 0.9442570675170172 | |
| }, | |
| { | |
| "epoch": 0.20929624138833172, | |
| "grad_norm": 1.1396198272705078, | |
| "learning_rate": 9.324348998664548e-07, | |
| "loss": 0.4095014572143555, | |
| "step": 150, | |
| "token_acc": 0.9402837198829093 | |
| }, | |
| { | |
| "epoch": 0.2162727827679428, | |
| "grad_norm": 1.1727170944213867, | |
| "learning_rate": 9.265308526168971e-07, | |
| "loss": 0.4158812999725342, | |
| "step": 155, | |
| "token_acc": 0.945169557184576 | |
| }, | |
| { | |
| "epoch": 0.22324932414755386, | |
| "grad_norm": 1.2740483283996582, | |
| "learning_rate": 9.203998833029945e-07, | |
| "loss": 0.40761551856994627, | |
| "step": 160, | |
| "token_acc": 0.941811175337187 | |
| }, | |
| { | |
| "epoch": 0.2302258655271649, | |
| "grad_norm": 1.2118650674819946, | |
| "learning_rate": 9.140452537103941e-07, | |
| "loss": 0.40488572120666505, | |
| "step": 165, | |
| "token_acc": 0.939998011533108 | |
| }, | |
| { | |
| "epoch": 0.23720240690677596, | |
| "grad_norm": 1.2679206132888794, | |
| "learning_rate": 9.074703446160232e-07, | |
| "loss": 0.38835389614105226, | |
| "step": 170, | |
| "token_acc": 0.9452747758105312 | |
| }, | |
| { | |
| "epoch": 0.24417894828638703, | |
| "grad_norm": 1.4239633083343506, | |
| "learning_rate": 9.006786539894554e-07, | |
| "loss": 0.434948205947876, | |
| "step": 175, | |
| "token_acc": 0.9388818618770554 | |
| }, | |
| { | |
| "epoch": 0.2511554896659981, | |
| "grad_norm": 1.4500908851623535, | |
| "learning_rate": 8.936737951319275e-07, | |
| "loss": 0.47136545181274414, | |
| "step": 180, | |
| "token_acc": 0.9320933879257776 | |
| }, | |
| { | |
| "epoch": 0.2581320310456091, | |
| "grad_norm": 1.1485167741775513, | |
| "learning_rate": 8.864594947539992e-07, | |
| "loss": 0.4100066661834717, | |
| "step": 185, | |
| "token_acc": 0.9440283102329696 | |
| }, | |
| { | |
| "epoch": 0.2651085724252202, | |
| "grad_norm": 1.468805193901062, | |
| "learning_rate": 8.790395909928753e-07, | |
| "loss": 0.39954936504364014, | |
| "step": 190, | |
| "token_acc": 0.9425916365513681 | |
| }, | |
| { | |
| "epoch": 0.27208511380483125, | |
| "grad_norm": 1.2177455425262451, | |
| "learning_rate": 8.714180313704489e-07, | |
| "loss": 0.3338632583618164, | |
| "step": 195, | |
| "token_acc": 0.9483336172145574 | |
| }, | |
| { | |
| "epoch": 0.2790616551844423, | |
| "grad_norm": 1.2997610569000244, | |
| "learning_rate": 8.635988706931486e-07, | |
| "loss": 0.38302700519561766, | |
| "step": 200, | |
| "token_acc": 0.9413486825782762 | |
| }, | |
| { | |
| "epoch": 0.2860381965640534, | |
| "grad_norm": 1.135811686515808, | |
| "learning_rate": 8.555862688947075e-07, | |
| "loss": 0.33866784572601316, | |
| "step": 205, | |
| "token_acc": 0.9479243990178724 | |
| }, | |
| { | |
| "epoch": 0.29301473794366445, | |
| "grad_norm": 1.3217355012893677, | |
| "learning_rate": 8.473844888230064e-07, | |
| "loss": 0.35600202083587645, | |
| "step": 210, | |
| "token_acc": 0.9461965574680733 | |
| }, | |
| { | |
| "epoch": 0.29999127932327546, | |
| "grad_norm": 1.2507730722427368, | |
| "learning_rate": 8.389978939721598e-07, | |
| "loss": 0.352951717376709, | |
| "step": 215, | |
| "token_acc": 0.9438299509473886 | |
| }, | |
| { | |
| "epoch": 0.30696782070288653, | |
| "grad_norm": 1.3820980787277222, | |
| "learning_rate": 8.304309461610601e-07, | |
| "loss": 0.3622483253479004, | |
| "step": 220, | |
| "token_acc": 0.9437283872995913 | |
| }, | |
| { | |
| "epoch": 0.3139443620824976, | |
| "grad_norm": 1.394065499305725, | |
| "learning_rate": 8.216882031596096e-07, | |
| "loss": 0.3512030363082886, | |
| "step": 225, | |
| "token_acc": 0.9435979832677711 | |
| }, | |
| { | |
| "epoch": 0.32092090346210866, | |
| "grad_norm": 1.2014678716659546, | |
| "learning_rate": 8.127743162639051e-07, | |
| "loss": 0.3101860523223877, | |
| "step": 230, | |
| "token_acc": 0.9467404378157325 | |
| }, | |
| { | |
| "epoch": 0.32789744484171973, | |
| "grad_norm": 1.3220340013504028, | |
| "learning_rate": 8.036940278216646e-07, | |
| "loss": 0.28164148330688477, | |
| "step": 235, | |
| "token_acc": 0.9506907137375288 | |
| }, | |
| { | |
| "epoch": 0.3348739862213308, | |
| "grad_norm": 1.5838576555252075, | |
| "learning_rate": 7.944521687092142e-07, | |
| "loss": 0.2631302118301392, | |
| "step": 240, | |
| "token_acc": 0.9514257294429708 | |
| }, | |
| { | |
| "epoch": 0.3418505276009418, | |
| "grad_norm": 1.5820879936218262, | |
| "learning_rate": 7.850536557613748e-07, | |
| "loss": 0.3039613962173462, | |
| "step": 245, | |
| "token_acc": 0.9476980693484858 | |
| }, | |
| { | |
| "epoch": 0.3488270689805529, | |
| "grad_norm": 1.4841257333755493, | |
| "learning_rate": 7.755034891556167e-07, | |
| "loss": 0.28357877731323244, | |
| "step": 250, | |
| "token_acc": 0.9478200246688475 | |
| }, | |
| { | |
| "epoch": 0.35580361036016395, | |
| "grad_norm": 1.2519296407699585, | |
| "learning_rate": 7.658067497518772e-07, | |
| "loss": 0.3490274429321289, | |
| "step": 255, | |
| "token_acc": 0.9376414667270258 | |
| }, | |
| { | |
| "epoch": 0.362780151739775, | |
| "grad_norm": 1.340489149093628, | |
| "learning_rate": 7.559685963894513e-07, | |
| "loss": 0.32015056610107423, | |
| "step": 260, | |
| "token_acc": 0.9416806521217933 | |
| }, | |
| { | |
| "epoch": 0.3697566931193861, | |
| "grad_norm": 1.3753681182861328, | |
| "learning_rate": 7.459942631423962e-07, | |
| "loss": 0.27373878955841063, | |
| "step": 265, | |
| "token_acc": 0.9539526646272742 | |
| }, | |
| { | |
| "epoch": 0.37673323449899715, | |
| "grad_norm": 1.1863863468170166, | |
| "learning_rate": 7.358890565349105e-07, | |
| "loss": 0.29328436851501466, | |
| "step": 270, | |
| "token_acc": 0.9487552700260992 | |
| }, | |
| { | |
| "epoch": 0.38370977587860816, | |
| "grad_norm": 1.490551233291626, | |
| "learning_rate": 7.256583527181683e-07, | |
| "loss": 0.33202688694000243, | |
| "step": 275, | |
| "token_acc": 0.9404879571346824 | |
| }, | |
| { | |
| "epoch": 0.39068631725821923, | |
| "grad_norm": 1.3143407106399536, | |
| "learning_rate": 7.153075946101097e-07, | |
| "loss": 0.278816294670105, | |
| "step": 280, | |
| "token_acc": 0.9493239404613112 | |
| }, | |
| { | |
| "epoch": 0.3976628586378303, | |
| "grad_norm": 1.0962167978286743, | |
| "learning_rate": 7.048422889997115e-07, | |
| "loss": 0.23789422512054442, | |
| "step": 285, | |
| "token_acc": 0.9503463643850215 | |
| }, | |
| { | |
| "epoch": 0.40463940001744136, | |
| "grad_norm": 0.824631929397583, | |
| "learning_rate": 6.942680036172762e-07, | |
| "loss": 0.24912948608398439, | |
| "step": 290, | |
| "token_acc": 0.9536533677324243 | |
| }, | |
| { | |
| "epoch": 0.41161594139705243, | |
| "grad_norm": 1.3604934215545654, | |
| "learning_rate": 6.835903641722999e-07, | |
| "loss": 0.3469517946243286, | |
| "step": 295, | |
| "token_acc": 0.9420104361524072 | |
| }, | |
| { | |
| "epoch": 0.41859248277666344, | |
| "grad_norm": 0.8911880850791931, | |
| "learning_rate": 6.72815051360494e-07, | |
| "loss": 0.24576101303100586, | |
| "step": 300, | |
| "token_acc": 0.953768733064204 | |
| }, | |
| { | |
| "epoch": 0.4255690241562745, | |
| "grad_norm": 1.0799229145050049, | |
| "learning_rate": 6.619477978415531e-07, | |
| "loss": 0.2791733980178833, | |
| "step": 305, | |
| "token_acc": 0.9415368904774062 | |
| }, | |
| { | |
| "epoch": 0.4325455655358856, | |
| "grad_norm": 0.872166633605957, | |
| "learning_rate": 6.509943851892766e-07, | |
| "loss": 0.3617237567901611, | |
| "step": 310, | |
| "token_acc": 0.9406946604458268 | |
| }, | |
| { | |
| "epoch": 0.43952210691549665, | |
| "grad_norm": 0.9697985649108887, | |
| "learning_rate": 6.399606408156687e-07, | |
| "loss": 0.22297954559326172, | |
| "step": 315, | |
| "token_acc": 0.954813046937152 | |
| }, | |
| { | |
| "epoch": 0.4464986482951077, | |
| "grad_norm": 0.8073396682739258, | |
| "learning_rate": 6.288524348706502e-07, | |
| "loss": 0.20998930931091309, | |
| "step": 320, | |
| "token_acc": 0.9557532836995339 | |
| }, | |
| { | |
| "epoch": 0.4534751896747188, | |
| "grad_norm": 0.9216898679733276, | |
| "learning_rate": 6.176756771190337e-07, | |
| "loss": 0.2161928176879883, | |
| "step": 325, | |
| "token_acc": 0.9520381208887968 | |
| }, | |
| { | |
| "epoch": 0.4604517310543298, | |
| "grad_norm": 0.9290446639060974, | |
| "learning_rate": 6.064363137964225e-07, | |
| "loss": 0.24029843807220458, | |
| "step": 330, | |
| "token_acc": 0.9494813278008298 | |
| }, | |
| { | |
| "epoch": 0.46742827243394086, | |
| "grad_norm": 0.9209424257278442, | |
| "learning_rate": 5.95140324445706e-07, | |
| "loss": 0.21532030105590821, | |
| "step": 335, | |
| "token_acc": 0.9515148253780337 | |
| }, | |
| { | |
| "epoch": 0.47440481381355193, | |
| "grad_norm": 0.7281814217567444, | |
| "learning_rate": 5.83793718735837e-07, | |
| "loss": 0.24763097763061523, | |
| "step": 340, | |
| "token_acc": 0.9499685006299874 | |
| }, | |
| { | |
| "epoch": 0.481381355193163, | |
| "grad_norm": 0.6570573449134827, | |
| "learning_rate": 5.724025332645793e-07, | |
| "loss": 0.19987608194351197, | |
| "step": 345, | |
| "token_acc": 0.9564738292011019 | |
| }, | |
| { | |
| "epoch": 0.48835789657277406, | |
| "grad_norm": 0.7470307946205139, | |
| "learning_rate": 5.609728283469288e-07, | |
| "loss": 0.1938636064529419, | |
| "step": 350, | |
| "token_acc": 0.9593920408400046 | |
| }, | |
| { | |
| "epoch": 0.49533443795238513, | |
| "grad_norm": 0.5753098130226135, | |
| "learning_rate": 5.495106847909182e-07, | |
| "loss": 0.2106870651245117, | |
| "step": 355, | |
| "token_acc": 0.9575553464414839 | |
| }, | |
| { | |
| "epoch": 0.5023109793319962, | |
| "grad_norm": 0.5757151246070862, | |
| "learning_rate": 5.380222006625179e-07, | |
| "loss": 0.18208713531494142, | |
| "step": 360, | |
| "token_acc": 0.963362694802052 | |
| }, | |
| { | |
| "epoch": 0.5092875207116072, | |
| "grad_norm": 0.5811958312988281, | |
| "learning_rate": 5.265134880413548e-07, | |
| "loss": 0.1780398368835449, | |
| "step": 365, | |
| "token_acc": 0.9639311886076607 | |
| }, | |
| { | |
| "epoch": 0.5162640620912182, | |
| "grad_norm": 0.5238430500030518, | |
| "learning_rate": 5.149906697689767e-07, | |
| "loss": 0.2431933879852295, | |
| "step": 370, | |
| "token_acc": 0.9474901594773364 | |
| }, | |
| { | |
| "epoch": 0.5232406034708293, | |
| "grad_norm": 0.7463929057121277, | |
| "learning_rate": 5.034598761913916e-07, | |
| "loss": 0.2559064865112305, | |
| "step": 375, | |
| "token_acc": 0.9454068781164859 | |
| }, | |
| { | |
| "epoch": 0.5302171448504404, | |
| "grad_norm": 0.5051546692848206, | |
| "learning_rate": 4.919272418976123e-07, | |
| "loss": 0.20950682163238527, | |
| "step": 380, | |
| "token_acc": 0.9548458149779736 | |
| }, | |
| { | |
| "epoch": 0.5371936862300515, | |
| "grad_norm": 0.47918471693992615, | |
| "learning_rate": 4.803989024559459e-07, | |
| "loss": 0.18409876823425292, | |
| "step": 385, | |
| "token_acc": 0.9592592592592593 | |
| }, | |
| { | |
| "epoch": 0.5441702276096625, | |
| "grad_norm": 0.48506343364715576, | |
| "learning_rate": 4.688809911497609e-07, | |
| "loss": 0.19301035404205322, | |
| "step": 390, | |
| "token_acc": 0.9593621399176955 | |
| }, | |
| { | |
| "epoch": 0.5511467689892736, | |
| "grad_norm": 0.6255201697349548, | |
| "learning_rate": 4.57379635714471e-07, | |
| "loss": 0.1948167562484741, | |
| "step": 395, | |
| "token_acc": 0.9559854371569853 | |
| }, | |
| { | |
| "epoch": 0.5581233103688846, | |
| "grad_norm": 0.6040950417518616, | |
| "learning_rate": 4.459009550774692e-07, | |
| "loss": 0.15679298639297484, | |
| "step": 400, | |
| "token_acc": 0.9631829798991504 | |
| }, | |
| { | |
| "epoch": 0.5650998517484956, | |
| "grad_norm": 0.6469866037368774, | |
| "learning_rate": 4.344510561027498e-07, | |
| "loss": 0.2119133472442627, | |
| "step": 405, | |
| "token_acc": 0.9538207806487081 | |
| }, | |
| { | |
| "epoch": 0.5720763931281068, | |
| "grad_norm": 0.6210054755210876, | |
| "learning_rate": 4.230360303419453e-07, | |
| "loss": 0.17766163349151612, | |
| "step": 410, | |
| "token_acc": 0.9606851549755302 | |
| }, | |
| { | |
| "epoch": 0.5790529345077178, | |
| "grad_norm": 0.4716809391975403, | |
| "learning_rate": 4.116619507935144e-07, | |
| "loss": 0.18397997617721557, | |
| "step": 415, | |
| "token_acc": 0.9560470014410819 | |
| }, | |
| { | |
| "epoch": 0.5860294758873289, | |
| "grad_norm": 0.5452926754951477, | |
| "learning_rate": 4.003348686717949e-07, | |
| "loss": 0.2028341293334961, | |
| "step": 420, | |
| "token_acc": 0.9551058385671086 | |
| }, | |
| { | |
| "epoch": 0.5930060172669399, | |
| "grad_norm": 0.5029204487800598, | |
| "learning_rate": 3.890608101876517e-07, | |
| "loss": 0.16716669797897338, | |
| "step": 425, | |
| "token_acc": 0.9617021276595744 | |
| }, | |
| { | |
| "epoch": 0.5999825586465509, | |
| "grad_norm": 0.5014962553977966, | |
| "learning_rate": 3.7784577334242273e-07, | |
| "loss": 0.18506402969360353, | |
| "step": 430, | |
| "token_acc": 0.9586908319676082 | |
| }, | |
| { | |
| "epoch": 0.606959100026162, | |
| "grad_norm": 0.5529438257217407, | |
| "learning_rate": 3.666957247368757e-07, | |
| "loss": 0.1777629852294922, | |
| "step": 435, | |
| "token_acc": 0.9608512874408828 | |
| }, | |
| { | |
| "epoch": 0.6139356414057731, | |
| "grad_norm": 0.4262159466743469, | |
| "learning_rate": 3.556165963968691e-07, | |
| "loss": 0.14577605724334716, | |
| "step": 440, | |
| "token_acc": 0.962014556659406 | |
| }, | |
| { | |
| "epoch": 0.6209121827853842, | |
| "grad_norm": 0.5142691135406494, | |
| "learning_rate": 3.4461428261740754e-07, | |
| "loss": 0.20166921615600586, | |
| "step": 445, | |
| "token_acc": 0.9590035201733008 | |
| }, | |
| { | |
| "epoch": 0.6278887241649952, | |
| "grad_norm": 0.5572395324707031, | |
| "learning_rate": 3.3369463682677234e-07, | |
| "loss": 0.20577445030212402, | |
| "step": 450, | |
| "token_acc": 0.9554721339878718 | |
| }, | |
| { | |
| "epoch": 0.6348652655446063, | |
| "grad_norm": 0.496382474899292, | |
| "learning_rate": 3.2286346847239123e-07, | |
| "loss": 0.14863760471343995, | |
| "step": 455, | |
| "token_acc": 0.9620308092861792 | |
| }, | |
| { | |
| "epoch": 0.6418418069242173, | |
| "grad_norm": 0.4650530517101288, | |
| "learning_rate": 3.1212653993010954e-07, | |
| "loss": 0.17070106267929078, | |
| "step": 460, | |
| "token_acc": 0.9571316789626649 | |
| }, | |
| { | |
| "epoch": 0.6488183483038283, | |
| "grad_norm": 0.4173397421836853, | |
| "learning_rate": 3.014895634385014e-07, | |
| "loss": 0.1784367799758911, | |
| "step": 465, | |
| "token_acc": 0.9574702782203701 | |
| }, | |
| { | |
| "epoch": 0.6557948896834395, | |
| "grad_norm": 0.50703364610672, | |
| "learning_rate": 2.9095819805985795e-07, | |
| "loss": 0.18249971866607667, | |
| "step": 470, | |
| "token_acc": 0.9553955949304787 | |
| }, | |
| { | |
| "epoch": 0.6627714310630505, | |
| "grad_norm": 0.4510941505432129, | |
| "learning_rate": 2.8053804666946287e-07, | |
| "loss": 0.17186166048049928, | |
| "step": 475, | |
| "token_acc": 0.9583355252775001 | |
| }, | |
| { | |
| "epoch": 0.6697479724426616, | |
| "grad_norm": 0.498515248298645, | |
| "learning_rate": 2.7023465297476424e-07, | |
| "loss": 0.23503575325012208, | |
| "step": 480, | |
| "token_acc": 0.9480993056596233 | |
| }, | |
| { | |
| "epoch": 0.6767245138222726, | |
| "grad_norm": 0.5071078538894653, | |
| "learning_rate": 2.6005349856602123e-07, | |
| "loss": 0.23459105491638182, | |
| "step": 485, | |
| "token_acc": 0.9456293181135476 | |
| }, | |
| { | |
| "epoch": 0.6837010552018836, | |
| "grad_norm": 0.5977618098258972, | |
| "learning_rate": 2.500000000000001e-07, | |
| "loss": 0.1519307851791382, | |
| "step": 490, | |
| "token_acc": 0.9616792137181096 | |
| }, | |
| { | |
| "epoch": 0.6906775965814947, | |
| "grad_norm": 0.49618807435035706, | |
| "learning_rate": 2.4007950591826913e-07, | |
| "loss": 0.21449580192565917, | |
| "step": 495, | |
| "token_acc": 0.9518640628962719 | |
| }, | |
| { | |
| "epoch": 0.6976541379611058, | |
| "grad_norm": 0.5371702313423157, | |
| "learning_rate": 2.3029729420162587e-07, | |
| "loss": 0.15500261783599853, | |
| "step": 500, | |
| "token_acc": 0.9591128732499071 | |
| }, | |
| { | |
| "epoch": 0.7046306793407169, | |
| "grad_norm": 0.5795394778251648, | |
| "learning_rate": 2.2065856916216786e-07, | |
| "loss": 0.16497514247894288, | |
| "step": 505, | |
| "token_acc": 0.9617788774580024 | |
| }, | |
| { | |
| "epoch": 0.7116072207203279, | |
| "grad_norm": 0.5824469327926636, | |
| "learning_rate": 2.1116845877450805e-07, | |
| "loss": 0.16700024604797364, | |
| "step": 510, | |
| "token_acc": 0.9572996706915478 | |
| }, | |
| { | |
| "epoch": 0.7185837620999389, | |
| "grad_norm": 0.539864182472229, | |
| "learning_rate": 2.0183201194759825e-07, | |
| "loss": 0.2224641799926758, | |
| "step": 515, | |
| "token_acc": 0.9501959166838524 | |
| }, | |
| { | |
| "epoch": 0.72556030347955, | |
| "grad_norm": 0.781697690486908, | |
| "learning_rate": 1.9265419583861952e-07, | |
| "loss": 0.1476673364639282, | |
| "step": 520, | |
| "token_acc": 0.9633160506216201 | |
| }, | |
| { | |
| "epoch": 0.732536844859161, | |
| "grad_norm": 0.4989713728427887, | |
| "learning_rate": 1.8363989321036577e-07, | |
| "loss": 0.143803870677948, | |
| "step": 525, | |
| "token_acc": 0.9604544058949954 | |
| }, | |
| { | |
| "epoch": 0.7395133862387722, | |
| "grad_norm": 0.5253103971481323, | |
| "learning_rate": 1.7479389983352656e-07, | |
| "loss": 0.17980681657791137, | |
| "step": 530, | |
| "token_acc": 0.9542640495272832 | |
| }, | |
| { | |
| "epoch": 0.7464899276183832, | |
| "grad_norm": 0.5387361645698547, | |
| "learning_rate": 1.6612092193525017e-07, | |
| "loss": 0.217242431640625, | |
| "step": 535, | |
| "token_acc": 0.950739773716275 | |
| }, | |
| { | |
| "epoch": 0.7534664689979943, | |
| "grad_norm": 0.5405040979385376, | |
| "learning_rate": 1.5762557369534708e-07, | |
| "loss": 0.19491589069366455, | |
| "step": 540, | |
| "token_acc": 0.954151055018734 | |
| }, | |
| { | |
| "epoch": 0.7604430103776053, | |
| "grad_norm": 0.43293488025665283, | |
| "learning_rate": 1.4931237479146326e-07, | |
| "loss": 0.18127689361572266, | |
| "step": 545, | |
| "token_acc": 0.9546740778170794 | |
| }, | |
| { | |
| "epoch": 0.7674195517572163, | |
| "grad_norm": 0.5920426249504089, | |
| "learning_rate": 1.4118574799453115e-07, | |
| "loss": 0.17992936372756957, | |
| "step": 550, | |
| "token_acc": 0.9548200289551195 | |
| }, | |
| { | |
| "epoch": 0.7743960931368274, | |
| "grad_norm": 0.8887305855751038, | |
| "learning_rate": 1.332500168157748e-07, | |
| "loss": 0.1434216856956482, | |
| "step": 555, | |
| "token_acc": 0.9616718027734977 | |
| }, | |
| { | |
| "epoch": 0.7813726345164385, | |
| "grad_norm": 0.45279937982559204, | |
| "learning_rate": 1.2550940320652614e-07, | |
| "loss": 0.15285730361938477, | |
| "step": 560, | |
| "token_acc": 0.9589277780520314 | |
| }, | |
| { | |
| "epoch": 0.7883491758960496, | |
| "grad_norm": 0.5594329833984375, | |
| "learning_rate": 1.179680253120699e-07, | |
| "loss": 0.14827193021774293, | |
| "step": 565, | |
| "token_acc": 0.9611136415395126 | |
| }, | |
| { | |
| "epoch": 0.7953257172756606, | |
| "grad_norm": 0.5205839276313782, | |
| "learning_rate": 1.1062989528071681e-07, | |
| "loss": 0.14820796251296997, | |
| "step": 570, | |
| "token_acc": 0.9608738340697104 | |
| }, | |
| { | |
| "epoch": 0.8023022586552716, | |
| "grad_norm": 0.7842152118682861, | |
| "learning_rate": 1.0349891712926855e-07, | |
| "loss": 0.14528849124908447, | |
| "step": 575, | |
| "token_acc": 0.9591823819769649 | |
| }, | |
| { | |
| "epoch": 0.8092788000348827, | |
| "grad_norm": 0.5881332159042358, | |
| "learning_rate": 9.65788846660116e-08, | |
| "loss": 0.12228701114654542, | |
| "step": 580, | |
| "token_acc": 0.9648055356716774 | |
| }, | |
| { | |
| "epoch": 0.8162553414144937, | |
| "grad_norm": 0.5683630704879761, | |
| "learning_rate": 8.987347947234192e-08, | |
| "loss": 0.15679004192352294, | |
| "step": 585, | |
| "token_acc": 0.9599228461208744 | |
| }, | |
| { | |
| "epoch": 0.8232318827941049, | |
| "grad_norm": 0.6288495659828186, | |
| "learning_rate": 8.33862689440985e-08, | |
| "loss": 0.16296907663345336, | |
| "step": 590, | |
| "token_acc": 0.957492548981287 | |
| }, | |
| { | |
| "epoch": 0.8302084241737159, | |
| "grad_norm": 0.46802279353141785, | |
| "learning_rate": 7.712070439364438e-08, | |
| "loss": 0.13914816379547118, | |
| "step": 595, | |
| "token_acc": 0.9615843086259211 | |
| }, | |
| { | |
| "epoch": 0.8371849655533269, | |
| "grad_norm": 0.5954472422599792, | |
| "learning_rate": 7.108011921370727e-08, | |
| "loss": 0.15333893299102783, | |
| "step": 600, | |
| "token_acc": 0.9563102463405927 | |
| }, | |
| { | |
| "epoch": 0.844161506932938, | |
| "grad_norm": 0.611599862575531, | |
| "learning_rate": 6.526772710395323e-08, | |
| "loss": 0.11822519302368165, | |
| "step": 605, | |
| "token_acc": 0.967852975495916 | |
| }, | |
| { | |
| "epoch": 0.851138048312549, | |
| "grad_norm": 0.5725059509277344, | |
| "learning_rate": 5.968662036124295e-08, | |
| "loss": 0.15996166467666625, | |
| "step": 610, | |
| "token_acc": 0.959222581157655 | |
| }, | |
| { | |
| "epoch": 0.8581145896921601, | |
| "grad_norm": 0.4814670979976654, | |
| "learning_rate": 5.433976823447262e-08, | |
| "loss": 0.13899474143981932, | |
| "step": 615, | |
| "token_acc": 0.9624644833258561 | |
| }, | |
| { | |
| "epoch": 0.8650911310717712, | |
| "grad_norm": 0.525391697883606, | |
| "learning_rate": 4.923001534488097e-08, | |
| "loss": 0.1286926746368408, | |
| "step": 620, | |
| "token_acc": 0.9623963626638139 | |
| }, | |
| { | |
| "epoch": 0.8720676724513823, | |
| "grad_norm": 0.6519795656204224, | |
| "learning_rate": 4.43600801726598e-08, | |
| "loss": 0.17959569692611693, | |
| "step": 625, | |
| "token_acc": 0.9575730509123389 | |
| }, | |
| { | |
| "epoch": 0.8790442138309933, | |
| "grad_norm": 0.5954372882843018, | |
| "learning_rate": 3.973255361067346e-08, | |
| "loss": 0.14509177207946777, | |
| "step": 630, | |
| "token_acc": 0.9609306955331591 | |
| }, | |
| { | |
| "epoch": 0.8860207552106043, | |
| "grad_norm": 0.6551011800765991, | |
| "learning_rate": 3.534989758605772e-08, | |
| "loss": 0.13519610166549684, | |
| "step": 635, | |
| "token_acc": 0.961144806671721 | |
| }, | |
| { | |
| "epoch": 0.8929972965902154, | |
| "grad_norm": 0.6382178664207458, | |
| "learning_rate": 3.121444375042992e-08, | |
| "loss": 0.14140852689743041, | |
| "step": 640, | |
| "token_acc": 0.9617294770669004 | |
| }, | |
| { | |
| "epoch": 0.8999738379698264, | |
| "grad_norm": 0.5000672340393066, | |
| "learning_rate": 2.732839223940914e-08, | |
| "loss": 0.15130863189697266, | |
| "step": 645, | |
| "token_acc": 0.9578913532626165 | |
| }, | |
| { | |
| "epoch": 0.9069503793494376, | |
| "grad_norm": 0.4966048002243042, | |
| "learning_rate": 2.3693810502103783e-08, | |
| "loss": 0.16461522579193116, | |
| "step": 650, | |
| "token_acc": 0.956586014881979 | |
| }, | |
| { | |
| "epoch": 0.9139269207290486, | |
| "grad_norm": 0.7203890085220337, | |
| "learning_rate": 2.0312632201192338e-08, | |
| "loss": 0.15151506662368774, | |
| "step": 655, | |
| "token_acc": 0.9578531445505433 | |
| }, | |
| { | |
| "epoch": 0.9209034621086596, | |
| "grad_norm": 0.5585451722145081, | |
| "learning_rate": 1.7186656184179473e-08, | |
| "loss": 0.19614295959472655, | |
| "step": 660, | |
| "token_acc": 0.9517613299030279 | |
| }, | |
| { | |
| "epoch": 0.9278800034882707, | |
| "grad_norm": 0.5863579511642456, | |
| "learning_rate": 1.431754552637754e-08, | |
| "loss": 0.13972072601318358, | |
| "step": 665, | |
| "token_acc": 0.960591916834624 | |
| }, | |
| { | |
| "epoch": 0.9348565448678817, | |
| "grad_norm": 0.540397584438324, | |
| "learning_rate": 1.1706826646119994e-08, | |
| "loss": 0.20453217029571533, | |
| "step": 670, | |
| "token_acc": 0.9501171417415072 | |
| }, | |
| { | |
| "epoch": 0.9418330862474928, | |
| "grad_norm": 0.4735467731952667, | |
| "learning_rate": 9.355888492680153e-09, | |
| "loss": 0.2564453840255737, | |
| "step": 675, | |
| "token_acc": 0.9417061863910055 | |
| }, | |
| { | |
| "epoch": 0.9488096276271039, | |
| "grad_norm": 0.41385316848754883, | |
| "learning_rate": 7.265981807324795e-09, | |
| "loss": 0.1432310461997986, | |
| "step": 680, | |
| "token_acc": 0.9586380054620738 | |
| }, | |
| { | |
| "epoch": 0.9557861690067149, | |
| "grad_norm": 0.5068058967590332, | |
| "learning_rate": 5.438218457897492e-09, | |
| "loss": 0.12814297676086425, | |
| "step": 685, | |
| "token_acc": 0.9636561355311355 | |
| }, | |
| { | |
| "epoch": 0.962762710386326, | |
| "grad_norm": 0.6264599561691284, | |
| "learning_rate": 3.873570847285012e-09, | |
| "loss": 0.2733027935028076, | |
| "step": 690, | |
| "token_acc": 0.9410119633331607 | |
| }, | |
| { | |
| "epoch": 0.969739251765937, | |
| "grad_norm": 0.5079652667045593, | |
| "learning_rate": 2.5728713960815884e-09, | |
| "loss": 0.1333064079284668, | |
| "step": 695, | |
| "token_acc": 0.963391442155309 | |
| }, | |
| { | |
| "epoch": 0.9767157931455481, | |
| "grad_norm": 0.5711411237716675, | |
| "learning_rate": 1.5368120997261147e-09, | |
| "loss": 0.21541709899902345, | |
| "step": 700, | |
| "token_acc": 0.9518470869325492 | |
| }, | |
| { | |
| "epoch": 0.9836923345251591, | |
| "grad_norm": 0.5428957939147949, | |
| "learning_rate": 7.65944160348142e-10, | |
| "loss": 0.14647810459136962, | |
| "step": 705, | |
| "token_acc": 0.9616447996782788 | |
| }, | |
| { | |
| "epoch": 0.9906688759047703, | |
| "grad_norm": 0.5820784568786621, | |
| "learning_rate": 2.6067769351867384e-10, | |
| "loss": 0.144012713432312, | |
| "step": 710, | |
| "token_acc": 0.9590297709494062 | |
| }, | |
| { | |
| "epoch": 0.9976454172843813, | |
| "grad_norm": 0.6706213355064392, | |
| "learning_rate": 2.128151006108858e-11, | |
| "loss": 0.16256020069122315, | |
| "step": 715, | |
| "token_acc": 0.9583173343572678 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 717, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.451809127798866e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |