| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 1777, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0005627462014631402, | |
| "grad_norm": 0.96278315782547, | |
| "learning_rate": 1.1235955056179775e-06, | |
| "loss": 0.8197, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0028137310073157004, | |
| "grad_norm": 1.2738590240478516, | |
| "learning_rate": 5.617977528089888e-06, | |
| "loss": 1.1947, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.005627462014631401, | |
| "grad_norm": 2.892261028289795, | |
| "learning_rate": 1.1235955056179776e-05, | |
| "loss": 1.2592, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.008441193021947102, | |
| "grad_norm": 1.4733870029449463, | |
| "learning_rate": 1.6853932584269665e-05, | |
| "loss": 1.2391, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.011254924029262802, | |
| "grad_norm": 1.0532560348510742, | |
| "learning_rate": 2.2471910112359552e-05, | |
| "loss": 1.5268, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.014068655036578503, | |
| "grad_norm": 1.5811922550201416, | |
| "learning_rate": 2.8089887640449443e-05, | |
| "loss": 1.0691, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.016882386043894203, | |
| "grad_norm": 1.4303256273269653, | |
| "learning_rate": 3.370786516853933e-05, | |
| "loss": 0.6871, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.019696117051209903, | |
| "grad_norm": 2.0926148891448975, | |
| "learning_rate": 3.9325842696629214e-05, | |
| "loss": 0.6074, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.022509848058525603, | |
| "grad_norm": 2.149613857269287, | |
| "learning_rate": 4.4943820224719104e-05, | |
| "loss": 0.4506, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.025323579065841307, | |
| "grad_norm": 1.4852650165557861, | |
| "learning_rate": 5.0561797752808995e-05, | |
| "loss": 0.6363, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.028137310073157007, | |
| "grad_norm": 0.8294332027435303, | |
| "learning_rate": 5.6179775280898885e-05, | |
| "loss": 0.3258, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.030951041080472707, | |
| "grad_norm": 0.9797491431236267, | |
| "learning_rate": 6.179775280898876e-05, | |
| "loss": 0.4315, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.03376477208778841, | |
| "grad_norm": 0.8157183527946472, | |
| "learning_rate": 6.741573033707866e-05, | |
| "loss": 0.5252, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03657850309510411, | |
| "grad_norm": 0.988738477230072, | |
| "learning_rate": 7.303370786516854e-05, | |
| "loss": 0.3002, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.03939223410241981, | |
| "grad_norm": 1.035030484199524, | |
| "learning_rate": 7.865168539325843e-05, | |
| "loss": 0.3721, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04220596510973551, | |
| "grad_norm": 1.2941393852233887, | |
| "learning_rate": 8.426966292134831e-05, | |
| "loss": 0.3631, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.04501969611705121, | |
| "grad_norm": 0.8939509391784668, | |
| "learning_rate": 8.988764044943821e-05, | |
| "loss": 0.3648, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.04783342712436691, | |
| "grad_norm": 1.7211397886276245, | |
| "learning_rate": 9.550561797752809e-05, | |
| "loss": 0.3593, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.050647158131682614, | |
| "grad_norm": 0.7832581996917725, | |
| "learning_rate": 0.00010112359550561799, | |
| "loss": 0.2104, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.05346088913899831, | |
| "grad_norm": 1.1701756715774536, | |
| "learning_rate": 0.00010674157303370786, | |
| "loss": 0.3957, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.056274620146314014, | |
| "grad_norm": 0.7531750798225403, | |
| "learning_rate": 0.00011235955056179777, | |
| "loss": 0.3513, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05908835115362971, | |
| "grad_norm": 0.7287600636482239, | |
| "learning_rate": 0.00011797752808988764, | |
| "loss": 0.3221, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.061902082160945414, | |
| "grad_norm": 1.7473770380020142, | |
| "learning_rate": 0.00012359550561797752, | |
| "loss": 0.1973, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06471581316826111, | |
| "grad_norm": 1.0901485681533813, | |
| "learning_rate": 0.00012921348314606744, | |
| "loss": 0.4353, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.06752954417557681, | |
| "grad_norm": 0.7513278722763062, | |
| "learning_rate": 0.00013483146067415732, | |
| "loss": 0.3628, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.07034327518289252, | |
| "grad_norm": 1.2991347312927246, | |
| "learning_rate": 0.0001404494382022472, | |
| "loss": 0.3113, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.07315700619020822, | |
| "grad_norm": 0.6642701029777527, | |
| "learning_rate": 0.0001460674157303371, | |
| "loss": 0.4762, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07597073719752391, | |
| "grad_norm": 1.3134933710098267, | |
| "learning_rate": 0.00015168539325842697, | |
| "loss": 0.3526, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.07878446820483961, | |
| "grad_norm": 1.0081052780151367, | |
| "learning_rate": 0.00015730337078651685, | |
| "loss": 0.3115, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08159819921215532, | |
| "grad_norm": 0.2705545723438263, | |
| "learning_rate": 0.00016292134831460674, | |
| "loss": 0.2489, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.08441193021947102, | |
| "grad_norm": 1.097110390663147, | |
| "learning_rate": 0.00016853932584269662, | |
| "loss": 0.4727, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08722566122678672, | |
| "grad_norm": 0.8656176328659058, | |
| "learning_rate": 0.00017415730337078653, | |
| "loss": 0.4242, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.09003939223410241, | |
| "grad_norm": 0.8114811778068542, | |
| "learning_rate": 0.00017977528089887642, | |
| "loss": 0.3033, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09285312324141812, | |
| "grad_norm": 1.072106957435608, | |
| "learning_rate": 0.0001853932584269663, | |
| "loss": 0.3359, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.09566685424873382, | |
| "grad_norm": 0.6422829031944275, | |
| "learning_rate": 0.00019101123595505618, | |
| "loss": 0.2874, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.09848058525604952, | |
| "grad_norm": 1.2826429605484009, | |
| "learning_rate": 0.00019662921348314607, | |
| "loss": 0.3826, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.10129431626336523, | |
| "grad_norm": 0.24743008613586426, | |
| "learning_rate": 0.00019999922797341667, | |
| "loss": 0.1672, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.10410804727068092, | |
| "grad_norm": 0.9253625273704529, | |
| "learning_rate": 0.00019999054281125283, | |
| "loss": 0.2247, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.10692177827799662, | |
| "grad_norm": 0.6312052011489868, | |
| "learning_rate": 0.0001999722082946312, | |
| "loss": 0.4353, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.10973550928531232, | |
| "grad_norm": 0.7200600504875183, | |
| "learning_rate": 0.00019994422619288159, | |
| "loss": 0.2801, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.11254924029262803, | |
| "grad_norm": 0.46093428134918213, | |
| "learning_rate": 0.00019990659920635152, | |
| "loss": 0.2552, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11536297129994373, | |
| "grad_norm": 2.7881016731262207, | |
| "learning_rate": 0.00019985933096614578, | |
| "loss": 0.4449, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.11817670230725942, | |
| "grad_norm": 0.7893804311752319, | |
| "learning_rate": 0.00019980242603377573, | |
| "loss": 0.2687, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.12099043331457512, | |
| "grad_norm": 0.644159197807312, | |
| "learning_rate": 0.00019973588990071937, | |
| "loss": 0.2525, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.12380416432189083, | |
| "grad_norm": 0.7028160095214844, | |
| "learning_rate": 0.00019965972898789125, | |
| "loss": 0.4349, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.12661789532920653, | |
| "grad_norm": 0.9951076507568359, | |
| "learning_rate": 0.0001995739506450229, | |
| "loss": 0.3944, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.12943162633652222, | |
| "grad_norm": 0.9317317605018616, | |
| "learning_rate": 0.00019947856314995349, | |
| "loss": 0.2633, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.13224535734383794, | |
| "grad_norm": 0.426104336977005, | |
| "learning_rate": 0.00019937357570783107, | |
| "loss": 0.2667, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.13505908835115363, | |
| "grad_norm": 0.7403711676597595, | |
| "learning_rate": 0.0001992589984502243, | |
| "loss": 0.4544, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.13787281935846932, | |
| "grad_norm": 0.7271831035614014, | |
| "learning_rate": 0.0001991348424341445, | |
| "loss": 0.3209, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.14068655036578503, | |
| "grad_norm": 2.071850299835205, | |
| "learning_rate": 0.00019900111964097893, | |
| "loss": 0.4721, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.14350028137310072, | |
| "grad_norm": 0.5502871870994568, | |
| "learning_rate": 0.0001988578429753342, | |
| "loss": 0.2698, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.14631401238041644, | |
| "grad_norm": 1.0685575008392334, | |
| "learning_rate": 0.00019870502626379127, | |
| "loss": 0.3656, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.14912774338773213, | |
| "grad_norm": 0.521375834941864, | |
| "learning_rate": 0.00019854268425357105, | |
| "loss": 0.3065, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.15194147439504782, | |
| "grad_norm": 0.7491894960403442, | |
| "learning_rate": 0.0001983708326111111, | |
| "loss": 0.3863, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.15475520540236354, | |
| "grad_norm": 0.8233822584152222, | |
| "learning_rate": 0.0001981894879205539, | |
| "loss": 0.3434, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.15756893640967923, | |
| "grad_norm": 0.9702492952346802, | |
| "learning_rate": 0.0001979986676821465, | |
| "loss": 0.3103, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.16038266741699495, | |
| "grad_norm": 1.0381535291671753, | |
| "learning_rate": 0.00019779839031055157, | |
| "loss": 0.4879, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.16319639842431063, | |
| "grad_norm": 0.594953179359436, | |
| "learning_rate": 0.00019758867513307047, | |
| "loss": 0.2185, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.16601012943162632, | |
| "grad_norm": 0.9089880585670471, | |
| "learning_rate": 0.00019736954238777792, | |
| "loss": 0.4932, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.16882386043894204, | |
| "grad_norm": 0.8703072667121887, | |
| "learning_rate": 0.00019714101322156915, | |
| "loss": 0.3456, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.17163759144625773, | |
| "grad_norm": 0.9151054620742798, | |
| "learning_rate": 0.00019690310968811914, | |
| "loss": 0.3413, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.17445132245357345, | |
| "grad_norm": 0.7803131341934204, | |
| "learning_rate": 0.0001966558547457543, | |
| "loss": 0.2144, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.17726505346088914, | |
| "grad_norm": 4.146880626678467, | |
| "learning_rate": 0.00019639927225523698, | |
| "loss": 0.2838, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.18007878446820483, | |
| "grad_norm": 0.8946036696434021, | |
| "learning_rate": 0.00019613338697746285, | |
| "loss": 0.3069, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.18289251547552055, | |
| "grad_norm": 1.0504130125045776, | |
| "learning_rate": 0.00019585822457107138, | |
| "loss": 0.1849, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.18570624648283623, | |
| "grad_norm": 0.5320996642112732, | |
| "learning_rate": 0.0001955738115899698, | |
| "loss": 0.4705, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.18851997749015195, | |
| "grad_norm": 1.0972635746002197, | |
| "learning_rate": 0.00019528017548077045, | |
| "loss": 0.1279, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.19133370849746764, | |
| "grad_norm": 0.9836655855178833, | |
| "learning_rate": 0.00019497734458014216, | |
| "loss": 0.3454, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.19414743950478333, | |
| "grad_norm": 0.9435672163963318, | |
| "learning_rate": 0.00019466534811207569, | |
| "loss": 0.1713, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.19696117051209905, | |
| "grad_norm": 0.8675717115402222, | |
| "learning_rate": 0.00019434421618506358, | |
| "loss": 0.4799, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.19977490151941474, | |
| "grad_norm": 0.5335102677345276, | |
| "learning_rate": 0.00019401397978919453, | |
| "loss": 0.242, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.20258863252673046, | |
| "grad_norm": 0.7480678558349609, | |
| "learning_rate": 0.00019367467079316279, | |
| "loss": 0.3568, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.20540236353404615, | |
| "grad_norm": 0.23846450448036194, | |
| "learning_rate": 0.0001933263219411928, | |
| "loss": 0.4519, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.20821609454136183, | |
| "grad_norm": 0.24460311233997345, | |
| "learning_rate": 0.00019296896684987925, | |
| "loss": 0.3763, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.21102982554867755, | |
| "grad_norm": 0.6336620450019836, | |
| "learning_rate": 0.0001926026400049429, | |
| "loss": 0.3754, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.21384355655599324, | |
| "grad_norm": 0.4195510745048523, | |
| "learning_rate": 0.00019222737675790276, | |
| "loss": 0.2576, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.21665728756330896, | |
| "grad_norm": 1.3396929502487183, | |
| "learning_rate": 0.00019184321332266452, | |
| "loss": 0.3267, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.21947101857062465, | |
| "grad_norm": 0.49325576424598694, | |
| "learning_rate": 0.0001914501867720258, | |
| "loss": 0.2602, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.22228474957794034, | |
| "grad_norm": 0.5282377004623413, | |
| "learning_rate": 0.00019104833503409848, | |
| "loss": 0.3498, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.22509848058525606, | |
| "grad_norm": 0.6364492774009705, | |
| "learning_rate": 0.00019063769688864866, | |
| "loss": 0.2147, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.22791221159257175, | |
| "grad_norm": 0.8976377248764038, | |
| "learning_rate": 0.00019021831196335418, | |
| "loss": 0.328, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.23072594259988746, | |
| "grad_norm": 0.4209904372692108, | |
| "learning_rate": 0.0001897902207299805, | |
| "loss": 0.2822, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.23353967360720315, | |
| "grad_norm": 0.5531566739082336, | |
| "learning_rate": 0.0001893534645004751, | |
| "loss": 0.4366, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.23635340461451884, | |
| "grad_norm": 0.592050313949585, | |
| "learning_rate": 0.00018890808542298073, | |
| "loss": 0.381, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.23916713562183456, | |
| "grad_norm": 0.8051882982254028, | |
| "learning_rate": 0.00018845412647776794, | |
| "loss": 0.3602, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.24198086662915025, | |
| "grad_norm": 0.7593362331390381, | |
| "learning_rate": 0.0001879916314730875, | |
| "loss": 0.3809, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.24479459763646597, | |
| "grad_norm": 1.2135759592056274, | |
| "learning_rate": 0.00018752064504094272, | |
| "loss": 0.2138, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.24760832864378166, | |
| "grad_norm": 0.14413990080356598, | |
| "learning_rate": 0.00018704121263278227, | |
| "loss": 0.3506, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2504220596510974, | |
| "grad_norm": 0.6321181058883667, | |
| "learning_rate": 0.00018655338051511413, | |
| "loss": 0.3232, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.25323579065841306, | |
| "grad_norm": 0.7276772856712341, | |
| "learning_rate": 0.00018605719576504065, | |
| "loss": 0.2345, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.25604952166572875, | |
| "grad_norm": 0.324861079454422, | |
| "learning_rate": 0.00018555270626571555, | |
| "loss": 0.1345, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.25886325267304444, | |
| "grad_norm": 0.7779459953308105, | |
| "learning_rate": 0.000185039960701723, | |
| "loss": 0.2958, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.26167698368036013, | |
| "grad_norm": 0.6974682211875916, | |
| "learning_rate": 0.0001845190085543795, | |
| "loss": 0.2257, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.2644907146876759, | |
| "grad_norm": 0.9312912821769714, | |
| "learning_rate": 0.0001839899000969587, | |
| "loss": 0.353, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.26730444569499157, | |
| "grad_norm": 0.49484914541244507, | |
| "learning_rate": 0.00018345268638984003, | |
| "loss": 0.2321, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.27011817670230726, | |
| "grad_norm": 0.24110960960388184, | |
| "learning_rate": 0.00018290741927558113, | |
| "loss": 0.2501, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.27293190770962295, | |
| "grad_norm": 0.5313132405281067, | |
| "learning_rate": 0.00018235415137391497, | |
| "loss": 0.2477, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.27574563871693863, | |
| "grad_norm": 0.6360633373260498, | |
| "learning_rate": 0.00018179293607667178, | |
| "loss": 0.2846, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2785593697242544, | |
| "grad_norm": 0.5810567140579224, | |
| "learning_rate": 0.00018122382754262681, | |
| "loss": 0.2196, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.28137310073157007, | |
| "grad_norm": 0.7277317047119141, | |
| "learning_rate": 0.00018064688069227368, | |
| "loss": 0.2656, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.28418683173888576, | |
| "grad_norm": 0.7561081051826477, | |
| "learning_rate": 0.00018006215120252453, | |
| "loss": 0.3004, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.28700056274620145, | |
| "grad_norm": 0.8930642604827881, | |
| "learning_rate": 0.0001794696955013369, | |
| "loss": 0.445, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.28981429375351714, | |
| "grad_norm": 0.9028257727622986, | |
| "learning_rate": 0.00017886957076226838, | |
| "loss": 0.3362, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.2926280247608329, | |
| "grad_norm": 0.6070359945297241, | |
| "learning_rate": 0.0001782618348989593, | |
| "loss": 0.1993, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2954417557681486, | |
| "grad_norm": 0.9078888893127441, | |
| "learning_rate": 0.0001776465465595437, | |
| "loss": 0.2554, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.29825548677546426, | |
| "grad_norm": 0.7235105633735657, | |
| "learning_rate": 0.0001770237651209898, | |
| "loss": 0.1807, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.30106921778277995, | |
| "grad_norm": 0.37401115894317627, | |
| "learning_rate": 0.00017639355068336987, | |
| "loss": 0.153, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.30388294879009564, | |
| "grad_norm": 0.4220016896724701, | |
| "learning_rate": 0.00017575596406406048, | |
| "loss": 0.2249, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.3066966797974114, | |
| "grad_norm": 0.6732789278030396, | |
| "learning_rate": 0.00017511106679187334, | |
| "loss": 0.1663, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.3095104108047271, | |
| "grad_norm": 0.8022940754890442, | |
| "learning_rate": 0.00017445892110111783, | |
| "loss": 0.3083, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.31232414181204277, | |
| "grad_norm": 2.0369653701782227, | |
| "learning_rate": 0.00017379958992559493, | |
| "loss": 0.3939, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.31513787281935846, | |
| "grad_norm": 1.5206650495529175, | |
| "learning_rate": 0.00017313313689252418, | |
| "loss": 0.3079, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.31795160382667415, | |
| "grad_norm": 0.343148410320282, | |
| "learning_rate": 0.00017245962631640341, | |
| "loss": 0.2408, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.3207653348339899, | |
| "grad_norm": 0.7586761116981506, | |
| "learning_rate": 0.00017177912319280217, | |
| "loss": 0.2298, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3235790658413056, | |
| "grad_norm": 0.5147440433502197, | |
| "learning_rate": 0.00017109169319208948, | |
| "loss": 0.3067, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.32639279684862127, | |
| "grad_norm": 0.5967961549758911, | |
| "learning_rate": 0.0001703974026530966, | |
| "loss": 0.3291, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.32920652785593696, | |
| "grad_norm": 0.8705066442489624, | |
| "learning_rate": 0.00016969631857671497, | |
| "loss": 0.3395, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.33202025886325265, | |
| "grad_norm": 0.36913836002349854, | |
| "learning_rate": 0.00016898850861943058, | |
| "loss": 0.15, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.3348339898705684, | |
| "grad_norm": 1.0335655212402344, | |
| "learning_rate": 0.00016827404108679485, | |
| "loss": 0.2577, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.3376477208778841, | |
| "grad_norm": 0.44228196144104004, | |
| "learning_rate": 0.00016755298492683308, | |
| "loss": 0.3735, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3404614518851998, | |
| "grad_norm": 0.7567837238311768, | |
| "learning_rate": 0.0001668254097233907, | |
| "loss": 0.233, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.34327518289251546, | |
| "grad_norm": 0.7530750632286072, | |
| "learning_rate": 0.00016609138568941809, | |
| "loss": 0.34, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.34608891389983115, | |
| "grad_norm": 0.5381020307540894, | |
| "learning_rate": 0.0001653509836601952, | |
| "loss": 0.3743, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.3489026449071469, | |
| "grad_norm": 1.5347527265548706, | |
| "learning_rate": 0.00016460427508649546, | |
| "loss": 0.3224, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.3517163759144626, | |
| "grad_norm": 0.7328157424926758, | |
| "learning_rate": 0.0001638513320276907, | |
| "loss": 0.4606, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.3545301069217783, | |
| "grad_norm": 0.5422099232673645, | |
| "learning_rate": 0.0001630922271447972, | |
| "loss": 0.239, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.35734383792909397, | |
| "grad_norm": 0.12207705527544022, | |
| "learning_rate": 0.0001623270336934638, | |
| "loss": 0.1777, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.36015756893640966, | |
| "grad_norm": 0.7163983583450317, | |
| "learning_rate": 0.00016155582551690236, | |
| "loss": 0.308, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3629712999437254, | |
| "grad_norm": 0.5855613350868225, | |
| "learning_rate": 0.00016077867703876182, | |
| "loss": 0.3146, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.3657850309510411, | |
| "grad_norm": 0.3779029846191406, | |
| "learning_rate": 0.0001599956632559461, | |
| "loss": 0.2512, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3685987619583568, | |
| "grad_norm": 0.4623885452747345, | |
| "learning_rate": 0.00015920685973137673, | |
| "loss": 0.3344, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.37141249296567247, | |
| "grad_norm": 0.5071548819541931, | |
| "learning_rate": 0.00015841234258670065, | |
| "loss": 0.2301, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.37422622397298816, | |
| "grad_norm": 0.8452264070510864, | |
| "learning_rate": 0.0001576121884949446, | |
| "loss": 0.3051, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.3770399549803039, | |
| "grad_norm": 0.3451974391937256, | |
| "learning_rate": 0.00015680647467311557, | |
| "loss": 0.2358, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.3798536859876196, | |
| "grad_norm": 0.7894652485847473, | |
| "learning_rate": 0.0001559952788747495, | |
| "loss": 0.2995, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.3826674169949353, | |
| "grad_norm": 0.6503499746322632, | |
| "learning_rate": 0.00015517867938240763, | |
| "loss": 0.3838, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.385481148002251, | |
| "grad_norm": 0.8255408406257629, | |
| "learning_rate": 0.00015435675500012212, | |
| "loss": 0.3326, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.38829487900956666, | |
| "grad_norm": 0.9840317368507385, | |
| "learning_rate": 0.00015352958504579123, | |
| "loss": 0.3544, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.3911086100168824, | |
| "grad_norm": 0.5513814687728882, | |
| "learning_rate": 0.00015269724934352497, | |
| "loss": 0.3565, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.3939223410241981, | |
| "grad_norm": 0.9206532835960388, | |
| "learning_rate": 0.00015185982821594175, | |
| "loss": 0.6995, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3967360720315138, | |
| "grad_norm": 0.6624305248260498, | |
| "learning_rate": 0.00015101740247641714, | |
| "loss": 0.236, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.3995498030388295, | |
| "grad_norm": 0.4557129442691803, | |
| "learning_rate": 0.00015017005342128517, | |
| "loss": 0.2185, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.40236353404614517, | |
| "grad_norm": 0.5107508301734924, | |
| "learning_rate": 0.000149317862821993, | |
| "loss": 0.2376, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.4051772650534609, | |
| "grad_norm": 0.6542500257492065, | |
| "learning_rate": 0.00014846091291720957, | |
| "loss": 0.3315, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4079909960607766, | |
| "grad_norm": 1.1817783117294312, | |
| "learning_rate": 0.00014759928640488965, | |
| "loss": 0.27, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.4108047270680923, | |
| "grad_norm": 0.47976672649383545, | |
| "learning_rate": 0.00014673306643429314, | |
| "loss": 0.2458, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.413618458075408, | |
| "grad_norm": 0.9391474723815918, | |
| "learning_rate": 0.00014586233659796087, | |
| "loss": 0.424, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.41643218908272367, | |
| "grad_norm": 0.6755409240722656, | |
| "learning_rate": 0.0001449871809236478, | |
| "loss": 0.3009, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4192459200900394, | |
| "grad_norm": 1.1786988973617554, | |
| "learning_rate": 0.000144107683866214, | |
| "loss": 0.2926, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.4220596510973551, | |
| "grad_norm": 0.4564046561717987, | |
| "learning_rate": 0.00014322393029947468, | |
| "loss": 0.3663, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4248733821046708, | |
| "grad_norm": 0.650117814540863, | |
| "learning_rate": 0.00014233600550800962, | |
| "loss": 0.2522, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.4276871131119865, | |
| "grad_norm": 0.35542795062065125, | |
| "learning_rate": 0.0001414439951789328, | |
| "loss": 0.1902, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.4305008441193022, | |
| "grad_norm": 0.6666870713233948, | |
| "learning_rate": 0.00014054798539362356, | |
| "loss": 0.3625, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.4333145751266179, | |
| "grad_norm": 1.3364002704620361, | |
| "learning_rate": 0.00013964806261941944, | |
| "loss": 0.2144, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4361283061339336, | |
| "grad_norm": 0.30019038915634155, | |
| "learning_rate": 0.00013874431370127188, | |
| "loss": 0.205, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.4389420371412493, | |
| "grad_norm": 0.7001076936721802, | |
| "learning_rate": 0.0001378368258533654, | |
| "loss": 0.4068, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.441755768148565, | |
| "grad_norm": 1.1424932479858398, | |
| "learning_rate": 0.0001369256866507012, | |
| "loss": 0.2298, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.4445694991558807, | |
| "grad_norm": 0.8599133491516113, | |
| "learning_rate": 0.00013601098402064607, | |
| "loss": 0.2843, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.4473832301631964, | |
| "grad_norm": 0.8263369798660278, | |
| "learning_rate": 0.00013509280623444695, | |
| "loss": 0.2451, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.4501969611705121, | |
| "grad_norm": 0.9365822672843933, | |
| "learning_rate": 0.00013417124189871272, | |
| "loss": 0.2621, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4530106921778278, | |
| "grad_norm": 0.47065469622612, | |
| "learning_rate": 0.00013324637994686326, | |
| "loss": 0.191, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.4558244231851435, | |
| "grad_norm": 0.6455582976341248, | |
| "learning_rate": 0.00013231830963054722, | |
| "loss": 0.2073, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4586381541924592, | |
| "grad_norm": 0.8905434012413025, | |
| "learning_rate": 0.0001313871205110291, | |
| "loss": 0.3213, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.4614518851997749, | |
| "grad_norm": 0.44368186593055725, | |
| "learning_rate": 0.0001304529024505461, | |
| "loss": 0.4487, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.4642656162070906, | |
| "grad_norm": 0.3236369788646698, | |
| "learning_rate": 0.00012951574560363636, | |
| "loss": 0.2343, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.4670793472144063, | |
| "grad_norm": 0.3316313624382019, | |
| "learning_rate": 0.00012857574040843876, | |
| "loss": 0.1704, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.469893078221722, | |
| "grad_norm": 1.180114984512329, | |
| "learning_rate": 0.0001276329775779655, | |
| "loss": 0.2944, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.4727068092290377, | |
| "grad_norm": 0.4699708819389343, | |
| "learning_rate": 0.00012668754809134773, | |
| "loss": 0.1709, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.47552054023635343, | |
| "grad_norm": 0.5087912082672119, | |
| "learning_rate": 0.00012573954318505624, | |
| "loss": 0.2753, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.4783342712436691, | |
| "grad_norm": 0.21406421065330505, | |
| "learning_rate": 0.00012478905434409662, | |
| "loss": 0.2955, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4811480022509848, | |
| "grad_norm": 0.8056962490081787, | |
| "learning_rate": 0.0001238361732931808, | |
| "loss": 0.275, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.4839617332583005, | |
| "grad_norm": 0.7347704768180847, | |
| "learning_rate": 0.00012288099198787532, | |
| "loss": 0.2448, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4867754642656162, | |
| "grad_norm": 0.43679895997047424, | |
| "learning_rate": 0.0001219236026057275, | |
| "loss": 0.4004, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.48958919527293193, | |
| "grad_norm": 0.4202831983566284, | |
| "learning_rate": 0.00012096409753736991, | |
| "loss": 0.1963, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.4924029262802476, | |
| "grad_norm": 0.8716102838516235, | |
| "learning_rate": 0.00012000256937760445, | |
| "loss": 0.225, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.4952166572875633, | |
| "grad_norm": 0.2482863813638687, | |
| "learning_rate": 0.00011903911091646684, | |
| "loss": 0.2338, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.498030388294879, | |
| "grad_norm": 0.6226937174797058, | |
| "learning_rate": 0.000118073815130272, | |
| "loss": 0.3606, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.5008441193021947, | |
| "grad_norm": 0.4387325942516327, | |
| "learning_rate": 0.0001171067751726416, | |
| "loss": 0.231, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.5036578503095104, | |
| "grad_norm": 0.26261425018310547, | |
| "learning_rate": 0.00011613808436551454, | |
| "loss": 0.1239, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.5064715813168261, | |
| "grad_norm": 1.2383506298065186, | |
| "learning_rate": 0.00011516783619014109, | |
| "loss": 0.2496, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5092853123241418, | |
| "grad_norm": 1.853761911392212, | |
| "learning_rate": 0.00011419612427806172, | |
| "loss": 0.3915, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.5120990433314575, | |
| "grad_norm": 0.330138623714447, | |
| "learning_rate": 0.00011322304240207145, | |
| "loss": 0.0917, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5149127743387732, | |
| "grad_norm": 0.41656142473220825, | |
| "learning_rate": 0.00011224868446717036, | |
| "loss": 0.1754, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.5177265053460889, | |
| "grad_norm": 0.6251401901245117, | |
| "learning_rate": 0.00011127314450150175, | |
| "loss": 0.3901, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5205402363534046, | |
| "grad_norm": 1.254900336265564, | |
| "learning_rate": 0.00011029651664727798, | |
| "loss": 0.2828, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.5233539673607203, | |
| "grad_norm": 0.9572696089744568, | |
| "learning_rate": 0.00010931889515169555, | |
| "loss": 0.2235, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.526167698368036, | |
| "grad_norm": 0.8414142727851868, | |
| "learning_rate": 0.00010834037435784008, | |
| "loss": 0.2718, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.5289814293753518, | |
| "grad_norm": 0.4331166744232178, | |
| "learning_rate": 0.00010736104869558176, | |
| "loss": 0.2558, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5317951603826674, | |
| "grad_norm": 0.32980430126190186, | |
| "learning_rate": 0.00010638101267246283, | |
| "loss": 0.1117, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.5346088913899831, | |
| "grad_norm": 0.7335298657417297, | |
| "learning_rate": 0.00010540036086457723, | |
| "loss": 0.4412, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5374226223972988, | |
| "grad_norm": 0.6139857769012451, | |
| "learning_rate": 0.00010441918790744372, | |
| "loss": 0.2925, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.5402363534046145, | |
| "grad_norm": 0.3401097059249878, | |
| "learning_rate": 0.00010343758848687341, | |
| "loss": 0.2625, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5430500844119303, | |
| "grad_norm": 0.3688424229621887, | |
| "learning_rate": 0.00010245565732983227, | |
| "loss": 0.211, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.5458638154192459, | |
| "grad_norm": 1.3460103273391724, | |
| "learning_rate": 0.00010147348919529969, | |
| "loss": 0.3091, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.5486775464265616, | |
| "grad_norm": 0.4599795937538147, | |
| "learning_rate": 0.00010049117886512404, | |
| "loss": 0.2301, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.5514912774338773, | |
| "grad_norm": 0.5787628293037415, | |
| "learning_rate": 9.950882113487598e-05, | |
| "loss": 0.258, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.554305008441193, | |
| "grad_norm": 0.8748778104782104, | |
| "learning_rate": 9.852651080470033e-05, | |
| "loss": 0.3606, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.5571187394485088, | |
| "grad_norm": 0.4328353703022003, | |
| "learning_rate": 9.754434267016775e-05, | |
| "loss": 0.2004, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.5599324704558244, | |
| "grad_norm": 0.9542059898376465, | |
| "learning_rate": 9.656241151312661e-05, | |
| "loss": 0.2206, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.5627462014631401, | |
| "grad_norm": 0.3367530405521393, | |
| "learning_rate": 9.558081209255629e-05, | |
| "loss": 0.1936, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5655599324704558, | |
| "grad_norm": 0.3511320650577545, | |
| "learning_rate": 9.459963913542279e-05, | |
| "loss": 0.1467, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.5683736634777715, | |
| "grad_norm": 0.5722060799598694, | |
| "learning_rate": 9.361898732753716e-05, | |
| "loss": 0.3173, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5711873944850873, | |
| "grad_norm": 0.5380959510803223, | |
| "learning_rate": 9.263895130441826e-05, | |
| "loss": 0.2697, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.5740011254924029, | |
| "grad_norm": 0.7701444625854492, | |
| "learning_rate": 9.165962564215993e-05, | |
| "loss": 0.2513, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5768148564997186, | |
| "grad_norm": 0.44029852747917175, | |
| "learning_rate": 9.068110484830447e-05, | |
| "loss": 0.3467, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.5796285875070343, | |
| "grad_norm": 0.6167469620704651, | |
| "learning_rate": 8.970348335272203e-05, | |
| "loss": 0.3191, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.58244231851435, | |
| "grad_norm": 0.8046761751174927, | |
| "learning_rate": 8.872685549849827e-05, | |
| "loss": 0.2874, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.5852560495216658, | |
| "grad_norm": 0.6045218110084534, | |
| "learning_rate": 8.775131553282965e-05, | |
| "loss": 0.2724, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.5880697805289814, | |
| "grad_norm": 0.23991712927818298, | |
| "learning_rate": 8.67769575979286e-05, | |
| "loss": 0.0839, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.5908835115362971, | |
| "grad_norm": 0.5629101395606995, | |
| "learning_rate": 8.580387572193829e-05, | |
| "loss": 0.2608, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5936972425436128, | |
| "grad_norm": 0.7069487571716309, | |
| "learning_rate": 8.483216380985895e-05, | |
| "loss": 0.359, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.5965109735509285, | |
| "grad_norm": 1.0714657306671143, | |
| "learning_rate": 8.386191563448548e-05, | |
| "loss": 0.2144, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.5993247045582443, | |
| "grad_norm": 0.8178947567939758, | |
| "learning_rate": 8.289322482735844e-05, | |
| "loss": 0.3105, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.6021384355655599, | |
| "grad_norm": 0.7573699951171875, | |
| "learning_rate": 8.192618486972803e-05, | |
| "loss": 0.2918, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.6049521665728756, | |
| "grad_norm": 0.3417803645133972, | |
| "learning_rate": 8.096088908353315e-05, | |
| "loss": 0.1382, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.6077658975801913, | |
| "grad_norm": 0.7476038336753845, | |
| "learning_rate": 7.999743062239557e-05, | |
| "loss": 0.4213, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.610579628587507, | |
| "grad_norm": 0.3231750726699829, | |
| "learning_rate": 7.90359024626301e-05, | |
| "loss": 0.2874, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.6133933595948228, | |
| "grad_norm": 0.5958102345466614, | |
| "learning_rate": 7.807639739427251e-05, | |
| "loss": 0.2656, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.6162070906021384, | |
| "grad_norm": 0.4588276743888855, | |
| "learning_rate": 7.711900801212466e-05, | |
| "loss": 0.1933, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.6190208216094542, | |
| "grad_norm": 0.5570498704910278, | |
| "learning_rate": 7.616382670681924e-05, | |
| "loss": 0.3897, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6218345526167698, | |
| "grad_norm": 0.41902509331703186, | |
| "learning_rate": 7.521094565590338e-05, | |
| "loss": 0.2403, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.6246482836240855, | |
| "grad_norm": 0.9511467814445496, | |
| "learning_rate": 7.426045681494378e-05, | |
| "loss": 0.3146, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.6274620146314013, | |
| "grad_norm": 1.1212773323059082, | |
| "learning_rate": 7.33124519086523e-05, | |
| "loss": 0.2424, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.6302757456387169, | |
| "grad_norm": 0.8666883111000061, | |
| "learning_rate": 7.236702242203457e-05, | |
| "loss": 0.319, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.6330894766460327, | |
| "grad_norm": 1.2638081312179565, | |
| "learning_rate": 7.142425959156125e-05, | |
| "loss": 0.1587, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.6359032076533483, | |
| "grad_norm": 0.33488303422927856, | |
| "learning_rate": 7.04842543963637e-05, | |
| "loss": 0.2139, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.638716938660664, | |
| "grad_norm": 0.6867479681968689, | |
| "learning_rate": 6.954709754945394e-05, | |
| "loss": 0.2332, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.6415306696679798, | |
| "grad_norm": 0.4780934751033783, | |
| "learning_rate": 6.861287948897091e-05, | |
| "loss": 0.1127, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.6443444006752954, | |
| "grad_norm": 0.8691847920417786, | |
| "learning_rate": 6.768169036945277e-05, | |
| "loss": 0.3039, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.6471581316826112, | |
| "grad_norm": 0.4771972894668579, | |
| "learning_rate": 6.675362005313677e-05, | |
| "loss": 0.2787, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6499718626899268, | |
| "grad_norm": 0.5366829037666321, | |
| "learning_rate": 6.58287581012873e-05, | |
| "loss": 0.1824, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.6527855936972425, | |
| "grad_norm": 0.28026753664016724, | |
| "learning_rate": 6.490719376555305e-05, | |
| "loss": 0.2074, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.6555993247045583, | |
| "grad_norm": 0.9920913577079773, | |
| "learning_rate": 6.398901597935393e-05, | |
| "loss": 0.3188, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.6584130557118739, | |
| "grad_norm": 0.5217199921607971, | |
| "learning_rate": 6.30743133492988e-05, | |
| "loss": 0.2846, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6612267867191897, | |
| "grad_norm": 0.5738883018493652, | |
| "learning_rate": 6.216317414663463e-05, | |
| "loss": 0.1972, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.6640405177265053, | |
| "grad_norm": 0.3134082853794098, | |
| "learning_rate": 6.125568629872813e-05, | |
| "loss": 0.1806, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.666854248733821, | |
| "grad_norm": 0.4762999415397644, | |
| "learning_rate": 6.035193738058056e-05, | |
| "loss": 0.2386, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.6696679797411368, | |
| "grad_norm": 0.48775815963745117, | |
| "learning_rate": 5.945201460637645e-05, | |
| "loss": 0.1261, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6724817107484524, | |
| "grad_norm": 0.5460477471351624, | |
| "learning_rate": 5.855600482106721e-05, | |
| "loss": 0.3201, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.6752954417557682, | |
| "grad_norm": 0.41563519835472107, | |
| "learning_rate": 5.766399449199037e-05, | |
| "loss": 0.2287, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6781091727630838, | |
| "grad_norm": 0.832744300365448, | |
| "learning_rate": 5.677606970052529e-05, | |
| "loss": 0.5409, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.6809229037703995, | |
| "grad_norm": 0.8101387023925781, | |
| "learning_rate": 5.5892316133786005e-05, | |
| "loss": 0.1934, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.6837366347777153, | |
| "grad_norm": 0.9781274795532227, | |
| "learning_rate": 5.501281907635223e-05, | |
| "loss": 0.1842, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.6865503657850309, | |
| "grad_norm": 0.36751049757003784, | |
| "learning_rate": 5.413766340203914e-05, | |
| "loss": 0.2631, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.6893640967923467, | |
| "grad_norm": 0.3681579828262329, | |
| "learning_rate": 5.3266933565706865e-05, | |
| "loss": 0.2639, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.6921778277996623, | |
| "grad_norm": 0.7795785069465637, | |
| "learning_rate": 5.240071359511035e-05, | |
| "loss": 0.3817, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.694991558806978, | |
| "grad_norm": 0.6714096069335938, | |
| "learning_rate": 5.153908708279045e-05, | |
| "loss": 0.2655, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.6978052898142938, | |
| "grad_norm": 0.6018862724304199, | |
| "learning_rate": 5.0682137178007025e-05, | |
| "loss": 0.2517, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.7006190208216094, | |
| "grad_norm": 0.7170803546905518, | |
| "learning_rate": 4.9829946578714825e-05, | |
| "loss": 0.3097, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.7034327518289252, | |
| "grad_norm": 0.37394005060195923, | |
| "learning_rate": 4.898259752358287e-05, | |
| "loss": 0.1883, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.7062464828362408, | |
| "grad_norm": 2.812126874923706, | |
| "learning_rate": 4.814017178405829e-05, | |
| "loss": 0.1431, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.7090602138435566, | |
| "grad_norm": 0.5540988445281982, | |
| "learning_rate": 4.730275065647506e-05, | |
| "loss": 0.2109, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.7118739448508723, | |
| "grad_norm": 0.9915019869804382, | |
| "learning_rate": 4.6470414954208785e-05, | |
| "loss": 0.2857, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.7146876758581879, | |
| "grad_norm": 1.248504400253296, | |
| "learning_rate": 4.56432449998779e-05, | |
| "loss": 0.4287, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.7175014068655037, | |
| "grad_norm": 0.5127077102661133, | |
| "learning_rate": 4.482132061759239e-05, | |
| "loss": 0.204, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.7203151378728193, | |
| "grad_norm": 0.6224874258041382, | |
| "learning_rate": 4.400472112525051e-05, | |
| "loss": 0.2376, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.7231288688801351, | |
| "grad_norm": 0.3423043191432953, | |
| "learning_rate": 4.3193525326884435e-05, | |
| "loss": 0.1957, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.7259425998874508, | |
| "grad_norm": 0.41790780425071716, | |
| "learning_rate": 4.238781150505542e-05, | |
| "loss": 0.3171, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.7287563308947664, | |
| "grad_norm": 0.550262451171875, | |
| "learning_rate": 4.158765741329935e-05, | |
| "loss": 0.2016, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.7315700619020822, | |
| "grad_norm": 0.5064123868942261, | |
| "learning_rate": 4.079314026862331e-05, | |
| "loss": 0.2747, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7343837929093978, | |
| "grad_norm": 0.6976874470710754, | |
| "learning_rate": 4.000433674405392e-05, | |
| "loss": 0.2478, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.7371975239167136, | |
| "grad_norm": 0.8951148986816406, | |
| "learning_rate": 3.9221322961238213e-05, | |
| "loss": 0.1747, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.7400112549240293, | |
| "grad_norm": 0.5888150930404663, | |
| "learning_rate": 3.8444174483097675e-05, | |
| "loss": 0.2342, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.7428249859313449, | |
| "grad_norm": 0.39759594202041626, | |
| "learning_rate": 3.7672966306536226e-05, | |
| "loss": 0.2402, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.7456387169386607, | |
| "grad_norm": 1.4384478330612183, | |
| "learning_rate": 3.690777285520281e-05, | |
| "loss": 0.3132, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.7484524479459763, | |
| "grad_norm": 0.4053248465061188, | |
| "learning_rate": 3.614866797230935e-05, | |
| "loss": 0.1615, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.7512661789532921, | |
| "grad_norm": 0.4696710407733917, | |
| "learning_rate": 3.5395724913504545e-05, | |
| "loss": 0.1633, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.7540799099606078, | |
| "grad_norm": 0.3567434847354889, | |
| "learning_rate": 3.464901633980484e-05, | |
| "loss": 0.2388, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.7568936409679234, | |
| "grad_norm": 0.8810656070709229, | |
| "learning_rate": 3.3908614310581924e-05, | |
| "loss": 0.3078, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.7597073719752392, | |
| "grad_norm": 0.9257289171218872, | |
| "learning_rate": 3.3174590276609355e-05, | |
| "loss": 0.4227, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7625211029825548, | |
| "grad_norm": 0.3970353305339813, | |
| "learning_rate": 3.24470150731669e-05, | |
| "loss": 0.108, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.7653348339898706, | |
| "grad_norm": 0.4147047996520996, | |
| "learning_rate": 3.1725958913205166e-05, | |
| "loss": 0.3138, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.7681485649971863, | |
| "grad_norm": 0.29604053497314453, | |
| "learning_rate": 3.1011491380569425e-05, | |
| "loss": 0.1246, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.770962296004502, | |
| "grad_norm": 0.7796684503555298, | |
| "learning_rate": 3.0303681423285068e-05, | |
| "loss": 0.3338, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.7737760270118177, | |
| "grad_norm": 0.5329720973968506, | |
| "learning_rate": 2.9602597346903406e-05, | |
| "loss": 0.3101, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.7765897580191333, | |
| "grad_norm": 0.6327192187309265, | |
| "learning_rate": 2.8908306807910534e-05, | |
| "loss": 0.1832, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.7794034890264491, | |
| "grad_norm": 0.6063408851623535, | |
| "learning_rate": 2.822087680719783e-05, | |
| "loss": 0.2447, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.7822172200337648, | |
| "grad_norm": 0.3461267650127411, | |
| "learning_rate": 2.754037368359661e-05, | |
| "loss": 0.274, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.7850309510410804, | |
| "grad_norm": 0.598047137260437, | |
| "learning_rate": 2.6866863107475803e-05, | |
| "loss": 0.173, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.7878446820483962, | |
| "grad_norm": 0.7208033800125122, | |
| "learning_rate": 2.620041007440508e-05, | |
| "loss": 0.2908, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7906584130557118, | |
| "grad_norm": 0.3856890797615051, | |
| "learning_rate": 2.5541078898882187e-05, | |
| "loss": 0.1546, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.7934721440630276, | |
| "grad_norm": 0.9550760388374329, | |
| "learning_rate": 2.4888933208126663e-05, | |
| "loss": 0.2096, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.7962858750703433, | |
| "grad_norm": 0.4413495361804962, | |
| "learning_rate": 2.4244035935939547e-05, | |
| "loss": 0.2607, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.799099606077659, | |
| "grad_norm": 1.8407310247421265, | |
| "learning_rate": 2.360644931663014e-05, | |
| "loss": 0.3178, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.8019133370849747, | |
| "grad_norm": 0.6831107139587402, | |
| "learning_rate": 2.2976234879010218e-05, | |
| "loss": 0.2287, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.8047270680922903, | |
| "grad_norm": 0.7173850536346436, | |
| "learning_rate": 2.2353453440456316e-05, | |
| "loss": 0.2534, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.8075407990996061, | |
| "grad_norm": 0.5183877944946289, | |
| "learning_rate": 2.173816510104073e-05, | |
| "loss": 0.2009, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.8103545301069218, | |
| "grad_norm": 0.50481116771698, | |
| "learning_rate": 2.113042923773164e-05, | |
| "loss": 0.088, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.8131682611142375, | |
| "grad_norm": 0.648539125919342, | |
| "learning_rate": 2.0530304498663143e-05, | |
| "loss": 0.1828, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.8159819921215532, | |
| "grad_norm": 0.6068935990333557, | |
| "learning_rate": 1.9937848797475488e-05, | |
| "loss": 0.2655, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.8187957231288688, | |
| "grad_norm": 0.36234456300735474, | |
| "learning_rate": 1.935311930772632e-05, | |
| "loss": 0.1955, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.8216094541361846, | |
| "grad_norm": 0.5231152176856995, | |
| "learning_rate": 1.877617245737321e-05, | |
| "loss": 0.2071, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.8244231851435003, | |
| "grad_norm": 0.21570482850074768, | |
| "learning_rate": 1.8207063923328237e-05, | |
| "loss": 0.1853, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.827236916150816, | |
| "grad_norm": 0.4301048815250397, | |
| "learning_rate": 1.764584862608507e-05, | |
| "loss": 0.2832, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.8300506471581317, | |
| "grad_norm": 0.44830775260925293, | |
| "learning_rate": 1.7092580724418882e-05, | |
| "loss": 0.2344, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.8328643781654473, | |
| "grad_norm": 0.42212042212486267, | |
| "learning_rate": 1.6547313610159986e-05, | |
| "loss": 0.2679, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.8356781091727631, | |
| "grad_norm": 0.7017850875854492, | |
| "learning_rate": 1.6010099903041332e-05, | |
| "loss": 0.2124, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.8384918401800788, | |
| "grad_norm": 0.8840892910957336, | |
| "learning_rate": 1.5480991445620542e-05, | |
| "loss": 0.2939, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.8413055711873945, | |
| "grad_norm": 0.8503584265708923, | |
| "learning_rate": 1.4960039298277029e-05, | |
| "loss": 0.2429, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.8441193021947102, | |
| "grad_norm": 1.0028765201568604, | |
| "learning_rate": 1.4447293734284474e-05, | |
| "loss": 0.3548, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8469330332020258, | |
| "grad_norm": 0.5684967637062073, | |
| "learning_rate": 1.3942804234959373e-05, | |
| "loss": 0.2871, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.8497467642093416, | |
| "grad_norm": 0.7405120730400085, | |
| "learning_rate": 1.3446619484885903e-05, | |
| "loss": 0.1683, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.8525604952166573, | |
| "grad_norm": 0.6290687918663025, | |
| "learning_rate": 1.2958787367217751e-05, | |
| "loss": 0.2926, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.855374226223973, | |
| "grad_norm": 0.7487866878509521, | |
| "learning_rate": 1.2479354959057298e-05, | |
| "loss": 0.4543, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.8581879572312887, | |
| "grad_norm": 0.2865360379219055, | |
| "learning_rate": 1.2008368526912506e-05, | |
| "loss": 0.1499, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.8610016882386043, | |
| "grad_norm": 0.7912615537643433, | |
| "learning_rate": 1.1545873522232053e-05, | |
| "loss": 0.4047, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.8638154192459201, | |
| "grad_norm": 0.4929727017879486, | |
| "learning_rate": 1.1091914577019302e-05, | |
| "loss": 0.3078, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.8666291502532358, | |
| "grad_norm": 0.590064287185669, | |
| "learning_rate": 1.0646535499524902e-05, | |
| "loss": 0.2642, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.8694428812605515, | |
| "grad_norm": 0.7222818732261658, | |
| "learning_rate": 1.0209779270019525e-05, | |
| "loss": 0.241, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.8722566122678672, | |
| "grad_norm": 0.8655977249145508, | |
| "learning_rate": 9.781688036645842e-06, | |
| "loss": 0.2761, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.8750703432751828, | |
| "grad_norm": 0.4627645015716553, | |
| "learning_rate": 9.362303111351378e-06, | |
| "loss": 0.1023, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.8778840742824986, | |
| "grad_norm": 0.07925199717283249, | |
| "learning_rate": 8.95166496590153e-06, | |
| "loss": 0.1914, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.8806978052898143, | |
| "grad_norm": 0.3665456771850586, | |
| "learning_rate": 8.549813227974247e-06, | |
| "loss": 0.2617, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.88351153629713, | |
| "grad_norm": 0.8194103240966797, | |
| "learning_rate": 8.156786677335493e-06, | |
| "loss": 0.3987, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.8863252673044457, | |
| "grad_norm": 0.4855242669582367, | |
| "learning_rate": 7.772623242097277e-06, | |
| "loss": 0.2692, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.8891389983117614, | |
| "grad_norm": 0.531732976436615, | |
| "learning_rate": 7.397359995057118e-06, | |
| "loss": 0.2017, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.8919527293190771, | |
| "grad_norm": 0.23579372465610504, | |
| "learning_rate": 7.03103315012078e-06, | |
| "loss": 0.183, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.8947664603263928, | |
| "grad_norm": 0.38668301701545715, | |
| "learning_rate": 6.673678058807198e-06, | |
| "loss": 0.1825, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.8975801913337085, | |
| "grad_norm": 0.5998560190200806, | |
| "learning_rate": 6.325329206837216e-06, | |
| "loss": 0.3018, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.9003939223410242, | |
| "grad_norm": 0.96495521068573, | |
| "learning_rate": 5.986020210805488e-06, | |
| "loss": 0.2978, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.9032076533483399, | |
| "grad_norm": 0.6704295873641968, | |
| "learning_rate": 5.655783814936433e-06, | |
| "loss": 0.1745, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.9060213843556556, | |
| "grad_norm": 0.502069890499115, | |
| "learning_rate": 5.334651887924324e-06, | |
| "loss": 0.1923, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.9088351153629713, | |
| "grad_norm": 0.1441662758588791, | |
| "learning_rate": 5.0226554198578576e-06, | |
| "loss": 0.1279, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.911648846370287, | |
| "grad_norm": 0.6545499563217163, | |
| "learning_rate": 4.719824519229554e-06, | |
| "loss": 0.2999, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.9144625773776027, | |
| "grad_norm": 0.4436165690422058, | |
| "learning_rate": 4.426188410030196e-06, | |
| "loss": 0.2404, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.9172763083849184, | |
| "grad_norm": 0.2450067549943924, | |
| "learning_rate": 4.1417754289286184e-06, | |
| "loss": 0.3257, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.9200900393922341, | |
| "grad_norm": 0.5148952603340149, | |
| "learning_rate": 3.866613022537169e-06, | |
| "loss": 0.1532, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.9229037703995498, | |
| "grad_norm": 0.39606383442878723, | |
| "learning_rate": 3.600727744763044e-06, | |
| "loss": 0.3313, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.9257175014068655, | |
| "grad_norm": 0.5422732830047607, | |
| "learning_rate": 3.344145254245723e-06, | |
| "loss": 0.1593, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.9285312324141812, | |
| "grad_norm": 0.565556526184082, | |
| "learning_rate": 3.0968903118808622e-06, | |
| "loss": 0.3292, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.9313449634214969, | |
| "grad_norm": 0.9481168389320374, | |
| "learning_rate": 2.85898677843085e-06, | |
| "loss": 0.1793, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.9341586944288126, | |
| "grad_norm": 0.5848947167396545, | |
| "learning_rate": 2.6304576122221035e-06, | |
| "loss": 0.2746, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.9369724254361284, | |
| "grad_norm": 0.840390145778656, | |
| "learning_rate": 2.411324866929543e-06, | |
| "loss": 0.3995, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.939786156443444, | |
| "grad_norm": 0.5747278928756714, | |
| "learning_rate": 2.201609689448425e-06, | |
| "loss": 0.2925, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.9425998874507597, | |
| "grad_norm": 0.518104612827301, | |
| "learning_rate": 2.0013323178535102e-06, | |
| "loss": 0.2362, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.9454136184580754, | |
| "grad_norm": 0.5568994879722595, | |
| "learning_rate": 1.810512079446125e-06, | |
| "loss": 0.2395, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.9482273494653911, | |
| "grad_norm": 0.2408752143383026, | |
| "learning_rate": 1.6291673888889302e-06, | |
| "loss": 0.1682, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.9510410804727069, | |
| "grad_norm": 0.3361740708351135, | |
| "learning_rate": 1.4573157464289554e-06, | |
| "loss": 0.1792, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.9538548114800225, | |
| "grad_norm": 0.46246424317359924, | |
| "learning_rate": 1.2949737362087156e-06, | |
| "loss": 0.1895, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.9566685424873382, | |
| "grad_norm": 0.6043664813041687, | |
| "learning_rate": 1.1421570246658242e-06, | |
| "loss": 0.2753, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9594822734946539, | |
| "grad_norm": 0.5759782195091248, | |
| "learning_rate": 9.988803590211037e-07, | |
| "loss": 0.2, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.9622960045019696, | |
| "grad_norm": 0.7253143787384033, | |
| "learning_rate": 8.6515756585549e-07, | |
| "loss": 0.3226, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.9651097355092854, | |
| "grad_norm": 0.4216267466545105, | |
| "learning_rate": 7.410015497756994e-07, | |
| "loss": 0.2211, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.967923466516601, | |
| "grad_norm": 0.6309015154838562, | |
| "learning_rate": 6.264242921689257e-07, | |
| "loss": 0.2258, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.9707371975239167, | |
| "grad_norm": 0.4388352930545807, | |
| "learning_rate": 5.214368500465305e-07, | |
| "loss": 0.2074, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.9735509285312324, | |
| "grad_norm": 0.36881500482559204, | |
| "learning_rate": 4.260493549771316e-07, | |
| "loss": 0.3136, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.9763646595385481, | |
| "grad_norm": 0.612010657787323, | |
| "learning_rate": 3.4027101210876155e-07, | |
| "loss": 0.29, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.9791783905458639, | |
| "grad_norm": 0.5004868507385254, | |
| "learning_rate": 2.6411009928064556e-07, | |
| "loss": 0.2251, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.9819921215531795, | |
| "grad_norm": 0.3668440878391266, | |
| "learning_rate": 1.9757396622428482e-07, | |
| "loss": 0.1813, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.9848058525604952, | |
| "grad_norm": 0.45063304901123047, | |
| "learning_rate": 1.406690338542349e-07, | |
| "loss": 0.1792, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.9876195835678109, | |
| "grad_norm": 0.5779865384101868, | |
| "learning_rate": 9.340079364847931e-08, | |
| "loss": 0.2583, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.9904333145751266, | |
| "grad_norm": 0.5747032165527344, | |
| "learning_rate": 5.5773807118442154e-08, | |
| "loss": 0.2567, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.9932470455824424, | |
| "grad_norm": 0.49229690432548523, | |
| "learning_rate": 2.7791705368818143e-08, | |
| "loss": 0.1704, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.996060776589758, | |
| "grad_norm": 0.24515922367572784, | |
| "learning_rate": 9.457188747186151e-09, | |
| "loss": 0.1709, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.9988745075970737, | |
| "grad_norm": 0.45681869983673096, | |
| "learning_rate": 7.720265833510709e-10, | |
| "loss": 0.2311, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 1777, | |
| "total_flos": 6.548002937199657e+17, | |
| "train_loss": 0.29464965595847514, | |
| "train_runtime": 5546.9812, | |
| "train_samples_per_second": 2.563, | |
| "train_steps_per_second": 0.32 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1777, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 20, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.548002937199657e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |