| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9994982438534872, | |
| "eval_steps": 100, | |
| "global_step": 2490, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004014049172102358, | |
| "grad_norm": 6.058766803882945, | |
| "learning_rate": 2.0080321285140563e-07, | |
| "loss": 1.1175, | |
| "mean_token_accuracy": 0.7435387402772904, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.008028098344204716, | |
| "grad_norm": 5.776390774267812, | |
| "learning_rate": 4.0160642570281125e-07, | |
| "loss": 1.0984, | |
| "mean_token_accuracy": 0.7462003320455551, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.012042147516307075, | |
| "grad_norm": 4.964372817070418, | |
| "learning_rate": 6.024096385542169e-07, | |
| "loss": 1.1026, | |
| "mean_token_accuracy": 0.7443804442882538, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.016056196688409432, | |
| "grad_norm": 3.8205027028573406, | |
| "learning_rate": 8.032128514056225e-07, | |
| "loss": 1.0507, | |
| "mean_token_accuracy": 0.7505107507109642, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.02007024586051179, | |
| "grad_norm": 2.2377857535427106, | |
| "learning_rate": 1.0040160642570282e-06, | |
| "loss": 0.9846, | |
| "mean_token_accuracy": 0.7566263154149055, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.02408429503261415, | |
| "grad_norm": 2.019284032849979, | |
| "learning_rate": 1.2048192771084338e-06, | |
| "loss": 0.9575, | |
| "mean_token_accuracy": 0.7592063814401626, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02809834420471651, | |
| "grad_norm": 1.4724634441521711, | |
| "learning_rate": 1.4056224899598394e-06, | |
| "loss": 0.8998, | |
| "mean_token_accuracy": 0.7671008065342904, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.032112393376818864, | |
| "grad_norm": 1.1460238128354983, | |
| "learning_rate": 1.606425702811245e-06, | |
| "loss": 0.8622, | |
| "mean_token_accuracy": 0.774419629573822, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.03612644254892122, | |
| "grad_norm": 0.8962017383164974, | |
| "learning_rate": 1.8072289156626508e-06, | |
| "loss": 0.8274, | |
| "mean_token_accuracy": 0.7802118778228759, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.04014049172102358, | |
| "grad_norm": 0.812855071257074, | |
| "learning_rate": 2.0080321285140564e-06, | |
| "loss": 0.7938, | |
| "mean_token_accuracy": 0.7862023413181305, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.04415454089312594, | |
| "grad_norm": 0.70094482944081, | |
| "learning_rate": 2.2088353413654622e-06, | |
| "loss": 0.7875, | |
| "mean_token_accuracy": 0.7867150768637657, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0481685900652283, | |
| "grad_norm": 0.6178275403403997, | |
| "learning_rate": 2.4096385542168676e-06, | |
| "loss": 0.7622, | |
| "mean_token_accuracy": 0.7920725375413895, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.05218263923733066, | |
| "grad_norm": 0.6043428588395752, | |
| "learning_rate": 2.6104417670682734e-06, | |
| "loss": 0.7431, | |
| "mean_token_accuracy": 0.7964029759168625, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.05619668840943302, | |
| "grad_norm": 0.575539588575506, | |
| "learning_rate": 2.811244979919679e-06, | |
| "loss": 0.7275, | |
| "mean_token_accuracy": 0.79924486130476, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.060210737581535376, | |
| "grad_norm": 0.580001753536299, | |
| "learning_rate": 3.012048192771085e-06, | |
| "loss": 0.7098, | |
| "mean_token_accuracy": 0.8032902508974076, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.06422478675363773, | |
| "grad_norm": 0.5522658470324157, | |
| "learning_rate": 3.21285140562249e-06, | |
| "loss": 0.6998, | |
| "mean_token_accuracy": 0.8053625896573067, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0682388359257401, | |
| "grad_norm": 0.5299497087282212, | |
| "learning_rate": 3.4136546184738962e-06, | |
| "loss": 0.7, | |
| "mean_token_accuracy": 0.8049216270446777, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.07225288509784245, | |
| "grad_norm": 0.5510228635994878, | |
| "learning_rate": 3.6144578313253016e-06, | |
| "loss": 0.7066, | |
| "mean_token_accuracy": 0.803421251475811, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.07626693426994481, | |
| "grad_norm": 0.5707453842809624, | |
| "learning_rate": 3.8152610441767074e-06, | |
| "loss": 0.685, | |
| "mean_token_accuracy": 0.8084254205226898, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.08028098344204716, | |
| "grad_norm": 0.6177649426118329, | |
| "learning_rate": 4.016064257028113e-06, | |
| "loss": 0.6964, | |
| "mean_token_accuracy": 0.8053169295191764, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.08429503261414953, | |
| "grad_norm": 0.5690454870209587, | |
| "learning_rate": 4.216867469879519e-06, | |
| "loss": 0.687, | |
| "mean_token_accuracy": 0.8071819305419922, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.08830908178625188, | |
| "grad_norm": 0.6164411988135075, | |
| "learning_rate": 4.4176706827309244e-06, | |
| "loss": 0.6607, | |
| "mean_token_accuracy": 0.8136294975876808, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.09232313095835425, | |
| "grad_norm": 0.553614457840363, | |
| "learning_rate": 4.61847389558233e-06, | |
| "loss": 0.6623, | |
| "mean_token_accuracy": 0.8132717654109001, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.0963371801304566, | |
| "grad_norm": 0.5422023366195315, | |
| "learning_rate": 4.819277108433735e-06, | |
| "loss": 0.6721, | |
| "mean_token_accuracy": 0.8104784831404686, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.10035122930255895, | |
| "grad_norm": 0.5133572915810863, | |
| "learning_rate": 5.0200803212851415e-06, | |
| "loss": 0.6645, | |
| "mean_token_accuracy": 0.8119137555360794, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.10436527847466132, | |
| "grad_norm": 0.58503647555332, | |
| "learning_rate": 5.220883534136547e-06, | |
| "loss": 0.6635, | |
| "mean_token_accuracy": 0.8120792865753174, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.10837932764676367, | |
| "grad_norm": 0.5684351953572544, | |
| "learning_rate": 5.421686746987952e-06, | |
| "loss": 0.6505, | |
| "mean_token_accuracy": 0.8155466809868812, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.11239337681886603, | |
| "grad_norm": 0.5447677613420705, | |
| "learning_rate": 5.622489959839358e-06, | |
| "loss": 0.6449, | |
| "mean_token_accuracy": 0.8165045753121376, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.11640742599096839, | |
| "grad_norm": 0.6432560163866337, | |
| "learning_rate": 5.823293172690764e-06, | |
| "loss": 0.647, | |
| "mean_token_accuracy": 0.8159680441021919, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.12042147516307075, | |
| "grad_norm": 0.5878776869535187, | |
| "learning_rate": 6.02409638554217e-06, | |
| "loss": 0.6398, | |
| "mean_token_accuracy": 0.8172471389174462, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1244355243351731, | |
| "grad_norm": 0.5528353725419236, | |
| "learning_rate": 6.224899598393575e-06, | |
| "loss": 0.6453, | |
| "mean_token_accuracy": 0.8162423759698868, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.12844957350727546, | |
| "grad_norm": 0.7274143611573796, | |
| "learning_rate": 6.42570281124498e-06, | |
| "loss": 0.6538, | |
| "mean_token_accuracy": 0.8140816584229469, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.13246362267937783, | |
| "grad_norm": 0.602479326169129, | |
| "learning_rate": 6.626506024096386e-06, | |
| "loss": 0.6388, | |
| "mean_token_accuracy": 0.8172610536217689, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.1364776718514802, | |
| "grad_norm": 0.59001493891811, | |
| "learning_rate": 6.8273092369477925e-06, | |
| "loss": 0.6386, | |
| "mean_token_accuracy": 0.8174594342708588, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.14049172102358254, | |
| "grad_norm": 0.643343148654434, | |
| "learning_rate": 7.028112449799197e-06, | |
| "loss": 0.6449, | |
| "mean_token_accuracy": 0.8159894704818725, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.1445057701956849, | |
| "grad_norm": 0.6086401788605312, | |
| "learning_rate": 7.228915662650603e-06, | |
| "loss": 0.619, | |
| "mean_token_accuracy": 0.821616081893444, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.14851981936778724, | |
| "grad_norm": 0.6081562590281072, | |
| "learning_rate": 7.429718875502009e-06, | |
| "loss": 0.6338, | |
| "mean_token_accuracy": 0.8180027529597282, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.15253386853988962, | |
| "grad_norm": 0.5631488889183262, | |
| "learning_rate": 7.630522088353415e-06, | |
| "loss": 0.6206, | |
| "mean_token_accuracy": 0.8213231518864632, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.15654791771199197, | |
| "grad_norm": 0.6623884065817287, | |
| "learning_rate": 7.83132530120482e-06, | |
| "loss": 0.6233, | |
| "mean_token_accuracy": 0.8204338252544403, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.16056196688409433, | |
| "grad_norm": 0.6674297683399895, | |
| "learning_rate": 8.032128514056226e-06, | |
| "loss": 0.6312, | |
| "mean_token_accuracy": 0.8186420142650604, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.16457601605619668, | |
| "grad_norm": 0.704186654303425, | |
| "learning_rate": 8.232931726907631e-06, | |
| "loss": 0.6266, | |
| "mean_token_accuracy": 0.8195266082882882, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.16859006522829906, | |
| "grad_norm": 0.6473103907382632, | |
| "learning_rate": 8.433734939759038e-06, | |
| "loss": 0.6287, | |
| "mean_token_accuracy": 0.819443441927433, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1726041144004014, | |
| "grad_norm": 0.5992551573092145, | |
| "learning_rate": 8.634538152610442e-06, | |
| "loss": 0.6141, | |
| "mean_token_accuracy": 0.8228801786899567, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.17661816357250376, | |
| "grad_norm": 0.607794767167812, | |
| "learning_rate": 8.835341365461849e-06, | |
| "loss": 0.6233, | |
| "mean_token_accuracy": 0.8208266854286194, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1806322127446061, | |
| "grad_norm": 0.6251260607888597, | |
| "learning_rate": 9.036144578313254e-06, | |
| "loss": 0.6186, | |
| "mean_token_accuracy": 0.8219277203083039, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.1846462619167085, | |
| "grad_norm": 0.7752879519685737, | |
| "learning_rate": 9.23694779116466e-06, | |
| "loss": 0.616, | |
| "mean_token_accuracy": 0.8226246342062951, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.18866031108881084, | |
| "grad_norm": 0.6885986830946352, | |
| "learning_rate": 9.437751004016065e-06, | |
| "loss": 0.6162, | |
| "mean_token_accuracy": 0.82130526304245, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.1926743602609132, | |
| "grad_norm": 0.6299815927660661, | |
| "learning_rate": 9.63855421686747e-06, | |
| "loss": 0.6173, | |
| "mean_token_accuracy": 0.8219423845410347, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.19668840943301555, | |
| "grad_norm": 0.7583480330442449, | |
| "learning_rate": 9.839357429718876e-06, | |
| "loss": 0.6148, | |
| "mean_token_accuracy": 0.8225555747747422, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.2007024586051179, | |
| "grad_norm": 0.7143206609736337, | |
| "learning_rate": 9.99999508689586e-06, | |
| "loss": 0.6154, | |
| "mean_token_accuracy": 0.8219420880079269, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.20471650777722028, | |
| "grad_norm": 0.6907038048664699, | |
| "learning_rate": 9.999823129264712e-06, | |
| "loss": 0.6108, | |
| "mean_token_accuracy": 0.8229139536619187, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.20873055694932263, | |
| "grad_norm": 0.6559142117068314, | |
| "learning_rate": 9.999405526081825e-06, | |
| "loss": 0.6189, | |
| "mean_token_accuracy": 0.8209213152527809, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.21274460612142498, | |
| "grad_norm": 0.6659512333420378, | |
| "learning_rate": 9.998742297864394e-06, | |
| "loss": 0.6035, | |
| "mean_token_accuracy": 0.8244217514991761, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.21675865529352734, | |
| "grad_norm": 0.5947661763992849, | |
| "learning_rate": 9.997833477197386e-06, | |
| "loss": 0.5997, | |
| "mean_token_accuracy": 0.8259585857391357, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.22077270446562972, | |
| "grad_norm": 0.8438008201731584, | |
| "learning_rate": 9.99667910873193e-06, | |
| "loss": 0.6078, | |
| "mean_token_accuracy": 0.8240197777748108, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.22478675363773207, | |
| "grad_norm": 0.8713006693349223, | |
| "learning_rate": 9.99527924918313e-06, | |
| "loss": 0.6016, | |
| "mean_token_accuracy": 0.8250452890992165, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.22880080280983442, | |
| "grad_norm": 0.7120303554563648, | |
| "learning_rate": 9.99363396732727e-06, | |
| "loss": 0.6052, | |
| "mean_token_accuracy": 0.8246865943074226, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.23281485198193677, | |
| "grad_norm": 0.6101967868649579, | |
| "learning_rate": 9.991743343998446e-06, | |
| "loss": 0.5928, | |
| "mean_token_accuracy": 0.8269972503185272, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.23682890115403912, | |
| "grad_norm": 0.6881046614257558, | |
| "learning_rate": 9.989607472084583e-06, | |
| "loss": 0.5997, | |
| "mean_token_accuracy": 0.8250862330198288, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.2408429503261415, | |
| "grad_norm": 0.6112394055545181, | |
| "learning_rate": 9.987226456522884e-06, | |
| "loss": 0.5915, | |
| "mean_token_accuracy": 0.8273163467645646, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.24485699949824385, | |
| "grad_norm": 0.6277850468291706, | |
| "learning_rate": 9.98460041429466e-06, | |
| "loss": 0.5903, | |
| "mean_token_accuracy": 0.8281888499855995, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.2488710486703462, | |
| "grad_norm": 0.7241042442848334, | |
| "learning_rate": 9.981729474419595e-06, | |
| "loss": 0.6041, | |
| "mean_token_accuracy": 0.8239907890558242, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.25288509784244856, | |
| "grad_norm": 0.6241491569251894, | |
| "learning_rate": 9.978613777949401e-06, | |
| "loss": 0.5964, | |
| "mean_token_accuracy": 0.8260207697749138, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.2568991470145509, | |
| "grad_norm": 0.6370033368365354, | |
| "learning_rate": 9.975253477960887e-06, | |
| "loss": 0.5917, | |
| "mean_token_accuracy": 0.8270863309502602, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.26091319618665326, | |
| "grad_norm": 0.638196228956886, | |
| "learning_rate": 9.971648739548443e-06, | |
| "loss": 0.5955, | |
| "mean_token_accuracy": 0.8271015107631683, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.26492724535875567, | |
| "grad_norm": 0.5924987938379438, | |
| "learning_rate": 9.967799739815925e-06, | |
| "loss": 0.5953, | |
| "mean_token_accuracy": 0.8263336911797523, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.268941294530858, | |
| "grad_norm": 0.6085311126983912, | |
| "learning_rate": 9.963706667867956e-06, | |
| "loss": 0.5963, | |
| "mean_token_accuracy": 0.8259957909584046, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.2729553437029604, | |
| "grad_norm": 0.739333912621636, | |
| "learning_rate": 9.95936972480063e-06, | |
| "loss": 0.5904, | |
| "mean_token_accuracy": 0.8273468598723411, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.2769693928750627, | |
| "grad_norm": 0.7146326321442452, | |
| "learning_rate": 9.954789123691643e-06, | |
| "loss": 0.5826, | |
| "mean_token_accuracy": 0.829837815463543, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.2809834420471651, | |
| "grad_norm": 0.6734576231685178, | |
| "learning_rate": 9.94996508958981e-06, | |
| "loss": 0.5887, | |
| "mean_token_accuracy": 0.828106701374054, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.28499749121926743, | |
| "grad_norm": 0.6525370797613959, | |
| "learning_rate": 9.944897859504022e-06, | |
| "loss": 0.5749, | |
| "mean_token_accuracy": 0.8314093336462974, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.2890115403913698, | |
| "grad_norm": 0.656220700779885, | |
| "learning_rate": 9.939587682391587e-06, | |
| "loss": 0.5881, | |
| "mean_token_accuracy": 0.8278362900018692, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.29302558956347213, | |
| "grad_norm": 0.8291093196925574, | |
| "learning_rate": 9.934034819146015e-06, | |
| "loss": 0.5857, | |
| "mean_token_accuracy": 0.8292678326368332, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.2970396387355745, | |
| "grad_norm": 0.75313947093166, | |
| "learning_rate": 9.928239542584186e-06, | |
| "loss": 0.581, | |
| "mean_token_accuracy": 0.8295959487557412, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.3010536879076769, | |
| "grad_norm": 0.6701377011807581, | |
| "learning_rate": 9.922202137432954e-06, | |
| "loss": 0.5774, | |
| "mean_token_accuracy": 0.8301370680332184, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.30506773707977924, | |
| "grad_norm": 0.6752117129825771, | |
| "learning_rate": 9.915922900315158e-06, | |
| "loss": 0.5862, | |
| "mean_token_accuracy": 0.828272470831871, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.3090817862518816, | |
| "grad_norm": 0.6462301731086534, | |
| "learning_rate": 9.90940213973504e-06, | |
| "loss": 0.5872, | |
| "mean_token_accuracy": 0.8279532685875892, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.31309583542398395, | |
| "grad_norm": 0.6180558660475647, | |
| "learning_rate": 9.902640176063103e-06, | |
| "loss": 0.5839, | |
| "mean_token_accuracy": 0.8296105518937111, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.3171098845960863, | |
| "grad_norm": 0.7334201323221599, | |
| "learning_rate": 9.895637341520357e-06, | |
| "loss": 0.5914, | |
| "mean_token_accuracy": 0.8273193582892417, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.32112393376818865, | |
| "grad_norm": 0.6244699500533561, | |
| "learning_rate": 9.888393980162e-06, | |
| "loss": 0.5894, | |
| "mean_token_accuracy": 0.8276168584823609, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.325137982940291, | |
| "grad_norm": 0.5890996107094697, | |
| "learning_rate": 9.880910447860527e-06, | |
| "loss": 0.5714, | |
| "mean_token_accuracy": 0.8319997638463974, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.32915203211239336, | |
| "grad_norm": 0.6067071145439632, | |
| "learning_rate": 9.873187112288224e-06, | |
| "loss": 0.5776, | |
| "mean_token_accuracy": 0.8308833613991737, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.3331660812844957, | |
| "grad_norm": 0.599604126460933, | |
| "learning_rate": 9.86522435289912e-06, | |
| "loss": 0.591, | |
| "mean_token_accuracy": 0.8270509079098701, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.3371801304565981, | |
| "grad_norm": 0.6817769908050123, | |
| "learning_rate": 9.857022560910338e-06, | |
| "loss": 0.5834, | |
| "mean_token_accuracy": 0.8284465298056602, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.34119417962870047, | |
| "grad_norm": 0.718771722804361, | |
| "learning_rate": 9.848582139282879e-06, | |
| "loss": 0.5806, | |
| "mean_token_accuracy": 0.8296876326203346, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.3452082288008028, | |
| "grad_norm": 0.751747783066705, | |
| "learning_rate": 9.839903502701815e-06, | |
| "loss": 0.5866, | |
| "mean_token_accuracy": 0.8281015455722809, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.34922227797290517, | |
| "grad_norm": 0.6843260226103427, | |
| "learning_rate": 9.830987077555925e-06, | |
| "loss": 0.5708, | |
| "mean_token_accuracy": 0.8315734773874283, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.3532363271450075, | |
| "grad_norm": 0.609990923675998, | |
| "learning_rate": 9.821833301916737e-06, | |
| "loss": 0.5744, | |
| "mean_token_accuracy": 0.8311674281954765, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.3572503763171099, | |
| "grad_norm": 0.6541236549277216, | |
| "learning_rate": 9.812442625517017e-06, | |
| "loss": 0.5759, | |
| "mean_token_accuracy": 0.8305603489279747, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.3612644254892122, | |
| "grad_norm": 0.6333659668052299, | |
| "learning_rate": 9.802815509728662e-06, | |
| "loss": 0.5663, | |
| "mean_token_accuracy": 0.8331687957048416, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.3652784746613146, | |
| "grad_norm": 0.6543354382780389, | |
| "learning_rate": 9.792952427540037e-06, | |
| "loss": 0.57, | |
| "mean_token_accuracy": 0.8322424262762069, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.369292523833417, | |
| "grad_norm": 0.5994204037638772, | |
| "learning_rate": 9.782853863532736e-06, | |
| "loss": 0.5806, | |
| "mean_token_accuracy": 0.8293602868914605, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.37330657300551934, | |
| "grad_norm": 0.6554085321235406, | |
| "learning_rate": 9.772520313857777e-06, | |
| "loss": 0.5664, | |
| "mean_token_accuracy": 0.8329186499118805, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.3773206221776217, | |
| "grad_norm": 0.6642085090735961, | |
| "learning_rate": 9.761952286211221e-06, | |
| "loss": 0.5726, | |
| "mean_token_accuracy": 0.8313711732625961, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.38133467134972404, | |
| "grad_norm": 0.6831480368586637, | |
| "learning_rate": 9.75115029980923e-06, | |
| "loss": 0.5719, | |
| "mean_token_accuracy": 0.8317721590399743, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.3853487205218264, | |
| "grad_norm": 0.5545902036926461, | |
| "learning_rate": 9.740114885362562e-06, | |
| "loss": 0.5739, | |
| "mean_token_accuracy": 0.8307827830314636, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.38936276969392875, | |
| "grad_norm": 0.6045330433361745, | |
| "learning_rate": 9.728846585050486e-06, | |
| "loss": 0.5609, | |
| "mean_token_accuracy": 0.8340664029121398, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.3933768188660311, | |
| "grad_norm": 0.5622152800616955, | |
| "learning_rate": 9.717345952494162e-06, | |
| "loss": 0.5638, | |
| "mean_token_accuracy": 0.8336855262517929, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.39739086803813345, | |
| "grad_norm": 0.6341361309278204, | |
| "learning_rate": 9.705613552729416e-06, | |
| "loss": 0.5778, | |
| "mean_token_accuracy": 0.8299095824360847, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.4014049172102358, | |
| "grad_norm": 0.6805767537720608, | |
| "learning_rate": 9.693649962179006e-06, | |
| "loss": 0.5659, | |
| "mean_token_accuracy": 0.8327050372958184, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4054189663823382, | |
| "grad_norm": 0.5836257323922431, | |
| "learning_rate": 9.681455768624284e-06, | |
| "loss": 0.5778, | |
| "mean_token_accuracy": 0.8303996577858925, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.40943301555444056, | |
| "grad_norm": 0.6427318323333274, | |
| "learning_rate": 9.669031571176322e-06, | |
| "loss": 0.5615, | |
| "mean_token_accuracy": 0.8343263894319535, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.4134470647265429, | |
| "grad_norm": 0.742563712570686, | |
| "learning_rate": 9.656377980246483e-06, | |
| "loss": 0.562, | |
| "mean_token_accuracy": 0.8343602031469345, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.41746111389864526, | |
| "grad_norm": 0.590467419516716, | |
| "learning_rate": 9.64349561751642e-06, | |
| "loss": 0.565, | |
| "mean_token_accuracy": 0.8327525511384011, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.4214751630707476, | |
| "grad_norm": 0.6312451978096704, | |
| "learning_rate": 9.630385115907545e-06, | |
| "loss": 0.5705, | |
| "mean_token_accuracy": 0.8321760416030883, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.42548921224284997, | |
| "grad_norm": 0.598557343394431, | |
| "learning_rate": 9.617047119549925e-06, | |
| "loss": 0.5737, | |
| "mean_token_accuracy": 0.831158398091793, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.4295032614149523, | |
| "grad_norm": 0.5787052037585553, | |
| "learning_rate": 9.603482283750631e-06, | |
| "loss": 0.5722, | |
| "mean_token_accuracy": 0.8312353953719139, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.43351731058705467, | |
| "grad_norm": 0.6288500273743972, | |
| "learning_rate": 9.589691274961556e-06, | |
| "loss": 0.5721, | |
| "mean_token_accuracy": 0.8316881075501442, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.437531359759157, | |
| "grad_norm": 0.6590854367714448, | |
| "learning_rate": 9.57567477074666e-06, | |
| "loss": 0.5728, | |
| "mean_token_accuracy": 0.8310303285717964, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.44154540893125943, | |
| "grad_norm": 0.7631123718341468, | |
| "learning_rate": 9.561433459748687e-06, | |
| "loss": 0.5723, | |
| "mean_token_accuracy": 0.8313180610537529, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.4455594581033618, | |
| "grad_norm": 0.7591778684239452, | |
| "learning_rate": 9.546968041655326e-06, | |
| "loss": 0.5573, | |
| "mean_token_accuracy": 0.8353345975279808, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.44957350727546413, | |
| "grad_norm": 0.6624232933704659, | |
| "learning_rate": 9.53227922716484e-06, | |
| "loss": 0.565, | |
| "mean_token_accuracy": 0.8330870345234871, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.4535875564475665, | |
| "grad_norm": 0.5786385616805684, | |
| "learning_rate": 9.517367737951144e-06, | |
| "loss": 0.5692, | |
| "mean_token_accuracy": 0.8318209871649742, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.45760160561966884, | |
| "grad_norm": 0.690131830320271, | |
| "learning_rate": 9.502234306628354e-06, | |
| "loss": 0.5693, | |
| "mean_token_accuracy": 0.8317399948835373, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.4616156547917712, | |
| "grad_norm": 0.7134142378990983, | |
| "learning_rate": 9.48687967671479e-06, | |
| "loss": 0.5609, | |
| "mean_token_accuracy": 0.8342039838433266, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.46562970396387354, | |
| "grad_norm": 0.7400587759918655, | |
| "learning_rate": 9.471304602596441e-06, | |
| "loss": 0.5628, | |
| "mean_token_accuracy": 0.8338278353214263, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.4696437531359759, | |
| "grad_norm": 0.6806903212075592, | |
| "learning_rate": 9.455509849489915e-06, | |
| "loss": 0.5633, | |
| "mean_token_accuracy": 0.8332584217190743, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.47365780230807825, | |
| "grad_norm": 0.761594191887626, | |
| "learning_rate": 9.43949619340483e-06, | |
| "loss": 0.564, | |
| "mean_token_accuracy": 0.8335172370076179, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.47767185148018065, | |
| "grad_norm": 0.5988325912168402, | |
| "learning_rate": 9.42326442110569e-06, | |
| "loss": 0.5721, | |
| "mean_token_accuracy": 0.8312305808067322, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.481685900652283, | |
| "grad_norm": 0.6255751606480096, | |
| "learning_rate": 9.406815330073244e-06, | |
| "loss": 0.569, | |
| "mean_token_accuracy": 0.8319594085216522, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.48569994982438536, | |
| "grad_norm": 0.5996297218515124, | |
| "learning_rate": 9.390149728465285e-06, | |
| "loss": 0.553, | |
| "mean_token_accuracy": 0.8362368091940879, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.4897139989964877, | |
| "grad_norm": 0.6918796234724647, | |
| "learning_rate": 9.373268435076959e-06, | |
| "loss": 0.5575, | |
| "mean_token_accuracy": 0.835071188211441, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.49372804816859006, | |
| "grad_norm": 0.7009608255947237, | |
| "learning_rate": 9.356172279300528e-06, | |
| "loss": 0.5575, | |
| "mean_token_accuracy": 0.8341048300266266, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.4977420973406924, | |
| "grad_norm": 0.6275504392991532, | |
| "learning_rate": 9.338862101084631e-06, | |
| "loss": 0.5636, | |
| "mean_token_accuracy": 0.8333185657858848, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.5017561465127948, | |
| "grad_norm": 0.5534731372491267, | |
| "learning_rate": 9.321338750893008e-06, | |
| "loss": 0.5683, | |
| "mean_token_accuracy": 0.8324558317661286, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.5057701956848971, | |
| "grad_norm": 0.5603065647360427, | |
| "learning_rate": 9.303603089662717e-06, | |
| "loss": 0.5504, | |
| "mean_token_accuracy": 0.8368956163525582, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.5097842448569995, | |
| "grad_norm": 0.6053004197413049, | |
| "learning_rate": 9.285655988761839e-06, | |
| "loss": 0.5499, | |
| "mean_token_accuracy": 0.8363432809710503, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.5137982940291018, | |
| "grad_norm": 0.5801686134608613, | |
| "learning_rate": 9.267498329946669e-06, | |
| "loss": 0.5653, | |
| "mean_token_accuracy": 0.8325617238879204, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.5178123432012042, | |
| "grad_norm": 0.6071815287621978, | |
| "learning_rate": 9.249131005318388e-06, | |
| "loss": 0.5544, | |
| "mean_token_accuracy": 0.8360264331102372, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.5218263923733065, | |
| "grad_norm": 0.6554755596069916, | |
| "learning_rate": 9.230554917279233e-06, | |
| "loss": 0.5581, | |
| "mean_token_accuracy": 0.8350590914487839, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.5258404415454089, | |
| "grad_norm": 0.6069071364502077, | |
| "learning_rate": 9.211770978488171e-06, | |
| "loss": 0.5627, | |
| "mean_token_accuracy": 0.833693404495716, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.5298544907175113, | |
| "grad_norm": 0.688309769409275, | |
| "learning_rate": 9.192780111816048e-06, | |
| "loss": 0.5632, | |
| "mean_token_accuracy": 0.8338836416602134, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.5338685398896137, | |
| "grad_norm": 0.6276359304847686, | |
| "learning_rate": 9.173583250300253e-06, | |
| "loss": 0.5624, | |
| "mean_token_accuracy": 0.8330780416727066, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.537882589061716, | |
| "grad_norm": 0.6332615236946093, | |
| "learning_rate": 9.154181337098878e-06, | |
| "loss": 0.552, | |
| "mean_token_accuracy": 0.8359251782298088, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.5418966382338184, | |
| "grad_norm": 0.5861055869183572, | |
| "learning_rate": 9.134575325444377e-06, | |
| "loss": 0.5423, | |
| "mean_token_accuracy": 0.8382502719759941, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.5459106874059207, | |
| "grad_norm": 0.6171709061249796, | |
| "learning_rate": 9.114766178596734e-06, | |
| "loss": 0.5558, | |
| "mean_token_accuracy": 0.8346521988511085, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.5499247365780231, | |
| "grad_norm": 0.5654000660210794, | |
| "learning_rate": 9.09475486979614e-06, | |
| "loss": 0.5528, | |
| "mean_token_accuracy": 0.8365279525518418, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.5539387857501255, | |
| "grad_norm": 0.6466821559302002, | |
| "learning_rate": 9.07454238221517e-06, | |
| "loss": 0.5602, | |
| "mean_token_accuracy": 0.8333901852369309, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.5579528349222278, | |
| "grad_norm": 0.6175016838311507, | |
| "learning_rate": 9.054129708910486e-06, | |
| "loss": 0.5657, | |
| "mean_token_accuracy": 0.8327378645539284, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.5619668840943302, | |
| "grad_norm": 0.5700614104338453, | |
| "learning_rate": 9.033517852774046e-06, | |
| "loss": 0.5501, | |
| "mean_token_accuracy": 0.8366321474313736, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.5659809332664325, | |
| "grad_norm": 0.5863271468585884, | |
| "learning_rate": 9.012707826483823e-06, | |
| "loss": 0.5468, | |
| "mean_token_accuracy": 0.8372207880020142, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.5699949824385349, | |
| "grad_norm": 0.6369292028334194, | |
| "learning_rate": 8.991700652454066e-06, | |
| "loss": 0.5619, | |
| "mean_token_accuracy": 0.8337433338165283, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.5740090316106372, | |
| "grad_norm": 0.5387359333690185, | |
| "learning_rate": 8.970497362785052e-06, | |
| "loss": 0.5579, | |
| "mean_token_accuracy": 0.8344336777925492, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.5780230807827396, | |
| "grad_norm": 0.6587993755610946, | |
| "learning_rate": 8.94909899921239e-06, | |
| "loss": 0.5542, | |
| "mean_token_accuracy": 0.8355127662420273, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.5820371299548419, | |
| "grad_norm": 0.6401226301007903, | |
| "learning_rate": 8.927506613055839e-06, | |
| "loss": 0.5497, | |
| "mean_token_accuracy": 0.8362575441598892, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.5860511791269443, | |
| "grad_norm": 0.6804051477273255, | |
| "learning_rate": 8.905721265167644e-06, | |
| "loss": 0.5399, | |
| "mean_token_accuracy": 0.8386722207069397, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.5900652282990466, | |
| "grad_norm": 0.5593126661424827, | |
| "learning_rate": 8.883744025880429e-06, | |
| "loss": 0.5492, | |
| "mean_token_accuracy": 0.8365695863962174, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.594079277471149, | |
| "grad_norm": 0.7158554231921177, | |
| "learning_rate": 8.861575974954602e-06, | |
| "loss": 0.5498, | |
| "mean_token_accuracy": 0.8374354973435402, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.5980933266432514, | |
| "grad_norm": 0.7747230270377562, | |
| "learning_rate": 8.839218201525312e-06, | |
| "loss": 0.5601, | |
| "mean_token_accuracy": 0.8338992148637772, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.6021073758153538, | |
| "grad_norm": 0.6309209085971893, | |
| "learning_rate": 8.816671804048933e-06, | |
| "loss": 0.5496, | |
| "mean_token_accuracy": 0.8368017837405205, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.6061214249874561, | |
| "grad_norm": 0.6666014610071999, | |
| "learning_rate": 8.7939378902491e-06, | |
| "loss": 0.5642, | |
| "mean_token_accuracy": 0.8328350514173508, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.6101354741595585, | |
| "grad_norm": 0.6202628898930358, | |
| "learning_rate": 8.771017577062282e-06, | |
| "loss": 0.5455, | |
| "mean_token_accuracy": 0.8375746294856071, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.6141495233316608, | |
| "grad_norm": 0.6066663449696902, | |
| "learning_rate": 8.747911990582912e-06, | |
| "loss": 0.5542, | |
| "mean_token_accuracy": 0.8355247572064399, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.6181635725037632, | |
| "grad_norm": 0.6146156971775205, | |
| "learning_rate": 8.724622266008054e-06, | |
| "loss": 0.5586, | |
| "mean_token_accuracy": 0.834241335093975, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.6221776216758655, | |
| "grad_norm": 0.5872550377785694, | |
| "learning_rate": 8.701149547581631e-06, | |
| "loss": 0.5482, | |
| "mean_token_accuracy": 0.8369159802794457, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.6261916708479679, | |
| "grad_norm": 0.5902160677997114, | |
| "learning_rate": 8.67749498853821e-06, | |
| "loss": 0.5394, | |
| "mean_token_accuracy": 0.8388776108622551, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.6302057200200702, | |
| "grad_norm": 0.5672836827289786, | |
| "learning_rate": 8.65365975104635e-06, | |
| "loss": 0.5531, | |
| "mean_token_accuracy": 0.8354972064495086, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.6342197691921726, | |
| "grad_norm": 0.6235670038077569, | |
| "learning_rate": 8.629645006151483e-06, | |
| "loss": 0.5527, | |
| "mean_token_accuracy": 0.8354064971208572, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.638233818364275, | |
| "grad_norm": 0.6716148045956016, | |
| "learning_rate": 8.6054519337184e-06, | |
| "loss": 0.5672, | |
| "mean_token_accuracy": 0.8319687351584435, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.6422478675363773, | |
| "grad_norm": 0.6872493021964662, | |
| "learning_rate": 8.58108172237327e-06, | |
| "loss": 0.5423, | |
| "mean_token_accuracy": 0.8382853552699089, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.6462619167084797, | |
| "grad_norm": 0.567572405722234, | |
| "learning_rate": 8.556535569445252e-06, | |
| "loss": 0.5584, | |
| "mean_token_accuracy": 0.8342087417840958, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.650275965880582, | |
| "grad_norm": 0.6202297374631588, | |
| "learning_rate": 8.531814680907664e-06, | |
| "loss": 0.5494, | |
| "mean_token_accuracy": 0.8363103911280632, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.6542900150526844, | |
| "grad_norm": 0.7149267451711114, | |
| "learning_rate": 8.506920271318729e-06, | |
| "loss": 0.5526, | |
| "mean_token_accuracy": 0.835868252813816, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.6583040642247867, | |
| "grad_norm": 0.5912849173956192, | |
| "learning_rate": 8.481853563761906e-06, | |
| "loss": 0.5453, | |
| "mean_token_accuracy": 0.8372933536767959, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.6623181133968891, | |
| "grad_norm": 0.6637136617026432, | |
| "learning_rate": 8.456615789785804e-06, | |
| "loss": 0.5422, | |
| "mean_token_accuracy": 0.8382136434316635, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.6663321625689914, | |
| "grad_norm": 0.6995566851869169, | |
| "learning_rate": 8.43120818934367e-06, | |
| "loss": 0.555, | |
| "mean_token_accuracy": 0.8347884580492974, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.6703462117410939, | |
| "grad_norm": 0.6602263363301184, | |
| "learning_rate": 8.405632010732462e-06, | |
| "loss": 0.548, | |
| "mean_token_accuracy": 0.8366368874907494, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.6743602609131962, | |
| "grad_norm": 0.6796507475619034, | |
| "learning_rate": 8.379888510531536e-06, | |
| "loss": 0.5418, | |
| "mean_token_accuracy": 0.8378365620970726, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.6783743100852986, | |
| "grad_norm": 0.6973054982700846, | |
| "learning_rate": 8.353978953540893e-06, | |
| "loss": 0.5467, | |
| "mean_token_accuracy": 0.8372493907809258, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.6823883592574009, | |
| "grad_norm": 0.6536264569620208, | |
| "learning_rate": 8.32790461271905e-06, | |
| "loss": 0.5418, | |
| "mean_token_accuracy": 0.8384019210934639, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.6864024084295033, | |
| "grad_norm": 0.546683652398305, | |
| "learning_rate": 8.301666769120488e-06, | |
| "loss": 0.5437, | |
| "mean_token_accuracy": 0.8375723645091057, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.6904164576016056, | |
| "grad_norm": 0.7963584379304186, | |
| "learning_rate": 8.275266711832722e-06, | |
| "loss": 0.5504, | |
| "mean_token_accuracy": 0.8361119583249093, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.694430506773708, | |
| "grad_norm": 0.6129739181303979, | |
| "learning_rate": 8.24870573791296e-06, | |
| "loss": 0.5587, | |
| "mean_token_accuracy": 0.8341295495629311, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.6984445559458103, | |
| "grad_norm": 0.6035540038334497, | |
| "learning_rate": 8.221985152324385e-06, | |
| "loss": 0.5438, | |
| "mean_token_accuracy": 0.8377896025776863, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.7024586051179127, | |
| "grad_norm": 0.5464317718279572, | |
| "learning_rate": 8.195106267872035e-06, | |
| "loss": 0.5308, | |
| "mean_token_accuracy": 0.8412103086709977, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.706472654290015, | |
| "grad_norm": 0.5880182013926408, | |
| "learning_rate": 8.168070405138303e-06, | |
| "loss": 0.5411, | |
| "mean_token_accuracy": 0.8386243000626564, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.7104867034621174, | |
| "grad_norm": 0.70991046912602, | |
| "learning_rate": 8.14087889241806e-06, | |
| "loss": 0.5431, | |
| "mean_token_accuracy": 0.8383191004395485, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.7145007526342197, | |
| "grad_norm": 0.6562145842410761, | |
| "learning_rate": 8.113533065653395e-06, | |
| "loss": 0.5423, | |
| "mean_token_accuracy": 0.8379684969782829, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.7185148018063221, | |
| "grad_norm": 0.6480124383110402, | |
| "learning_rate": 8.086034268367971e-06, | |
| "loss": 0.5422, | |
| "mean_token_accuracy": 0.8380523830652237, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.7225288509784245, | |
| "grad_norm": 0.6292142248495385, | |
| "learning_rate": 8.058383851601027e-06, | |
| "loss": 0.5408, | |
| "mean_token_accuracy": 0.8389136686921119, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7265429001505268, | |
| "grad_norm": 0.5820959494127994, | |
| "learning_rate": 8.030583173840997e-06, | |
| "loss": 0.5388, | |
| "mean_token_accuracy": 0.8387292832136154, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.7305569493226292, | |
| "grad_norm": 0.6250625002124468, | |
| "learning_rate": 8.002633600958762e-06, | |
| "loss": 0.5468, | |
| "mean_token_accuracy": 0.8370036602020263, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.7345709984947315, | |
| "grad_norm": 0.672514782103767, | |
| "learning_rate": 7.974536506140546e-06, | |
| "loss": 0.5441, | |
| "mean_token_accuracy": 0.8380858764052391, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.738585047666834, | |
| "grad_norm": 0.590876941682476, | |
| "learning_rate": 7.946293269820456e-06, | |
| "loss": 0.546, | |
| "mean_token_accuracy": 0.8371559247374535, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.7425990968389363, | |
| "grad_norm": 0.6118643146843981, | |
| "learning_rate": 7.917905279612648e-06, | |
| "loss": 0.5325, | |
| "mean_token_accuracy": 0.840439823269844, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.7466131460110387, | |
| "grad_norm": 0.7336641277164324, | |
| "learning_rate": 7.889373930243166e-06, | |
| "loss": 0.5485, | |
| "mean_token_accuracy": 0.8366749197244644, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.750627195183141, | |
| "grad_norm": 0.7152604521519405, | |
| "learning_rate": 7.860700623481404e-06, | |
| "loss": 0.5427, | |
| "mean_token_accuracy": 0.8377209782600403, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.7546412443552434, | |
| "grad_norm": 0.6776957187532802, | |
| "learning_rate": 7.831886768071249e-06, | |
| "loss": 0.5362, | |
| "mean_token_accuracy": 0.8392210811376571, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.7586552935273457, | |
| "grad_norm": 0.770986244124321, | |
| "learning_rate": 7.80293377966186e-06, | |
| "loss": 0.5458, | |
| "mean_token_accuracy": 0.8375400707125664, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.7626693426994481, | |
| "grad_norm": 0.6825578099616549, | |
| "learning_rate": 7.77384308073812e-06, | |
| "loss": 0.5453, | |
| "mean_token_accuracy": 0.8378095313906669, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.7666833918715504, | |
| "grad_norm": 5.294879176126245, | |
| "learning_rate": 7.744616100550743e-06, | |
| "loss": 0.5467, | |
| "mean_token_accuracy": 0.8369431406259537, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.7706974410436528, | |
| "grad_norm": 0.8415837092408757, | |
| "learning_rate": 7.715254275046062e-06, | |
| "loss": 0.5452, | |
| "mean_token_accuracy": 0.837572930753231, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.7747114902157551, | |
| "grad_norm": 0.5986581958942131, | |
| "learning_rate": 7.68575904679547e-06, | |
| "loss": 0.5391, | |
| "mean_token_accuracy": 0.8389428481459618, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.7787255393878575, | |
| "grad_norm": 0.5612943684118845, | |
| "learning_rate": 7.65613186492455e-06, | |
| "loss": 0.5309, | |
| "mean_token_accuracy": 0.8410877391695977, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.7827395885599598, | |
| "grad_norm": 0.5892948923721739, | |
| "learning_rate": 7.626374185041887e-06, | |
| "loss": 0.5339, | |
| "mean_token_accuracy": 0.8404828563332558, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.7867536377320622, | |
| "grad_norm": 0.614935197747847, | |
| "learning_rate": 7.596487469167531e-06, | |
| "loss": 0.5417, | |
| "mean_token_accuracy": 0.8379798352718353, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.7907676869041645, | |
| "grad_norm": 0.5572327756360332, | |
| "learning_rate": 7.566473185661187e-06, | |
| "loss": 0.5379, | |
| "mean_token_accuracy": 0.839057058095932, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.7947817360762669, | |
| "grad_norm": 0.5918529196648193, | |
| "learning_rate": 7.536332809150066e-06, | |
| "loss": 0.5387, | |
| "mean_token_accuracy": 0.8385102912783623, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.7987957852483693, | |
| "grad_norm": 0.5961572996178651, | |
| "learning_rate": 7.506067820456438e-06, | |
| "loss": 0.5458, | |
| "mean_token_accuracy": 0.8371727049350739, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.8028098344204716, | |
| "grad_norm": 0.6305016848759029, | |
| "learning_rate": 7.475679706524864e-06, | |
| "loss": 0.5357, | |
| "mean_token_accuracy": 0.8398390769958496, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.806823883592574, | |
| "grad_norm": 0.5798737021411499, | |
| "learning_rate": 7.445169960349167e-06, | |
| "loss": 0.5332, | |
| "mean_token_accuracy": 0.8406742095947266, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.8108379327646764, | |
| "grad_norm": 0.6561676520637244, | |
| "learning_rate": 7.414540080899056e-06, | |
| "loss": 0.5469, | |
| "mean_token_accuracy": 0.8368082106113434, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.8148519819367788, | |
| "grad_norm": 0.5915371150391379, | |
| "learning_rate": 7.3837915730464896e-06, | |
| "loss": 0.5371, | |
| "mean_token_accuracy": 0.8387147217988968, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.8188660311088811, | |
| "grad_norm": 0.5335073385354651, | |
| "learning_rate": 7.3529259474917455e-06, | |
| "loss": 0.5358, | |
| "mean_token_accuracy": 0.8394276514649391, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.8228800802809835, | |
| "grad_norm": 0.5386012526091869, | |
| "learning_rate": 7.321944720689191e-06, | |
| "loss": 0.5366, | |
| "mean_token_accuracy": 0.8397987276315689, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.8268941294530858, | |
| "grad_norm": 0.5223449430699659, | |
| "learning_rate": 7.290849414772779e-06, | |
| "loss": 0.5353, | |
| "mean_token_accuracy": 0.8395177885890007, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.8309081786251882, | |
| "grad_norm": 0.5706841180761181, | |
| "learning_rate": 7.2596415574812695e-06, | |
| "loss": 0.5403, | |
| "mean_token_accuracy": 0.8387762665748596, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.8349222277972905, | |
| "grad_norm": 0.6290586727279687, | |
| "learning_rate": 7.228322682083164e-06, | |
| "loss": 0.5351, | |
| "mean_token_accuracy": 0.8395211502909661, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.8389362769693929, | |
| "grad_norm": 0.5277748021387862, | |
| "learning_rate": 7.196894327301378e-06, | |
| "loss": 0.5295, | |
| "mean_token_accuracy": 0.8414138451218605, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.8429503261414952, | |
| "grad_norm": 0.5539533785866126, | |
| "learning_rate": 7.165358037237644e-06, | |
| "loss": 0.5388, | |
| "mean_token_accuracy": 0.8386986523866653, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.8469643753135976, | |
| "grad_norm": 0.5062193058912297, | |
| "learning_rate": 7.1337153612966455e-06, | |
| "loss": 0.5349, | |
| "mean_token_accuracy": 0.8398931756615639, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.8509784244856999, | |
| "grad_norm": 0.5622943196472996, | |
| "learning_rate": 7.1019678541098945e-06, | |
| "loss": 0.5378, | |
| "mean_token_accuracy": 0.8387416958808899, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.8549924736578023, | |
| "grad_norm": 0.5718749819810406, | |
| "learning_rate": 7.0701170754593516e-06, | |
| "loss": 0.54, | |
| "mean_token_accuracy": 0.8385371461510658, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.8590065228299046, | |
| "grad_norm": 0.6435557666099391, | |
| "learning_rate": 7.038164590200789e-06, | |
| "loss": 0.5282, | |
| "mean_token_accuracy": 0.8415184810757637, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.863020572002007, | |
| "grad_norm": 0.6743895521038166, | |
| "learning_rate": 7.006111968186914e-06, | |
| "loss": 0.5332, | |
| "mean_token_accuracy": 0.8404285505414009, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.8670346211741093, | |
| "grad_norm": 0.6869985118812036, | |
| "learning_rate": 6.9739607841902365e-06, | |
| "loss": 0.538, | |
| "mean_token_accuracy": 0.8389334738254547, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.8710486703462117, | |
| "grad_norm": 0.62070604390802, | |
| "learning_rate": 6.941712617825701e-06, | |
| "loss": 0.5351, | |
| "mean_token_accuracy": 0.8399768278002739, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.875062719518314, | |
| "grad_norm": 0.5747195866768225, | |
| "learning_rate": 6.909369053473079e-06, | |
| "loss": 0.5428, | |
| "mean_token_accuracy": 0.8378928422927856, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.8790767686904165, | |
| "grad_norm": 0.5877150605799464, | |
| "learning_rate": 6.876931680199121e-06, | |
| "loss": 0.5298, | |
| "mean_token_accuracy": 0.8409986332058906, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.8830908178625189, | |
| "grad_norm": 0.598163324475379, | |
| "learning_rate": 6.844402091679494e-06, | |
| "loss": 0.5359, | |
| "mean_token_accuracy": 0.8396167665719986, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.8871048670346212, | |
| "grad_norm": 0.5988514234545925, | |
| "learning_rate": 6.811781886120479e-06, | |
| "loss": 0.5416, | |
| "mean_token_accuracy": 0.8384165868163109, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.8911189162067236, | |
| "grad_norm": 0.5253771613266209, | |
| "learning_rate": 6.779072666180447e-06, | |
| "loss": 0.5381, | |
| "mean_token_accuracy": 0.8387709230184555, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.8951329653788259, | |
| "grad_norm": 0.6133443221994367, | |
| "learning_rate": 6.746276038891117e-06, | |
| "loss": 0.54, | |
| "mean_token_accuracy": 0.839115546643734, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.8991470145509283, | |
| "grad_norm": 0.5596525836914499, | |
| "learning_rate": 6.713393615578616e-06, | |
| "loss": 0.5378, | |
| "mean_token_accuracy": 0.8392528027296067, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.9031610637230306, | |
| "grad_norm": 0.5821761444332735, | |
| "learning_rate": 6.680427011784292e-06, | |
| "loss": 0.537, | |
| "mean_token_accuracy": 0.839189724624157, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.907175112895133, | |
| "grad_norm": 0.894955820302947, | |
| "learning_rate": 6.6473778471853536e-06, | |
| "loss": 0.5359, | |
| "mean_token_accuracy": 0.8388691842556, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.9111891620672353, | |
| "grad_norm": 0.5737563610744022, | |
| "learning_rate": 6.614247745515298e-06, | |
| "loss": 0.5423, | |
| "mean_token_accuracy": 0.838544836640358, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.9152032112393377, | |
| "grad_norm": 0.5692294184392518, | |
| "learning_rate": 6.58103833448412e-06, | |
| "loss": 0.5407, | |
| "mean_token_accuracy": 0.8381945803761482, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.91921726041144, | |
| "grad_norm": 0.5742117002922255, | |
| "learning_rate": 6.5477512456983595e-06, | |
| "loss": 0.5291, | |
| "mean_token_accuracy": 0.8412258476018906, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.9232313095835424, | |
| "grad_norm": 0.5517555661869946, | |
| "learning_rate": 6.514388114580924e-06, | |
| "loss": 0.5309, | |
| "mean_token_accuracy": 0.8410136729478837, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.9272453587556447, | |
| "grad_norm": 0.6314576791547585, | |
| "learning_rate": 6.480950580290751e-06, | |
| "loss": 0.5398, | |
| "mean_token_accuracy": 0.8388326406478882, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.9312594079277471, | |
| "grad_norm": 0.5349229116155992, | |
| "learning_rate": 6.44744028564226e-06, | |
| "loss": 0.5403, | |
| "mean_token_accuracy": 0.8383991166949272, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.9352734570998494, | |
| "grad_norm": 0.5457988264371361, | |
| "learning_rate": 6.413858877024659e-06, | |
| "loss": 0.5385, | |
| "mean_token_accuracy": 0.8390275910496712, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.9392875062719518, | |
| "grad_norm": 0.6345026890776209, | |
| "learning_rate": 6.380208004321037e-06, | |
| "loss": 0.5266, | |
| "mean_token_accuracy": 0.8420624524354935, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.9433015554440541, | |
| "grad_norm": 0.5897108286808963, | |
| "learning_rate": 6.34648932082732e-06, | |
| "loss": 0.5298, | |
| "mean_token_accuracy": 0.8406174406409264, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.9473156046161565, | |
| "grad_norm": 0.5868359598483316, | |
| "learning_rate": 6.312704483171029e-06, | |
| "loss": 0.53, | |
| "mean_token_accuracy": 0.8407926484942436, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.951329653788259, | |
| "grad_norm": 0.6313928647086123, | |
| "learning_rate": 6.2788551512299014e-06, | |
| "loss": 0.531, | |
| "mean_token_accuracy": 0.8403575956821442, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.9553437029603613, | |
| "grad_norm": 0.5798903955533093, | |
| "learning_rate": 6.244942988050325e-06, | |
| "loss": 0.5468, | |
| "mean_token_accuracy": 0.8370823442935944, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.9593577521324637, | |
| "grad_norm": 0.5349118049394295, | |
| "learning_rate": 6.210969659765651e-06, | |
| "loss": 0.5327, | |
| "mean_token_accuracy": 0.8406338363885879, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.963371801304566, | |
| "grad_norm": 0.4950155751846596, | |
| "learning_rate": 6.1769368355143125e-06, | |
| "loss": 0.5205, | |
| "mean_token_accuracy": 0.8438980832695961, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.9673858504766684, | |
| "grad_norm": 0.5323860455486669, | |
| "learning_rate": 6.142846187357839e-06, | |
| "loss": 0.5313, | |
| "mean_token_accuracy": 0.840233464539051, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.9713998996487707, | |
| "grad_norm": 0.4817617225243705, | |
| "learning_rate": 6.108699390198691e-06, | |
| "loss": 0.5335, | |
| "mean_token_accuracy": 0.8402068182826042, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.9754139488208731, | |
| "grad_norm": 0.4995196546081424, | |
| "learning_rate": 6.074498121697983e-06, | |
| "loss": 0.5251, | |
| "mean_token_accuracy": 0.8414423301815986, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.9794279979929754, | |
| "grad_norm": 0.5750567202600121, | |
| "learning_rate": 6.04024406219305e-06, | |
| "loss": 0.5319, | |
| "mean_token_accuracy": 0.8403345927596092, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.9834420471650778, | |
| "grad_norm": 0.554410393503299, | |
| "learning_rate": 6.0059388946148885e-06, | |
| "loss": 0.5362, | |
| "mean_token_accuracy": 0.8393355578184127, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.9874560963371801, | |
| "grad_norm": 0.5547067674499261, | |
| "learning_rate": 5.971584304405489e-06, | |
| "loss": 0.5333, | |
| "mean_token_accuracy": 0.8406757906079292, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.9914701455092825, | |
| "grad_norm": 0.541496404401069, | |
| "learning_rate": 5.937181979435007e-06, | |
| "loss": 0.5402, | |
| "mean_token_accuracy": 0.8378434419631958, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.9954841946813848, | |
| "grad_norm": 0.565488616851699, | |
| "learning_rate": 5.902733609918857e-06, | |
| "loss": 0.5421, | |
| "mean_token_accuracy": 0.8383191093802452, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.9994982438534872, | |
| "grad_norm": 0.5830669897046137, | |
| "learning_rate": 5.8682408883346535e-06, | |
| "loss": 0.5341, | |
| "mean_token_accuracy": 0.8402512192726135, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 1.0040140491721024, | |
| "grad_norm": 0.5255806082753902, | |
| "learning_rate": 5.833705509339067e-06, | |
| "loss": 0.5977, | |
| "mean_token_accuracy": 0.8489638639659416, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.0080280983442047, | |
| "grad_norm": 0.61241581013064, | |
| "learning_rate": 5.799129169684566e-06, | |
| "loss": 0.4891, | |
| "mean_token_accuracy": 0.851288877427578, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 1.012042147516307, | |
| "grad_norm": 0.6700704727742163, | |
| "learning_rate": 5.76451356813605e-06, | |
| "loss": 0.4878, | |
| "mean_token_accuracy": 0.8516981139779091, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.0160561966884094, | |
| "grad_norm": 0.5456983971902244, | |
| "learning_rate": 5.729860405387384e-06, | |
| "loss": 0.4854, | |
| "mean_token_accuracy": 0.8516273483633995, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 1.0200702458605118, | |
| "grad_norm": 0.6175710890585946, | |
| "learning_rate": 5.6951713839778565e-06, | |
| "loss": 0.4833, | |
| "mean_token_accuracy": 0.8524551376700401, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.0240842950326141, | |
| "grad_norm": 0.5253091539774541, | |
| "learning_rate": 5.660448208208513e-06, | |
| "loss": 0.4838, | |
| "mean_token_accuracy": 0.8522418588399887, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 1.0280983442047165, | |
| "grad_norm": 0.5012462654206351, | |
| "learning_rate": 5.625692584058434e-06, | |
| "loss": 0.4823, | |
| "mean_token_accuracy": 0.8529530748724937, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.0321123933768188, | |
| "grad_norm": 0.5592121551569235, | |
| "learning_rate": 5.590906219100919e-06, | |
| "loss": 0.4905, | |
| "mean_token_accuracy": 0.8506887316703796, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 1.0361264425489212, | |
| "grad_norm": 0.5153674677293302, | |
| "learning_rate": 5.556090822419589e-06, | |
| "loss": 0.4881, | |
| "mean_token_accuracy": 0.8507951095700264, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.0401404917210235, | |
| "grad_norm": 0.6088145024835736, | |
| "learning_rate": 5.521248104524415e-06, | |
| "loss": 0.4784, | |
| "mean_token_accuracy": 0.8535375446081161, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 1.0441545408931259, | |
| "grad_norm": 0.4664157690543253, | |
| "learning_rate": 5.4863797772676865e-06, | |
| "loss": 0.4781, | |
| "mean_token_accuracy": 0.8540792852640152, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.0481685900652282, | |
| "grad_norm": 0.5354927183536671, | |
| "learning_rate": 5.451487553759899e-06, | |
| "loss": 0.4858, | |
| "mean_token_accuracy": 0.8520985931158066, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 1.0521826392373306, | |
| "grad_norm": 0.5411262184931188, | |
| "learning_rate": 5.416573148285594e-06, | |
| "loss": 0.4812, | |
| "mean_token_accuracy": 0.8527259394526482, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.056196688409433, | |
| "grad_norm": 0.5828680936268568, | |
| "learning_rate": 5.3816382762191314e-06, | |
| "loss": 0.4872, | |
| "mean_token_accuracy": 0.8516100868582726, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 1.0602107375815353, | |
| "grad_norm": 0.5202254772193751, | |
| "learning_rate": 5.346684653940408e-06, | |
| "loss": 0.4779, | |
| "mean_token_accuracy": 0.8536487385630608, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.0642247867536376, | |
| "grad_norm": 0.6388141647258224, | |
| "learning_rate": 5.311713998750543e-06, | |
| "loss": 0.4881, | |
| "mean_token_accuracy": 0.85139652043581, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 1.0682388359257402, | |
| "grad_norm": 0.5465015590788906, | |
| "learning_rate": 5.276728028787489e-06, | |
| "loss": 0.4821, | |
| "mean_token_accuracy": 0.8529908075928688, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.0722528850978423, | |
| "grad_norm": 0.5594552749543369, | |
| "learning_rate": 5.24172846294163e-06, | |
| "loss": 0.4847, | |
| "mean_token_accuracy": 0.8517061173915863, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 1.076266934269945, | |
| "grad_norm": 0.6110305995724576, | |
| "learning_rate": 5.206717020771323e-06, | |
| "loss": 0.4923, | |
| "mean_token_accuracy": 0.8506526678800583, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.0802809834420473, | |
| "grad_norm": 0.5684290645772769, | |
| "learning_rate": 5.171695422418429e-06, | |
| "loss": 0.4832, | |
| "mean_token_accuracy": 0.8523881688714028, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 1.0842950326141496, | |
| "grad_norm": 0.623945297590145, | |
| "learning_rate": 5.136665388523779e-06, | |
| "loss": 0.4796, | |
| "mean_token_accuracy": 0.8534211605787277, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.088309081786252, | |
| "grad_norm": 0.784850879982968, | |
| "learning_rate": 5.101628640142655e-06, | |
| "loss": 0.4794, | |
| "mean_token_accuracy": 0.8535040110349655, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 1.0923231309583543, | |
| "grad_norm": 0.47623171391541286, | |
| "learning_rate": 5.06658689866023e-06, | |
| "loss": 0.4817, | |
| "mean_token_accuracy": 0.8525002762675286, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.0963371801304567, | |
| "grad_norm": 0.5042352606147674, | |
| "learning_rate": 5.031541885706987e-06, | |
| "loss": 0.4813, | |
| "mean_token_accuracy": 0.8532689422369003, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 1.100351229302559, | |
| "grad_norm": 0.5016426811840553, | |
| "learning_rate": 4.99649532307414e-06, | |
| "loss": 0.4865, | |
| "mean_token_accuracy": 0.8514423102140427, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.1043652784746614, | |
| "grad_norm": 0.5268578218681798, | |
| "learning_rate": 4.961448932629047e-06, | |
| "loss": 0.4842, | |
| "mean_token_accuracy": 0.8524499326944351, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 1.1083793276467637, | |
| "grad_norm": 0.5144802084081512, | |
| "learning_rate": 4.926404436230596e-06, | |
| "loss": 0.4831, | |
| "mean_token_accuracy": 0.8526805534958839, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.112393376818866, | |
| "grad_norm": 0.5062678557635162, | |
| "learning_rate": 4.891363555644623e-06, | |
| "loss": 0.4768, | |
| "mean_token_accuracy": 0.8536328047513961, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 1.1164074259909684, | |
| "grad_norm": 0.5784487909262853, | |
| "learning_rate": 4.8563280124593205e-06, | |
| "loss": 0.4837, | |
| "mean_token_accuracy": 0.8526077657938004, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.1204214751630708, | |
| "grad_norm": 0.5599564503109443, | |
| "learning_rate": 4.821299528000643e-06, | |
| "loss": 0.4764, | |
| "mean_token_accuracy": 0.8544846564531327, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 1.1244355243351731, | |
| "grad_norm": 0.5051397425603605, | |
| "learning_rate": 4.786279823247749e-06, | |
| "loss": 0.4841, | |
| "mean_token_accuracy": 0.8521718412637711, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.1284495735072755, | |
| "grad_norm": 0.5564697170540107, | |
| "learning_rate": 4.751270618748439e-06, | |
| "loss": 0.482, | |
| "mean_token_accuracy": 0.8527628138661385, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 1.1324636226793778, | |
| "grad_norm": 0.5462012358781666, | |
| "learning_rate": 4.71627363453463e-06, | |
| "loss": 0.481, | |
| "mean_token_accuracy": 0.8532051593065262, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.1364776718514802, | |
| "grad_norm": 0.47939684892732765, | |
| "learning_rate": 4.681290590037845e-06, | |
| "loss": 0.4861, | |
| "mean_token_accuracy": 0.8520785883069039, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 1.1404917210235825, | |
| "grad_norm": 0.49652735975196016, | |
| "learning_rate": 4.6463232040047355e-06, | |
| "loss": 0.4749, | |
| "mean_token_accuracy": 0.8546465337276459, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.144505770195685, | |
| "grad_norm": 0.47650301494794856, | |
| "learning_rate": 4.61137319441264e-06, | |
| "loss": 0.488, | |
| "mean_token_accuracy": 0.8511898010969162, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 1.1485198193677872, | |
| "grad_norm": 0.49365022232137656, | |
| "learning_rate": 4.57644227838518e-06, | |
| "loss": 0.4773, | |
| "mean_token_accuracy": 0.8540461644530296, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.1525338685398896, | |
| "grad_norm": 0.5174719592435709, | |
| "learning_rate": 4.541532172107891e-06, | |
| "loss": 0.4764, | |
| "mean_token_accuracy": 0.8547235488891601, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 1.156547917711992, | |
| "grad_norm": 0.6030063953889359, | |
| "learning_rate": 4.5066445907439104e-06, | |
| "loss": 0.4846, | |
| "mean_token_accuracy": 0.8522387713193893, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.1605619668840943, | |
| "grad_norm": 0.48832132466767514, | |
| "learning_rate": 4.471781248349702e-06, | |
| "loss": 0.4868, | |
| "mean_token_accuracy": 0.8517817571759224, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 1.1645760160561967, | |
| "grad_norm": 0.5530963471918582, | |
| "learning_rate": 4.436943857790859e-06, | |
| "loss": 0.4882, | |
| "mean_token_accuracy": 0.8510265350341797, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.168590065228299, | |
| "grad_norm": 0.5744529358259624, | |
| "learning_rate": 4.402134130657925e-06, | |
| "loss": 0.4803, | |
| "mean_token_accuracy": 0.8536973863840103, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 1.1726041144004014, | |
| "grad_norm": 1.0812219162154455, | |
| "learning_rate": 4.367353777182332e-06, | |
| "loss": 0.4876, | |
| "mean_token_accuracy": 0.8515413969755172, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.1766181635725037, | |
| "grad_norm": 0.505147254276408, | |
| "learning_rate": 4.332604506152341e-06, | |
| "loss": 0.4782, | |
| "mean_token_accuracy": 0.8540340691804886, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 1.180632212744606, | |
| "grad_norm": 0.5103240298779389, | |
| "learning_rate": 4.297888024829126e-06, | |
| "loss": 0.4785, | |
| "mean_token_accuracy": 0.8533610329031944, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.1846462619167084, | |
| "grad_norm": 0.5611328032960321, | |
| "learning_rate": 4.263206038862858e-06, | |
| "loss": 0.4885, | |
| "mean_token_accuracy": 0.8507640421390533, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 1.1886603110888108, | |
| "grad_norm": 0.5136330037040837, | |
| "learning_rate": 4.22856025220893e-06, | |
| "loss": 0.4896, | |
| "mean_token_accuracy": 0.8505268082022667, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.1926743602609131, | |
| "grad_norm": 0.4853833862909341, | |
| "learning_rate": 4.193952367044232e-06, | |
| "loss": 0.4815, | |
| "mean_token_accuracy": 0.8532075121998787, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 1.1966884094330155, | |
| "grad_norm": 0.5147177386874451, | |
| "learning_rate": 4.159384083683518e-06, | |
| "loss": 0.4779, | |
| "mean_token_accuracy": 0.8535997048020363, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.2007024586051178, | |
| "grad_norm": 0.5497574369895587, | |
| "learning_rate": 4.124857100495877e-06, | |
| "loss": 0.4856, | |
| "mean_token_accuracy": 0.8514174923300744, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 1.2047165077772202, | |
| "grad_norm": 0.6226254194200767, | |
| "learning_rate": 4.090373113821281e-06, | |
| "loss": 0.4817, | |
| "mean_token_accuracy": 0.8529708757996559, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.2087305569493227, | |
| "grad_norm": 0.5488518920357796, | |
| "learning_rate": 4.055933817887247e-06, | |
| "loss": 0.4842, | |
| "mean_token_accuracy": 0.8528445586562157, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 1.2127446061214249, | |
| "grad_norm": 0.5221887780865238, | |
| "learning_rate": 4.021540904725603e-06, | |
| "loss": 0.4826, | |
| "mean_token_accuracy": 0.8525567546486854, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.2167586552935274, | |
| "grad_norm": 0.5594564612440402, | |
| "learning_rate": 3.987196064089346e-06, | |
| "loss": 0.4769, | |
| "mean_token_accuracy": 0.8545236945152282, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 1.2207727044656298, | |
| "grad_norm": 0.5255224366679334, | |
| "learning_rate": 3.952900983369632e-06, | |
| "loss": 0.487, | |
| "mean_token_accuracy": 0.8506386041641235, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.2247867536377322, | |
| "grad_norm": 0.5726647439659638, | |
| "learning_rate": 3.91865734751287e-06, | |
| "loss": 0.4806, | |
| "mean_token_accuracy": 0.8534023508429527, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 1.2288008028098345, | |
| "grad_norm": 0.5624813503686409, | |
| "learning_rate": 3.88446683893794e-06, | |
| "loss": 0.482, | |
| "mean_token_accuracy": 0.8528756931424141, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.2328148519819369, | |
| "grad_norm": 0.5195532566310461, | |
| "learning_rate": 3.850331137453529e-06, | |
| "loss": 0.4819, | |
| "mean_token_accuracy": 0.8532358273863793, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 1.2368289011540392, | |
| "grad_norm": 0.5472089753849174, | |
| "learning_rate": 3.816251920175611e-06, | |
| "loss": 0.4934, | |
| "mean_token_accuracy": 0.8497111722826958, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.2408429503261416, | |
| "grad_norm": 0.5368326431573441, | |
| "learning_rate": 3.782230861445041e-06, | |
| "loss": 0.4739, | |
| "mean_token_accuracy": 0.8550259992480278, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 1.244856999498244, | |
| "grad_norm": 0.4915065349616282, | |
| "learning_rate": 3.7482696327452926e-06, | |
| "loss": 0.4827, | |
| "mean_token_accuracy": 0.8524277821183205, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.2488710486703463, | |
| "grad_norm": 0.48495072086168006, | |
| "learning_rate": 3.714369902620345e-06, | |
| "loss": 0.4803, | |
| "mean_token_accuracy": 0.8536085486412048, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 1.2528850978424486, | |
| "grad_norm": 1.750136693379065, | |
| "learning_rate": 3.6805333365926943e-06, | |
| "loss": 0.4874, | |
| "mean_token_accuracy": 0.8515821918845177, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.256899147014551, | |
| "grad_norm": 0.5041608097022385, | |
| "learning_rate": 3.6467615970815323e-06, | |
| "loss": 0.4837, | |
| "mean_token_accuracy": 0.8521517217159271, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 1.2609131961866533, | |
| "grad_norm": 0.5630876506506443, | |
| "learning_rate": 3.613056343321073e-06, | |
| "loss": 0.4676, | |
| "mean_token_accuracy": 0.8564149275422096, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.2649272453587557, | |
| "grad_norm": 0.5419703298529472, | |
| "learning_rate": 3.579419231279023e-06, | |
| "loss": 0.4746, | |
| "mean_token_accuracy": 0.8546606913208962, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 1.268941294530858, | |
| "grad_norm": 0.5500842339907406, | |
| "learning_rate": 3.5458519135752346e-06, | |
| "loss": 0.4786, | |
| "mean_token_accuracy": 0.8540479198098183, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.2729553437029604, | |
| "grad_norm": 0.5574331160594922, | |
| "learning_rate": 3.5123560394005004e-06, | |
| "loss": 0.4877, | |
| "mean_token_accuracy": 0.8515752986073494, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 1.2769693928750627, | |
| "grad_norm": 0.5251115350284153, | |
| "learning_rate": 3.478933254435534e-06, | |
| "loss": 0.4803, | |
| "mean_token_accuracy": 0.8529331281781196, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.280983442047165, | |
| "grad_norm": 0.5718874948859872, | |
| "learning_rate": 3.4455852007701154e-06, | |
| "loss": 0.4824, | |
| "mean_token_accuracy": 0.8529073163866997, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 1.2849974912192674, | |
| "grad_norm": 0.5236497682255199, | |
| "learning_rate": 3.4123135168224053e-06, | |
| "loss": 0.4842, | |
| "mean_token_accuracy": 0.8519692197442055, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.2890115403913698, | |
| "grad_norm": 0.4884629705033181, | |
| "learning_rate": 3.3791198372584664e-06, | |
| "loss": 0.4781, | |
| "mean_token_accuracy": 0.8534619972109795, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 1.2930255895634721, | |
| "grad_norm": 0.5303681824959834, | |
| "learning_rate": 3.3460057929119306e-06, | |
| "loss": 0.4868, | |
| "mean_token_accuracy": 0.850970309972763, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.2970396387355745, | |
| "grad_norm": 0.5123281384081424, | |
| "learning_rate": 3.3129730107038916e-06, | |
| "loss": 0.4903, | |
| "mean_token_accuracy": 0.8506717085838318, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 1.3010536879076768, | |
| "grad_norm": 0.4899717227978676, | |
| "learning_rate": 3.280023113562957e-06, | |
| "loss": 0.4859, | |
| "mean_token_accuracy": 0.8520964190363884, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.3050677370797792, | |
| "grad_norm": 0.5052149939852724, | |
| "learning_rate": 3.2471577203455263e-06, | |
| "loss": 0.4807, | |
| "mean_token_accuracy": 0.8533585995435715, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 1.3090817862518815, | |
| "grad_norm": 0.5364809069727472, | |
| "learning_rate": 3.21437844575625e-06, | |
| "loss": 0.4783, | |
| "mean_token_accuracy": 0.8539668470621109, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.313095835423984, | |
| "grad_norm": 0.49451274386147803, | |
| "learning_rate": 3.181686900268694e-06, | |
| "loss": 0.4717, | |
| "mean_token_accuracy": 0.8559211567044258, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 1.3171098845960862, | |
| "grad_norm": 0.4754946294939683, | |
| "learning_rate": 3.149084690046221e-06, | |
| "loss": 0.478, | |
| "mean_token_accuracy": 0.8539295867085457, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.3211239337681886, | |
| "grad_norm": 0.4807750565396974, | |
| "learning_rate": 3.1165734168630767e-06, | |
| "loss": 0.4703, | |
| "mean_token_accuracy": 0.8557563424110413, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 1.325137982940291, | |
| "grad_norm": 0.4783048679547648, | |
| "learning_rate": 3.084154678025692e-06, | |
| "loss": 0.4736, | |
| "mean_token_accuracy": 0.8553013518452645, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.3291520321123933, | |
| "grad_norm": 0.4870454651116592, | |
| "learning_rate": 3.051830066294207e-06, | |
| "loss": 0.4728, | |
| "mean_token_accuracy": 0.8551506981253624, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 1.3331660812844957, | |
| "grad_norm": 0.4992080659924365, | |
| "learning_rate": 3.019601169804216e-06, | |
| "loss": 0.4799, | |
| "mean_token_accuracy": 0.8541036993265152, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.337180130456598, | |
| "grad_norm": 0.4796718782317687, | |
| "learning_rate": 2.9874695719887463e-06, | |
| "loss": 0.4737, | |
| "mean_token_accuracy": 0.8544951483607293, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 1.3411941796287006, | |
| "grad_norm": 0.4747418088055164, | |
| "learning_rate": 2.955436851500454e-06, | |
| "loss": 0.4718, | |
| "mean_token_accuracy": 0.8550894737243653, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.3452082288008027, | |
| "grad_norm": 0.4740811703629381, | |
| "learning_rate": 2.9235045821340713e-06, | |
| "loss": 0.4775, | |
| "mean_token_accuracy": 0.8539848864078522, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 1.3492222779729053, | |
| "grad_norm": 0.5040859930530716, | |
| "learning_rate": 2.89167433274908e-06, | |
| "loss": 0.4844, | |
| "mean_token_accuracy": 0.8522534683346749, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.3532363271450074, | |
| "grad_norm": 0.4685574013809817, | |
| "learning_rate": 2.859947667192636e-06, | |
| "loss": 0.4675, | |
| "mean_token_accuracy": 0.8567684695124627, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 1.35725037631711, | |
| "grad_norm": 0.5414825961816714, | |
| "learning_rate": 2.8283261442227303e-06, | |
| "loss": 0.4835, | |
| "mean_token_accuracy": 0.8527524784207344, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.3612644254892121, | |
| "grad_norm": 0.4767368636637838, | |
| "learning_rate": 2.7968113174316102e-06, | |
| "loss": 0.4843, | |
| "mean_token_accuracy": 0.8519262120127677, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 1.3652784746613147, | |
| "grad_norm": 0.4762349732624997, | |
| "learning_rate": 2.765404735169454e-06, | |
| "loss": 0.4779, | |
| "mean_token_accuracy": 0.8536876887083054, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.369292523833417, | |
| "grad_norm": 0.4603866793711708, | |
| "learning_rate": 2.7341079404682887e-06, | |
| "loss": 0.483, | |
| "mean_token_accuracy": 0.8522417232394218, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 1.3733065730055194, | |
| "grad_norm": 0.48783583675642933, | |
| "learning_rate": 2.702922470966187e-06, | |
| "loss": 0.4861, | |
| "mean_token_accuracy": 0.8517315194010735, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.3773206221776217, | |
| "grad_norm": 0.457565482089079, | |
| "learning_rate": 2.671849858831721e-06, | |
| "loss": 0.4781, | |
| "mean_token_accuracy": 0.8536698654294014, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 1.381334671349724, | |
| "grad_norm": 0.47742709258212396, | |
| "learning_rate": 2.640891630688682e-06, | |
| "loss": 0.4856, | |
| "mean_token_accuracy": 0.8515735983848571, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.3853487205218264, | |
| "grad_norm": 0.4519116845237833, | |
| "learning_rate": 2.610049307541085e-06, | |
| "loss": 0.4745, | |
| "mean_token_accuracy": 0.8553285971283913, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 1.3893627696939288, | |
| "grad_norm": 0.46002460375644066, | |
| "learning_rate": 2.579324404698428e-06, | |
| "loss": 0.4792, | |
| "mean_token_accuracy": 0.8532739356160164, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.3933768188660312, | |
| "grad_norm": 0.43230273487871207, | |
| "learning_rate": 2.548718431701251e-06, | |
| "loss": 0.4624, | |
| "mean_token_accuracy": 0.8579189226031303, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 1.3973908680381335, | |
| "grad_norm": 0.4784555211080267, | |
| "learning_rate": 2.518232892246972e-06, | |
| "loss": 0.4783, | |
| "mean_token_accuracy": 0.8536991968750953, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.4014049172102359, | |
| "grad_norm": 0.47787198660534125, | |
| "learning_rate": 2.4878692841160053e-06, | |
| "loss": 0.4762, | |
| "mean_token_accuracy": 0.8540623039007187, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 1.4054189663823382, | |
| "grad_norm": 0.47979677873474513, | |
| "learning_rate": 2.4576290990981755e-06, | |
| "loss": 0.4909, | |
| "mean_token_accuracy": 0.8503037631511688, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.4094330155544406, | |
| "grad_norm": 0.5095819745293857, | |
| "learning_rate": 2.4275138229194238e-06, | |
| "loss": 0.4759, | |
| "mean_token_accuracy": 0.8540142044425011, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 1.413447064726543, | |
| "grad_norm": 0.4805811343718235, | |
| "learning_rate": 2.3975249351688207e-06, | |
| "loss": 0.4803, | |
| "mean_token_accuracy": 0.8527746021747589, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.4174611138986453, | |
| "grad_norm": 0.5969184626019208, | |
| "learning_rate": 2.3676639092258584e-06, | |
| "loss": 0.4829, | |
| "mean_token_accuracy": 0.852690675854683, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 1.4214751630707476, | |
| "grad_norm": 0.4818521910414447, | |
| "learning_rate": 2.337932212188073e-06, | |
| "loss": 0.477, | |
| "mean_token_accuracy": 0.8541664496064186, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.42548921224285, | |
| "grad_norm": 0.465679128369887, | |
| "learning_rate": 2.3083313047989626e-06, | |
| "loss": 0.4772, | |
| "mean_token_accuracy": 0.8541705653071403, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 1.4295032614149523, | |
| "grad_norm": 0.47603113675595066, | |
| "learning_rate": 2.278862641376215e-06, | |
| "loss": 0.4718, | |
| "mean_token_accuracy": 0.8553374916315079, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.4335173105870547, | |
| "grad_norm": 0.4274170132490548, | |
| "learning_rate": 2.2495276697402663e-06, | |
| "loss": 0.4772, | |
| "mean_token_accuracy": 0.8541041478514672, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 1.437531359759157, | |
| "grad_norm": 0.4666292377645937, | |
| "learning_rate": 2.2203278311431575e-06, | |
| "loss": 0.4779, | |
| "mean_token_accuracy": 0.8537328645586968, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.4415454089312594, | |
| "grad_norm": 0.4537894496554442, | |
| "learning_rate": 2.1912645601977283e-06, | |
| "loss": 0.4686, | |
| "mean_token_accuracy": 0.8557942762970925, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 1.4455594581033617, | |
| "grad_norm": 0.475653896781378, | |
| "learning_rate": 2.162339284807136e-06, | |
| "loss": 0.4882, | |
| "mean_token_accuracy": 0.8511700749397277, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.449573507275464, | |
| "grad_norm": 0.4920511476816344, | |
| "learning_rate": 2.1335534260946945e-06, | |
| "loss": 0.488, | |
| "mean_token_accuracy": 0.8513952940702438, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 1.4535875564475664, | |
| "grad_norm": 0.4814427078654733, | |
| "learning_rate": 2.104908398334069e-06, | |
| "loss": 0.4785, | |
| "mean_token_accuracy": 0.8537196487188339, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.4576016056196688, | |
| "grad_norm": 0.48060183639390225, | |
| "learning_rate": 2.0764056088797646e-06, | |
| "loss": 0.4776, | |
| "mean_token_accuracy": 0.8542910426855087, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 1.4616156547917711, | |
| "grad_norm": 0.4632278881841595, | |
| "learning_rate": 2.048046458098013e-06, | |
| "loss": 0.4806, | |
| "mean_token_accuracy": 0.8533736944198609, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.4656297039638735, | |
| "grad_norm": 0.43866859419295146, | |
| "learning_rate": 2.0198323392979453e-06, | |
| "loss": 0.4734, | |
| "mean_token_accuracy": 0.8550125047564506, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 1.4696437531359758, | |
| "grad_norm": 0.5025943669083757, | |
| "learning_rate": 1.9917646386631577e-06, | |
| "loss": 0.4796, | |
| "mean_token_accuracy": 0.8533670097589493, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.4736578023080782, | |
| "grad_norm": 0.4544304942999498, | |
| "learning_rate": 1.9638447351835875e-06, | |
| "loss": 0.4859, | |
| "mean_token_accuracy": 0.8517355710268021, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 1.4776718514801805, | |
| "grad_norm": 0.4499771859558144, | |
| "learning_rate": 1.9360740005877774e-06, | |
| "loss": 0.4785, | |
| "mean_token_accuracy": 0.8536696940660476, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.4816859006522831, | |
| "grad_norm": 0.4391506848423441, | |
| "learning_rate": 1.908453799275479e-06, | |
| "loss": 0.4751, | |
| "mean_token_accuracy": 0.8547928795218468, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 1.4856999498243852, | |
| "grad_norm": 0.4321233556378181, | |
| "learning_rate": 1.8809854882506129e-06, | |
| "loss": 0.4719, | |
| "mean_token_accuracy": 0.8550894305109977, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.4897139989964878, | |
| "grad_norm": 0.4545767091147806, | |
| "learning_rate": 1.8536704170546005e-06, | |
| "loss": 0.4792, | |
| "mean_token_accuracy": 0.8536446437239646, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 1.49372804816859, | |
| "grad_norm": 0.45768305317904345, | |
| "learning_rate": 1.8265099277000614e-06, | |
| "loss": 0.4675, | |
| "mean_token_accuracy": 0.8566059991717339, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.4977420973406925, | |
| "grad_norm": 0.583758976338618, | |
| "learning_rate": 1.7995053546048762e-06, | |
| "loss": 0.4861, | |
| "mean_token_accuracy": 0.8516813203692436, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 1.5017561465127947, | |
| "grad_norm": 0.5014344703500223, | |
| "learning_rate": 1.7726580245266334e-06, | |
| "loss": 0.482, | |
| "mean_token_accuracy": 0.852832806110382, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.5057701956848972, | |
| "grad_norm": 0.6997345848148291, | |
| "learning_rate": 1.7459692564974317e-06, | |
| "loss": 0.4854, | |
| "mean_token_accuracy": 0.852383628487587, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 1.5097842448569994, | |
| "grad_norm": 0.5154156720297645, | |
| "learning_rate": 1.719440361759086e-06, | |
| "loss": 0.4808, | |
| "mean_token_accuracy": 0.8528477355837822, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.513798294029102, | |
| "grad_norm": 0.4870849894345533, | |
| "learning_rate": 1.6930726436986977e-06, | |
| "loss": 0.4702, | |
| "mean_token_accuracy": 0.8555424034595489, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 1.517812343201204, | |
| "grad_norm": 0.4518757378910186, | |
| "learning_rate": 1.6668673977846255e-06, | |
| "loss": 0.4766, | |
| "mean_token_accuracy": 0.8546996787190437, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.5218263923733066, | |
| "grad_norm": 0.4576719732501448, | |
| "learning_rate": 1.6408259115028325e-06, | |
| "loss": 0.4676, | |
| "mean_token_accuracy": 0.8562597304582595, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 1.5258404415454088, | |
| "grad_norm": 0.4836723761371539, | |
| "learning_rate": 1.6149494642936253e-06, | |
| "loss": 0.4775, | |
| "mean_token_accuracy": 0.8537423238158226, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.5298544907175113, | |
| "grad_norm": 0.4386507907416896, | |
| "learning_rate": 1.589239327488812e-06, | |
| "loss": 0.4769, | |
| "mean_token_accuracy": 0.8538577347993851, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 1.5338685398896137, | |
| "grad_norm": 0.4688066892992193, | |
| "learning_rate": 1.5636967642492196e-06, | |
| "loss": 0.4691, | |
| "mean_token_accuracy": 0.8562686622142792, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.537882589061716, | |
| "grad_norm": 0.434007927437407, | |
| "learning_rate": 1.538323029502654e-06, | |
| "loss": 0.4806, | |
| "mean_token_accuracy": 0.8535393372178077, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 1.5418966382338184, | |
| "grad_norm": 0.43743732120688245, | |
| "learning_rate": 1.5131193698822234e-06, | |
| "loss": 0.4612, | |
| "mean_token_accuracy": 0.8581413358449936, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.5459106874059207, | |
| "grad_norm": 0.4569484082529831, | |
| "learning_rate": 1.488087023665104e-06, | |
| "loss": 0.4768, | |
| "mean_token_accuracy": 0.8539406448602677, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 1.549924736578023, | |
| "grad_norm": 0.4699068791196816, | |
| "learning_rate": 1.463227220711706e-06, | |
| "loss": 0.4735, | |
| "mean_token_accuracy": 0.8544613897800446, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.5539387857501255, | |
| "grad_norm": 0.424463650107424, | |
| "learning_rate": 1.4385411824052343e-06, | |
| "loss": 0.4781, | |
| "mean_token_accuracy": 0.8539833888411522, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 1.5579528349222278, | |
| "grad_norm": 0.46255145201724784, | |
| "learning_rate": 1.414030121591692e-06, | |
| "loss": 0.4729, | |
| "mean_token_accuracy": 0.8552562475204468, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.5619668840943302, | |
| "grad_norm": 0.46101782496953947, | |
| "learning_rate": 1.3896952425202893e-06, | |
| "loss": 0.4763, | |
| "mean_token_accuracy": 0.8538477480411529, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 1.5659809332664325, | |
| "grad_norm": 0.4710263307623761, | |
| "learning_rate": 1.3655377407842813e-06, | |
| "loss": 0.4785, | |
| "mean_token_accuracy": 0.8540721848607064, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.5699949824385349, | |
| "grad_norm": 0.45711760666640394, | |
| "learning_rate": 1.3415588032622202e-06, | |
| "loss": 0.4717, | |
| "mean_token_accuracy": 0.8554088562726975, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 1.5740090316106372, | |
| "grad_norm": 0.4456868518853785, | |
| "learning_rate": 1.3177596080596467e-06, | |
| "loss": 0.4701, | |
| "mean_token_accuracy": 0.8558782458305358, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.5780230807827396, | |
| "grad_norm": 0.4290784998523086, | |
| "learning_rate": 1.2941413244512113e-06, | |
| "loss": 0.4732, | |
| "mean_token_accuracy": 0.8551268830895424, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 1.582037129954842, | |
| "grad_norm": 0.45571698707191083, | |
| "learning_rate": 1.2707051128232217e-06, | |
| "loss": 0.478, | |
| "mean_token_accuracy": 0.8536119505763053, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.5860511791269443, | |
| "grad_norm": 0.4553754461355705, | |
| "learning_rate": 1.2474521246166392e-06, | |
| "loss": 0.4745, | |
| "mean_token_accuracy": 0.8546977415680885, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 1.5900652282990466, | |
| "grad_norm": 0.4394199878687813, | |
| "learning_rate": 1.2243835022705003e-06, | |
| "loss": 0.4761, | |
| "mean_token_accuracy": 0.8550259307026863, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.594079277471149, | |
| "grad_norm": 0.46568138230485706, | |
| "learning_rate": 1.2015003791657854e-06, | |
| "loss": 0.4672, | |
| "mean_token_accuracy": 0.8566817179322243, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 1.5980933266432515, | |
| "grad_norm": 0.43193045548782183, | |
| "learning_rate": 1.1788038795697487e-06, | |
| "loss": 0.4737, | |
| "mean_token_accuracy": 0.8545981183648109, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.6021073758153537, | |
| "grad_norm": 0.43777140121635394, | |
| "learning_rate": 1.1562951185806675e-06, | |
| "loss": 0.4758, | |
| "mean_token_accuracy": 0.8548903733491897, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 1.6061214249874562, | |
| "grad_norm": 0.5060321727440272, | |
| "learning_rate": 1.1339752020730664e-06, | |
| "loss": 0.4734, | |
| "mean_token_accuracy": 0.8551399186253548, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.6101354741595584, | |
| "grad_norm": 0.4923417367245005, | |
| "learning_rate": 1.1118452266433732e-06, | |
| "loss": 0.4777, | |
| "mean_token_accuracy": 0.8535337939858436, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 1.614149523331661, | |
| "grad_norm": 0.47803638007953236, | |
| "learning_rate": 1.0899062795560572e-06, | |
| "loss": 0.4848, | |
| "mean_token_accuracy": 0.8526277154684067, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.618163572503763, | |
| "grad_norm": 0.4643935744468831, | |
| "learning_rate": 1.068159438690199e-06, | |
| "loss": 0.4799, | |
| "mean_token_accuracy": 0.8536258295178414, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 1.6221776216758657, | |
| "grad_norm": 0.4467924882668494, | |
| "learning_rate": 1.046605772486538e-06, | |
| "loss": 0.4808, | |
| "mean_token_accuracy": 0.8534541547298431, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.6261916708479678, | |
| "grad_norm": 0.4842025238599929, | |
| "learning_rate": 1.025246339894979e-06, | |
| "loss": 0.4752, | |
| "mean_token_accuracy": 0.8539585873484612, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 1.6302057200200704, | |
| "grad_norm": 0.4867487448285858, | |
| "learning_rate": 1.0040821903225633e-06, | |
| "loss": 0.4701, | |
| "mean_token_accuracy": 0.8561724841594696, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.6342197691921725, | |
| "grad_norm": 0.4284828609172432, | |
| "learning_rate": 9.831143635819162e-07, | |
| "loss": 0.4766, | |
| "mean_token_accuracy": 0.8540926545858383, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 1.638233818364275, | |
| "grad_norm": 0.47071814520488964, | |
| "learning_rate": 9.62343889840151e-07, | |
| "loss": 0.4771, | |
| "mean_token_accuracy": 0.8538994386792182, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.6422478675363772, | |
| "grad_norm": 0.46031800716554394, | |
| "learning_rate": 9.417717895682626e-07, | |
| "loss": 0.4715, | |
| "mean_token_accuracy": 0.8556769326329231, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 1.6462619167084798, | |
| "grad_norm": 0.4823659021884061, | |
| "learning_rate": 9.213990734909884e-07, | |
| "loss": 0.4813, | |
| "mean_token_accuracy": 0.8533883213996887, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.650275965880582, | |
| "grad_norm": 0.4373578109896797, | |
| "learning_rate": 9.012267425371513e-07, | |
| "loss": 0.4809, | |
| "mean_token_accuracy": 0.8532153591513634, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 1.6542900150526845, | |
| "grad_norm": 0.4576132273763026, | |
| "learning_rate": 8.812557877904848e-07, | |
| "loss": 0.4754, | |
| "mean_token_accuracy": 0.8547022685408592, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.6583040642247866, | |
| "grad_norm": 0.43234177821278336, | |
| "learning_rate": 8.614871904409372e-07, | |
| "loss": 0.4623, | |
| "mean_token_accuracy": 0.8578175500035286, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 1.6623181133968892, | |
| "grad_norm": 0.46525027057902296, | |
| "learning_rate": 8.419219217364654e-07, | |
| "loss": 0.4757, | |
| "mean_token_accuracy": 0.8548225834965706, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.6663321625689913, | |
| "grad_norm": 0.4499223883161168, | |
| "learning_rate": 8.225609429353187e-07, | |
| "loss": 0.4753, | |
| "mean_token_accuracy": 0.8543162420392036, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 1.6703462117410939, | |
| "grad_norm": 0.44239112700740263, | |
| "learning_rate": 8.034052052588076e-07, | |
| "loss": 0.4764, | |
| "mean_token_accuracy": 0.8538721084594727, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.6743602609131962, | |
| "grad_norm": 0.4326153640002424, | |
| "learning_rate": 7.844556498445788e-07, | |
| "loss": 0.4774, | |
| "mean_token_accuracy": 0.8538077279925347, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 1.6783743100852986, | |
| "grad_norm": 0.4601280688859696, | |
| "learning_rate": 7.657132077003599e-07, | |
| "loss": 0.4705, | |
| "mean_token_accuracy": 0.8558229163289071, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.682388359257401, | |
| "grad_norm": 0.44335461441524826, | |
| "learning_rate": 7.471787996582358e-07, | |
| "loss": 0.4709, | |
| "mean_token_accuracy": 0.8559122830629349, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 1.6864024084295033, | |
| "grad_norm": 0.46533113083334554, | |
| "learning_rate": 7.288533363293959e-07, | |
| "loss": 0.4717, | |
| "mean_token_accuracy": 0.855779829621315, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.6904164576016056, | |
| "grad_norm": 0.4345554569585679, | |
| "learning_rate": 7.107377180593994e-07, | |
| "loss": 0.4761, | |
| "mean_token_accuracy": 0.8539123505353927, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 1.694430506773708, | |
| "grad_norm": 0.4537776341024369, | |
| "learning_rate": 6.928328348839392e-07, | |
| "loss": 0.473, | |
| "mean_token_accuracy": 0.8550264790654183, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.6984445559458103, | |
| "grad_norm": 0.4608039104937001, | |
| "learning_rate": 6.751395664851135e-07, | |
| "loss": 0.4772, | |
| "mean_token_accuracy": 0.8542622447013855, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 1.7024586051179127, | |
| "grad_norm": 0.45531945033095894, | |
| "learning_rate": 6.5765878214821e-07, | |
| "loss": 0.477, | |
| "mean_token_accuracy": 0.8539689004421234, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.706472654290015, | |
| "grad_norm": 0.4488368557626286, | |
| "learning_rate": 6.403913407189921e-07, | |
| "loss": 0.4706, | |
| "mean_token_accuracy": 0.8554460853338242, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 1.7104867034621174, | |
| "grad_norm": 0.4511207312852134, | |
| "learning_rate": 6.233380905615049e-07, | |
| "loss": 0.4737, | |
| "mean_token_accuracy": 0.8550473853945733, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.7145007526342197, | |
| "grad_norm": 0.4831341748488721, | |
| "learning_rate": 6.064998695163948e-07, | |
| "loss": 0.4733, | |
| "mean_token_accuracy": 0.8551791489124299, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 1.718514801806322, | |
| "grad_norm": 0.4492097445567558, | |
| "learning_rate": 5.898775048597449e-07, | |
| "loss": 0.4778, | |
| "mean_token_accuracy": 0.8540239587426186, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.7225288509784245, | |
| "grad_norm": 0.43196711826065015, | |
| "learning_rate": 5.734718132624351e-07, | |
| "loss": 0.4701, | |
| "mean_token_accuracy": 0.8554452732205391, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 1.7265429001505268, | |
| "grad_norm": 0.4481914858588619, | |
| "learning_rate": 5.57283600750006e-07, | |
| "loss": 0.4749, | |
| "mean_token_accuracy": 0.854470057785511, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.7305569493226292, | |
| "grad_norm": 0.442688331520453, | |
| "learning_rate": 5.41313662663075e-07, | |
| "loss": 0.4813, | |
| "mean_token_accuracy": 0.8537053450942039, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 1.7345709984947315, | |
| "grad_norm": 0.45437110406179004, | |
| "learning_rate": 5.255627836182453e-07, | |
| "loss": 0.4742, | |
| "mean_token_accuracy": 0.8547719925642013, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.738585047666834, | |
| "grad_norm": 0.4525783315312203, | |
| "learning_rate": 5.100317374695673e-07, | |
| "loss": 0.4712, | |
| "mean_token_accuracy": 0.854958064854145, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 1.7425990968389362, | |
| "grad_norm": 0.45989644907653204, | |
| "learning_rate": 4.947212872705131e-07, | |
| "loss": 0.4844, | |
| "mean_token_accuracy": 0.8522865787148476, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.7466131460110388, | |
| "grad_norm": 0.44179078654846465, | |
| "learning_rate": 4.796321852364877e-07, | |
| "loss": 0.4726, | |
| "mean_token_accuracy": 0.8552617311477662, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 1.750627195183141, | |
| "grad_norm": 0.44502137847599055, | |
| "learning_rate": 4.6476517270787667e-07, | |
| "loss": 0.4755, | |
| "mean_token_accuracy": 0.8547576576471329, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.7546412443552435, | |
| "grad_norm": 0.4293509834801368, | |
| "learning_rate": 4.5012098011361583e-07, | |
| "loss": 0.475, | |
| "mean_token_accuracy": 0.8545479908585548, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 1.7586552935273456, | |
| "grad_norm": 0.4476896792512822, | |
| "learning_rate": 4.357003269353105e-07, | |
| "loss": 0.4752, | |
| "mean_token_accuracy": 0.854350033402443, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.7626693426994482, | |
| "grad_norm": 0.43152267828647745, | |
| "learning_rate": 4.215039216718847e-07, | |
| "loss": 0.4794, | |
| "mean_token_accuracy": 0.8537008062005043, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 1.7666833918715503, | |
| "grad_norm": 0.47430915815921837, | |
| "learning_rate": 4.075324618047705e-07, | |
| "loss": 0.4758, | |
| "mean_token_accuracy": 0.8541364178061486, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.770697441043653, | |
| "grad_norm": 0.4287695621483679, | |
| "learning_rate": 3.937866337636459e-07, | |
| "loss": 0.4656, | |
| "mean_token_accuracy": 0.8566897332668304, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 1.774711490215755, | |
| "grad_norm": 0.43111942298665074, | |
| "learning_rate": 3.802671128927016e-07, | |
| "loss": 0.476, | |
| "mean_token_accuracy": 0.854565116763115, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.7787255393878576, | |
| "grad_norm": 0.436167995973845, | |
| "learning_rate": 3.6697456341746706e-07, | |
| "loss": 0.4729, | |
| "mean_token_accuracy": 0.8552428483963013, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 1.7827395885599597, | |
| "grad_norm": 0.4415001621236335, | |
| "learning_rate": 3.539096384121743e-07, | |
| "loss": 0.4673, | |
| "mean_token_accuracy": 0.8565417811274528, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.7867536377320623, | |
| "grad_norm": 0.4415379193192118, | |
| "learning_rate": 3.4107297976767097e-07, | |
| "loss": 0.4696, | |
| "mean_token_accuracy": 0.855958080291748, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 1.7907676869041644, | |
| "grad_norm": 0.4451028631056662, | |
| "learning_rate": 3.2846521815988853e-07, | |
| "loss": 0.4806, | |
| "mean_token_accuracy": 0.8532564789056778, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.794781736076267, | |
| "grad_norm": 0.42848978049456043, | |
| "learning_rate": 3.160869730188465e-07, | |
| "loss": 0.4753, | |
| "mean_token_accuracy": 0.8544053509831429, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 1.7987957852483691, | |
| "grad_norm": 0.4358873015501897, | |
| "learning_rate": 3.0393885249823174e-07, | |
| "loss": 0.4708, | |
| "mean_token_accuracy": 0.8561656758189201, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.8028098344204717, | |
| "grad_norm": 0.4235541886416018, | |
| "learning_rate": 2.92021453445509e-07, | |
| "loss": 0.4813, | |
| "mean_token_accuracy": 0.8529417350888252, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 1.8068238835925738, | |
| "grad_norm": 0.4835046099418169, | |
| "learning_rate": 2.8033536137260565e-07, | |
| "loss": 0.4783, | |
| "mean_token_accuracy": 0.8537642017006875, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.8108379327646764, | |
| "grad_norm": 0.42897346841833367, | |
| "learning_rate": 2.688811504271371e-07, | |
| "loss": 0.4729, | |
| "mean_token_accuracy": 0.8553988888859749, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 1.8148519819367788, | |
| "grad_norm": 0.4301559523428082, | |
| "learning_rate": 2.576593833642033e-07, | |
| "loss": 0.4761, | |
| "mean_token_accuracy": 0.8536762282252311, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.8188660311088811, | |
| "grad_norm": 0.4369762786857926, | |
| "learning_rate": 2.466706115187406e-07, | |
| "loss": 0.4755, | |
| "mean_token_accuracy": 0.854819355905056, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 1.8228800802809835, | |
| "grad_norm": 0.44612227390075465, | |
| "learning_rate": 2.3591537477843208e-07, | |
| "loss": 0.4755, | |
| "mean_token_accuracy": 0.854425060749054, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.8268941294530858, | |
| "grad_norm": 0.44247042993963465, | |
| "learning_rate": 2.253942015571814e-07, | |
| "loss": 0.474, | |
| "mean_token_accuracy": 0.8552328020334243, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 1.8309081786251882, | |
| "grad_norm": 0.4578633638360693, | |
| "learning_rate": 2.1510760876915505e-07, | |
| "loss": 0.4774, | |
| "mean_token_accuracy": 0.8542596101760864, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.8349222277972905, | |
| "grad_norm": 0.43669293127724457, | |
| "learning_rate": 2.0505610180338198e-07, | |
| "loss": 0.4717, | |
| "mean_token_accuracy": 0.855582419037819, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 1.8389362769693929, | |
| "grad_norm": 0.4239247527956778, | |
| "learning_rate": 1.952401744989274e-07, | |
| "loss": 0.4654, | |
| "mean_token_accuracy": 0.85718754529953, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.8429503261414952, | |
| "grad_norm": 0.46610653318906003, | |
| "learning_rate": 1.856603091206255e-07, | |
| "loss": 0.4788, | |
| "mean_token_accuracy": 0.8535729303956032, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 1.8469643753135976, | |
| "grad_norm": 0.42340902296366656, | |
| "learning_rate": 1.7631697633539058e-07, | |
| "loss": 0.4685, | |
| "mean_token_accuracy": 0.8563544407486916, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.8509784244857, | |
| "grad_norm": 0.4524402197663993, | |
| "learning_rate": 1.672106351890862e-07, | |
| "loss": 0.4762, | |
| "mean_token_accuracy": 0.8543719783425331, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 1.8549924736578023, | |
| "grad_norm": 0.44808822341167676, | |
| "learning_rate": 1.583417330839798e-07, | |
| "loss": 0.4779, | |
| "mean_token_accuracy": 0.8537597358226776, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.8590065228299046, | |
| "grad_norm": 0.4274978896560569, | |
| "learning_rate": 1.4971070575675538e-07, | |
| "loss": 0.4825, | |
| "mean_token_accuracy": 0.8530182659626007, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 1.863020572002007, | |
| "grad_norm": 0.41753329575556447, | |
| "learning_rate": 1.413179772571055e-07, | |
| "loss": 0.4769, | |
| "mean_token_accuracy": 0.8544663786888123, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.8670346211741093, | |
| "grad_norm": 0.46270689148121547, | |
| "learning_rate": 1.3316395992690302e-07, | |
| "loss": 0.4831, | |
| "mean_token_accuracy": 0.8527223974466324, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 1.8710486703462117, | |
| "grad_norm": 0.4699440162702181, | |
| "learning_rate": 1.252490543799345e-07, | |
| "loss": 0.4664, | |
| "mean_token_accuracy": 0.8564791366457939, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.875062719518314, | |
| "grad_norm": 0.4222369586797418, | |
| "learning_rate": 1.175736494822266e-07, | |
| "loss": 0.4704, | |
| "mean_token_accuracy": 0.855928809940815, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 1.8790767686904166, | |
| "grad_norm": 0.4301011412110541, | |
| "learning_rate": 1.1013812233293008e-07, | |
| "loss": 0.4666, | |
| "mean_token_accuracy": 0.8567720711231231, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.8830908178625188, | |
| "grad_norm": 0.4326138437671182, | |
| "learning_rate": 1.0294283824580309e-07, | |
| "loss": 0.4832, | |
| "mean_token_accuracy": 0.8523061379790307, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 1.8871048670346213, | |
| "grad_norm": 0.4497012617910045, | |
| "learning_rate": 9.5988150731256e-08, | |
| "loss": 0.4745, | |
| "mean_token_accuracy": 0.8544217735528946, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.8911189162067235, | |
| "grad_norm": 0.44164898887491794, | |
| "learning_rate": 8.927440147898703e-08, | |
| "loss": 0.4631, | |
| "mean_token_accuracy": 0.8575935378670693, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 1.895132965378826, | |
| "grad_norm": 0.4230848433661066, | |
| "learning_rate": 8.280192034119116e-08, | |
| "loss": 0.4736, | |
| "mean_token_accuracy": 0.8550165235996247, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.8991470145509282, | |
| "grad_norm": 0.4257386932133046, | |
| "learning_rate": 7.657102531635762e-08, | |
| "loss": 0.4757, | |
| "mean_token_accuracy": 0.8543663218617439, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 1.9031610637230307, | |
| "grad_norm": 0.4407975270333358, | |
| "learning_rate": 7.058202253364511e-08, | |
| "loss": 0.4794, | |
| "mean_token_accuracy": 0.8536389827728271, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.9071751128951329, | |
| "grad_norm": 0.42934669175844953, | |
| "learning_rate": 6.4835206237841e-08, | |
| "loss": 0.4742, | |
| "mean_token_accuracy": 0.85507842451334, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 1.9111891620672354, | |
| "grad_norm": 0.43251041025298353, | |
| "learning_rate": 5.933085877490474e-08, | |
| "loss": 0.4744, | |
| "mean_token_accuracy": 0.8547703847289085, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.9152032112393376, | |
| "grad_norm": 0.43605508973639817, | |
| "learning_rate": 5.406925057809653e-08, | |
| "loss": 0.4748, | |
| "mean_token_accuracy": 0.8543973922729492, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 1.9192172604114401, | |
| "grad_norm": 0.43786161881380403, | |
| "learning_rate": 4.9050640154690297e-08, | |
| "loss": 0.4705, | |
| "mean_token_accuracy": 0.8555782288312912, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.9232313095835423, | |
| "grad_norm": 0.4488612147870018, | |
| "learning_rate": 4.427527407327381e-08, | |
| "loss": 0.4825, | |
| "mean_token_accuracy": 0.852660457789898, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 1.9272453587556448, | |
| "grad_norm": 0.41818464639030534, | |
| "learning_rate": 3.974338695163393e-08, | |
| "loss": 0.4826, | |
| "mean_token_accuracy": 0.8533532917499542, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.931259407927747, | |
| "grad_norm": 0.43318994140619926, | |
| "learning_rate": 3.5455201445228625e-08, | |
| "loss": 0.4721, | |
| "mean_token_accuracy": 0.8555899515748024, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 1.9352734570998495, | |
| "grad_norm": 0.4350348341914019, | |
| "learning_rate": 3.141092823625014e-08, | |
| "loss": 0.4753, | |
| "mean_token_accuracy": 0.8547878980636596, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.9392875062719517, | |
| "grad_norm": 0.437156964872597, | |
| "learning_rate": 2.7610766023271618e-08, | |
| "loss": 0.4642, | |
| "mean_token_accuracy": 0.8575856953859329, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 1.9433015554440543, | |
| "grad_norm": 0.4439443740172379, | |
| "learning_rate": 2.405490151148715e-08, | |
| "loss": 0.4816, | |
| "mean_token_accuracy": 0.852755481004715, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.9473156046161564, | |
| "grad_norm": 0.4243080967093221, | |
| "learning_rate": 2.07435094035352e-08, | |
| "loss": 0.4713, | |
| "mean_token_accuracy": 0.8551632657647132, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 1.951329653788259, | |
| "grad_norm": 0.4417367451533877, | |
| "learning_rate": 1.7676752390920482e-08, | |
| "loss": 0.4693, | |
| "mean_token_accuracy": 0.8561436429619789, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.9553437029603613, | |
| "grad_norm": 0.5506523758872923, | |
| "learning_rate": 1.4854781146015906e-08, | |
| "loss": 0.472, | |
| "mean_token_accuracy": 0.8551851272583008, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 1.9593577521324637, | |
| "grad_norm": 0.4172594879961426, | |
| "learning_rate": 1.2277734314662948e-08, | |
| "loss": 0.4648, | |
| "mean_token_accuracy": 0.8577016338706016, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.963371801304566, | |
| "grad_norm": 0.43928189202839146, | |
| "learning_rate": 9.945738509358205e-09, | |
| "loss": 0.4751, | |
| "mean_token_accuracy": 0.8543354839086532, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 1.9673858504766684, | |
| "grad_norm": 0.46064111484894527, | |
| "learning_rate": 7.85890830303393e-09, | |
| "loss": 0.4799, | |
| "mean_token_accuracy": 0.8528526350855827, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.9713998996487707, | |
| "grad_norm": 0.4273528805773524, | |
| "learning_rate": 6.017346223429199e-09, | |
| "loss": 0.473, | |
| "mean_token_accuracy": 0.855083005130291, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 1.975413948820873, | |
| "grad_norm": 0.41903611556825693, | |
| "learning_rate": 4.421142748050056e-09, | |
| "loss": 0.4722, | |
| "mean_token_accuracy": 0.8553782507777214, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.9794279979929754, | |
| "grad_norm": 0.40693132138961724, | |
| "learning_rate": 3.070376299728062e-09, | |
| "loss": 0.4768, | |
| "mean_token_accuracy": 0.8542209342122078, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 1.9834420471650778, | |
| "grad_norm": 0.45323616894670904, | |
| "learning_rate": 1.965113242764494e-09, | |
| "loss": 0.4797, | |
| "mean_token_accuracy": 0.8531859144568443, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.9874560963371801, | |
| "grad_norm": 0.4334834647690343, | |
| "learning_rate": 1.105407879670728e-09, | |
| "loss": 0.4766, | |
| "mean_token_accuracy": 0.854351207613945, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 1.9914701455092825, | |
| "grad_norm": 0.4426951792767383, | |
| "learning_rate": 4.913024485003748e-10, | |
| "loss": 0.4837, | |
| "mean_token_accuracy": 0.852310574054718, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.9954841946813848, | |
| "grad_norm": 0.4662710035176809, | |
| "learning_rate": 1.2282712077538173e-10, | |
| "loss": 0.476, | |
| "mean_token_accuracy": 0.8544206082820892, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 1.9994982438534872, | |
| "grad_norm": 0.44864109609424646, | |
| "learning_rate": 0.0, | |
| "loss": 0.4713, | |
| "mean_token_accuracy": 0.8560152605175972, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.9994982438534872, | |
| "step": 2490, | |
| "total_flos": 1305393569464320.0, | |
| "train_loss": 0.5340647190928938, | |
| "train_runtime": 372903.5049, | |
| "train_samples_per_second": 1.71, | |
| "train_steps_per_second": 0.007 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2490, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1305393569464320.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |