Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": 72000, | |
| "best_metric": 3.5336711406707764, | |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/swap_last_to_drop_5039/checkpoint-40000", | |
| "epoch": 26.814387314911976, | |
| "eval_steps": 1000, | |
| "global_step": 92000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014573860324122653, | |
| "grad_norm": 0.6621675491333008, | |
| "learning_rate": 0.000294, | |
| "loss": 8.4935, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.029147720648245307, | |
| "grad_norm": 0.49097710847854614, | |
| "learning_rate": 0.0005939999999999999, | |
| "loss": 6.7519, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04372158097236796, | |
| "grad_norm": 0.44661810994148254, | |
| "learning_rate": 0.0005998285214348206, | |
| "loss": 6.373, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.05829544129649061, | |
| "grad_norm": 0.4845108091831207, | |
| "learning_rate": 0.0005996535433070866, | |
| "loss": 6.1463, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07286930162061327, | |
| "grad_norm": 0.7079914212226868, | |
| "learning_rate": 0.0005994785651793525, | |
| "loss": 6.0021, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08744316194473592, | |
| "grad_norm": 0.5005303621292114, | |
| "learning_rate": 0.0005993035870516185, | |
| "loss": 5.8895, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10201702226885857, | |
| "grad_norm": 0.47492167353630066, | |
| "learning_rate": 0.0005991286089238845, | |
| "loss": 5.7663, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11659088259298123, | |
| "grad_norm": 0.4829168915748596, | |
| "learning_rate": 0.0005989536307961504, | |
| "loss": 5.6455, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1311647429171039, | |
| "grad_norm": 0.5034458041191101, | |
| "learning_rate": 0.0005987786526684164, | |
| "loss": 5.5263, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.14573860324122653, | |
| "grad_norm": 0.4625563621520996, | |
| "learning_rate": 0.0005986036745406824, | |
| "loss": 5.4249, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1603124635653492, | |
| "grad_norm": 0.4806194603443146, | |
| "learning_rate": 0.0005984286964129484, | |
| "loss": 5.3596, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.17488632388947184, | |
| "grad_norm": 0.4784989058971405, | |
| "learning_rate": 0.0005982537182852143, | |
| "loss": 5.2812, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1894601842135945, | |
| "grad_norm": 0.4541167616844177, | |
| "learning_rate": 0.0005980787401574803, | |
| "loss": 5.2211, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.20403404453771715, | |
| "grad_norm": 0.49250224232673645, | |
| "learning_rate": 0.0005979037620297463, | |
| "loss": 5.1392, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2186079048618398, | |
| "grad_norm": 0.439749538898468, | |
| "learning_rate": 0.0005977287839020123, | |
| "loss": 5.0797, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.23318176518596245, | |
| "grad_norm": 0.5406767725944519, | |
| "learning_rate": 0.0005975538057742782, | |
| "loss": 5.0266, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.24775562551008512, | |
| "grad_norm": 0.44189009070396423, | |
| "learning_rate": 0.0005973788276465442, | |
| "loss": 4.9857, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2623294858342078, | |
| "grad_norm": 0.3949568271636963, | |
| "learning_rate": 0.0005972038495188102, | |
| "loss": 4.9285, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2769033461583304, | |
| "grad_norm": 0.43237313628196716, | |
| "learning_rate": 0.000597028871391076, | |
| "loss": 4.8772, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.29147720648245307, | |
| "grad_norm": 0.5117595195770264, | |
| "learning_rate": 0.000596853893263342, | |
| "loss": 4.8306, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.29147720648245307, | |
| "eval_accuracy": 0.25472762160242746, | |
| "eval_loss": 4.7552924156188965, | |
| "eval_runtime": 53.5229, | |
| "eval_samples_per_second": 310.652, | |
| "eval_steps_per_second": 19.431, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.30605106680657573, | |
| "grad_norm": 0.43191710114479065, | |
| "learning_rate": 0.000596678915135608, | |
| "loss": 4.7805, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3206249271306984, | |
| "grad_norm": 0.427901953458786, | |
| "learning_rate": 0.0005965039370078739, | |
| "loss": 4.7407, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.335198787454821, | |
| "grad_norm": 0.6437082290649414, | |
| "learning_rate": 0.0005963289588801399, | |
| "loss": 4.7037, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3497726477789437, | |
| "grad_norm": 0.43874093890190125, | |
| "learning_rate": 0.0005961539807524059, | |
| "loss": 4.6684, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.36434650810306635, | |
| "grad_norm": 0.4586627185344696, | |
| "learning_rate": 0.0005959790026246719, | |
| "loss": 4.6442, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.378920368427189, | |
| "grad_norm": 0.4979628324508667, | |
| "learning_rate": 0.0005958040244969378, | |
| "loss": 4.6031, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.3934942287513116, | |
| "grad_norm": 0.4893154203891754, | |
| "learning_rate": 0.0005956290463692038, | |
| "loss": 4.5684, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4080680890754343, | |
| "grad_norm": 0.4149429202079773, | |
| "learning_rate": 0.0005954540682414698, | |
| "loss": 4.556, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.42264194939955696, | |
| "grad_norm": 0.43906036019325256, | |
| "learning_rate": 0.0005952790901137357, | |
| "loss": 4.5275, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4372158097236796, | |
| "grad_norm": 0.47645220160484314, | |
| "learning_rate": 0.0005951041119860017, | |
| "loss": 4.5114, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.45178967004780224, | |
| "grad_norm": 0.4202033281326294, | |
| "learning_rate": 0.0005949291338582677, | |
| "loss": 4.5, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4663635303719249, | |
| "grad_norm": 0.4314590096473694, | |
| "learning_rate": 0.0005947541557305336, | |
| "loss": 4.4597, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.4809373906960476, | |
| "grad_norm": 0.4086189568042755, | |
| "learning_rate": 0.0005945791776027996, | |
| "loss": 4.4403, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.49551125102017024, | |
| "grad_norm": 0.3905881643295288, | |
| "learning_rate": 0.0005944041994750656, | |
| "loss": 4.4181, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5100851113442929, | |
| "grad_norm": 0.40394750237464905, | |
| "learning_rate": 0.0005942292213473315, | |
| "loss": 4.4135, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5246589716684156, | |
| "grad_norm": 0.4248270094394684, | |
| "learning_rate": 0.0005940542432195975, | |
| "loss": 4.3972, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5392328319925381, | |
| "grad_norm": 0.40690281987190247, | |
| "learning_rate": 0.0005938792650918635, | |
| "loss": 4.3909, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5538066923166608, | |
| "grad_norm": 0.3957778513431549, | |
| "learning_rate": 0.0005937042869641295, | |
| "loss": 4.3735, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5683805526407835, | |
| "grad_norm": 0.4087545871734619, | |
| "learning_rate": 0.0005935293088363953, | |
| "loss": 4.3596, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5829544129649061, | |
| "grad_norm": 0.3965914845466614, | |
| "learning_rate": 0.0005933543307086613, | |
| "loss": 4.3462, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5829544129649061, | |
| "eval_accuracy": 0.2984550980845175, | |
| "eval_loss": 4.287600517272949, | |
| "eval_runtime": 53.0696, | |
| "eval_samples_per_second": 313.306, | |
| "eval_steps_per_second": 19.597, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5975282732890288, | |
| "grad_norm": 0.3693578541278839, | |
| "learning_rate": 0.0005931793525809273, | |
| "loss": 4.339, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6121021336131515, | |
| "grad_norm": 0.4273644983768463, | |
| "learning_rate": 0.0005930043744531933, | |
| "loss": 4.336, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6266759939372741, | |
| "grad_norm": 0.40753233432769775, | |
| "learning_rate": 0.0005928293963254592, | |
| "loss": 4.3252, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6412498542613968, | |
| "grad_norm": 0.3944893181324005, | |
| "learning_rate": 0.0005926544181977252, | |
| "loss": 4.3, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6558237145855194, | |
| "grad_norm": 0.40525901317596436, | |
| "learning_rate": 0.0005924794400699912, | |
| "loss": 4.2763, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.670397574909642, | |
| "grad_norm": 0.3596903085708618, | |
| "learning_rate": 0.0005923044619422571, | |
| "loss": 4.2768, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6849714352337647, | |
| "grad_norm": 0.4436907172203064, | |
| "learning_rate": 0.0005921294838145231, | |
| "loss": 4.2644, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6995452955578874, | |
| "grad_norm": 0.3655913472175598, | |
| "learning_rate": 0.0005919545056867891, | |
| "loss": 4.2584, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.71411915588201, | |
| "grad_norm": 0.3963913321495056, | |
| "learning_rate": 0.0005917795275590551, | |
| "loss": 4.2567, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7286930162061327, | |
| "grad_norm": 0.3979296386241913, | |
| "learning_rate": 0.000591604549431321, | |
| "loss": 4.2257, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7432668765302554, | |
| "grad_norm": 0.4119713306427002, | |
| "learning_rate": 0.000591429571303587, | |
| "loss": 4.2345, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.757840736854378, | |
| "grad_norm": 0.41694387793540955, | |
| "learning_rate": 0.000591254593175853, | |
| "loss": 4.2196, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.7724145971785007, | |
| "grad_norm": 0.35006240010261536, | |
| "learning_rate": 0.000591079615048119, | |
| "loss": 4.2018, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.7869884575026233, | |
| "grad_norm": 0.365438312292099, | |
| "learning_rate": 0.0005909046369203849, | |
| "loss": 4.2114, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8015623178267459, | |
| "grad_norm": 0.3674423396587372, | |
| "learning_rate": 0.0005907296587926509, | |
| "loss": 4.2021, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8161361781508686, | |
| "grad_norm": 0.3676803410053253, | |
| "learning_rate": 0.0005905546806649169, | |
| "loss": 4.2025, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8307100384749913, | |
| "grad_norm": 0.429167240858078, | |
| "learning_rate": 0.0005903797025371829, | |
| "loss": 4.1849, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.8452838987991139, | |
| "grad_norm": 0.3721235990524292, | |
| "learning_rate": 0.0005902047244094488, | |
| "loss": 4.1595, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.8598577591232366, | |
| "grad_norm": 0.3640214204788208, | |
| "learning_rate": 0.0005900297462817148, | |
| "loss": 4.1594, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.8744316194473593, | |
| "grad_norm": 0.35601043701171875, | |
| "learning_rate": 0.0005898547681539808, | |
| "loss": 4.1483, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8744316194473593, | |
| "eval_accuracy": 0.3144607061087188, | |
| "eval_loss": 4.105985164642334, | |
| "eval_runtime": 53.2638, | |
| "eval_samples_per_second": 312.163, | |
| "eval_steps_per_second": 19.525, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8890054797714819, | |
| "grad_norm": 0.35206031799316406, | |
| "learning_rate": 0.0005896797900262466, | |
| "loss": 4.1486, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9035793400956045, | |
| "grad_norm": 0.36506882309913635, | |
| "learning_rate": 0.0005895048118985126, | |
| "loss": 4.1418, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9181532004197271, | |
| "grad_norm": 0.3510189950466156, | |
| "learning_rate": 0.0005893298337707786, | |
| "loss": 4.147, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.9327270607438498, | |
| "grad_norm": 0.3484358787536621, | |
| "learning_rate": 0.0005891548556430446, | |
| "loss": 4.1269, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.9473009210679725, | |
| "grad_norm": 0.3678065836429596, | |
| "learning_rate": 0.0005889798775153105, | |
| "loss": 4.1302, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.9618747813920951, | |
| "grad_norm": 0.3914882242679596, | |
| "learning_rate": 0.0005888048993875765, | |
| "loss": 4.1129, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.9764486417162178, | |
| "grad_norm": 0.3623497188091278, | |
| "learning_rate": 0.0005886299212598425, | |
| "loss": 4.1136, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.9910225020403405, | |
| "grad_norm": 0.3361571431159973, | |
| "learning_rate": 0.0005884549431321084, | |
| "loss": 4.1018, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.0055380669231666, | |
| "grad_norm": 0.3415493071079254, | |
| "learning_rate": 0.0005882799650043744, | |
| "loss": 4.0828, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.0201119272472892, | |
| "grad_norm": 0.34337010979652405, | |
| "learning_rate": 0.0005881049868766404, | |
| "loss": 4.0262, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.034685787571412, | |
| "grad_norm": 0.3571913242340088, | |
| "learning_rate": 0.0005879300087489063, | |
| "loss": 4.0256, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.0492596478955345, | |
| "grad_norm": 0.36647820472717285, | |
| "learning_rate": 0.0005877550306211723, | |
| "loss": 4.0044, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.0638335082196573, | |
| "grad_norm": 0.359706312417984, | |
| "learning_rate": 0.0005875800524934383, | |
| "loss": 4.0143, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.0784073685437798, | |
| "grad_norm": 0.35660111904144287, | |
| "learning_rate": 0.0005874050743657042, | |
| "loss": 4.0265, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.0929812288679026, | |
| "grad_norm": 0.3381880521774292, | |
| "learning_rate": 0.0005872300962379702, | |
| "loss": 4.0087, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.1075550891920252, | |
| "grad_norm": 0.3747643530368805, | |
| "learning_rate": 0.0005870551181102362, | |
| "loss": 4.01, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.122128949516148, | |
| "grad_norm": 0.3946177661418915, | |
| "learning_rate": 0.0005868801399825022, | |
| "loss": 3.9809, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.1367028098402705, | |
| "grad_norm": 0.3495027720928192, | |
| "learning_rate": 0.0005867051618547681, | |
| "loss": 3.9893, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.151276670164393, | |
| "grad_norm": 0.356650173664093, | |
| "learning_rate": 0.0005865301837270341, | |
| "loss": 4.0052, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.1658505304885158, | |
| "grad_norm": 0.37137117981910706, | |
| "learning_rate": 0.0005863552055993001, | |
| "loss": 4.003, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.1658505304885158, | |
| "eval_accuracy": 0.32443093231166104, | |
| "eval_loss": 3.994969129562378, | |
| "eval_runtime": 53.0558, | |
| "eval_samples_per_second": 313.387, | |
| "eval_steps_per_second": 19.602, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.1804243908126384, | |
| "grad_norm": 0.34678372740745544, | |
| "learning_rate": 0.0005861802274715659, | |
| "loss": 4.0064, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.1949982511367612, | |
| "grad_norm": 0.33129486441612244, | |
| "learning_rate": 0.0005860052493438319, | |
| "loss": 3.9905, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.2095721114608837, | |
| "grad_norm": 0.3196367621421814, | |
| "learning_rate": 0.0005858302712160979, | |
| "loss": 4.0104, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.2241459717850065, | |
| "grad_norm": 0.3991895318031311, | |
| "learning_rate": 0.0005856552930883638, | |
| "loss": 3.9833, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.238719832109129, | |
| "grad_norm": 0.3395180106163025, | |
| "learning_rate": 0.0005854803149606298, | |
| "loss": 3.984, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.2532936924332518, | |
| "grad_norm": 0.32869836688041687, | |
| "learning_rate": 0.0005853053368328958, | |
| "loss": 3.98, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.2678675527573744, | |
| "grad_norm": 0.3420329689979553, | |
| "learning_rate": 0.0005851303587051618, | |
| "loss": 3.9745, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.282441413081497, | |
| "grad_norm": 0.34595686197280884, | |
| "learning_rate": 0.0005849553805774277, | |
| "loss": 3.9838, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.2970152734056197, | |
| "grad_norm": 0.3202042877674103, | |
| "learning_rate": 0.0005847804024496937, | |
| "loss": 3.9751, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.3115891337297423, | |
| "grad_norm": 0.33933225274086, | |
| "learning_rate": 0.0005846054243219597, | |
| "loss": 3.9762, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.326162994053865, | |
| "grad_norm": 0.3334672749042511, | |
| "learning_rate": 0.0005844304461942257, | |
| "loss": 3.9556, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.3407368543779876, | |
| "grad_norm": 0.33130043745040894, | |
| "learning_rate": 0.0005842554680664916, | |
| "loss": 3.9444, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.3553107147021102, | |
| "grad_norm": 0.33543631434440613, | |
| "learning_rate": 0.0005840804899387576, | |
| "loss": 3.9586, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.369884575026233, | |
| "grad_norm": 0.3611527979373932, | |
| "learning_rate": 0.0005839055118110236, | |
| "loss": 3.9639, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.3844584353503557, | |
| "grad_norm": 0.32615354657173157, | |
| "learning_rate": 0.0005837305336832896, | |
| "loss": 3.9483, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.3990322956744783, | |
| "grad_norm": 0.3202286958694458, | |
| "learning_rate": 0.0005835555555555555, | |
| "loss": 3.9413, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.4136061559986008, | |
| "grad_norm": 0.32702603936195374, | |
| "learning_rate": 0.0005833805774278215, | |
| "loss": 3.9485, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.4281800163227236, | |
| "grad_norm": 0.33720141649246216, | |
| "learning_rate": 0.0005832055993000875, | |
| "loss": 3.9391, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.4427538766468462, | |
| "grad_norm": 0.34604305028915405, | |
| "learning_rate": 0.0005830306211723534, | |
| "loss": 3.9409, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.457327736970969, | |
| "grad_norm": 0.33776533603668213, | |
| "learning_rate": 0.0005828556430446194, | |
| "loss": 3.9556, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.457327736970969, | |
| "eval_accuracy": 0.3308636590309987, | |
| "eval_loss": 3.919948101043701, | |
| "eval_runtime": 53.1821, | |
| "eval_samples_per_second": 312.643, | |
| "eval_steps_per_second": 19.555, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4719015972950915, | |
| "grad_norm": 0.33289724588394165, | |
| "learning_rate": 0.0005826806649168854, | |
| "loss": 3.9443, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.486475457619214, | |
| "grad_norm": 0.35393133759498596, | |
| "learning_rate": 0.0005825056867891514, | |
| "loss": 3.927, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.5010493179433368, | |
| "grad_norm": 0.3489775061607361, | |
| "learning_rate": 0.0005823307086614172, | |
| "loss": 3.9243, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.5156231782674596, | |
| "grad_norm": 0.33480021357536316, | |
| "learning_rate": 0.0005821557305336832, | |
| "loss": 3.9314, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.5301970385915822, | |
| "grad_norm": 0.32323822379112244, | |
| "learning_rate": 0.0005819807524059492, | |
| "loss": 3.9173, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.5447708989157047, | |
| "grad_norm": 0.3644621670246124, | |
| "learning_rate": 0.0005818057742782152, | |
| "loss": 3.9221, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.5593447592398273, | |
| "grad_norm": 0.32539132237434387, | |
| "learning_rate": 0.0005816307961504811, | |
| "loss": 3.9161, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.57391861956395, | |
| "grad_norm": 0.32493916153907776, | |
| "learning_rate": 0.0005814558180227471, | |
| "loss": 3.9183, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.5884924798880729, | |
| "grad_norm": 0.3585509955883026, | |
| "learning_rate": 0.0005812808398950131, | |
| "loss": 3.9221, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.6030663402121954, | |
| "grad_norm": 0.32815247774124146, | |
| "learning_rate": 0.0005811058617672791, | |
| "loss": 3.9087, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.617640200536318, | |
| "grad_norm": 0.3581700623035431, | |
| "learning_rate": 0.000580930883639545, | |
| "loss": 3.9166, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.6322140608604407, | |
| "grad_norm": 0.3333188593387604, | |
| "learning_rate": 0.000580755905511811, | |
| "loss": 3.8997, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.6467879211845635, | |
| "grad_norm": 0.3736306130886078, | |
| "learning_rate": 0.000580580927384077, | |
| "loss": 3.908, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.661361781508686, | |
| "grad_norm": 0.3163699805736542, | |
| "learning_rate": 0.0005804059492563429, | |
| "loss": 3.9041, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.6759356418328086, | |
| "grad_norm": 0.33947551250457764, | |
| "learning_rate": 0.0005802309711286089, | |
| "loss": 3.8887, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.6905095021569312, | |
| "grad_norm": 0.3375563621520996, | |
| "learning_rate": 0.0005800559930008749, | |
| "loss": 3.8895, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.705083362481054, | |
| "grad_norm": 0.34993767738342285, | |
| "learning_rate": 0.0005798810148731408, | |
| "loss": 3.8793, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.7196572228051767, | |
| "grad_norm": 0.30305829644203186, | |
| "learning_rate": 0.0005797060367454068, | |
| "loss": 3.8957, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.7342310831292993, | |
| "grad_norm": 0.34079498052597046, | |
| "learning_rate": 0.0005795310586176728, | |
| "loss": 3.8875, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.7488049434534219, | |
| "grad_norm": 0.31197983026504517, | |
| "learning_rate": 0.0005793560804899387, | |
| "loss": 3.8834, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.7488049434534219, | |
| "eval_accuracy": 0.3360885796650039, | |
| "eval_loss": 3.8626701831817627, | |
| "eval_runtime": 53.2105, | |
| "eval_samples_per_second": 312.476, | |
| "eval_steps_per_second": 19.545, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.7633788037775446, | |
| "grad_norm": 0.35590726137161255, | |
| "learning_rate": 0.0005791811023622047, | |
| "loss": 3.8794, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.7779526641016674, | |
| "grad_norm": 0.32964015007019043, | |
| "learning_rate": 0.0005790061242344707, | |
| "loss": 3.8809, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.79252652442579, | |
| "grad_norm": 0.326513409614563, | |
| "learning_rate": 0.0005788311461067365, | |
| "loss": 3.8919, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.8071003847499125, | |
| "grad_norm": 0.3079835772514343, | |
| "learning_rate": 0.0005786561679790025, | |
| "loss": 3.8739, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.821674245074035, | |
| "grad_norm": 0.32225608825683594, | |
| "learning_rate": 0.0005784811898512685, | |
| "loss": 3.8682, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.8362481053981579, | |
| "grad_norm": 0.3490994870662689, | |
| "learning_rate": 0.0005783062117235344, | |
| "loss": 3.8679, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.8508219657222806, | |
| "grad_norm": 0.3336406648159027, | |
| "learning_rate": 0.0005781312335958004, | |
| "loss": 3.8754, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.8653958260464032, | |
| "grad_norm": 0.3177735209465027, | |
| "learning_rate": 0.0005779562554680664, | |
| "loss": 3.8614, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.8799696863705257, | |
| "grad_norm": 0.3288932740688324, | |
| "learning_rate": 0.0005777812773403324, | |
| "loss": 3.8722, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.8945435466946485, | |
| "grad_norm": 0.32330477237701416, | |
| "learning_rate": 0.0005776062992125983, | |
| "loss": 3.8629, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.909117407018771, | |
| "grad_norm": 0.329217791557312, | |
| "learning_rate": 0.0005774313210848643, | |
| "loss": 3.8653, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.9236912673428939, | |
| "grad_norm": 0.3131251633167267, | |
| "learning_rate": 0.0005772563429571303, | |
| "loss": 3.8552, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.9382651276670164, | |
| "grad_norm": 0.3294188976287842, | |
| "learning_rate": 0.0005770813648293962, | |
| "loss": 3.8606, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.952838987991139, | |
| "grad_norm": 0.3348577916622162, | |
| "learning_rate": 0.0005769063867016622, | |
| "loss": 3.8644, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.9674128483152618, | |
| "grad_norm": 0.3283770680427551, | |
| "learning_rate": 0.0005767314085739282, | |
| "loss": 3.849, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.9819867086393845, | |
| "grad_norm": 0.3316490948200226, | |
| "learning_rate": 0.0005765564304461942, | |
| "loss": 3.8496, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.996560568963507, | |
| "grad_norm": 0.30883973836898804, | |
| "learning_rate": 0.0005763814523184601, | |
| "loss": 3.8497, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.011076133846333, | |
| "grad_norm": 0.32743337750434875, | |
| "learning_rate": 0.0005762064741907261, | |
| "loss": 3.7759, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.0256499941704558, | |
| "grad_norm": 0.32366102933883667, | |
| "learning_rate": 0.0005760314960629921, | |
| "loss": 3.7522, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.0402238544945783, | |
| "grad_norm": 0.3358854353427887, | |
| "learning_rate": 0.0005758565179352581, | |
| "loss": 3.7614, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.0402238544945783, | |
| "eval_accuracy": 0.3401457111761609, | |
| "eval_loss": 3.820042371749878, | |
| "eval_runtime": 53.1073, | |
| "eval_samples_per_second": 313.083, | |
| "eval_steps_per_second": 19.583, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.0547977148187013, | |
| "grad_norm": 0.32670819759368896, | |
| "learning_rate": 0.000575681539807524, | |
| "loss": 3.7503, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.069371575142824, | |
| "grad_norm": 0.31555989384651184, | |
| "learning_rate": 0.00057550656167979, | |
| "loss": 3.7607, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.0839454354669464, | |
| "grad_norm": 0.3106895089149475, | |
| "learning_rate": 0.000575331583552056, | |
| "loss": 3.7461, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.098519295791069, | |
| "grad_norm": 0.33727002143859863, | |
| "learning_rate": 0.000575156605424322, | |
| "loss": 3.7591, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.113093156115192, | |
| "grad_norm": 0.32135823369026184, | |
| "learning_rate": 0.0005749816272965878, | |
| "loss": 3.7584, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.1276670164393146, | |
| "grad_norm": 0.3297559916973114, | |
| "learning_rate": 0.0005748066491688538, | |
| "loss": 3.7566, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.142240876763437, | |
| "grad_norm": 0.34436535835266113, | |
| "learning_rate": 0.0005746316710411198, | |
| "loss": 3.7664, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.1568147370875597, | |
| "grad_norm": 0.3044912815093994, | |
| "learning_rate": 0.0005744566929133858, | |
| "loss": 3.7561, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.171388597411682, | |
| "grad_norm": 0.3473377525806427, | |
| "learning_rate": 0.0005742817147856517, | |
| "loss": 3.7588, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.185962457735805, | |
| "grad_norm": 0.3503433167934418, | |
| "learning_rate": 0.0005741067366579177, | |
| "loss": 3.775, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.2005363180599278, | |
| "grad_norm": 0.3245205581188202, | |
| "learning_rate": 0.0005739317585301837, | |
| "loss": 3.7608, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.2151101783840503, | |
| "grad_norm": 0.313618004322052, | |
| "learning_rate": 0.0005737567804024496, | |
| "loss": 3.7689, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.229684038708173, | |
| "grad_norm": 0.325967013835907, | |
| "learning_rate": 0.0005735818022747156, | |
| "loss": 3.7553, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.244257899032296, | |
| "grad_norm": 0.31503552198410034, | |
| "learning_rate": 0.0005734068241469816, | |
| "loss": 3.7574, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.2588317593564184, | |
| "grad_norm": 0.33212393522262573, | |
| "learning_rate": 0.0005732318460192476, | |
| "loss": 3.7583, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.273405619680541, | |
| "grad_norm": 0.33172062039375305, | |
| "learning_rate": 0.0005730568678915135, | |
| "loss": 3.7637, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.2879794800046636, | |
| "grad_norm": 0.33467426896095276, | |
| "learning_rate": 0.0005728818897637795, | |
| "loss": 3.7787, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.302553340328786, | |
| "grad_norm": 0.3073655068874359, | |
| "learning_rate": 0.0005727069116360455, | |
| "loss": 3.7648, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.317127200652909, | |
| "grad_norm": 0.3209739625453949, | |
| "learning_rate": 0.0005725319335083115, | |
| "loss": 3.7622, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.3317010609770317, | |
| "grad_norm": 0.32014501094818115, | |
| "learning_rate": 0.0005723569553805774, | |
| "loss": 3.757, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.3317010609770317, | |
| "eval_accuracy": 0.34350807760042285, | |
| "eval_loss": 3.789243459701538, | |
| "eval_runtime": 53.1145, | |
| "eval_samples_per_second": 313.041, | |
| "eval_steps_per_second": 19.58, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.3462749213011542, | |
| "grad_norm": 0.32058241963386536, | |
| "learning_rate": 0.0005721819772528434, | |
| "loss": 3.7583, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.360848781625277, | |
| "grad_norm": 0.3153894543647766, | |
| "learning_rate": 0.0005720069991251094, | |
| "loss": 3.7588, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.3754226419494, | |
| "grad_norm": 0.3164069652557373, | |
| "learning_rate": 0.0005718320209973753, | |
| "loss": 3.7473, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.3899965022735223, | |
| "grad_norm": 0.301470011472702, | |
| "learning_rate": 0.0005716570428696413, | |
| "loss": 3.7519, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.404570362597645, | |
| "grad_norm": 0.31935805082321167, | |
| "learning_rate": 0.0005714820647419073, | |
| "loss": 3.7565, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.4191442229217675, | |
| "grad_norm": 0.3229046165943146, | |
| "learning_rate": 0.0005713070866141731, | |
| "loss": 3.76, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.43371808324589, | |
| "grad_norm": 0.3205743432044983, | |
| "learning_rate": 0.0005711321084864391, | |
| "loss": 3.7615, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.448291943570013, | |
| "grad_norm": 0.33511775732040405, | |
| "learning_rate": 0.0005709571303587051, | |
| "loss": 3.7459, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.4628658038941356, | |
| "grad_norm": 0.30935677886009216, | |
| "learning_rate": 0.000570782152230971, | |
| "loss": 3.7575, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.477439664218258, | |
| "grad_norm": 0.31650465726852417, | |
| "learning_rate": 0.000570607174103237, | |
| "loss": 3.7435, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4920135245423807, | |
| "grad_norm": 0.32589587569236755, | |
| "learning_rate": 0.000570432195975503, | |
| "loss": 3.7407, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.5065873848665037, | |
| "grad_norm": 0.33884397149086, | |
| "learning_rate": 0.0005702572178477689, | |
| "loss": 3.7479, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.5211612451906262, | |
| "grad_norm": 0.3378530740737915, | |
| "learning_rate": 0.0005700822397200349, | |
| "loss": 3.7527, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.535735105514749, | |
| "grad_norm": 0.31704697012901306, | |
| "learning_rate": 0.0005699072615923009, | |
| "loss": 3.7519, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.5503089658388713, | |
| "grad_norm": 0.316654235124588, | |
| "learning_rate": 0.0005697322834645668, | |
| "loss": 3.7468, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.564882826162994, | |
| "grad_norm": 0.3165382146835327, | |
| "learning_rate": 0.0005695573053368328, | |
| "loss": 3.7497, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.5794566864871165, | |
| "grad_norm": 0.3091493248939514, | |
| "learning_rate": 0.0005693823272090988, | |
| "loss": 3.7419, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 2.5940305468112395, | |
| "grad_norm": 0.30668315291404724, | |
| "learning_rate": 0.0005692073490813648, | |
| "loss": 3.7545, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.608604407135362, | |
| "grad_norm": 0.3360009789466858, | |
| "learning_rate": 0.0005690323709536307, | |
| "loss": 3.7493, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 2.6231782674594846, | |
| "grad_norm": 0.30944499373435974, | |
| "learning_rate": 0.0005688573928258967, | |
| "loss": 3.7486, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.6231782674594846, | |
| "eval_accuracy": 0.3460370319324768, | |
| "eval_loss": 3.757467031478882, | |
| "eval_runtime": 53.2431, | |
| "eval_samples_per_second": 312.284, | |
| "eval_steps_per_second": 19.533, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.6377521277836076, | |
| "grad_norm": 0.3274962902069092, | |
| "learning_rate": 0.0005686824146981627, | |
| "loss": 3.7323, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 2.65232598810773, | |
| "grad_norm": 0.3120983839035034, | |
| "learning_rate": 0.0005685074365704287, | |
| "loss": 3.7383, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.6668998484318527, | |
| "grad_norm": 0.31694895029067993, | |
| "learning_rate": 0.0005683324584426946, | |
| "loss": 3.7308, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 2.6814737087559752, | |
| "grad_norm": 0.33465129137039185, | |
| "learning_rate": 0.0005681574803149606, | |
| "loss": 3.7403, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.696047569080098, | |
| "grad_norm": 0.33591049909591675, | |
| "learning_rate": 0.0005679825021872266, | |
| "loss": 3.7318, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 2.7106214294042204, | |
| "grad_norm": 0.3214552104473114, | |
| "learning_rate": 0.0005678075240594926, | |
| "loss": 3.7361, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.7251952897283434, | |
| "grad_norm": 0.3201562166213989, | |
| "learning_rate": 0.0005676325459317584, | |
| "loss": 3.7222, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 2.739769150052466, | |
| "grad_norm": 0.32140442728996277, | |
| "learning_rate": 0.0005674575678040244, | |
| "loss": 3.7411, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.7543430103765885, | |
| "grad_norm": 0.3104303181171417, | |
| "learning_rate": 0.0005672825896762904, | |
| "loss": 3.7268, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 2.7689168707007115, | |
| "grad_norm": 0.30419158935546875, | |
| "learning_rate": 0.0005671076115485563, | |
| "loss": 3.7381, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.783490731024834, | |
| "grad_norm": 0.3095656931400299, | |
| "learning_rate": 0.0005669326334208223, | |
| "loss": 3.7364, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 2.7980645913489566, | |
| "grad_norm": 0.33916303515434265, | |
| "learning_rate": 0.0005667576552930883, | |
| "loss": 3.7464, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.812638451673079, | |
| "grad_norm": 0.3232407569885254, | |
| "learning_rate": 0.0005665826771653543, | |
| "loss": 3.7295, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 2.8272123119972017, | |
| "grad_norm": 0.32287347316741943, | |
| "learning_rate": 0.0005664076990376202, | |
| "loss": 3.7335, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.8417861723213242, | |
| "grad_norm": 0.30645373463630676, | |
| "learning_rate": 0.0005662327209098862, | |
| "loss": 3.717, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 2.8563600326454472, | |
| "grad_norm": 0.3092345595359802, | |
| "learning_rate": 0.0005660577427821522, | |
| "loss": 3.7239, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.87093389296957, | |
| "grad_norm": 0.3197495937347412, | |
| "learning_rate": 0.0005658827646544182, | |
| "loss": 3.739, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 2.8855077532936924, | |
| "grad_norm": 0.32873114943504333, | |
| "learning_rate": 0.0005657077865266841, | |
| "loss": 3.7426, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.9000816136178154, | |
| "grad_norm": 0.31242650747299194, | |
| "learning_rate": 0.0005655328083989501, | |
| "loss": 3.7344, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 2.914655473941938, | |
| "grad_norm": 0.30841541290283203, | |
| "learning_rate": 0.0005653578302712161, | |
| "loss": 3.7276, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.914655473941938, | |
| "eval_accuracy": 0.34864237158409617, | |
| "eval_loss": 3.731783151626587, | |
| "eval_runtime": 53.4879, | |
| "eval_samples_per_second": 310.856, | |
| "eval_steps_per_second": 19.444, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.9292293342660605, | |
| "grad_norm": 0.2998133897781372, | |
| "learning_rate": 0.0005651828521434821, | |
| "loss": 3.7182, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 2.943803194590183, | |
| "grad_norm": 0.2998456656932831, | |
| "learning_rate": 0.000565007874015748, | |
| "loss": 3.7475, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.9583770549143056, | |
| "grad_norm": 0.308724969625473, | |
| "learning_rate": 0.000564832895888014, | |
| "loss": 3.7205, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 2.972950915238428, | |
| "grad_norm": 0.3019901216030121, | |
| "learning_rate": 0.00056465791776028, | |
| "loss": 3.7242, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.987524775562551, | |
| "grad_norm": 0.30775097012519836, | |
| "learning_rate": 0.0005644829396325459, | |
| "loss": 3.7263, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 3.0020403404453773, | |
| "grad_norm": 0.31498757004737854, | |
| "learning_rate": 0.0005643079615048119, | |
| "loss": 3.7102, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.0166142007695, | |
| "grad_norm": 0.3100306987762451, | |
| "learning_rate": 0.0005641329833770779, | |
| "loss": 3.6121, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 3.0311880610936224, | |
| "grad_norm": 0.32588666677474976, | |
| "learning_rate": 0.0005639580052493437, | |
| "loss": 3.6233, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.045761921417745, | |
| "grad_norm": 0.3099152147769928, | |
| "learning_rate": 0.0005637830271216097, | |
| "loss": 3.6062, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 3.060335781741868, | |
| "grad_norm": 0.33343327045440674, | |
| "learning_rate": 0.0005636080489938757, | |
| "loss": 3.6271, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.0749096420659905, | |
| "grad_norm": 0.3227072060108185, | |
| "learning_rate": 0.0005634330708661417, | |
| "loss": 3.6244, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 3.089483502390113, | |
| "grad_norm": 0.3116724491119385, | |
| "learning_rate": 0.0005632580927384076, | |
| "loss": 3.6408, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.1040573627142356, | |
| "grad_norm": 0.3207854628562927, | |
| "learning_rate": 0.0005630831146106736, | |
| "loss": 3.6246, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 3.1186312230383586, | |
| "grad_norm": 0.33564624190330505, | |
| "learning_rate": 0.0005629081364829396, | |
| "loss": 3.6297, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.133205083362481, | |
| "grad_norm": 0.3260132074356079, | |
| "learning_rate": 0.0005627331583552055, | |
| "loss": 3.6283, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 3.1477789436866037, | |
| "grad_norm": 0.319987416267395, | |
| "learning_rate": 0.0005625581802274715, | |
| "loss": 3.6464, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.1623528040107263, | |
| "grad_norm": 0.3306788206100464, | |
| "learning_rate": 0.0005623832020997375, | |
| "loss": 3.6449, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 3.176926664334849, | |
| "grad_norm": 0.31286585330963135, | |
| "learning_rate": 0.0005622082239720034, | |
| "loss": 3.6245, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.191500524658972, | |
| "grad_norm": 0.3328874111175537, | |
| "learning_rate": 0.0005620332458442694, | |
| "loss": 3.6358, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 3.2060743849830944, | |
| "grad_norm": 0.31454306840896606, | |
| "learning_rate": 0.0005618582677165354, | |
| "loss": 3.6398, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.2060743849830944, | |
| "eval_accuracy": 0.3506840605494305, | |
| "eval_loss": 3.7200303077697754, | |
| "eval_runtime": 53.1789, | |
| "eval_samples_per_second": 312.662, | |
| "eval_steps_per_second": 19.557, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.220648245307217, | |
| "grad_norm": 0.34588032960891724, | |
| "learning_rate": 0.0005616832895888013, | |
| "loss": 3.6533, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 3.2352221056313395, | |
| "grad_norm": 0.313573956489563, | |
| "learning_rate": 0.0005615083114610673, | |
| "loss": 3.6573, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.249795965955462, | |
| "grad_norm": 0.31849417090415955, | |
| "learning_rate": 0.0005613333333333333, | |
| "loss": 3.6237, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 3.264369826279585, | |
| "grad_norm": 0.32497328519821167, | |
| "learning_rate": 0.0005611583552055992, | |
| "loss": 3.6429, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.2789436866037076, | |
| "grad_norm": 0.3148914873600006, | |
| "learning_rate": 0.0005609833770778652, | |
| "loss": 3.6336, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 3.29351754692783, | |
| "grad_norm": 0.3106619417667389, | |
| "learning_rate": 0.0005608083989501312, | |
| "loss": 3.6433, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.3080914072519527, | |
| "grad_norm": 0.31093135476112366, | |
| "learning_rate": 0.0005606334208223972, | |
| "loss": 3.6351, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 3.3226652675760757, | |
| "grad_norm": 0.308928906917572, | |
| "learning_rate": 0.000560458442694663, | |
| "loss": 3.6378, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.3372391279001983, | |
| "grad_norm": 0.3089756965637207, | |
| "learning_rate": 0.000560283464566929, | |
| "loss": 3.6515, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 3.351812988224321, | |
| "grad_norm": 0.3176999092102051, | |
| "learning_rate": 0.000560108486439195, | |
| "loss": 3.6379, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.3663868485484434, | |
| "grad_norm": 0.3044137954711914, | |
| "learning_rate": 0.000559933508311461, | |
| "loss": 3.6399, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 3.380960708872566, | |
| "grad_norm": 0.31266817450523376, | |
| "learning_rate": 0.0005597585301837269, | |
| "loss": 3.6518, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.395534569196689, | |
| "grad_norm": 0.32951512932777405, | |
| "learning_rate": 0.0005595835520559929, | |
| "loss": 3.6541, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 3.4101084295208115, | |
| "grad_norm": 0.3169146478176117, | |
| "learning_rate": 0.0005594085739282589, | |
| "loss": 3.6457, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.424682289844934, | |
| "grad_norm": 0.31720760464668274, | |
| "learning_rate": 0.0005592335958005249, | |
| "loss": 3.6483, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 3.4392561501690566, | |
| "grad_norm": 0.3167465925216675, | |
| "learning_rate": 0.0005590586176727908, | |
| "loss": 3.6523, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.4538300104931796, | |
| "grad_norm": 0.308098703622818, | |
| "learning_rate": 0.0005588836395450568, | |
| "loss": 3.6401, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 3.468403870817302, | |
| "grad_norm": 0.3026895821094513, | |
| "learning_rate": 0.0005587086614173228, | |
| "loss": 3.6505, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.4829777311414247, | |
| "grad_norm": 0.3005221486091614, | |
| "learning_rate": 0.0005585336832895888, | |
| "loss": 3.6451, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 3.4975515914655473, | |
| "grad_norm": 0.3093535304069519, | |
| "learning_rate": 0.0005583587051618547, | |
| "loss": 3.6512, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.4975515914655473, | |
| "eval_accuracy": 0.35233946812984374, | |
| "eval_loss": 3.7018749713897705, | |
| "eval_runtime": 53.3009, | |
| "eval_samples_per_second": 311.946, | |
| "eval_steps_per_second": 19.512, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.51212545178967, | |
| "grad_norm": 0.3283046782016754, | |
| "learning_rate": 0.0005581837270341207, | |
| "loss": 3.6413, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 3.526699312113793, | |
| "grad_norm": 0.32031574845314026, | |
| "learning_rate": 0.0005580087489063867, | |
| "loss": 3.6525, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 3.5412731724379154, | |
| "grad_norm": 0.30100202560424805, | |
| "learning_rate": 0.0005578337707786526, | |
| "loss": 3.6395, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 3.555847032762038, | |
| "grad_norm": 0.32140886783599854, | |
| "learning_rate": 0.0005576587926509186, | |
| "loss": 3.6548, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 3.5704208930861605, | |
| "grad_norm": 0.31598982214927673, | |
| "learning_rate": 0.0005574838145231846, | |
| "loss": 3.6401, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 3.5849947534102835, | |
| "grad_norm": 0.33419525623321533, | |
| "learning_rate": 0.0005573088363954506, | |
| "loss": 3.6517, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 3.599568613734406, | |
| "grad_norm": 0.32463476061820984, | |
| "learning_rate": 0.0005571338582677165, | |
| "loss": 3.6566, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 3.6141424740585286, | |
| "grad_norm": 0.30612531304359436, | |
| "learning_rate": 0.0005569588801399825, | |
| "loss": 3.6432, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 3.628716334382651, | |
| "grad_norm": 0.3223789930343628, | |
| "learning_rate": 0.0005567839020122485, | |
| "loss": 3.6425, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 3.6432901947067737, | |
| "grad_norm": 0.30926087498664856, | |
| "learning_rate": 0.0005566089238845145, | |
| "loss": 3.6481, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.6578640550308967, | |
| "grad_norm": 0.33441799879074097, | |
| "learning_rate": 0.0005564339457567803, | |
| "loss": 3.6521, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 3.6724379153550193, | |
| "grad_norm": 0.3189132511615753, | |
| "learning_rate": 0.0005562589676290463, | |
| "loss": 3.6557, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 3.687011775679142, | |
| "grad_norm": 0.30546867847442627, | |
| "learning_rate": 0.0005560839895013123, | |
| "loss": 3.653, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 3.7015856360032644, | |
| "grad_norm": 0.3111048936843872, | |
| "learning_rate": 0.0005559090113735782, | |
| "loss": 3.6362, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 3.7161594963273874, | |
| "grad_norm": 0.3029773235321045, | |
| "learning_rate": 0.0005557340332458442, | |
| "loss": 3.6503, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 3.73073335665151, | |
| "grad_norm": 0.31810298562049866, | |
| "learning_rate": 0.0005555590551181102, | |
| "loss": 3.6596, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 3.7453072169756325, | |
| "grad_norm": 0.3288934528827667, | |
| "learning_rate": 0.0005553840769903761, | |
| "loss": 3.6478, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 3.759881077299755, | |
| "grad_norm": 0.3125859200954437, | |
| "learning_rate": 0.0005552090988626421, | |
| "loss": 3.6452, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 3.7744549376238776, | |
| "grad_norm": 0.3162294626235962, | |
| "learning_rate": 0.0005550341207349081, | |
| "loss": 3.6445, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 3.7890287979480006, | |
| "grad_norm": 0.3025747537612915, | |
| "learning_rate": 0.000554859142607174, | |
| "loss": 3.6577, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.7890287979480006, | |
| "eval_accuracy": 0.35399511110415394, | |
| "eval_loss": 3.682143449783325, | |
| "eval_runtime": 53.2805, | |
| "eval_samples_per_second": 312.065, | |
| "eval_steps_per_second": 19.519, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.803602658272123, | |
| "grad_norm": 0.3304448425769806, | |
| "learning_rate": 0.00055468416447944, | |
| "loss": 3.6377, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 3.8181765185962457, | |
| "grad_norm": 0.29563280940055847, | |
| "learning_rate": 0.000554509186351706, | |
| "loss": 3.6455, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 3.8327503789203683, | |
| "grad_norm": 0.30134159326553345, | |
| "learning_rate": 0.000554334208223972, | |
| "loss": 3.6413, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 3.8473242392444913, | |
| "grad_norm": 0.3190068304538727, | |
| "learning_rate": 0.0005541592300962379, | |
| "loss": 3.6308, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 3.861898099568614, | |
| "grad_norm": 0.30791667103767395, | |
| "learning_rate": 0.0005539842519685039, | |
| "loss": 3.6437, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 3.8764719598927364, | |
| "grad_norm": 0.3044162094593048, | |
| "learning_rate": 0.0005538092738407699, | |
| "loss": 3.6238, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 3.891045820216859, | |
| "grad_norm": 0.31529924273490906, | |
| "learning_rate": 0.0005536342957130358, | |
| "loss": 3.6585, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 3.9056196805409815, | |
| "grad_norm": 0.2981610596179962, | |
| "learning_rate": 0.0005534593175853018, | |
| "loss": 3.6456, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 3.920193540865104, | |
| "grad_norm": 0.3133598566055298, | |
| "learning_rate": 0.0005532843394575678, | |
| "loss": 3.6483, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 3.934767401189227, | |
| "grad_norm": 0.3186839520931244, | |
| "learning_rate": 0.0005531093613298337, | |
| "loss": 3.6346, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.9493412615133496, | |
| "grad_norm": 0.3375805914402008, | |
| "learning_rate": 0.0005529343832020997, | |
| "loss": 3.6408, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 3.963915121837472, | |
| "grad_norm": 0.30909714102745056, | |
| "learning_rate": 0.0005527594050743656, | |
| "loss": 3.6381, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 3.978488982161595, | |
| "grad_norm": 0.30542704463005066, | |
| "learning_rate": 0.0005525844269466316, | |
| "loss": 3.6544, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 3.9930628424857177, | |
| "grad_norm": 0.31719449162483215, | |
| "learning_rate": 0.0005524094488188975, | |
| "loss": 3.6531, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 4.007578407368544, | |
| "grad_norm": 0.32333192229270935, | |
| "learning_rate": 0.0005522344706911635, | |
| "loss": 3.5804, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 4.022152267692666, | |
| "grad_norm": 0.33345669507980347, | |
| "learning_rate": 0.0005520594925634295, | |
| "loss": 3.5267, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 4.036726128016789, | |
| "grad_norm": 0.34269770979881287, | |
| "learning_rate": 0.0005518845144356954, | |
| "loss": 3.5349, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 4.0512999883409115, | |
| "grad_norm": 0.3315916955471039, | |
| "learning_rate": 0.0005517095363079614, | |
| "loss": 3.5392, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 4.065873848665034, | |
| "grad_norm": 0.31645020842552185, | |
| "learning_rate": 0.0005515345581802274, | |
| "loss": 3.563, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 4.080447708989157, | |
| "grad_norm": 0.3305814564228058, | |
| "learning_rate": 0.0005513595800524934, | |
| "loss": 3.5496, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.080447708989157, | |
| "eval_accuracy": 0.3554473737514855, | |
| "eval_loss": 3.676452875137329, | |
| "eval_runtime": 53.3246, | |
| "eval_samples_per_second": 311.807, | |
| "eval_steps_per_second": 19.503, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.09502156931328, | |
| "grad_norm": 0.30775004625320435, | |
| "learning_rate": 0.0005511846019247593, | |
| "loss": 3.5461, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 4.109595429637403, | |
| "grad_norm": 0.32936856150627136, | |
| "learning_rate": 0.0005510096237970253, | |
| "loss": 3.539, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 4.124169289961525, | |
| "grad_norm": 0.3322300314903259, | |
| "learning_rate": 0.0005508346456692913, | |
| "loss": 3.5399, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 4.138743150285648, | |
| "grad_norm": 0.34817028045654297, | |
| "learning_rate": 0.0005506596675415573, | |
| "loss": 3.5356, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 4.15331701060977, | |
| "grad_norm": 0.31786543130874634, | |
| "learning_rate": 0.0005504846894138232, | |
| "loss": 3.5594, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 4.167890870933893, | |
| "grad_norm": 0.3049430549144745, | |
| "learning_rate": 0.0005503097112860892, | |
| "loss": 3.5576, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 4.182464731258015, | |
| "grad_norm": 0.32783564925193787, | |
| "learning_rate": 0.0005501347331583552, | |
| "loss": 3.5633, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 4.197038591582138, | |
| "grad_norm": 0.33510342240333557, | |
| "learning_rate": 0.0005499597550306212, | |
| "loss": 3.5555, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.2116124519062605, | |
| "grad_norm": 0.301746666431427, | |
| "learning_rate": 0.0005497847769028871, | |
| "loss": 3.5529, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 4.226186312230384, | |
| "grad_norm": 0.33126580715179443, | |
| "learning_rate": 0.0005496097987751531, | |
| "loss": 3.5658, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.2407601725545065, | |
| "grad_norm": 0.3298133313655853, | |
| "learning_rate": 0.0005494348206474191, | |
| "loss": 3.561, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 4.255334032878629, | |
| "grad_norm": 0.33059850335121155, | |
| "learning_rate": 0.0005492598425196851, | |
| "loss": 3.5577, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 4.269907893202752, | |
| "grad_norm": 0.32975366711616516, | |
| "learning_rate": 0.000549084864391951, | |
| "loss": 3.5681, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 4.284481753526874, | |
| "grad_norm": 0.33336469531059265, | |
| "learning_rate": 0.000548909886264217, | |
| "loss": 3.572, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 4.299055613850997, | |
| "grad_norm": 0.3046822249889374, | |
| "learning_rate": 0.000548734908136483, | |
| "loss": 3.5711, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 4.313629474175119, | |
| "grad_norm": 0.3244114816188812, | |
| "learning_rate": 0.0005485599300087488, | |
| "loss": 3.5777, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 4.328203334499242, | |
| "grad_norm": 0.3168890178203583, | |
| "learning_rate": 0.0005483849518810148, | |
| "loss": 3.5727, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 4.342777194823364, | |
| "grad_norm": 0.3299658000469208, | |
| "learning_rate": 0.0005482099737532808, | |
| "loss": 3.5626, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 4.357351055147488, | |
| "grad_norm": 0.29865631461143494, | |
| "learning_rate": 0.0005480349956255468, | |
| "loss": 3.5794, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 4.37192491547161, | |
| "grad_norm": 0.33295169472694397, | |
| "learning_rate": 0.0005478600174978127, | |
| "loss": 3.5622, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.37192491547161, | |
| "eval_accuracy": 0.35635163940668024, | |
| "eval_loss": 3.6663150787353516, | |
| "eval_runtime": 53.2482, | |
| "eval_samples_per_second": 312.255, | |
| "eval_steps_per_second": 19.531, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.386498775795733, | |
| "grad_norm": 0.3243556618690491, | |
| "learning_rate": 0.0005476850393700787, | |
| "loss": 3.5707, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 4.4010726361198556, | |
| "grad_norm": 0.3029650151729584, | |
| "learning_rate": 0.0005475100612423447, | |
| "loss": 3.5766, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 4.415646496443978, | |
| "grad_norm": 0.3167254626750946, | |
| "learning_rate": 0.0005473350831146106, | |
| "loss": 3.5817, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 4.430220356768101, | |
| "grad_norm": 0.32114019989967346, | |
| "learning_rate": 0.0005471601049868766, | |
| "loss": 3.5813, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 4.444794217092223, | |
| "grad_norm": 0.3149368464946747, | |
| "learning_rate": 0.0005469851268591426, | |
| "loss": 3.5623, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 4.459368077416346, | |
| "grad_norm": 0.3121349513530731, | |
| "learning_rate": 0.0005468101487314085, | |
| "loss": 3.5768, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 4.473941937740468, | |
| "grad_norm": 0.32959362864494324, | |
| "learning_rate": 0.0005466351706036745, | |
| "loss": 3.5778, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 4.488515798064592, | |
| "grad_norm": 0.349617063999176, | |
| "learning_rate": 0.0005464601924759405, | |
| "loss": 3.5847, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 4.503089658388714, | |
| "grad_norm": 0.326031357049942, | |
| "learning_rate": 0.0005462852143482064, | |
| "loss": 3.5799, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 4.517663518712837, | |
| "grad_norm": 0.33209681510925293, | |
| "learning_rate": 0.0005461102362204724, | |
| "loss": 3.572, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.5322373790369594, | |
| "grad_norm": 0.30135366320610046, | |
| "learning_rate": 0.0005459352580927384, | |
| "loss": 3.5844, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 4.546811239361082, | |
| "grad_norm": 0.3455754518508911, | |
| "learning_rate": 0.0005457602799650043, | |
| "loss": 3.5838, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 4.561385099685205, | |
| "grad_norm": 0.30380716919898987, | |
| "learning_rate": 0.0005455853018372703, | |
| "loss": 3.5783, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 4.575958960009327, | |
| "grad_norm": 0.33048829436302185, | |
| "learning_rate": 0.0005454103237095363, | |
| "loss": 3.5857, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 4.59053282033345, | |
| "grad_norm": 0.32799360156059265, | |
| "learning_rate": 0.0005452353455818022, | |
| "loss": 3.5731, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 4.605106680657572, | |
| "grad_norm": 0.3069393038749695, | |
| "learning_rate": 0.0005450603674540681, | |
| "loss": 3.5851, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 4.619680540981696, | |
| "grad_norm": 0.3098676800727844, | |
| "learning_rate": 0.0005448853893263341, | |
| "loss": 3.5782, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 4.634254401305818, | |
| "grad_norm": 0.3361780643463135, | |
| "learning_rate": 0.0005447104111986001, | |
| "loss": 3.5892, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 4.648828261629941, | |
| "grad_norm": 0.3276433050632477, | |
| "learning_rate": 0.000544535433070866, | |
| "loss": 3.5825, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 4.663402121954063, | |
| "grad_norm": 0.3088076412677765, | |
| "learning_rate": 0.000544360454943132, | |
| "loss": 3.5784, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.663402121954063, | |
| "eval_accuracy": 0.3573661871026036, | |
| "eval_loss": 3.6518714427948, | |
| "eval_runtime": 53.2511, | |
| "eval_samples_per_second": 312.238, | |
| "eval_steps_per_second": 19.53, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.677975982278186, | |
| "grad_norm": 0.3108406960964203, | |
| "learning_rate": 0.000544185476815398, | |
| "loss": 3.596, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 4.6925498426023085, | |
| "grad_norm": 0.31319963932037354, | |
| "learning_rate": 0.000544010498687664, | |
| "loss": 3.5762, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 4.707123702926431, | |
| "grad_norm": 0.3112225830554962, | |
| "learning_rate": 0.0005438355205599299, | |
| "loss": 3.5807, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 4.721697563250554, | |
| "grad_norm": 0.32752206921577454, | |
| "learning_rate": 0.0005436605424321959, | |
| "loss": 3.5715, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 4.736271423574676, | |
| "grad_norm": 0.3147103786468506, | |
| "learning_rate": 0.0005434855643044619, | |
| "loss": 3.579, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 4.7508452838988, | |
| "grad_norm": 0.3242517113685608, | |
| "learning_rate": 0.0005433105861767279, | |
| "loss": 3.5785, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 4.765419144222922, | |
| "grad_norm": 0.29891932010650635, | |
| "learning_rate": 0.0005431356080489938, | |
| "loss": 3.5741, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 4.779993004547045, | |
| "grad_norm": 0.3357846736907959, | |
| "learning_rate": 0.0005429606299212598, | |
| "loss": 3.5738, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 4.794566864871167, | |
| "grad_norm": 0.3278196454048157, | |
| "learning_rate": 0.0005427856517935258, | |
| "loss": 3.5824, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 4.80914072519529, | |
| "grad_norm": 0.29816699028015137, | |
| "learning_rate": 0.0005426106736657917, | |
| "loss": 3.5886, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.823714585519412, | |
| "grad_norm": 0.31529319286346436, | |
| "learning_rate": 0.0005424356955380577, | |
| "loss": 3.5942, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 4.838288445843535, | |
| "grad_norm": 0.31740570068359375, | |
| "learning_rate": 0.0005422607174103237, | |
| "loss": 3.5816, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 4.8528623061676575, | |
| "grad_norm": 0.31539270281791687, | |
| "learning_rate": 0.0005420857392825897, | |
| "loss": 3.5812, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 4.86743616649178, | |
| "grad_norm": 0.3055557608604431, | |
| "learning_rate": 0.0005419107611548556, | |
| "loss": 3.583, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 4.8820100268159035, | |
| "grad_norm": 0.3405109643936157, | |
| "learning_rate": 0.0005417357830271216, | |
| "loss": 3.5824, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 4.896583887140026, | |
| "grad_norm": 0.3146877586841583, | |
| "learning_rate": 0.0005415608048993876, | |
| "loss": 3.5799, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 4.911157747464149, | |
| "grad_norm": 0.32447442412376404, | |
| "learning_rate": 0.0005413858267716535, | |
| "loss": 3.5765, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 4.925731607788271, | |
| "grad_norm": 0.3202003240585327, | |
| "learning_rate": 0.0005412108486439194, | |
| "loss": 3.5972, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 4.940305468112394, | |
| "grad_norm": 0.3180946409702301, | |
| "learning_rate": 0.0005410358705161854, | |
| "loss": 3.5874, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 4.954879328436516, | |
| "grad_norm": 0.3090243637561798, | |
| "learning_rate": 0.0005408608923884514, | |
| "loss": 3.5766, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.954879328436516, | |
| "eval_accuracy": 0.35883657508000155, | |
| "eval_loss": 3.636709690093994, | |
| "eval_runtime": 53.2502, | |
| "eval_samples_per_second": 312.243, | |
| "eval_steps_per_second": 19.53, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.969453188760639, | |
| "grad_norm": 0.30562207102775574, | |
| "learning_rate": 0.0005406859142607174, | |
| "loss": 3.5694, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 4.984027049084761, | |
| "grad_norm": 0.31890225410461426, | |
| "learning_rate": 0.0005405109361329833, | |
| "loss": 3.5781, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 4.998600909408884, | |
| "grad_norm": 0.3138265013694763, | |
| "learning_rate": 0.0005403359580052493, | |
| "loss": 3.5757, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 5.01311647429171, | |
| "grad_norm": 0.3236299753189087, | |
| "learning_rate": 0.0005401609798775153, | |
| "loss": 3.4683, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 5.0276903346158335, | |
| "grad_norm": 0.3176940381526947, | |
| "learning_rate": 0.0005399860017497813, | |
| "loss": 3.474, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 5.042264194939956, | |
| "grad_norm": 0.3468603491783142, | |
| "learning_rate": 0.0005398110236220472, | |
| "loss": 3.4827, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 5.056838055264079, | |
| "grad_norm": 0.3197671175003052, | |
| "learning_rate": 0.0005396360454943132, | |
| "loss": 3.4756, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 5.071411915588201, | |
| "grad_norm": 0.3230718672275543, | |
| "learning_rate": 0.0005394610673665792, | |
| "loss": 3.484, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 5.085985775912324, | |
| "grad_norm": 0.3250696361064911, | |
| "learning_rate": 0.0005392860892388451, | |
| "loss": 3.4808, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 5.100559636236446, | |
| "grad_norm": 0.3313814103603363, | |
| "learning_rate": 0.0005391111111111111, | |
| "loss": 3.4877, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.115133496560569, | |
| "grad_norm": 0.3151894807815552, | |
| "learning_rate": 0.0005389361329833771, | |
| "loss": 3.4858, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 5.129707356884691, | |
| "grad_norm": 0.31763774156570435, | |
| "learning_rate": 0.000538761154855643, | |
| "loss": 3.4894, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 5.144281217208814, | |
| "grad_norm": 0.31315064430236816, | |
| "learning_rate": 0.000538586176727909, | |
| "loss": 3.4958, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 5.158855077532937, | |
| "grad_norm": 0.3251068890094757, | |
| "learning_rate": 0.000538411198600175, | |
| "loss": 3.4978, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 5.17342893785706, | |
| "grad_norm": 0.3382989466190338, | |
| "learning_rate": 0.0005382362204724409, | |
| "loss": 3.4971, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 5.1880027981811825, | |
| "grad_norm": 0.3223947286605835, | |
| "learning_rate": 0.0005380612423447069, | |
| "loss": 3.5032, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 5.202576658505305, | |
| "grad_norm": 0.31984540820121765, | |
| "learning_rate": 0.0005378862642169729, | |
| "loss": 3.5002, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 5.217150518829428, | |
| "grad_norm": 0.2947542071342468, | |
| "learning_rate": 0.0005377112860892387, | |
| "loss": 3.5097, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 5.23172437915355, | |
| "grad_norm": 0.3217833936214447, | |
| "learning_rate": 0.0005375363079615047, | |
| "loss": 3.5042, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 5.246298239477673, | |
| "grad_norm": 0.32693716883659363, | |
| "learning_rate": 0.0005373613298337707, | |
| "loss": 3.5103, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.246298239477673, | |
| "eval_accuracy": 0.3593892799500777, | |
| "eval_loss": 3.6392245292663574, | |
| "eval_runtime": 53.2938, | |
| "eval_samples_per_second": 311.987, | |
| "eval_steps_per_second": 19.514, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.260872099801795, | |
| "grad_norm": 0.3177185654640198, | |
| "learning_rate": 0.0005371863517060366, | |
| "loss": 3.5056, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 5.275445960125918, | |
| "grad_norm": 0.338419109582901, | |
| "learning_rate": 0.0005370113735783026, | |
| "loss": 3.5179, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 5.290019820450041, | |
| "grad_norm": 0.32331186532974243, | |
| "learning_rate": 0.0005368363954505686, | |
| "loss": 3.5053, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 5.304593680774164, | |
| "grad_norm": 0.32105499505996704, | |
| "learning_rate": 0.0005366614173228346, | |
| "loss": 3.5151, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 5.319167541098286, | |
| "grad_norm": 0.35447439551353455, | |
| "learning_rate": 0.0005364864391951005, | |
| "loss": 3.5032, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 5.333741401422409, | |
| "grad_norm": 0.325785756111145, | |
| "learning_rate": 0.0005363114610673665, | |
| "loss": 3.5201, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 5.3483152617465315, | |
| "grad_norm": 0.3268141746520996, | |
| "learning_rate": 0.0005361364829396325, | |
| "loss": 3.5109, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 5.362889122070654, | |
| "grad_norm": 0.31733396649360657, | |
| "learning_rate": 0.0005359615048118984, | |
| "loss": 3.5147, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 5.377462982394777, | |
| "grad_norm": 0.317416250705719, | |
| "learning_rate": 0.0005357865266841644, | |
| "loss": 3.5209, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 5.392036842718899, | |
| "grad_norm": 0.3161477744579315, | |
| "learning_rate": 0.0005356115485564304, | |
| "loss": 3.5195, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.406610703043022, | |
| "grad_norm": 0.3308257758617401, | |
| "learning_rate": 0.0005354365704286964, | |
| "loss": 3.5269, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 5.421184563367145, | |
| "grad_norm": 0.3476756811141968, | |
| "learning_rate": 0.0005352615923009623, | |
| "loss": 3.5241, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 5.435758423691268, | |
| "grad_norm": 0.3217465579509735, | |
| "learning_rate": 0.0005350866141732283, | |
| "loss": 3.5084, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 5.45033228401539, | |
| "grad_norm": 0.31799283623695374, | |
| "learning_rate": 0.0005349116360454943, | |
| "loss": 3.5273, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 5.464906144339513, | |
| "grad_norm": 0.3282622992992401, | |
| "learning_rate": 0.0005347366579177603, | |
| "loss": 3.5296, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 5.479480004663635, | |
| "grad_norm": 0.29494890570640564, | |
| "learning_rate": 0.0005345616797900262, | |
| "loss": 3.5258, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 5.494053864987758, | |
| "grad_norm": 0.3265093266963959, | |
| "learning_rate": 0.0005343867016622922, | |
| "loss": 3.5197, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 5.5086277253118805, | |
| "grad_norm": 0.3251166045665741, | |
| "learning_rate": 0.0005342117235345582, | |
| "loss": 3.5268, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 5.523201585636003, | |
| "grad_norm": 0.3073839545249939, | |
| "learning_rate": 0.0005340367454068242, | |
| "loss": 3.5234, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 5.537775445960126, | |
| "grad_norm": 0.3181171417236328, | |
| "learning_rate": 0.00053386176727909, | |
| "loss": 3.5274, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.537775445960126, | |
| "eval_accuracy": 0.3603505109283382, | |
| "eval_loss": 3.629598617553711, | |
| "eval_runtime": 53.4392, | |
| "eval_samples_per_second": 311.138, | |
| "eval_steps_per_second": 19.461, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.552349306284249, | |
| "grad_norm": 0.31103068590164185, | |
| "learning_rate": 0.000533686789151356, | |
| "loss": 3.5327, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 5.566923166608372, | |
| "grad_norm": 0.33358463644981384, | |
| "learning_rate": 0.000533511811023622, | |
| "loss": 3.5186, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 5.581497026932494, | |
| "grad_norm": 0.3126921057701111, | |
| "learning_rate": 0.000533336832895888, | |
| "loss": 3.5254, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 5.596070887256617, | |
| "grad_norm": 0.3349047303199768, | |
| "learning_rate": 0.0005331618547681539, | |
| "loss": 3.5188, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 5.610644747580739, | |
| "grad_norm": 0.3327292203903198, | |
| "learning_rate": 0.0005329868766404199, | |
| "loss": 3.5282, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 5.625218607904862, | |
| "grad_norm": 0.29962432384490967, | |
| "learning_rate": 0.0005328118985126859, | |
| "loss": 3.5257, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 5.639792468228984, | |
| "grad_norm": 0.31677988171577454, | |
| "learning_rate": 0.0005326369203849518, | |
| "loss": 3.5326, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 5.654366328553107, | |
| "grad_norm": 0.3229268193244934, | |
| "learning_rate": 0.0005324619422572178, | |
| "loss": 3.5268, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 5.6689401888772295, | |
| "grad_norm": 0.3216641843318939, | |
| "learning_rate": 0.0005322869641294838, | |
| "loss": 3.5276, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 5.683514049201353, | |
| "grad_norm": 0.3101942539215088, | |
| "learning_rate": 0.0005321119860017498, | |
| "loss": 3.5275, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.698087909525475, | |
| "grad_norm": 0.3119480013847351, | |
| "learning_rate": 0.0005319370078740157, | |
| "loss": 3.5369, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 5.712661769849598, | |
| "grad_norm": 0.32610809803009033, | |
| "learning_rate": 0.0005317620297462817, | |
| "loss": 3.5344, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 5.727235630173721, | |
| "grad_norm": 0.32529962062835693, | |
| "learning_rate": 0.0005315870516185477, | |
| "loss": 3.5331, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 5.741809490497843, | |
| "grad_norm": 0.3158641457557678, | |
| "learning_rate": 0.0005314120734908137, | |
| "loss": 3.5367, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 5.756383350821966, | |
| "grad_norm": 0.32387575507164, | |
| "learning_rate": 0.0005312370953630796, | |
| "loss": 3.533, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 5.770957211146088, | |
| "grad_norm": 0.3478480875492096, | |
| "learning_rate": 0.0005310621172353456, | |
| "loss": 3.5447, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 5.785531071470211, | |
| "grad_norm": 0.33817481994628906, | |
| "learning_rate": 0.0005308871391076116, | |
| "loss": 3.5409, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 5.800104931794333, | |
| "grad_norm": 0.31714707612991333, | |
| "learning_rate": 0.0005307121609798775, | |
| "loss": 3.5228, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 5.814678792118456, | |
| "grad_norm": 0.3329344093799591, | |
| "learning_rate": 0.0005305371828521435, | |
| "loss": 3.5459, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 5.8292526524425785, | |
| "grad_norm": 0.3378346264362335, | |
| "learning_rate": 0.0005303622047244095, | |
| "loss": 3.5278, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.8292526524425785, | |
| "eval_accuracy": 0.36104056813729396, | |
| "eval_loss": 3.616792917251587, | |
| "eval_runtime": 53.2411, | |
| "eval_samples_per_second": 312.296, | |
| "eval_steps_per_second": 19.534, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.843826512766702, | |
| "grad_norm": 0.324442982673645, | |
| "learning_rate": 0.0005301872265966753, | |
| "loss": 3.5276, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 5.8584003730908245, | |
| "grad_norm": 0.3171297311782837, | |
| "learning_rate": 0.0005300122484689413, | |
| "loss": 3.5281, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 5.872974233414947, | |
| "grad_norm": 0.35564813017845154, | |
| "learning_rate": 0.0005298372703412073, | |
| "loss": 3.5407, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 5.88754809373907, | |
| "grad_norm": 0.3331674337387085, | |
| "learning_rate": 0.0005296622922134732, | |
| "loss": 3.5253, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 5.902121954063192, | |
| "grad_norm": 0.31681913137435913, | |
| "learning_rate": 0.0005294873140857392, | |
| "loss": 3.536, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 5.916695814387315, | |
| "grad_norm": 0.3327937722206116, | |
| "learning_rate": 0.0005293123359580052, | |
| "loss": 3.5207, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 5.931269674711437, | |
| "grad_norm": 0.339324414730072, | |
| "learning_rate": 0.0005291373578302711, | |
| "loss": 3.5475, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 5.94584353503556, | |
| "grad_norm": 0.3185977637767792, | |
| "learning_rate": 0.0005289623797025371, | |
| "loss": 3.5357, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 5.960417395359682, | |
| "grad_norm": 0.3692864179611206, | |
| "learning_rate": 0.0005287874015748031, | |
| "loss": 3.5374, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 5.974991255683806, | |
| "grad_norm": 0.30453935265541077, | |
| "learning_rate": 0.000528612423447069, | |
| "loss": 3.5386, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.989565116007928, | |
| "grad_norm": 0.32170945405960083, | |
| "learning_rate": 0.000528437445319335, | |
| "loss": 3.5519, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 6.0040806808907545, | |
| "grad_norm": 0.3116176128387451, | |
| "learning_rate": 0.000528262467191601, | |
| "loss": 3.5049, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 6.018654541214877, | |
| "grad_norm": 0.3333321809768677, | |
| "learning_rate": 0.000528087489063867, | |
| "loss": 3.423, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 6.033228401539, | |
| "grad_norm": 0.33255186676979065, | |
| "learning_rate": 0.0005279125109361329, | |
| "loss": 3.4219, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 6.047802261863122, | |
| "grad_norm": 0.3399498164653778, | |
| "learning_rate": 0.0005277375328083989, | |
| "loss": 3.4307, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 6.062376122187245, | |
| "grad_norm": 0.3460189402103424, | |
| "learning_rate": 0.0005275625546806649, | |
| "loss": 3.4177, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 6.076949982511367, | |
| "grad_norm": 0.3405349552631378, | |
| "learning_rate": 0.0005273875765529309, | |
| "loss": 3.4299, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 6.09152384283549, | |
| "grad_norm": 0.32472336292266846, | |
| "learning_rate": 0.0005272125984251968, | |
| "loss": 3.4404, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 6.106097703159613, | |
| "grad_norm": 0.3217617869377136, | |
| "learning_rate": 0.0005270376202974628, | |
| "loss": 3.4387, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 6.120671563483736, | |
| "grad_norm": 0.36554691195487976, | |
| "learning_rate": 0.0005268626421697288, | |
| "loss": 3.4431, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.120671563483736, | |
| "eval_accuracy": 0.3611621490850769, | |
| "eval_loss": 3.6243624687194824, | |
| "eval_runtime": 53.2667, | |
| "eval_samples_per_second": 312.146, | |
| "eval_steps_per_second": 19.524, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.135245423807858, | |
| "grad_norm": 0.32540804147720337, | |
| "learning_rate": 0.0005266876640419946, | |
| "loss": 3.4545, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 6.149819284131981, | |
| "grad_norm": 0.31584632396698, | |
| "learning_rate": 0.0005265126859142606, | |
| "loss": 3.4606, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 6.1643931444561035, | |
| "grad_norm": 0.3380563259124756, | |
| "learning_rate": 0.0005263377077865266, | |
| "loss": 3.4468, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 6.178967004780226, | |
| "grad_norm": 0.3270926773548126, | |
| "learning_rate": 0.0005261627296587926, | |
| "loss": 3.4511, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 6.193540865104349, | |
| "grad_norm": 0.33158567547798157, | |
| "learning_rate": 0.0005259877515310585, | |
| "loss": 3.4604, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 6.208114725428471, | |
| "grad_norm": 0.3384026885032654, | |
| "learning_rate": 0.0005258127734033245, | |
| "loss": 3.4532, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 6.222688585752594, | |
| "grad_norm": 0.3033508360385895, | |
| "learning_rate": 0.0005256377952755905, | |
| "loss": 3.4695, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 6.237262446076717, | |
| "grad_norm": 0.3265267610549927, | |
| "learning_rate": 0.0005254628171478565, | |
| "loss": 3.4575, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 6.25183630640084, | |
| "grad_norm": 0.3173014223575592, | |
| "learning_rate": 0.0005252878390201224, | |
| "loss": 3.4492, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 6.266410166724962, | |
| "grad_norm": 0.3339494466781616, | |
| "learning_rate": 0.0005251128608923884, | |
| "loss": 3.4624, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 6.280984027049085, | |
| "grad_norm": 0.3087364435195923, | |
| "learning_rate": 0.0005249378827646544, | |
| "loss": 3.4773, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 6.295557887373207, | |
| "grad_norm": 0.3332245349884033, | |
| "learning_rate": 0.0005247629046369204, | |
| "loss": 3.4686, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 6.31013174769733, | |
| "grad_norm": 0.3170703649520874, | |
| "learning_rate": 0.0005245879265091863, | |
| "loss": 3.4716, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 6.3247056080214525, | |
| "grad_norm": 0.3388248682022095, | |
| "learning_rate": 0.0005244129483814523, | |
| "loss": 3.4911, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 6.339279468345575, | |
| "grad_norm": 0.32150861620903015, | |
| "learning_rate": 0.0005242379702537183, | |
| "loss": 3.4705, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 6.353853328669698, | |
| "grad_norm": 0.29872483015060425, | |
| "learning_rate": 0.0005240629921259843, | |
| "loss": 3.467, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 6.368427188993821, | |
| "grad_norm": 0.302783727645874, | |
| "learning_rate": 0.0005238880139982502, | |
| "loss": 3.4742, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 6.383001049317944, | |
| "grad_norm": 0.3234187960624695, | |
| "learning_rate": 0.0005237130358705162, | |
| "loss": 3.4823, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 6.397574909642066, | |
| "grad_norm": 0.3224688470363617, | |
| "learning_rate": 0.0005235380577427822, | |
| "loss": 3.4926, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 6.412148769966189, | |
| "grad_norm": 0.35340335965156555, | |
| "learning_rate": 0.0005233630796150481, | |
| "loss": 3.4695, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.412148769966189, | |
| "eval_accuracy": 0.36229804233488616, | |
| "eval_loss": 3.6155829429626465, | |
| "eval_runtime": 53.1967, | |
| "eval_samples_per_second": 312.557, | |
| "eval_steps_per_second": 19.55, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.426722630290311, | |
| "grad_norm": 0.3472863733768463, | |
| "learning_rate": 0.0005231881014873141, | |
| "loss": 3.4776, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 6.441296490614434, | |
| "grad_norm": 0.3207915127277374, | |
| "learning_rate": 0.00052301312335958, | |
| "loss": 3.4821, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 6.455870350938556, | |
| "grad_norm": 0.33775514364242554, | |
| "learning_rate": 0.0005228381452318459, | |
| "loss": 3.475, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 6.470444211262679, | |
| "grad_norm": 0.32991212606430054, | |
| "learning_rate": 0.0005226631671041119, | |
| "loss": 3.48, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 6.4850180715868015, | |
| "grad_norm": 0.3161657154560089, | |
| "learning_rate": 0.0005224881889763779, | |
| "loss": 3.4863, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 6.499591931910924, | |
| "grad_norm": 0.3155244290828705, | |
| "learning_rate": 0.0005223132108486439, | |
| "loss": 3.4772, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 6.5141657922350475, | |
| "grad_norm": 0.332363098859787, | |
| "learning_rate": 0.0005221382327209098, | |
| "loss": 3.4868, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 6.52873965255917, | |
| "grad_norm": 0.32883933186531067, | |
| "learning_rate": 0.0005219632545931758, | |
| "loss": 3.4962, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 6.543313512883293, | |
| "grad_norm": 0.30881384015083313, | |
| "learning_rate": 0.0005217882764654418, | |
| "loss": 3.4725, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 6.557887373207415, | |
| "grad_norm": 0.31679660081863403, | |
| "learning_rate": 0.0005216132983377077, | |
| "loss": 3.4787, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.572461233531538, | |
| "grad_norm": 0.3190597593784332, | |
| "learning_rate": 0.0005214383202099737, | |
| "loss": 3.4981, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 6.58703509385566, | |
| "grad_norm": 0.35668647289276123, | |
| "learning_rate": 0.0005212633420822397, | |
| "loss": 3.4779, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 6.601608954179783, | |
| "grad_norm": 0.33341237902641296, | |
| "learning_rate": 0.0005210883639545056, | |
| "loss": 3.4834, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 6.616182814503905, | |
| "grad_norm": 0.31415843963623047, | |
| "learning_rate": 0.0005209133858267716, | |
| "loss": 3.4832, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 6.630756674828028, | |
| "grad_norm": 0.3208731710910797, | |
| "learning_rate": 0.0005207384076990376, | |
| "loss": 3.4914, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 6.645330535152151, | |
| "grad_norm": 0.314189076423645, | |
| "learning_rate": 0.0005205634295713035, | |
| "loss": 3.4995, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 6.659904395476274, | |
| "grad_norm": 0.3122125566005707, | |
| "learning_rate": 0.0005203884514435695, | |
| "loss": 3.4772, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 6.6744782558003966, | |
| "grad_norm": 0.3228355348110199, | |
| "learning_rate": 0.0005202134733158355, | |
| "loss": 3.4793, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 6.689052116124519, | |
| "grad_norm": 0.3040376901626587, | |
| "learning_rate": 0.0005200384951881014, | |
| "loss": 3.491, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 6.703625976448642, | |
| "grad_norm": 0.3368416726589203, | |
| "learning_rate": 0.0005198635170603674, | |
| "loss": 3.489, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.703625976448642, | |
| "eval_accuracy": 0.3630768430429981, | |
| "eval_loss": 3.6057634353637695, | |
| "eval_runtime": 53.2283, | |
| "eval_samples_per_second": 312.372, | |
| "eval_steps_per_second": 19.538, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.718199836772764, | |
| "grad_norm": 0.33878281712532043, | |
| "learning_rate": 0.0005196885389326334, | |
| "loss": 3.4806, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 6.732773697096887, | |
| "grad_norm": 0.30997946858406067, | |
| "learning_rate": 0.0005195135608048994, | |
| "loss": 3.4829, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 6.747347557421009, | |
| "grad_norm": 0.34856128692626953, | |
| "learning_rate": 0.0005193385826771652, | |
| "loss": 3.4915, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 6.761921417745132, | |
| "grad_norm": 0.33696675300598145, | |
| "learning_rate": 0.0005191636045494312, | |
| "loss": 3.4904, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 6.776495278069255, | |
| "grad_norm": 0.338143527507782, | |
| "learning_rate": 0.0005189886264216972, | |
| "loss": 3.5018, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 6.791069138393378, | |
| "grad_norm": 0.31334543228149414, | |
| "learning_rate": 0.0005188136482939632, | |
| "loss": 3.5003, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 6.8056429987175004, | |
| "grad_norm": 0.33318978548049927, | |
| "learning_rate": 0.0005186386701662291, | |
| "loss": 3.5064, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 6.820216859041623, | |
| "grad_norm": 0.32178571820259094, | |
| "learning_rate": 0.0005184636920384951, | |
| "loss": 3.4924, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 6.834790719365746, | |
| "grad_norm": 0.3570283353328705, | |
| "learning_rate": 0.0005182887139107611, | |
| "loss": 3.4984, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 6.849364579689868, | |
| "grad_norm": 0.3409295380115509, | |
| "learning_rate": 0.0005181137357830271, | |
| "loss": 3.4973, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.863938440013991, | |
| "grad_norm": 0.3298083543777466, | |
| "learning_rate": 0.000517938757655293, | |
| "loss": 3.4946, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 6.878512300338113, | |
| "grad_norm": 0.3032434582710266, | |
| "learning_rate": 0.000517763779527559, | |
| "loss": 3.484, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 6.893086160662236, | |
| "grad_norm": 0.3227110505104065, | |
| "learning_rate": 0.000517588801399825, | |
| "loss": 3.5059, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 6.907660020986359, | |
| "grad_norm": 0.32623517513275146, | |
| "learning_rate": 0.0005174138232720909, | |
| "loss": 3.493, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 6.922233881310482, | |
| "grad_norm": 0.3259028196334839, | |
| "learning_rate": 0.0005172388451443569, | |
| "loss": 3.4902, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 6.936807741634604, | |
| "grad_norm": 0.3084574043750763, | |
| "learning_rate": 0.0005170638670166229, | |
| "loss": 3.5085, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 6.951381601958727, | |
| "grad_norm": 0.33005186915397644, | |
| "learning_rate": 0.0005168888888888889, | |
| "loss": 3.4998, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 6.9659554622828495, | |
| "grad_norm": 0.3244883418083191, | |
| "learning_rate": 0.0005167139107611548, | |
| "loss": 3.4995, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 6.980529322606972, | |
| "grad_norm": 0.3078853189945221, | |
| "learning_rate": 0.0005165389326334208, | |
| "loss": 3.4915, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 6.995103182931095, | |
| "grad_norm": 0.3137339651584625, | |
| "learning_rate": 0.0005163639545056868, | |
| "loss": 3.5016, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.995103182931095, | |
| "eval_accuracy": 0.3637403007415967, | |
| "eval_loss": 3.5967910289764404, | |
| "eval_runtime": 53.4115, | |
| "eval_samples_per_second": 311.3, | |
| "eval_steps_per_second": 19.471, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 7.009618747813921, | |
| "grad_norm": 0.3230603337287903, | |
| "learning_rate": 0.0005161889763779528, | |
| "loss": 3.4228, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 7.024192608138043, | |
| "grad_norm": 0.32526594400405884, | |
| "learning_rate": 0.0005160139982502187, | |
| "loss": 3.3865, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 7.038766468462167, | |
| "grad_norm": 0.3268178105354309, | |
| "learning_rate": 0.0005158390201224847, | |
| "loss": 3.383, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 7.053340328786289, | |
| "grad_norm": 0.3226619362831116, | |
| "learning_rate": 0.0005156640419947507, | |
| "loss": 3.376, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 7.067914189110412, | |
| "grad_norm": 0.3219951093196869, | |
| "learning_rate": 0.0005154890638670167, | |
| "loss": 3.4019, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 7.082488049434534, | |
| "grad_norm": 0.3288451135158539, | |
| "learning_rate": 0.0005153140857392825, | |
| "loss": 3.4092, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 7.097061909758657, | |
| "grad_norm": 0.3031003475189209, | |
| "learning_rate": 0.0005151391076115485, | |
| "loss": 3.4013, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 7.1116357700827795, | |
| "grad_norm": 0.34017258882522583, | |
| "learning_rate": 0.0005149641294838145, | |
| "loss": 3.4129, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 7.126209630406902, | |
| "grad_norm": 0.3318200409412384, | |
| "learning_rate": 0.0005147891513560804, | |
| "loss": 3.4126, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 7.140783490731025, | |
| "grad_norm": 0.3410212993621826, | |
| "learning_rate": 0.0005146141732283464, | |
| "loss": 3.4194, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 7.155357351055147, | |
| "grad_norm": 0.3241153955459595, | |
| "learning_rate": 0.0005144391951006124, | |
| "loss": 3.4164, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 7.16993121137927, | |
| "grad_norm": 0.326886922121048, | |
| "learning_rate": 0.0005142642169728783, | |
| "loss": 3.4167, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 7.184505071703393, | |
| "grad_norm": 0.33190369606018066, | |
| "learning_rate": 0.0005140892388451443, | |
| "loss": 3.4221, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 7.199078932027516, | |
| "grad_norm": 0.331858366727829, | |
| "learning_rate": 0.0005139142607174103, | |
| "loss": 3.4321, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 7.213652792351638, | |
| "grad_norm": 0.35590073466300964, | |
| "learning_rate": 0.0005137392825896762, | |
| "loss": 3.4218, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 7.228226652675761, | |
| "grad_norm": 0.3808642625808716, | |
| "learning_rate": 0.0005135643044619422, | |
| "loss": 3.4253, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 7.242800512999883, | |
| "grad_norm": 0.32019373774528503, | |
| "learning_rate": 0.0005133893263342082, | |
| "loss": 3.427, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 7.257374373324006, | |
| "grad_norm": 0.3348955512046814, | |
| "learning_rate": 0.0005132143482064742, | |
| "loss": 3.4321, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 7.2719482336481285, | |
| "grad_norm": 0.32612475752830505, | |
| "learning_rate": 0.0005130393700787401, | |
| "loss": 3.4371, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 7.286522093972251, | |
| "grad_norm": 0.336725115776062, | |
| "learning_rate": 0.0005128643919510061, | |
| "loss": 3.429, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.286522093972251, | |
| "eval_accuracy": 0.3636725072992705, | |
| "eval_loss": 3.6023082733154297, | |
| "eval_runtime": 53.3475, | |
| "eval_samples_per_second": 311.674, | |
| "eval_steps_per_second": 19.495, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.301095954296374, | |
| "grad_norm": 0.31937819719314575, | |
| "learning_rate": 0.0005126894138232721, | |
| "loss": 3.4346, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 7.315669814620497, | |
| "grad_norm": 0.3397108316421509, | |
| "learning_rate": 0.000512514435695538, | |
| "loss": 3.4382, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 7.33024367494462, | |
| "grad_norm": 0.3680172860622406, | |
| "learning_rate": 0.000512339457567804, | |
| "loss": 3.4343, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 7.344817535268742, | |
| "grad_norm": 0.33497798442840576, | |
| "learning_rate": 0.00051216447944007, | |
| "loss": 3.4407, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 7.359391395592865, | |
| "grad_norm": 0.3387533724308014, | |
| "learning_rate": 0.0005119895013123358, | |
| "loss": 3.4499, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 7.373965255916987, | |
| "grad_norm": 0.3436633050441742, | |
| "learning_rate": 0.0005118145231846018, | |
| "loss": 3.4469, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 7.38853911624111, | |
| "grad_norm": 0.33034539222717285, | |
| "learning_rate": 0.0005116395450568678, | |
| "loss": 3.4447, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 7.403112976565232, | |
| "grad_norm": 0.3452779948711395, | |
| "learning_rate": 0.0005114645669291338, | |
| "loss": 3.4325, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 7.417686836889355, | |
| "grad_norm": 0.3401397168636322, | |
| "learning_rate": 0.0005112895888013997, | |
| "loss": 3.4428, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 7.4322606972134775, | |
| "grad_norm": 0.32042157649993896, | |
| "learning_rate": 0.0005111146106736657, | |
| "loss": 3.4451, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.446834557537601, | |
| "grad_norm": 0.3398337960243225, | |
| "learning_rate": 0.0005109396325459317, | |
| "loss": 3.4465, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 7.4614084178617235, | |
| "grad_norm": 0.34229588508605957, | |
| "learning_rate": 0.0005107646544181976, | |
| "loss": 3.4414, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 7.475982278185846, | |
| "grad_norm": 0.3290734887123108, | |
| "learning_rate": 0.0005105896762904636, | |
| "loss": 3.4602, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 7.490556138509969, | |
| "grad_norm": 0.31352612376213074, | |
| "learning_rate": 0.0005104146981627296, | |
| "loss": 3.4439, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 7.505129998834091, | |
| "grad_norm": 0.34663698077201843, | |
| "learning_rate": 0.0005102397200349956, | |
| "loss": 3.4551, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 7.519703859158214, | |
| "grad_norm": 0.32665345072746277, | |
| "learning_rate": 0.0005100647419072615, | |
| "loss": 3.4437, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 7.534277719482336, | |
| "grad_norm": 0.33366599678993225, | |
| "learning_rate": 0.0005098897637795275, | |
| "loss": 3.4361, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 7.548851579806459, | |
| "grad_norm": 0.34481877088546753, | |
| "learning_rate": 0.0005097147856517935, | |
| "loss": 3.4411, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 7.563425440130581, | |
| "grad_norm": 0.3386363983154297, | |
| "learning_rate": 0.0005095398075240595, | |
| "loss": 3.4539, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 7.577999300454705, | |
| "grad_norm": 0.3457847535610199, | |
| "learning_rate": 0.0005093648293963254, | |
| "loss": 3.4487, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.577999300454705, | |
| "eval_accuracy": 0.36446955103439727, | |
| "eval_loss": 3.5960311889648438, | |
| "eval_runtime": 53.3428, | |
| "eval_samples_per_second": 311.701, | |
| "eval_steps_per_second": 19.497, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.592573160778827, | |
| "grad_norm": 0.33010855317115784, | |
| "learning_rate": 0.0005091898512685914, | |
| "loss": 3.4511, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 7.60714702110295, | |
| "grad_norm": 0.33151471614837646, | |
| "learning_rate": 0.0005090148731408574, | |
| "loss": 3.4468, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 7.6217208814270725, | |
| "grad_norm": 0.327346533536911, | |
| "learning_rate": 0.0005088398950131234, | |
| "loss": 3.458, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 7.636294741751195, | |
| "grad_norm": 0.3349602520465851, | |
| "learning_rate": 0.0005086649168853893, | |
| "loss": 3.4536, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 7.650868602075318, | |
| "grad_norm": 0.34411633014678955, | |
| "learning_rate": 0.0005084899387576553, | |
| "loss": 3.455, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 7.66544246239944, | |
| "grad_norm": 0.333551824092865, | |
| "learning_rate": 0.0005083149606299213, | |
| "loss": 3.4646, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 7.680016322723563, | |
| "grad_norm": 0.3610248863697052, | |
| "learning_rate": 0.0005081399825021873, | |
| "loss": 3.4596, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 7.694590183047685, | |
| "grad_norm": 0.3180255591869354, | |
| "learning_rate": 0.0005079650043744531, | |
| "loss": 3.4569, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 7.709164043371809, | |
| "grad_norm": 0.32871150970458984, | |
| "learning_rate": 0.0005077900262467191, | |
| "loss": 3.4467, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 7.723737903695931, | |
| "grad_norm": 0.32751843333244324, | |
| "learning_rate": 0.0005076150481189851, | |
| "loss": 3.4441, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.738311764020054, | |
| "grad_norm": 0.3273012042045593, | |
| "learning_rate": 0.000507440069991251, | |
| "loss": 3.4676, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 7.752885624344176, | |
| "grad_norm": 0.32619708776474, | |
| "learning_rate": 0.000507265091863517, | |
| "loss": 3.4606, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 7.767459484668299, | |
| "grad_norm": 0.36359405517578125, | |
| "learning_rate": 0.000507090113735783, | |
| "loss": 3.4586, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 7.7820333449924215, | |
| "grad_norm": 0.3292996883392334, | |
| "learning_rate": 0.000506915135608049, | |
| "loss": 3.465, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 7.796607205316544, | |
| "grad_norm": 0.3375926911830902, | |
| "learning_rate": 0.0005067401574803149, | |
| "loss": 3.4535, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 7.811181065640667, | |
| "grad_norm": 0.31585800647735596, | |
| "learning_rate": 0.0005065651793525809, | |
| "loss": 3.4642, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 7.825754925964789, | |
| "grad_norm": 0.3204030692577362, | |
| "learning_rate": 0.0005063902012248469, | |
| "loss": 3.4499, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 7.840328786288913, | |
| "grad_norm": 0.3427492082118988, | |
| "learning_rate": 0.0005062152230971128, | |
| "loss": 3.4637, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 7.854902646613035, | |
| "grad_norm": 0.34627190232276917, | |
| "learning_rate": 0.0005060402449693788, | |
| "loss": 3.4664, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 7.869476506937158, | |
| "grad_norm": 0.3362285792827606, | |
| "learning_rate": 0.0005058652668416448, | |
| "loss": 3.4749, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.869476506937158, | |
| "eval_accuracy": 0.36517149563514983, | |
| "eval_loss": 3.586765766143799, | |
| "eval_runtime": 53.4947, | |
| "eval_samples_per_second": 310.816, | |
| "eval_steps_per_second": 19.441, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.88405036726128, | |
| "grad_norm": 0.3395719826221466, | |
| "learning_rate": 0.0005056902887139107, | |
| "loss": 3.4635, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 7.898624227585403, | |
| "grad_norm": 0.34437862038612366, | |
| "learning_rate": 0.0005055153105861767, | |
| "loss": 3.4604, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 7.913198087909525, | |
| "grad_norm": 0.32952070236206055, | |
| "learning_rate": 0.0005053403324584427, | |
| "loss": 3.4611, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 7.927771948233648, | |
| "grad_norm": 0.3154241740703583, | |
| "learning_rate": 0.0005051653543307086, | |
| "loss": 3.4679, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 7.9423458085577705, | |
| "grad_norm": 0.3297794461250305, | |
| "learning_rate": 0.0005049903762029746, | |
| "loss": 3.4629, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 7.956919668881893, | |
| "grad_norm": 0.3234383463859558, | |
| "learning_rate": 0.0005048153980752406, | |
| "loss": 3.4671, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 7.9714935292060165, | |
| "grad_norm": 0.33262306451797485, | |
| "learning_rate": 0.0005046404199475064, | |
| "loss": 3.4733, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 7.986067389530139, | |
| "grad_norm": 0.3423873484134674, | |
| "learning_rate": 0.0005044654418197724, | |
| "loss": 3.4691, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 8.000582954412964, | |
| "grad_norm": 0.3569919466972351, | |
| "learning_rate": 0.0005042904636920384, | |
| "loss": 3.4555, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 8.015156814737088, | |
| "grad_norm": 0.3198401927947998, | |
| "learning_rate": 0.0005041154855643044, | |
| "loss": 3.3531, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 8.02973067506121, | |
| "grad_norm": 0.32315894961357117, | |
| "learning_rate": 0.0005039405074365703, | |
| "loss": 3.3566, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 8.044304535385333, | |
| "grad_norm": 0.33363547921180725, | |
| "learning_rate": 0.0005037655293088363, | |
| "loss": 3.3581, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 8.058878395709456, | |
| "grad_norm": 0.3445553779602051, | |
| "learning_rate": 0.0005035905511811023, | |
| "loss": 3.3566, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 8.073452256033578, | |
| "grad_norm": 0.3616427183151245, | |
| "learning_rate": 0.0005034155730533682, | |
| "loss": 3.377, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 8.088026116357701, | |
| "grad_norm": 0.37287381291389465, | |
| "learning_rate": 0.0005032405949256342, | |
| "loss": 3.3849, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 8.102599976681823, | |
| "grad_norm": 0.339351087808609, | |
| "learning_rate": 0.0005030656167979002, | |
| "loss": 3.3738, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 8.117173837005947, | |
| "grad_norm": 0.32417935132980347, | |
| "learning_rate": 0.0005028906386701662, | |
| "loss": 3.37, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 8.131747697330068, | |
| "grad_norm": 0.34558215737342834, | |
| "learning_rate": 0.0005027156605424321, | |
| "loss": 3.387, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 8.146321557654192, | |
| "grad_norm": 0.3561769127845764, | |
| "learning_rate": 0.0005025406824146981, | |
| "loss": 3.3842, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 8.160895417978313, | |
| "grad_norm": 0.33284032344818115, | |
| "learning_rate": 0.0005023657042869641, | |
| "loss": 3.3917, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.160895417978313, | |
| "eval_accuracy": 0.36516772933279834, | |
| "eval_loss": 3.596513271331787, | |
| "eval_runtime": 53.192, | |
| "eval_samples_per_second": 312.585, | |
| "eval_steps_per_second": 19.552, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.175469278302437, | |
| "grad_norm": 0.3167806565761566, | |
| "learning_rate": 0.0005021907261592301, | |
| "loss": 3.3807, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 8.19004313862656, | |
| "grad_norm": 0.3251250386238098, | |
| "learning_rate": 0.000502015748031496, | |
| "loss": 3.3931, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 8.204616998950682, | |
| "grad_norm": 0.33859023451805115, | |
| "learning_rate": 0.000501840769903762, | |
| "loss": 3.3845, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 8.219190859274805, | |
| "grad_norm": 0.34484195709228516, | |
| "learning_rate": 0.000501665791776028, | |
| "loss": 3.3959, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 8.233764719598927, | |
| "grad_norm": 0.3301757872104645, | |
| "learning_rate": 0.0005014908136482939, | |
| "loss": 3.3935, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 8.24833857992305, | |
| "grad_norm": 0.35394570231437683, | |
| "learning_rate": 0.0005013158355205599, | |
| "loss": 3.3925, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 8.262912440247172, | |
| "grad_norm": 0.32209792733192444, | |
| "learning_rate": 0.0005011408573928259, | |
| "loss": 3.4074, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 8.277486300571296, | |
| "grad_norm": 0.36506566405296326, | |
| "learning_rate": 0.0005009658792650919, | |
| "loss": 3.4031, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 8.292060160895417, | |
| "grad_norm": 0.3617514371871948, | |
| "learning_rate": 0.0005007909011373577, | |
| "loss": 3.405, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 8.30663402121954, | |
| "grad_norm": 0.33067309856414795, | |
| "learning_rate": 0.0005006159230096237, | |
| "loss": 3.4053, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 8.321207881543664, | |
| "grad_norm": 0.3240738809108734, | |
| "learning_rate": 0.0005004409448818897, | |
| "loss": 3.4026, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 8.335781741867786, | |
| "grad_norm": 0.31030556559562683, | |
| "learning_rate": 0.0005002659667541557, | |
| "loss": 3.4, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 8.35035560219191, | |
| "grad_norm": 0.3443307876586914, | |
| "learning_rate": 0.0005000909886264216, | |
| "loss": 3.4171, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 8.36492946251603, | |
| "grad_norm": 0.34475865960121155, | |
| "learning_rate": 0.0004999160104986876, | |
| "loss": 3.4069, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 8.379503322840154, | |
| "grad_norm": 0.33189693093299866, | |
| "learning_rate": 0.0004997410323709536, | |
| "loss": 3.4186, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 8.394077183164276, | |
| "grad_norm": 0.3285791277885437, | |
| "learning_rate": 0.0004995660542432196, | |
| "loss": 3.4207, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 8.4086510434884, | |
| "grad_norm": 0.35239142179489136, | |
| "learning_rate": 0.0004993910761154855, | |
| "loss": 3.4052, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 8.423224903812521, | |
| "grad_norm": 0.34239187836647034, | |
| "learning_rate": 0.0004992160979877515, | |
| "loss": 3.4027, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 8.437798764136645, | |
| "grad_norm": 0.34864264726638794, | |
| "learning_rate": 0.0004990411198600175, | |
| "loss": 3.4147, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 8.452372624460768, | |
| "grad_norm": 0.3308429718017578, | |
| "learning_rate": 0.0004988661417322835, | |
| "loss": 3.4029, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.452372624460768, | |
| "eval_accuracy": 0.36567417930212065, | |
| "eval_loss": 3.587951421737671, | |
| "eval_runtime": 53.2307, | |
| "eval_samples_per_second": 312.358, | |
| "eval_steps_per_second": 19.538, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.46694648478489, | |
| "grad_norm": 0.3271889388561249, | |
| "learning_rate": 0.0004986911636045494, | |
| "loss": 3.4046, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 8.481520345109013, | |
| "grad_norm": 0.34479963779449463, | |
| "learning_rate": 0.0004985161854768154, | |
| "loss": 3.4121, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 8.496094205433135, | |
| "grad_norm": 0.33550480008125305, | |
| "learning_rate": 0.0004983412073490814, | |
| "loss": 3.4261, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 8.510668065757258, | |
| "grad_norm": 0.3231935203075409, | |
| "learning_rate": 0.0004981662292213473, | |
| "loss": 3.4109, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 8.52524192608138, | |
| "grad_norm": 0.3306845426559448, | |
| "learning_rate": 0.0004979912510936133, | |
| "loss": 3.4212, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 8.539815786405503, | |
| "grad_norm": 0.3305637538433075, | |
| "learning_rate": 0.0004978162729658793, | |
| "loss": 3.4335, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 8.554389646729625, | |
| "grad_norm": 0.3217015862464905, | |
| "learning_rate": 0.0004976412948381452, | |
| "loss": 3.414, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 8.568963507053748, | |
| "grad_norm": 0.32402655482292175, | |
| "learning_rate": 0.0004974663167104112, | |
| "loss": 3.4238, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 8.583537367377872, | |
| "grad_norm": 0.3909885883331299, | |
| "learning_rate": 0.0004972913385826772, | |
| "loss": 3.4278, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 8.598111227701994, | |
| "grad_norm": 0.33252859115600586, | |
| "learning_rate": 0.000497116360454943, | |
| "loss": 3.4168, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 8.612685088026117, | |
| "grad_norm": 0.3216465413570404, | |
| "learning_rate": 0.000496941382327209, | |
| "loss": 3.4269, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 8.627258948350239, | |
| "grad_norm": 0.31453585624694824, | |
| "learning_rate": 0.000496766404199475, | |
| "loss": 3.4184, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 8.641832808674362, | |
| "grad_norm": 0.32268640398979187, | |
| "learning_rate": 0.0004965914260717409, | |
| "loss": 3.4316, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 8.656406668998484, | |
| "grad_norm": 0.33388710021972656, | |
| "learning_rate": 0.0004964164479440069, | |
| "loss": 3.4366, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 8.670980529322607, | |
| "grad_norm": 0.3239559829235077, | |
| "learning_rate": 0.0004962414698162729, | |
| "loss": 3.4147, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 8.685554389646729, | |
| "grad_norm": 0.32564789056777954, | |
| "learning_rate": 0.0004960664916885388, | |
| "loss": 3.4322, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 8.700128249970852, | |
| "grad_norm": 0.3407617211341858, | |
| "learning_rate": 0.0004958915135608048, | |
| "loss": 3.4322, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 8.714702110294976, | |
| "grad_norm": 0.31778067350387573, | |
| "learning_rate": 0.0004957165354330708, | |
| "loss": 3.4334, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 8.729275970619097, | |
| "grad_norm": 0.34262266755104065, | |
| "learning_rate": 0.0004955415573053368, | |
| "loss": 3.4418, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 8.74384983094322, | |
| "grad_norm": 0.33617204427719116, | |
| "learning_rate": 0.0004953665791776027, | |
| "loss": 3.4309, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.74384983094322, | |
| "eval_accuracy": 0.36643509007406316, | |
| "eval_loss": 3.5787062644958496, | |
| "eval_runtime": 53.2083, | |
| "eval_samples_per_second": 312.489, | |
| "eval_steps_per_second": 19.546, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.758423691267343, | |
| "grad_norm": 0.31523507833480835, | |
| "learning_rate": 0.0004951916010498687, | |
| "loss": 3.44, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 8.772997551591466, | |
| "grad_norm": 0.3123234510421753, | |
| "learning_rate": 0.0004950166229221347, | |
| "loss": 3.4403, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 8.787571411915588, | |
| "grad_norm": 0.32988399267196655, | |
| "learning_rate": 0.0004948416447944006, | |
| "loss": 3.4256, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 8.802145272239711, | |
| "grad_norm": 0.37780627608299255, | |
| "learning_rate": 0.0004946666666666666, | |
| "loss": 3.4506, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 8.816719132563833, | |
| "grad_norm": 0.3315257132053375, | |
| "learning_rate": 0.0004944916885389326, | |
| "loss": 3.4441, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 8.831292992887956, | |
| "grad_norm": 0.3197008967399597, | |
| "learning_rate": 0.0004943167104111986, | |
| "loss": 3.4346, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 8.84586685321208, | |
| "grad_norm": 0.3454113006591797, | |
| "learning_rate": 0.0004941417322834645, | |
| "loss": 3.4251, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 8.860440713536201, | |
| "grad_norm": 0.3199872672557831, | |
| "learning_rate": 0.0004939667541557305, | |
| "loss": 3.4346, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 8.875014573860325, | |
| "grad_norm": 0.3247497081756592, | |
| "learning_rate": 0.0004937917760279965, | |
| "loss": 3.4313, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 8.889588434184446, | |
| "grad_norm": 0.31507542729377747, | |
| "learning_rate": 0.0004936167979002625, | |
| "loss": 3.4333, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 8.90416229450857, | |
| "grad_norm": 0.30630066990852356, | |
| "learning_rate": 0.0004934418197725284, | |
| "loss": 3.4309, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 8.918736154832692, | |
| "grad_norm": 0.31729817390441895, | |
| "learning_rate": 0.0004932668416447943, | |
| "loss": 3.4386, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 8.933310015156815, | |
| "grad_norm": 0.3511722683906555, | |
| "learning_rate": 0.0004930918635170603, | |
| "loss": 3.4362, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 8.947883875480937, | |
| "grad_norm": 0.33442434668540955, | |
| "learning_rate": 0.0004929168853893263, | |
| "loss": 3.4393, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 8.96245773580506, | |
| "grad_norm": 0.34247684478759766, | |
| "learning_rate": 0.0004927419072615922, | |
| "loss": 3.4459, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 8.977031596129184, | |
| "grad_norm": 0.3286040723323822, | |
| "learning_rate": 0.0004925669291338582, | |
| "loss": 3.4431, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 8.991605456453305, | |
| "grad_norm": 0.321585088968277, | |
| "learning_rate": 0.0004923919510061242, | |
| "loss": 3.4367, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 9.006121021336131, | |
| "grad_norm": 0.331307590007782, | |
| "learning_rate": 0.0004922169728783901, | |
| "loss": 3.3862, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 9.020694881660255, | |
| "grad_norm": 0.37021103501319885, | |
| "learning_rate": 0.0004920419947506561, | |
| "loss": 3.3223, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 9.035268741984376, | |
| "grad_norm": 0.33175399899482727, | |
| "learning_rate": 0.0004918670166229221, | |
| "loss": 3.3335, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.035268741984376, | |
| "eval_accuracy": 0.36600290687923365, | |
| "eval_loss": 3.586287498474121, | |
| "eval_runtime": 53.3438, | |
| "eval_samples_per_second": 311.695, | |
| "eval_steps_per_second": 19.496, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.0498426023085, | |
| "grad_norm": 0.3926156163215637, | |
| "learning_rate": 0.0004916920384951881, | |
| "loss": 3.3284, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 9.064416462632622, | |
| "grad_norm": 0.3375106751918793, | |
| "learning_rate": 0.000491517060367454, | |
| "loss": 3.3468, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 9.078990322956745, | |
| "grad_norm": 0.3336063623428345, | |
| "learning_rate": 0.00049134208223972, | |
| "loss": 3.3463, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 9.093564183280867, | |
| "grad_norm": 0.3396972715854645, | |
| "learning_rate": 0.000491167104111986, | |
| "loss": 3.3374, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 9.10813804360499, | |
| "grad_norm": 0.3279450833797455, | |
| "learning_rate": 0.000490992125984252, | |
| "loss": 3.3375, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 9.122711903929114, | |
| "grad_norm": 0.36093869805336, | |
| "learning_rate": 0.0004908171478565179, | |
| "loss": 3.3489, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 9.137285764253235, | |
| "grad_norm": 0.3507830798625946, | |
| "learning_rate": 0.0004906421697287839, | |
| "loss": 3.3481, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 9.151859624577359, | |
| "grad_norm": 0.33186662197113037, | |
| "learning_rate": 0.0004904671916010499, | |
| "loss": 3.3527, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 9.16643348490148, | |
| "grad_norm": 0.34464725852012634, | |
| "learning_rate": 0.0004902922134733158, | |
| "loss": 3.3577, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 9.181007345225604, | |
| "grad_norm": 0.3341642916202545, | |
| "learning_rate": 0.0004901172353455818, | |
| "loss": 3.3541, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 9.195581205549725, | |
| "grad_norm": 0.3413945436477661, | |
| "learning_rate": 0.0004899422572178478, | |
| "loss": 3.3549, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 9.210155065873849, | |
| "grad_norm": 0.344498872756958, | |
| "learning_rate": 0.0004897672790901138, | |
| "loss": 3.3726, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 9.22472892619797, | |
| "grad_norm": 0.3580266833305359, | |
| "learning_rate": 0.0004895923009623796, | |
| "loss": 3.3629, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 9.239302786522094, | |
| "grad_norm": 0.33636781573295593, | |
| "learning_rate": 0.0004894173228346456, | |
| "loss": 3.3797, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 9.253876646846217, | |
| "grad_norm": 0.3473851978778839, | |
| "learning_rate": 0.0004892423447069116, | |
| "loss": 3.3811, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 9.26845050717034, | |
| "grad_norm": 0.3385968506336212, | |
| "learning_rate": 0.0004890673665791775, | |
| "loss": 3.3737, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 9.283024367494463, | |
| "grad_norm": 0.34464573860168457, | |
| "learning_rate": 0.0004888923884514435, | |
| "loss": 3.3843, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 9.297598227818584, | |
| "grad_norm": 0.3294459581375122, | |
| "learning_rate": 0.0004887174103237095, | |
| "loss": 3.3717, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 9.312172088142708, | |
| "grad_norm": 0.32553818821907043, | |
| "learning_rate": 0.0004885424321959754, | |
| "loss": 3.3829, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 9.32674594846683, | |
| "grad_norm": 0.35752755403518677, | |
| "learning_rate": 0.0004883674540682414, | |
| "loss": 3.3802, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.32674594846683, | |
| "eval_accuracy": 0.36622535411186646, | |
| "eval_loss": 3.584019899368286, | |
| "eval_runtime": 53.3181, | |
| "eval_samples_per_second": 311.845, | |
| "eval_steps_per_second": 19.506, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.341319808790953, | |
| "grad_norm": 0.34649282693862915, | |
| "learning_rate": 0.00048819247594050736, | |
| "loss": 3.3942, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 9.355893669115074, | |
| "grad_norm": 0.3347167670726776, | |
| "learning_rate": 0.00048801749781277336, | |
| "loss": 3.3805, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 9.370467529439198, | |
| "grad_norm": 0.32364505529403687, | |
| "learning_rate": 0.00048784251968503936, | |
| "loss": 3.3936, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 9.385041389763321, | |
| "grad_norm": 0.3570871353149414, | |
| "learning_rate": 0.0004876675415573053, | |
| "loss": 3.3723, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 9.399615250087443, | |
| "grad_norm": 0.3575366735458374, | |
| "learning_rate": 0.00048749256342957124, | |
| "loss": 3.3838, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 9.414189110411566, | |
| "grad_norm": 0.35830157995224, | |
| "learning_rate": 0.00048731758530183724, | |
| "loss": 3.3849, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 9.428762970735688, | |
| "grad_norm": 0.33467957377433777, | |
| "learning_rate": 0.0004871426071741032, | |
| "loss": 3.4113, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 9.443336831059812, | |
| "grad_norm": 0.34144988656044006, | |
| "learning_rate": 0.0004869676290463692, | |
| "loss": 3.3857, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 9.457910691383933, | |
| "grad_norm": 0.33660706877708435, | |
| "learning_rate": 0.0004867926509186351, | |
| "loss": 3.3921, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 9.472484551708057, | |
| "grad_norm": 0.3639011085033417, | |
| "learning_rate": 0.00048661767279090107, | |
| "loss": 3.399, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 9.487058412032178, | |
| "grad_norm": 0.35932692885398865, | |
| "learning_rate": 0.00048644269466316707, | |
| "loss": 3.3896, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 9.501632272356302, | |
| "grad_norm": 0.34504228830337524, | |
| "learning_rate": 0.00048626771653543306, | |
| "loss": 3.3977, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 9.516206132680423, | |
| "grad_norm": 0.357723593711853, | |
| "learning_rate": 0.00048609273840769895, | |
| "loss": 3.4023, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 9.530779993004547, | |
| "grad_norm": 0.34201788902282715, | |
| "learning_rate": 0.00048591776027996495, | |
| "loss": 3.4017, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 9.54535385332867, | |
| "grad_norm": 0.32081034779548645, | |
| "learning_rate": 0.00048574278215223095, | |
| "loss": 3.3882, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 9.559927713652792, | |
| "grad_norm": 0.350654661655426, | |
| "learning_rate": 0.0004855678040244969, | |
| "loss": 3.3947, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 9.574501573976915, | |
| "grad_norm": 0.3591080904006958, | |
| "learning_rate": 0.00048539282589676283, | |
| "loss": 3.3897, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 9.589075434301037, | |
| "grad_norm": 0.314141184091568, | |
| "learning_rate": 0.00048521784776902883, | |
| "loss": 3.4047, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 9.60364929462516, | |
| "grad_norm": 0.36194196343421936, | |
| "learning_rate": 0.00048504286964129483, | |
| "loss": 3.4103, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 9.618223154949282, | |
| "grad_norm": 0.33191657066345215, | |
| "learning_rate": 0.0004848678915135607, | |
| "loss": 3.4027, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.618223154949282, | |
| "eval_accuracy": 0.36700933348571163, | |
| "eval_loss": 3.573310375213623, | |
| "eval_runtime": 53.1509, | |
| "eval_samples_per_second": 312.826, | |
| "eval_steps_per_second": 19.567, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.632797015273406, | |
| "grad_norm": 0.32011061906814575, | |
| "learning_rate": 0.0004846929133858267, | |
| "loss": 3.3887, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 9.647370875597527, | |
| "grad_norm": 0.3385699391365051, | |
| "learning_rate": 0.0004845179352580927, | |
| "loss": 3.4071, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 9.66194473592165, | |
| "grad_norm": 0.3338579535484314, | |
| "learning_rate": 0.0004843429571303587, | |
| "loss": 3.4152, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 9.676518596245774, | |
| "grad_norm": 0.3355548679828644, | |
| "learning_rate": 0.0004841679790026246, | |
| "loss": 3.4062, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 9.691092456569896, | |
| "grad_norm": 0.33254146575927734, | |
| "learning_rate": 0.0004839930008748906, | |
| "loss": 3.4033, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 9.70566631689402, | |
| "grad_norm": 0.3543528914451599, | |
| "learning_rate": 0.0004838180227471566, | |
| "loss": 3.4011, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 9.720240177218141, | |
| "grad_norm": 0.31879448890686035, | |
| "learning_rate": 0.00048364304461942254, | |
| "loss": 3.4044, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 9.734814037542264, | |
| "grad_norm": 0.3408379852771759, | |
| "learning_rate": 0.0004834680664916885, | |
| "loss": 3.4031, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 9.749387897866386, | |
| "grad_norm": 0.3313541114330292, | |
| "learning_rate": 0.0004832930883639545, | |
| "loss": 3.4065, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 9.76396175819051, | |
| "grad_norm": 0.3503141701221466, | |
| "learning_rate": 0.0004831181102362204, | |
| "loss": 3.4135, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 9.778535618514631, | |
| "grad_norm": 0.34659284353256226, | |
| "learning_rate": 0.00048294313210848637, | |
| "loss": 3.4085, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 9.793109478838755, | |
| "grad_norm": 0.35435524582862854, | |
| "learning_rate": 0.00048276815398075237, | |
| "loss": 3.403, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 9.807683339162878, | |
| "grad_norm": 0.35643336176872253, | |
| "learning_rate": 0.0004825931758530183, | |
| "loss": 3.3999, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 9.822257199487, | |
| "grad_norm": 0.33353114128112793, | |
| "learning_rate": 0.0004824181977252843, | |
| "loss": 3.4155, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 9.836831059811123, | |
| "grad_norm": 0.30645281076431274, | |
| "learning_rate": 0.00048224321959755025, | |
| "loss": 3.4046, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 9.851404920135245, | |
| "grad_norm": 0.35525768995285034, | |
| "learning_rate": 0.0004820682414698162, | |
| "loss": 3.403, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 9.865978780459368, | |
| "grad_norm": 0.34258151054382324, | |
| "learning_rate": 0.0004818932633420822, | |
| "loss": 3.4106, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 9.88055264078349, | |
| "grad_norm": 0.34737566113471985, | |
| "learning_rate": 0.0004817182852143482, | |
| "loss": 3.4212, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 9.895126501107613, | |
| "grad_norm": 0.3499617874622345, | |
| "learning_rate": 0.0004815433070866141, | |
| "loss": 3.4168, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 9.909700361431735, | |
| "grad_norm": 0.3705946207046509, | |
| "learning_rate": 0.0004813683289588801, | |
| "loss": 3.4163, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.909700361431735, | |
| "eval_accuracy": 0.3675073092747432, | |
| "eval_loss": 3.5654964447021484, | |
| "eval_runtime": 53.6957, | |
| "eval_samples_per_second": 309.652, | |
| "eval_steps_per_second": 19.368, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.924274221755859, | |
| "grad_norm": 0.3173365294933319, | |
| "learning_rate": 0.0004811933508311461, | |
| "loss": 3.4174, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 9.938848082079982, | |
| "grad_norm": 0.3493649661540985, | |
| "learning_rate": 0.00048101837270341207, | |
| "loss": 3.4197, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 9.953421942404104, | |
| "grad_norm": 0.32634416222572327, | |
| "learning_rate": 0.00048084339457567796, | |
| "loss": 3.4278, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 9.967995802728227, | |
| "grad_norm": 0.36151570081710815, | |
| "learning_rate": 0.00048066841644794396, | |
| "loss": 3.4142, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 9.982569663052349, | |
| "grad_norm": 0.3287253677845001, | |
| "learning_rate": 0.00048049343832020996, | |
| "loss": 3.4161, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 9.997143523376472, | |
| "grad_norm": 0.33584117889404297, | |
| "learning_rate": 0.00048031846019247595, | |
| "loss": 3.419, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 10.011659088259298, | |
| "grad_norm": 0.32926878333091736, | |
| "learning_rate": 0.00048014348206474184, | |
| "loss": 3.3171, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 10.02623294858342, | |
| "grad_norm": 0.3427960276603699, | |
| "learning_rate": 0.00047996850393700784, | |
| "loss": 3.3109, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 10.040806808907544, | |
| "grad_norm": 0.3409363031387329, | |
| "learning_rate": 0.00047979352580927384, | |
| "loss": 3.2946, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 10.055380669231667, | |
| "grad_norm": 0.3273547887802124, | |
| "learning_rate": 0.00047961854768153973, | |
| "loss": 3.3174, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 10.069954529555789, | |
| "grad_norm": 0.35705727338790894, | |
| "learning_rate": 0.0004794435695538057, | |
| "loss": 3.3145, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 10.084528389879912, | |
| "grad_norm": 0.35857057571411133, | |
| "learning_rate": 0.0004792685914260717, | |
| "loss": 3.3145, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 10.099102250204034, | |
| "grad_norm": 0.3458826541900635, | |
| "learning_rate": 0.00047909361329833767, | |
| "loss": 3.3264, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 10.113676110528157, | |
| "grad_norm": 0.33831337094306946, | |
| "learning_rate": 0.0004789186351706036, | |
| "loss": 3.3239, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 10.128249970852279, | |
| "grad_norm": 0.3277437388896942, | |
| "learning_rate": 0.0004787436570428696, | |
| "loss": 3.3232, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 10.142823831176402, | |
| "grad_norm": 0.3412516415119171, | |
| "learning_rate": 0.00047856867891513555, | |
| "loss": 3.323, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 10.157397691500524, | |
| "grad_norm": 0.3519359827041626, | |
| "learning_rate": 0.00047839370078740155, | |
| "loss": 3.3357, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 10.171971551824647, | |
| "grad_norm": 0.3262736201286316, | |
| "learning_rate": 0.0004782187226596675, | |
| "loss": 3.3331, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 10.18654541214877, | |
| "grad_norm": 0.32696694135665894, | |
| "learning_rate": 0.00047804374453193344, | |
| "loss": 3.3521, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 10.201119272472893, | |
| "grad_norm": 0.34408679604530334, | |
| "learning_rate": 0.00047786876640419943, | |
| "loss": 3.339, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.201119272472893, | |
| "eval_accuracy": 0.3673739586321119, | |
| "eval_loss": 3.5782310962677, | |
| "eval_runtime": 53.0964, | |
| "eval_samples_per_second": 313.147, | |
| "eval_steps_per_second": 19.587, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.215693132797016, | |
| "grad_norm": 0.3292979598045349, | |
| "learning_rate": 0.00047769378827646543, | |
| "loss": 3.3516, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 10.230266993121138, | |
| "grad_norm": 0.3297373950481415, | |
| "learning_rate": 0.0004775188101487313, | |
| "loss": 3.3392, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 10.244840853445261, | |
| "grad_norm": 0.3371959924697876, | |
| "learning_rate": 0.0004773438320209973, | |
| "loss": 3.3412, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 10.259414713769383, | |
| "grad_norm": 0.32268255949020386, | |
| "learning_rate": 0.0004771688538932633, | |
| "loss": 3.3511, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 10.273988574093506, | |
| "grad_norm": 0.3661893606185913, | |
| "learning_rate": 0.0004769938757655293, | |
| "loss": 3.3514, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 10.288562434417628, | |
| "grad_norm": 0.33462610840797424, | |
| "learning_rate": 0.0004768188976377952, | |
| "loss": 3.3512, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 10.303136294741751, | |
| "grad_norm": 0.3414505422115326, | |
| "learning_rate": 0.0004766439195100612, | |
| "loss": 3.3589, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 10.317710155065875, | |
| "grad_norm": 0.35679128766059875, | |
| "learning_rate": 0.0004764689413823272, | |
| "loss": 3.3633, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 10.332284015389996, | |
| "grad_norm": 0.3407164216041565, | |
| "learning_rate": 0.0004762939632545931, | |
| "loss": 3.3417, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 10.34685787571412, | |
| "grad_norm": 0.32294556498527527, | |
| "learning_rate": 0.0004761189851268591, | |
| "loss": 3.3474, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 10.361431736038242, | |
| "grad_norm": 0.33205491304397583, | |
| "learning_rate": 0.0004759440069991251, | |
| "loss": 3.366, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 10.376005596362365, | |
| "grad_norm": 0.3272695541381836, | |
| "learning_rate": 0.0004757690288713911, | |
| "loss": 3.3662, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 10.390579456686487, | |
| "grad_norm": 0.364637166261673, | |
| "learning_rate": 0.00047559405074365697, | |
| "loss": 3.3581, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 10.40515331701061, | |
| "grad_norm": 0.331093430519104, | |
| "learning_rate": 0.00047541907261592297, | |
| "loss": 3.3676, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 10.419727177334732, | |
| "grad_norm": 0.3462320864200592, | |
| "learning_rate": 0.00047524409448818897, | |
| "loss": 3.3619, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 10.434301037658855, | |
| "grad_norm": 0.35206523537635803, | |
| "learning_rate": 0.0004750691163604549, | |
| "loss": 3.366, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 10.448874897982979, | |
| "grad_norm": 0.3616088926792145, | |
| "learning_rate": 0.00047489413823272085, | |
| "loss": 3.3777, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 10.4634487583071, | |
| "grad_norm": 0.33343666791915894, | |
| "learning_rate": 0.00047471916010498685, | |
| "loss": 3.3674, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 10.478022618631224, | |
| "grad_norm": 0.3323806822299957, | |
| "learning_rate": 0.0004745441819772528, | |
| "loss": 3.3683, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 10.492596478955345, | |
| "grad_norm": 0.3593173325061798, | |
| "learning_rate": 0.0004743692038495188, | |
| "loss": 3.3697, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.492596478955345, | |
| "eval_accuracy": 0.3675707479299755, | |
| "eval_loss": 3.5710971355438232, | |
| "eval_runtime": 53.2847, | |
| "eval_samples_per_second": 312.041, | |
| "eval_steps_per_second": 19.518, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.507170339279469, | |
| "grad_norm": 0.33613282442092896, | |
| "learning_rate": 0.00047419422572178474, | |
| "loss": 3.3624, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 10.52174419960359, | |
| "grad_norm": 0.322963684797287, | |
| "learning_rate": 0.0004740192475940507, | |
| "loss": 3.3662, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 10.536318059927714, | |
| "grad_norm": 0.3449420630931854, | |
| "learning_rate": 0.0004738442694663167, | |
| "loss": 3.3818, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 10.550891920251836, | |
| "grad_norm": 0.33825379610061646, | |
| "learning_rate": 0.0004736692913385827, | |
| "loss": 3.3729, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 10.565465780575959, | |
| "grad_norm": 0.34755024313926697, | |
| "learning_rate": 0.00047349431321084856, | |
| "loss": 3.3839, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 10.580039640900083, | |
| "grad_norm": 0.363221138715744, | |
| "learning_rate": 0.00047331933508311456, | |
| "loss": 3.3833, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 10.594613501224204, | |
| "grad_norm": 0.38004592061042786, | |
| "learning_rate": 0.00047314435695538056, | |
| "loss": 3.3847, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 10.609187361548328, | |
| "grad_norm": 0.33482077717781067, | |
| "learning_rate": 0.00047296937882764645, | |
| "loss": 3.3693, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 10.62376122187245, | |
| "grad_norm": 0.38688504695892334, | |
| "learning_rate": 0.00047279440069991245, | |
| "loss": 3.3994, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 10.638335082196573, | |
| "grad_norm": 0.8661547303199768, | |
| "learning_rate": 0.00047261942257217844, | |
| "loss": 3.3747, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 10.652908942520694, | |
| "grad_norm": 0.3520547151565552, | |
| "learning_rate": 0.00047244444444444444, | |
| "loss": 3.3939, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 10.667482802844818, | |
| "grad_norm": 0.3169161081314087, | |
| "learning_rate": 0.00047226946631671033, | |
| "loss": 3.3853, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 10.68205666316894, | |
| "grad_norm": 0.3585412800312042, | |
| "learning_rate": 0.00047209448818897633, | |
| "loss": 3.4039, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 10.696630523493063, | |
| "grad_norm": 0.36897537112236023, | |
| "learning_rate": 0.0004719195100612423, | |
| "loss": 3.3879, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 10.711204383817186, | |
| "grad_norm": 0.32157233357429504, | |
| "learning_rate": 0.00047174453193350827, | |
| "loss": 3.3873, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 10.725778244141308, | |
| "grad_norm": 0.3530696630477905, | |
| "learning_rate": 0.0004715695538057742, | |
| "loss": 3.3923, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 10.740352104465432, | |
| "grad_norm": 0.34365373849868774, | |
| "learning_rate": 0.0004713945756780402, | |
| "loss": 3.3944, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 10.754925964789553, | |
| "grad_norm": 0.34925419092178345, | |
| "learning_rate": 0.0004712195975503062, | |
| "loss": 3.3658, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 10.769499825113677, | |
| "grad_norm": 0.31599247455596924, | |
| "learning_rate": 0.00047104461942257215, | |
| "loss": 3.3862, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 10.784073685437798, | |
| "grad_norm": 0.34453409910202026, | |
| "learning_rate": 0.0004708696412948381, | |
| "loss": 3.3836, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.784073685437798, | |
| "eval_accuracy": 0.36834907785029347, | |
| "eval_loss": 3.565258502960205, | |
| "eval_runtime": 53.3013, | |
| "eval_samples_per_second": 311.944, | |
| "eval_steps_per_second": 19.512, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.798647545761922, | |
| "grad_norm": 0.3643403947353363, | |
| "learning_rate": 0.0004706946631671041, | |
| "loss": 3.3842, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 10.813221406086043, | |
| "grad_norm": 0.3264981210231781, | |
| "learning_rate": 0.00047051968503937004, | |
| "loss": 3.3748, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 10.827795266410167, | |
| "grad_norm": 0.3335154056549072, | |
| "learning_rate": 0.000470344706911636, | |
| "loss": 3.3895, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 10.84236912673429, | |
| "grad_norm": 0.33110693097114563, | |
| "learning_rate": 0.000470169728783902, | |
| "loss": 3.3914, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 10.856942987058412, | |
| "grad_norm": 0.33247503638267517, | |
| "learning_rate": 0.0004699947506561679, | |
| "loss": 3.3862, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 10.871516847382535, | |
| "grad_norm": 0.3364832103252411, | |
| "learning_rate": 0.0004698197725284339, | |
| "loss": 3.3992, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 10.886090707706657, | |
| "grad_norm": 0.3212383985519409, | |
| "learning_rate": 0.00046964479440069986, | |
| "loss": 3.4022, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 10.90066456803078, | |
| "grad_norm": 0.33531829714775085, | |
| "learning_rate": 0.0004694698162729658, | |
| "loss": 3.3974, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 10.915238428354902, | |
| "grad_norm": 0.3326956331729889, | |
| "learning_rate": 0.0004692948381452318, | |
| "loss": 3.3913, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 10.929812288679026, | |
| "grad_norm": 0.3450769782066345, | |
| "learning_rate": 0.0004691198600174978, | |
| "loss": 3.3981, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 10.944386149003147, | |
| "grad_norm": 0.3500250279903412, | |
| "learning_rate": 0.0004689448818897637, | |
| "loss": 3.3898, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 10.95896000932727, | |
| "grad_norm": 0.3329240381717682, | |
| "learning_rate": 0.0004687699037620297, | |
| "loss": 3.3965, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 10.973533869651392, | |
| "grad_norm": 0.35290616750717163, | |
| "learning_rate": 0.0004685949256342957, | |
| "loss": 3.4036, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 10.988107729975516, | |
| "grad_norm": 0.3301716148853302, | |
| "learning_rate": 0.0004684199475065617, | |
| "loss": 3.3869, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 11.002623294858342, | |
| "grad_norm": 0.35138195753097534, | |
| "learning_rate": 0.0004682449693788276, | |
| "loss": 3.3837, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 11.017197155182465, | |
| "grad_norm": 0.325611412525177, | |
| "learning_rate": 0.00046806999125109357, | |
| "loss": 3.2862, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 11.031771015506587, | |
| "grad_norm": 0.36736229062080383, | |
| "learning_rate": 0.00046789501312335957, | |
| "loss": 3.2805, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 11.04634487583071, | |
| "grad_norm": 0.3821553885936737, | |
| "learning_rate": 0.0004677200349956255, | |
| "loss": 3.296, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 11.060918736154832, | |
| "grad_norm": 0.34444135427474976, | |
| "learning_rate": 0.00046754505686789146, | |
| "loss": 3.2868, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 11.075492596478956, | |
| "grad_norm": 0.34698426723480225, | |
| "learning_rate": 0.00046737007874015745, | |
| "loss": 3.2929, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.075492596478956, | |
| "eval_accuracy": 0.3682230244184682, | |
| "eval_loss": 3.5735981464385986, | |
| "eval_runtime": 53.1976, | |
| "eval_samples_per_second": 312.552, | |
| "eval_steps_per_second": 19.55, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.090066456803077, | |
| "grad_norm": 0.3469058573246002, | |
| "learning_rate": 0.0004671951006124234, | |
| "loss": 3.2903, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 11.1046403171272, | |
| "grad_norm": 0.3458442687988281, | |
| "learning_rate": 0.00046702012248468934, | |
| "loss": 3.3083, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 11.119214177451322, | |
| "grad_norm": 0.36308300495147705, | |
| "learning_rate": 0.00046684514435695534, | |
| "loss": 3.3161, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 11.133788037775446, | |
| "grad_norm": 0.3724916875362396, | |
| "learning_rate": 0.00046667016622922134, | |
| "loss": 3.3028, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 11.14836189809957, | |
| "grad_norm": 0.34953588247299194, | |
| "learning_rate": 0.0004664951881014873, | |
| "loss": 3.3074, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 11.162935758423691, | |
| "grad_norm": 0.3273851275444031, | |
| "learning_rate": 0.0004663202099737532, | |
| "loss": 3.3194, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 11.177509618747814, | |
| "grad_norm": 0.3394640386104584, | |
| "learning_rate": 0.0004661452318460192, | |
| "loss": 3.3129, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 11.192083479071936, | |
| "grad_norm": 0.3485426902770996, | |
| "learning_rate": 0.00046597025371828516, | |
| "loss": 3.3283, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 11.20665733939606, | |
| "grad_norm": 0.34133514761924744, | |
| "learning_rate": 0.00046579527559055116, | |
| "loss": 3.3129, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 11.221231199720181, | |
| "grad_norm": 0.33524519205093384, | |
| "learning_rate": 0.0004656202974628171, | |
| "loss": 3.3224, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 11.235805060044305, | |
| "grad_norm": 0.3436400294303894, | |
| "learning_rate": 0.00046544531933508305, | |
| "loss": 3.3293, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 11.250378920368426, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.00046527034120734905, | |
| "loss": 3.3313, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 11.26495278069255, | |
| "grad_norm": 0.3648681044578552, | |
| "learning_rate": 0.00046509536307961504, | |
| "loss": 3.32, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 11.279526641016673, | |
| "grad_norm": 0.3386353850364685, | |
| "learning_rate": 0.00046492038495188093, | |
| "loss": 3.3257, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 11.294100501340795, | |
| "grad_norm": 0.4027436077594757, | |
| "learning_rate": 0.00046474540682414693, | |
| "loss": 3.325, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 11.308674361664918, | |
| "grad_norm": 0.36172330379486084, | |
| "learning_rate": 0.00046457042869641293, | |
| "loss": 3.331, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 11.32324822198904, | |
| "grad_norm": 0.36298421025276184, | |
| "learning_rate": 0.0004643954505686789, | |
| "loss": 3.3357, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 11.337822082313163, | |
| "grad_norm": 0.36087754368782043, | |
| "learning_rate": 0.0004642204724409448, | |
| "loss": 3.3367, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 11.352395942637285, | |
| "grad_norm": 0.3267645537853241, | |
| "learning_rate": 0.0004640454943132108, | |
| "loss": 3.3455, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 11.366969802961409, | |
| "grad_norm": 0.3678348660469055, | |
| "learning_rate": 0.0004638705161854768, | |
| "loss": 3.3442, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.366969802961409, | |
| "eval_accuracy": 0.36826857313753114, | |
| "eval_loss": 3.570142984390259, | |
| "eval_runtime": 53.2477, | |
| "eval_samples_per_second": 312.257, | |
| "eval_steps_per_second": 19.531, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.38154366328553, | |
| "grad_norm": 0.326987624168396, | |
| "learning_rate": 0.0004636955380577427, | |
| "loss": 3.3472, | |
| "step": 39050 | |
| }, | |
| { | |
| "epoch": 11.396117523609654, | |
| "grad_norm": 0.3479270339012146, | |
| "learning_rate": 0.0004635205599300087, | |
| "loss": 3.334, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 11.410691383933777, | |
| "grad_norm": 0.3902701437473297, | |
| "learning_rate": 0.0004633455818022747, | |
| "loss": 3.3365, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 11.425265244257899, | |
| "grad_norm": 0.33477523922920227, | |
| "learning_rate": 0.00046317060367454064, | |
| "loss": 3.3543, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 11.439839104582022, | |
| "grad_norm": 0.3593793511390686, | |
| "learning_rate": 0.0004629956255468066, | |
| "loss": 3.3543, | |
| "step": 39250 | |
| }, | |
| { | |
| "epoch": 11.454412964906144, | |
| "grad_norm": 0.31894612312316895, | |
| "learning_rate": 0.0004628206474190726, | |
| "loss": 3.3334, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 11.468986825230267, | |
| "grad_norm": 0.34372231364250183, | |
| "learning_rate": 0.0004626456692913385, | |
| "loss": 3.3548, | |
| "step": 39350 | |
| }, | |
| { | |
| "epoch": 11.483560685554389, | |
| "grad_norm": 0.35167017579078674, | |
| "learning_rate": 0.0004624706911636045, | |
| "loss": 3.3532, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 11.498134545878512, | |
| "grad_norm": 0.34937718510627747, | |
| "learning_rate": 0.00046229571303587046, | |
| "loss": 3.3543, | |
| "step": 39450 | |
| }, | |
| { | |
| "epoch": 11.512708406202634, | |
| "grad_norm": 0.342166006565094, | |
| "learning_rate": 0.00046212073490813646, | |
| "loss": 3.3563, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 11.527282266526758, | |
| "grad_norm": 0.3739171028137207, | |
| "learning_rate": 0.0004619457567804024, | |
| "loss": 3.3604, | |
| "step": 39550 | |
| }, | |
| { | |
| "epoch": 11.541856126850881, | |
| "grad_norm": 0.35929858684539795, | |
| "learning_rate": 0.0004617707786526684, | |
| "loss": 3.3605, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 11.556429987175003, | |
| "grad_norm": 0.3883790969848633, | |
| "learning_rate": 0.00046159580052493435, | |
| "loss": 3.3472, | |
| "step": 39650 | |
| }, | |
| { | |
| "epoch": 11.571003847499126, | |
| "grad_norm": 0.3464301824569702, | |
| "learning_rate": 0.0004614208223972003, | |
| "loss": 3.3535, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 11.585577707823248, | |
| "grad_norm": 0.3334825038909912, | |
| "learning_rate": 0.0004612458442694663, | |
| "loss": 3.3482, | |
| "step": 39750 | |
| }, | |
| { | |
| "epoch": 11.600151568147371, | |
| "grad_norm": 0.3654293119907379, | |
| "learning_rate": 0.0004610708661417323, | |
| "loss": 3.3473, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 11.614725428471493, | |
| "grad_norm": 0.3614482879638672, | |
| "learning_rate": 0.0004608958880139982, | |
| "loss": 3.3627, | |
| "step": 39850 | |
| }, | |
| { | |
| "epoch": 11.629299288795616, | |
| "grad_norm": 0.3462158441543579, | |
| "learning_rate": 0.00046072090988626417, | |
| "loss": 3.3557, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 11.643873149119738, | |
| "grad_norm": 0.35264861583709717, | |
| "learning_rate": 0.00046054593175853017, | |
| "loss": 3.361, | |
| "step": 39950 | |
| }, | |
| { | |
| "epoch": 11.658447009443861, | |
| "grad_norm": 0.3280007243156433, | |
| "learning_rate": 0.00046037095363079606, | |
| "loss": 3.3643, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.658447009443861, | |
| "eval_accuracy": 0.36889142538890307, | |
| "eval_loss": 3.5624794960021973, | |
| "eval_runtime": 53.3577, | |
| "eval_samples_per_second": 311.614, | |
| "eval_steps_per_second": 19.491, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.673020869767985, | |
| "grad_norm": 0.34470710158348083, | |
| "learning_rate": 0.00046019597550306206, | |
| "loss": 3.3638, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 11.687594730092107, | |
| "grad_norm": 0.3455049693584442, | |
| "learning_rate": 0.00046002099737532806, | |
| "loss": 3.3741, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 11.70216859041623, | |
| "grad_norm": 0.353929340839386, | |
| "learning_rate": 0.00045984601924759405, | |
| "loss": 3.3658, | |
| "step": 40150 | |
| }, | |
| { | |
| "epoch": 11.716742450740352, | |
| "grad_norm": 0.34111636877059937, | |
| "learning_rate": 0.00045967104111985994, | |
| "loss": 3.3672, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 11.731316311064475, | |
| "grad_norm": 0.3463384509086609, | |
| "learning_rate": 0.00045949606299212594, | |
| "loss": 3.3819, | |
| "step": 40250 | |
| }, | |
| { | |
| "epoch": 11.745890171388597, | |
| "grad_norm": 0.3586461842060089, | |
| "learning_rate": 0.00045932108486439194, | |
| "loss": 3.3647, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 11.76046403171272, | |
| "grad_norm": 0.3610433042049408, | |
| "learning_rate": 0.0004591461067366579, | |
| "loss": 3.3772, | |
| "step": 40350 | |
| }, | |
| { | |
| "epoch": 11.775037892036842, | |
| "grad_norm": 0.363849937915802, | |
| "learning_rate": 0.0004589711286089238, | |
| "loss": 3.3714, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 11.789611752360965, | |
| "grad_norm": 0.3719967305660248, | |
| "learning_rate": 0.0004587961504811898, | |
| "loss": 3.3634, | |
| "step": 40450 | |
| }, | |
| { | |
| "epoch": 11.804185612685089, | |
| "grad_norm": 0.3673126995563507, | |
| "learning_rate": 0.00045862117235345577, | |
| "loss": 3.3663, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 11.81875947300921, | |
| "grad_norm": 0.3462914228439331, | |
| "learning_rate": 0.00045844619422572176, | |
| "loss": 3.3805, | |
| "step": 40550 | |
| }, | |
| { | |
| "epoch": 11.833333333333334, | |
| "grad_norm": 0.3419555425643921, | |
| "learning_rate": 0.0004582712160979877, | |
| "loss": 3.3786, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 11.847907193657456, | |
| "grad_norm": 0.35096514225006104, | |
| "learning_rate": 0.00045809623797025365, | |
| "loss": 3.3566, | |
| "step": 40650 | |
| }, | |
| { | |
| "epoch": 11.862481053981579, | |
| "grad_norm": 0.3647240698337555, | |
| "learning_rate": 0.00045792125984251965, | |
| "loss": 3.3779, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 11.8770549143057, | |
| "grad_norm": 0.35765257477760315, | |
| "learning_rate": 0.0004577462817147856, | |
| "loss": 3.3644, | |
| "step": 40750 | |
| }, | |
| { | |
| "epoch": 11.891628774629824, | |
| "grad_norm": 0.3552255630493164, | |
| "learning_rate": 0.0004575713035870516, | |
| "loss": 3.3735, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 11.906202634953946, | |
| "grad_norm": 0.3263258635997772, | |
| "learning_rate": 0.00045739632545931753, | |
| "loss": 3.3768, | |
| "step": 40850 | |
| }, | |
| { | |
| "epoch": 11.92077649527807, | |
| "grad_norm": 0.3834480345249176, | |
| "learning_rate": 0.00045722134733158353, | |
| "loss": 3.3691, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 11.935350355602193, | |
| "grad_norm": 0.3409174084663391, | |
| "learning_rate": 0.0004570463692038495, | |
| "loss": 3.3798, | |
| "step": 40950 | |
| }, | |
| { | |
| "epoch": 11.949924215926314, | |
| "grad_norm": 0.338615745306015, | |
| "learning_rate": 0.0004568713910761154, | |
| "loss": 3.3835, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.949924215926314, | |
| "eval_accuracy": 0.3692655839881305, | |
| "eval_loss": 3.555158853530884, | |
| "eval_runtime": 53.1961, | |
| "eval_samples_per_second": 312.56, | |
| "eval_steps_per_second": 19.55, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.964498076250438, | |
| "grad_norm": 0.34794795513153076, | |
| "learning_rate": 0.0004566964129483814, | |
| "loss": 3.3811, | |
| "step": 41050 | |
| }, | |
| { | |
| "epoch": 11.97907193657456, | |
| "grad_norm": 0.3606904149055481, | |
| "learning_rate": 0.0004565214348206474, | |
| "loss": 3.3831, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 11.993645796898683, | |
| "grad_norm": 0.37181276082992554, | |
| "learning_rate": 0.0004563464566929133, | |
| "loss": 3.3904, | |
| "step": 41150 | |
| }, | |
| { | |
| "epoch": 12.008161361781509, | |
| "grad_norm": 0.3550557494163513, | |
| "learning_rate": 0.0004561714785651793, | |
| "loss": 3.3194, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 12.02273522210563, | |
| "grad_norm": 0.37992751598358154, | |
| "learning_rate": 0.0004559965004374453, | |
| "loss": 3.2675, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 12.037309082429754, | |
| "grad_norm": 0.33115464448928833, | |
| "learning_rate": 0.0004558215223097113, | |
| "loss": 3.2753, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 12.051882942753876, | |
| "grad_norm": 0.3398646116256714, | |
| "learning_rate": 0.0004556465441819772, | |
| "loss": 3.2684, | |
| "step": 41350 | |
| }, | |
| { | |
| "epoch": 12.066456803078, | |
| "grad_norm": 0.335304319858551, | |
| "learning_rate": 0.0004554715660542432, | |
| "loss": 3.2729, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 12.081030663402123, | |
| "grad_norm": 0.3528042137622833, | |
| "learning_rate": 0.0004552965879265092, | |
| "loss": 3.286, | |
| "step": 41450 | |
| }, | |
| { | |
| "epoch": 12.095604523726244, | |
| "grad_norm": 0.3518418073654175, | |
| "learning_rate": 0.0004551216097987751, | |
| "loss": 3.2895, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 12.110178384050368, | |
| "grad_norm": 0.33296331763267517, | |
| "learning_rate": 0.00045494663167104107, | |
| "loss": 3.2851, | |
| "step": 41550 | |
| }, | |
| { | |
| "epoch": 12.12475224437449, | |
| "grad_norm": 0.3649348318576813, | |
| "learning_rate": 0.00045477165354330706, | |
| "loss": 3.2921, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 12.139326104698613, | |
| "grad_norm": 0.37086284160614014, | |
| "learning_rate": 0.000454596675415573, | |
| "loss": 3.299, | |
| "step": 41650 | |
| }, | |
| { | |
| "epoch": 12.153899965022735, | |
| "grad_norm": 0.3619695007801056, | |
| "learning_rate": 0.00045442169728783895, | |
| "loss": 3.291, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 12.168473825346858, | |
| "grad_norm": 0.3611195981502533, | |
| "learning_rate": 0.00045424671916010495, | |
| "loss": 3.2959, | |
| "step": 41750 | |
| }, | |
| { | |
| "epoch": 12.18304768567098, | |
| "grad_norm": 0.33316633105278015, | |
| "learning_rate": 0.0004540717410323709, | |
| "loss": 3.3016, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 12.197621545995103, | |
| "grad_norm": 0.3570798337459564, | |
| "learning_rate": 0.0004538967629046369, | |
| "loss": 3.3126, | |
| "step": 41850 | |
| }, | |
| { | |
| "epoch": 12.212195406319227, | |
| "grad_norm": 0.37914368510246277, | |
| "learning_rate": 0.00045372178477690283, | |
| "loss": 3.3036, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 12.226769266643348, | |
| "grad_norm": 0.3518165647983551, | |
| "learning_rate": 0.0004535468066491688, | |
| "loss": 3.3045, | |
| "step": 41950 | |
| }, | |
| { | |
| "epoch": 12.241343126967472, | |
| "grad_norm": 0.35031190514564514, | |
| "learning_rate": 0.0004533718285214348, | |
| "loss": 3.3082, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.241343126967472, | |
| "eval_accuracy": 0.3688768309672912, | |
| "eval_loss": 3.5698370933532715, | |
| "eval_runtime": 53.2744, | |
| "eval_samples_per_second": 312.101, | |
| "eval_steps_per_second": 19.522, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.255916987291593, | |
| "grad_norm": 0.35693952441215515, | |
| "learning_rate": 0.00045319685039370077, | |
| "loss": 3.3034, | |
| "step": 42050 | |
| }, | |
| { | |
| "epoch": 12.270490847615717, | |
| "grad_norm": 0.34432175755500793, | |
| "learning_rate": 0.0004530218722659667, | |
| "loss": 3.3106, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 12.285064707939839, | |
| "grad_norm": 0.3476402461528778, | |
| "learning_rate": 0.00045284689413823266, | |
| "loss": 3.3177, | |
| "step": 42150 | |
| }, | |
| { | |
| "epoch": 12.299638568263962, | |
| "grad_norm": 0.33668053150177, | |
| "learning_rate": 0.00045267191601049866, | |
| "loss": 3.3185, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 12.314212428588084, | |
| "grad_norm": 0.3379969596862793, | |
| "learning_rate": 0.00045249693788276465, | |
| "loss": 3.3094, | |
| "step": 42250 | |
| }, | |
| { | |
| "epoch": 12.328786288912207, | |
| "grad_norm": 0.3845365345478058, | |
| "learning_rate": 0.00045232195975503054, | |
| "loss": 3.3214, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 12.34336014923633, | |
| "grad_norm": 0.3697710335254669, | |
| "learning_rate": 0.00045214698162729654, | |
| "loss": 3.3116, | |
| "step": 42350 | |
| }, | |
| { | |
| "epoch": 12.357934009560452, | |
| "grad_norm": 0.33476316928863525, | |
| "learning_rate": 0.00045197200349956254, | |
| "loss": 3.3326, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 12.372507869884576, | |
| "grad_norm": 0.3501075208187103, | |
| "learning_rate": 0.00045179702537182854, | |
| "loss": 3.3258, | |
| "step": 42450 | |
| }, | |
| { | |
| "epoch": 12.387081730208697, | |
| "grad_norm": 0.3357802927494049, | |
| "learning_rate": 0.0004516220472440944, | |
| "loss": 3.315, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 12.40165559053282, | |
| "grad_norm": 0.3776220977306366, | |
| "learning_rate": 0.0004514470691163604, | |
| "loss": 3.335, | |
| "step": 42550 | |
| }, | |
| { | |
| "epoch": 12.416229450856942, | |
| "grad_norm": 0.3859766125679016, | |
| "learning_rate": 0.0004512720909886264, | |
| "loss": 3.3257, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 12.430803311181066, | |
| "grad_norm": 0.33255526423454285, | |
| "learning_rate": 0.0004510971128608923, | |
| "loss": 3.3273, | |
| "step": 42650 | |
| }, | |
| { | |
| "epoch": 12.445377171505188, | |
| "grad_norm": 0.36508461833000183, | |
| "learning_rate": 0.0004509221347331583, | |
| "loss": 3.3366, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 12.459951031829311, | |
| "grad_norm": 0.3575635254383087, | |
| "learning_rate": 0.0004507471566054243, | |
| "loss": 3.3386, | |
| "step": 42750 | |
| }, | |
| { | |
| "epoch": 12.474524892153434, | |
| "grad_norm": 0.35500335693359375, | |
| "learning_rate": 0.00045057217847769025, | |
| "loss": 3.3267, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 12.489098752477556, | |
| "grad_norm": 0.3265441656112671, | |
| "learning_rate": 0.0004503972003499562, | |
| "loss": 3.3363, | |
| "step": 42850 | |
| }, | |
| { | |
| "epoch": 12.50367261280168, | |
| "grad_norm": 0.3403628468513489, | |
| "learning_rate": 0.0004502222222222222, | |
| "loss": 3.3262, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 12.518246473125801, | |
| "grad_norm": 0.343070387840271, | |
| "learning_rate": 0.00045004724409448813, | |
| "loss": 3.337, | |
| "step": 42950 | |
| }, | |
| { | |
| "epoch": 12.532820333449925, | |
| "grad_norm": 0.34135547280311584, | |
| "learning_rate": 0.00044987226596675413, | |
| "loss": 3.3344, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.532820333449925, | |
| "eval_accuracy": 0.3695046264904994, | |
| "eval_loss": 3.561704158782959, | |
| "eval_runtime": 53.3031, | |
| "eval_samples_per_second": 311.933, | |
| "eval_steps_per_second": 19.511, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.547394193774046, | |
| "grad_norm": 0.34756624698638916, | |
| "learning_rate": 0.0004496972878390201, | |
| "loss": 3.3327, | |
| "step": 43050 | |
| }, | |
| { | |
| "epoch": 12.56196805409817, | |
| "grad_norm": 0.36216285824775696, | |
| "learning_rate": 0.000449522309711286, | |
| "loss": 3.3483, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 12.576541914422291, | |
| "grad_norm": 0.3356451690196991, | |
| "learning_rate": 0.000449347331583552, | |
| "loss": 3.334, | |
| "step": 43150 | |
| }, | |
| { | |
| "epoch": 12.591115774746415, | |
| "grad_norm": 0.3678639531135559, | |
| "learning_rate": 0.000449172353455818, | |
| "loss": 3.3387, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 12.605689635070538, | |
| "grad_norm": 0.3456084132194519, | |
| "learning_rate": 0.0004489973753280839, | |
| "loss": 3.3369, | |
| "step": 43250 | |
| }, | |
| { | |
| "epoch": 12.62026349539466, | |
| "grad_norm": 0.3634917736053467, | |
| "learning_rate": 0.0004488223972003499, | |
| "loss": 3.3413, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 12.634837355718783, | |
| "grad_norm": 0.35174956917762756, | |
| "learning_rate": 0.0004486474190726159, | |
| "loss": 3.3357, | |
| "step": 43350 | |
| }, | |
| { | |
| "epoch": 12.649411216042905, | |
| "grad_norm": 0.3444618284702301, | |
| "learning_rate": 0.0004484724409448819, | |
| "loss": 3.344, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 12.663985076367029, | |
| "grad_norm": 0.36315611004829407, | |
| "learning_rate": 0.0004482974628171478, | |
| "loss": 3.3515, | |
| "step": 43450 | |
| }, | |
| { | |
| "epoch": 12.67855893669115, | |
| "grad_norm": 0.3482201099395752, | |
| "learning_rate": 0.0004481224846894138, | |
| "loss": 3.3423, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 12.693132797015274, | |
| "grad_norm": 0.33746659755706787, | |
| "learning_rate": 0.0004479475065616798, | |
| "loss": 3.3552, | |
| "step": 43550 | |
| }, | |
| { | |
| "epoch": 12.707706657339395, | |
| "grad_norm": 0.32812613248825073, | |
| "learning_rate": 0.00044777252843394567, | |
| "loss": 3.3375, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 12.722280517663519, | |
| "grad_norm": 0.36320027709007263, | |
| "learning_rate": 0.00044759755030621167, | |
| "loss": 3.3518, | |
| "step": 43650 | |
| }, | |
| { | |
| "epoch": 12.736854377987642, | |
| "grad_norm": 0.3756552040576935, | |
| "learning_rate": 0.00044742257217847767, | |
| "loss": 3.3594, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 12.751428238311764, | |
| "grad_norm": 0.3401622474193573, | |
| "learning_rate": 0.00044724759405074366, | |
| "loss": 3.3632, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 12.766002098635887, | |
| "grad_norm": 0.34506717324256897, | |
| "learning_rate": 0.00044707261592300955, | |
| "loss": 3.3487, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 12.780575958960009, | |
| "grad_norm": 0.3645191490650177, | |
| "learning_rate": 0.00044689763779527555, | |
| "loss": 3.3469, | |
| "step": 43850 | |
| }, | |
| { | |
| "epoch": 12.795149819284132, | |
| "grad_norm": 0.35409530997276306, | |
| "learning_rate": 0.00044672265966754155, | |
| "loss": 3.3453, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 12.809723679608254, | |
| "grad_norm": 0.3320823907852173, | |
| "learning_rate": 0.0004465476815398075, | |
| "loss": 3.3636, | |
| "step": 43950 | |
| }, | |
| { | |
| "epoch": 12.824297539932378, | |
| "grad_norm": 0.34940165281295776, | |
| "learning_rate": 0.00044637270341207344, | |
| "loss": 3.3591, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.824297539932378, | |
| "eval_accuracy": 0.36982064279717625, | |
| "eval_loss": 3.5522236824035645, | |
| "eval_runtime": 53.3823, | |
| "eval_samples_per_second": 311.47, | |
| "eval_steps_per_second": 19.482, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.8388714002565, | |
| "grad_norm": 0.36628594994544983, | |
| "learning_rate": 0.00044619772528433943, | |
| "loss": 3.3656, | |
| "step": 44050 | |
| }, | |
| { | |
| "epoch": 12.853445260580623, | |
| "grad_norm": 0.3620845675468445, | |
| "learning_rate": 0.0004460227471566054, | |
| "loss": 3.3576, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 12.868019120904744, | |
| "grad_norm": 0.34851986169815063, | |
| "learning_rate": 0.0004458477690288714, | |
| "loss": 3.3474, | |
| "step": 44150 | |
| }, | |
| { | |
| "epoch": 12.882592981228868, | |
| "grad_norm": 0.37146246433258057, | |
| "learning_rate": 0.0004456727909011373, | |
| "loss": 3.3361, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 12.897166841552991, | |
| "grad_norm": 0.3710069954395294, | |
| "learning_rate": 0.00044549781277340326, | |
| "loss": 3.3563, | |
| "step": 44250 | |
| }, | |
| { | |
| "epoch": 12.911740701877113, | |
| "grad_norm": 0.3436692953109741, | |
| "learning_rate": 0.00044532283464566926, | |
| "loss": 3.3659, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 12.926314562201236, | |
| "grad_norm": 0.3440268635749817, | |
| "learning_rate": 0.0004451478565179352, | |
| "loss": 3.3578, | |
| "step": 44350 | |
| }, | |
| { | |
| "epoch": 12.940888422525358, | |
| "grad_norm": 0.3370705246925354, | |
| "learning_rate": 0.00044497287839020115, | |
| "loss": 3.3602, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 12.955462282849481, | |
| "grad_norm": 0.3349348306655884, | |
| "learning_rate": 0.00044479790026246714, | |
| "loss": 3.3533, | |
| "step": 44450 | |
| }, | |
| { | |
| "epoch": 12.970036143173603, | |
| "grad_norm": 0.3440280854701996, | |
| "learning_rate": 0.00044462292213473314, | |
| "loss": 3.3597, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 12.984610003497727, | |
| "grad_norm": 0.37189579010009766, | |
| "learning_rate": 0.00044444794400699903, | |
| "loss": 3.3446, | |
| "step": 44550 | |
| }, | |
| { | |
| "epoch": 12.999183863821848, | |
| "grad_norm": 0.3803972899913788, | |
| "learning_rate": 0.00044427296587926503, | |
| "loss": 3.3488, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 13.013699428704676, | |
| "grad_norm": 0.3448667824268341, | |
| "learning_rate": 0.000444097987751531, | |
| "loss": 3.2586, | |
| "step": 44650 | |
| }, | |
| { | |
| "epoch": 13.028273289028798, | |
| "grad_norm": 0.3495483696460724, | |
| "learning_rate": 0.000443923009623797, | |
| "loss": 3.2472, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 13.042847149352921, | |
| "grad_norm": 0.3531075119972229, | |
| "learning_rate": 0.0004437480314960629, | |
| "loss": 3.2499, | |
| "step": 44750 | |
| }, | |
| { | |
| "epoch": 13.057421009677043, | |
| "grad_norm": 0.33505311608314514, | |
| "learning_rate": 0.0004435730533683289, | |
| "loss": 3.2616, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 13.071994870001166, | |
| "grad_norm": 0.37032684683799744, | |
| "learning_rate": 0.0004433980752405949, | |
| "loss": 3.2688, | |
| "step": 44850 | |
| }, | |
| { | |
| "epoch": 13.086568730325288, | |
| "grad_norm": 0.3590488135814667, | |
| "learning_rate": 0.0004432230971128609, | |
| "loss": 3.264, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 13.101142590649411, | |
| "grad_norm": 0.3663865029811859, | |
| "learning_rate": 0.0004430481189851268, | |
| "loss": 3.2836, | |
| "step": 44950 | |
| }, | |
| { | |
| "epoch": 13.115716450973533, | |
| "grad_norm": 0.32889941334724426, | |
| "learning_rate": 0.0004428731408573928, | |
| "loss": 3.271, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.115716450973533, | |
| "eval_accuracy": 0.36953016672832023, | |
| "eval_loss": 3.5666751861572266, | |
| "eval_runtime": 53.3385, | |
| "eval_samples_per_second": 311.726, | |
| "eval_steps_per_second": 19.498, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.130290311297657, | |
| "grad_norm": 0.32199689745903015, | |
| "learning_rate": 0.0004426981627296588, | |
| "loss": 3.2712, | |
| "step": 45050 | |
| }, | |
| { | |
| "epoch": 13.14486417162178, | |
| "grad_norm": 0.3411182761192322, | |
| "learning_rate": 0.00044252318460192473, | |
| "loss": 3.274, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 13.159438031945902, | |
| "grad_norm": 0.3830653429031372, | |
| "learning_rate": 0.0004423482064741907, | |
| "loss": 3.2699, | |
| "step": 45150 | |
| }, | |
| { | |
| "epoch": 13.174011892270025, | |
| "grad_norm": 0.3356707990169525, | |
| "learning_rate": 0.0004421732283464567, | |
| "loss": 3.276, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 13.188585752594147, | |
| "grad_norm": 0.36504417657852173, | |
| "learning_rate": 0.0004419982502187226, | |
| "loss": 3.2904, | |
| "step": 45250 | |
| }, | |
| { | |
| "epoch": 13.20315961291827, | |
| "grad_norm": 0.35199934244155884, | |
| "learning_rate": 0.00044182327209098856, | |
| "loss": 3.2868, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 13.217733473242392, | |
| "grad_norm": 0.3373820185661316, | |
| "learning_rate": 0.00044164829396325456, | |
| "loss": 3.2815, | |
| "step": 45350 | |
| }, | |
| { | |
| "epoch": 13.232307333566515, | |
| "grad_norm": 0.3519679009914398, | |
| "learning_rate": 0.0004414733158355205, | |
| "loss": 3.294, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 13.246881193890637, | |
| "grad_norm": 0.3462173342704773, | |
| "learning_rate": 0.0004412983377077865, | |
| "loss": 3.2805, | |
| "step": 45450 | |
| }, | |
| { | |
| "epoch": 13.26145505421476, | |
| "grad_norm": 0.3485550880432129, | |
| "learning_rate": 0.00044112335958005244, | |
| "loss": 3.2979, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 13.276028914538884, | |
| "grad_norm": 0.3813554346561432, | |
| "learning_rate": 0.0004409483814523184, | |
| "loss": 3.2948, | |
| "step": 45550 | |
| }, | |
| { | |
| "epoch": 13.290602774863006, | |
| "grad_norm": 0.36145317554473877, | |
| "learning_rate": 0.0004407734033245844, | |
| "loss": 3.3003, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 13.305176635187129, | |
| "grad_norm": 0.4004374146461487, | |
| "learning_rate": 0.0004405984251968504, | |
| "loss": 3.2865, | |
| "step": 45650 | |
| }, | |
| { | |
| "epoch": 13.31975049551125, | |
| "grad_norm": 0.3874744176864624, | |
| "learning_rate": 0.0004404234470691163, | |
| "loss": 3.2975, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 13.334324355835374, | |
| "grad_norm": 0.3737644553184509, | |
| "learning_rate": 0.00044024846894138227, | |
| "loss": 3.312, | |
| "step": 45750 | |
| }, | |
| { | |
| "epoch": 13.348898216159496, | |
| "grad_norm": 0.3732014298439026, | |
| "learning_rate": 0.00044007349081364827, | |
| "loss": 3.3073, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 13.36347207648362, | |
| "grad_norm": 0.33516380190849304, | |
| "learning_rate": 0.00043989851268591427, | |
| "loss": 3.2945, | |
| "step": 45850 | |
| }, | |
| { | |
| "epoch": 13.378045936807741, | |
| "grad_norm": 0.3793085217475891, | |
| "learning_rate": 0.00043972353455818016, | |
| "loss": 3.3081, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 13.392619797131864, | |
| "grad_norm": 0.3479321599006653, | |
| "learning_rate": 0.00043954855643044615, | |
| "loss": 3.3064, | |
| "step": 45950 | |
| }, | |
| { | |
| "epoch": 13.407193657455988, | |
| "grad_norm": 0.3450932204723358, | |
| "learning_rate": 0.00043937357830271215, | |
| "loss": 3.3146, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.407193657455988, | |
| "eval_accuracy": 0.36981722958567026, | |
| "eval_loss": 3.5598223209381104, | |
| "eval_runtime": 53.3903, | |
| "eval_samples_per_second": 311.424, | |
| "eval_steps_per_second": 19.479, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.42176751778011, | |
| "grad_norm": 0.35616064071655273, | |
| "learning_rate": 0.0004391986001749781, | |
| "loss": 3.329, | |
| "step": 46050 | |
| }, | |
| { | |
| "epoch": 13.436341378104233, | |
| "grad_norm": 0.3846951127052307, | |
| "learning_rate": 0.00043902362204724404, | |
| "loss": 3.3084, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 13.450915238428355, | |
| "grad_norm": 0.335750550031662, | |
| "learning_rate": 0.00043884864391951004, | |
| "loss": 3.3181, | |
| "step": 46150 | |
| }, | |
| { | |
| "epoch": 13.465489098752478, | |
| "grad_norm": 0.3532959818840027, | |
| "learning_rate": 0.00043867366579177603, | |
| "loss": 3.3181, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 13.4800629590766, | |
| "grad_norm": 0.35488593578338623, | |
| "learning_rate": 0.0004384986876640419, | |
| "loss": 3.3174, | |
| "step": 46250 | |
| }, | |
| { | |
| "epoch": 13.494636819400723, | |
| "grad_norm": 0.3597154915332794, | |
| "learning_rate": 0.0004383237095363079, | |
| "loss": 3.3166, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 13.509210679724845, | |
| "grad_norm": 0.37795230746269226, | |
| "learning_rate": 0.0004381487314085739, | |
| "loss": 3.3249, | |
| "step": 46350 | |
| }, | |
| { | |
| "epoch": 13.523784540048968, | |
| "grad_norm": 0.3656541407108307, | |
| "learning_rate": 0.00043797375328083986, | |
| "loss": 3.3218, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 13.538358400373092, | |
| "grad_norm": 0.36606359481811523, | |
| "learning_rate": 0.0004377987751531058, | |
| "loss": 3.3175, | |
| "step": 46450 | |
| }, | |
| { | |
| "epoch": 13.552932260697213, | |
| "grad_norm": 0.352518230676651, | |
| "learning_rate": 0.0004376237970253718, | |
| "loss": 3.3238, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 13.567506121021337, | |
| "grad_norm": 0.32122334837913513, | |
| "learning_rate": 0.00043744881889763775, | |
| "loss": 3.3217, | |
| "step": 46550 | |
| }, | |
| { | |
| "epoch": 13.582079981345458, | |
| "grad_norm": 0.3561324179172516, | |
| "learning_rate": 0.00043727384076990374, | |
| "loss": 3.3216, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 13.596653841669582, | |
| "grad_norm": 0.389505535364151, | |
| "learning_rate": 0.0004370988626421697, | |
| "loss": 3.3181, | |
| "step": 46650 | |
| }, | |
| { | |
| "epoch": 13.611227701993704, | |
| "grad_norm": 0.34612900018692017, | |
| "learning_rate": 0.00043692388451443563, | |
| "loss": 3.3226, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 13.625801562317827, | |
| "grad_norm": 0.350142240524292, | |
| "learning_rate": 0.00043674890638670163, | |
| "loss": 3.3378, | |
| "step": 46750 | |
| }, | |
| { | |
| "epoch": 13.640375422641949, | |
| "grad_norm": 0.3523232638835907, | |
| "learning_rate": 0.0004365739282589676, | |
| "loss": 3.3313, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 13.654949282966072, | |
| "grad_norm": 0.36117634177207947, | |
| "learning_rate": 0.0004363989501312335, | |
| "loss": 3.3417, | |
| "step": 46850 | |
| }, | |
| { | |
| "epoch": 13.669523143290196, | |
| "grad_norm": 0.34388113021850586, | |
| "learning_rate": 0.0004362239720034995, | |
| "loss": 3.3294, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 13.684097003614317, | |
| "grad_norm": 0.3423570990562439, | |
| "learning_rate": 0.0004360489938757655, | |
| "loss": 3.3376, | |
| "step": 46950 | |
| }, | |
| { | |
| "epoch": 13.69867086393844, | |
| "grad_norm": 0.34624671936035156, | |
| "learning_rate": 0.0004358740157480315, | |
| "loss": 3.3277, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.69867086393844, | |
| "eval_accuracy": 0.37011971074327155, | |
| "eval_loss": 3.553891658782959, | |
| "eval_runtime": 53.1879, | |
| "eval_samples_per_second": 312.609, | |
| "eval_steps_per_second": 19.553, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.713244724262562, | |
| "grad_norm": 0.35453182458877563, | |
| "learning_rate": 0.0004356990376202974, | |
| "loss": 3.3401, | |
| "step": 47050 | |
| }, | |
| { | |
| "epoch": 13.727818584586686, | |
| "grad_norm": 0.3806355893611908, | |
| "learning_rate": 0.0004355240594925634, | |
| "loss": 3.3272, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 13.742392444910807, | |
| "grad_norm": 0.3244573473930359, | |
| "learning_rate": 0.0004353490813648294, | |
| "loss": 3.332, | |
| "step": 47150 | |
| }, | |
| { | |
| "epoch": 13.756966305234931, | |
| "grad_norm": 0.35886478424072266, | |
| "learning_rate": 0.0004351741032370953, | |
| "loss": 3.3388, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 13.771540165559053, | |
| "grad_norm": 0.3819633722305298, | |
| "learning_rate": 0.0004349991251093613, | |
| "loss": 3.3411, | |
| "step": 47250 | |
| }, | |
| { | |
| "epoch": 13.786114025883176, | |
| "grad_norm": 0.36600205302238464, | |
| "learning_rate": 0.0004348241469816273, | |
| "loss": 3.334, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 13.8006878862073, | |
| "grad_norm": 0.36532509326934814, | |
| "learning_rate": 0.0004346491688538932, | |
| "loss": 3.3333, | |
| "step": 47350 | |
| }, | |
| { | |
| "epoch": 13.815261746531421, | |
| "grad_norm": 0.3700556457042694, | |
| "learning_rate": 0.00043447419072615916, | |
| "loss": 3.3347, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 13.829835606855545, | |
| "grad_norm": 0.32723188400268555, | |
| "learning_rate": 0.00043429921259842516, | |
| "loss": 3.3258, | |
| "step": 47450 | |
| }, | |
| { | |
| "epoch": 13.844409467179666, | |
| "grad_norm": 0.36504605412483215, | |
| "learning_rate": 0.00043412423447069116, | |
| "loss": 3.3328, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 13.85898332750379, | |
| "grad_norm": 0.34605199098587036, | |
| "learning_rate": 0.0004339492563429571, | |
| "loss": 3.3427, | |
| "step": 47550 | |
| }, | |
| { | |
| "epoch": 13.873557187827911, | |
| "grad_norm": 0.40003854036331177, | |
| "learning_rate": 0.00043377427821522305, | |
| "loss": 3.336, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 13.888131048152035, | |
| "grad_norm": 0.33703961968421936, | |
| "learning_rate": 0.00043359930008748904, | |
| "loss": 3.3425, | |
| "step": 47650 | |
| }, | |
| { | |
| "epoch": 13.902704908476156, | |
| "grad_norm": 0.3614450693130493, | |
| "learning_rate": 0.000433424321959755, | |
| "loss": 3.3337, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 13.91727876880028, | |
| "grad_norm": 0.3742247223854065, | |
| "learning_rate": 0.000433249343832021, | |
| "loss": 3.3439, | |
| "step": 47750 | |
| }, | |
| { | |
| "epoch": 13.931852629124403, | |
| "grad_norm": 0.3677355945110321, | |
| "learning_rate": 0.00043307436570428693, | |
| "loss": 3.3504, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 13.946426489448525, | |
| "grad_norm": 0.34420591592788696, | |
| "learning_rate": 0.00043289938757655287, | |
| "loss": 3.3484, | |
| "step": 47850 | |
| }, | |
| { | |
| "epoch": 13.961000349772648, | |
| "grad_norm": 0.36050641536712646, | |
| "learning_rate": 0.00043272440944881887, | |
| "loss": 3.3386, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 13.97557421009677, | |
| "grad_norm": 0.3552097678184509, | |
| "learning_rate": 0.0004325494313210848, | |
| "loss": 3.3387, | |
| "step": 47950 | |
| }, | |
| { | |
| "epoch": 13.990148070420894, | |
| "grad_norm": 0.3638666570186615, | |
| "learning_rate": 0.00043237445319335076, | |
| "loss": 3.3397, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 13.990148070420894, | |
| "eval_accuracy": 0.3705688422986826, | |
| "eval_loss": 3.5461502075195312, | |
| "eval_runtime": 53.3477, | |
| "eval_samples_per_second": 311.672, | |
| "eval_steps_per_second": 19.495, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 14.00466363530372, | |
| "grad_norm": 0.37190598249435425, | |
| "learning_rate": 0.00043219947506561676, | |
| "loss": 3.3138, | |
| "step": 48050 | |
| }, | |
| { | |
| "epoch": 14.019237495627841, | |
| "grad_norm": 0.3626306653022766, | |
| "learning_rate": 0.00043202449693788275, | |
| "loss": 3.2264, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 14.033811355951965, | |
| "grad_norm": 0.36090075969696045, | |
| "learning_rate": 0.00043184951881014864, | |
| "loss": 3.2424, | |
| "step": 48150 | |
| }, | |
| { | |
| "epoch": 14.048385216276086, | |
| "grad_norm": 0.35671380162239075, | |
| "learning_rate": 0.00043167454068241464, | |
| "loss": 3.2551, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 14.06295907660021, | |
| "grad_norm": 0.3807269334793091, | |
| "learning_rate": 0.00043149956255468064, | |
| "loss": 3.2405, | |
| "step": 48250 | |
| }, | |
| { | |
| "epoch": 14.077532936924333, | |
| "grad_norm": 0.35869401693344116, | |
| "learning_rate": 0.00043132458442694664, | |
| "loss": 3.2448, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 14.092106797248455, | |
| "grad_norm": 0.3535849452018738, | |
| "learning_rate": 0.0004311496062992125, | |
| "loss": 3.2553, | |
| "step": 48350 | |
| }, | |
| { | |
| "epoch": 14.106680657572578, | |
| "grad_norm": 0.37184974551200867, | |
| "learning_rate": 0.0004309746281714785, | |
| "loss": 3.2573, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 14.1212545178967, | |
| "grad_norm": 0.36269503831863403, | |
| "learning_rate": 0.0004307996500437445, | |
| "loss": 3.2596, | |
| "step": 48450 | |
| }, | |
| { | |
| "epoch": 14.135828378220824, | |
| "grad_norm": 0.3423750102519989, | |
| "learning_rate": 0.00043062467191601046, | |
| "loss": 3.2674, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 14.150402238544945, | |
| "grad_norm": 0.34411948919296265, | |
| "learning_rate": 0.0004304496937882764, | |
| "loss": 3.2583, | |
| "step": 48550 | |
| }, | |
| { | |
| "epoch": 14.164976098869069, | |
| "grad_norm": 0.37814998626708984, | |
| "learning_rate": 0.0004302747156605424, | |
| "loss": 3.2656, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 14.17954995919319, | |
| "grad_norm": 0.34621769189834595, | |
| "learning_rate": 0.00043009973753280835, | |
| "loss": 3.2716, | |
| "step": 48650 | |
| }, | |
| { | |
| "epoch": 14.194123819517314, | |
| "grad_norm": 0.3595483899116516, | |
| "learning_rate": 0.00042992475940507435, | |
| "loss": 3.2708, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 14.208697679841436, | |
| "grad_norm": 0.3458899259567261, | |
| "learning_rate": 0.0004297497812773403, | |
| "loss": 3.2623, | |
| "step": 48750 | |
| }, | |
| { | |
| "epoch": 14.223271540165559, | |
| "grad_norm": 0.3649323880672455, | |
| "learning_rate": 0.0004295748031496063, | |
| "loss": 3.2657, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 14.237845400489682, | |
| "grad_norm": 0.3653738796710968, | |
| "learning_rate": 0.00042939982502187223, | |
| "loss": 3.2688, | |
| "step": 48850 | |
| }, | |
| { | |
| "epoch": 14.252419260813804, | |
| "grad_norm": 0.3623703420162201, | |
| "learning_rate": 0.0004292248468941382, | |
| "loss": 3.2699, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 14.266993121137928, | |
| "grad_norm": 0.3507397770881653, | |
| "learning_rate": 0.00042904986876640417, | |
| "loss": 3.2814, | |
| "step": 48950 | |
| }, | |
| { | |
| "epoch": 14.28156698146205, | |
| "grad_norm": 0.3713599741458893, | |
| "learning_rate": 0.0004288748906386701, | |
| "loss": 3.2894, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.28156698146205, | |
| "eval_accuracy": 0.3698350018248912, | |
| "eval_loss": 3.562056541442871, | |
| "eval_runtime": 53.1481, | |
| "eval_samples_per_second": 312.843, | |
| "eval_steps_per_second": 19.568, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.296140841786173, | |
| "grad_norm": 0.3673607110977173, | |
| "learning_rate": 0.0004286999125109361, | |
| "loss": 3.2879, | |
| "step": 49050 | |
| }, | |
| { | |
| "epoch": 14.310714702110294, | |
| "grad_norm": 0.3480517268180847, | |
| "learning_rate": 0.00042852493438320206, | |
| "loss": 3.2797, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 14.325288562434418, | |
| "grad_norm": 0.3668559491634369, | |
| "learning_rate": 0.000428349956255468, | |
| "loss": 3.288, | |
| "step": 49150 | |
| }, | |
| { | |
| "epoch": 14.33986242275854, | |
| "grad_norm": 0.35511183738708496, | |
| "learning_rate": 0.000428174978127734, | |
| "loss": 3.2788, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 14.354436283082663, | |
| "grad_norm": 0.3590772747993469, | |
| "learning_rate": 0.000428, | |
| "loss": 3.2956, | |
| "step": 49250 | |
| }, | |
| { | |
| "epoch": 14.369010143406786, | |
| "grad_norm": 0.34500300884246826, | |
| "learning_rate": 0.0004278250218722659, | |
| "loss": 3.2888, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 14.383584003730908, | |
| "grad_norm": 0.36788177490234375, | |
| "learning_rate": 0.0004276500437445319, | |
| "loss": 3.2891, | |
| "step": 49350 | |
| }, | |
| { | |
| "epoch": 14.398157864055031, | |
| "grad_norm": 0.363623708486557, | |
| "learning_rate": 0.0004274750656167979, | |
| "loss": 3.2917, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 14.412731724379153, | |
| "grad_norm": 0.35550540685653687, | |
| "learning_rate": 0.0004273000874890639, | |
| "loss": 3.2989, | |
| "step": 49450 | |
| }, | |
| { | |
| "epoch": 14.427305584703277, | |
| "grad_norm": 0.39245107769966125, | |
| "learning_rate": 0.00042712510936132977, | |
| "loss": 3.2925, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 14.441879445027398, | |
| "grad_norm": 0.38620251417160034, | |
| "learning_rate": 0.00042695013123359576, | |
| "loss": 3.2867, | |
| "step": 49550 | |
| }, | |
| { | |
| "epoch": 14.456453305351522, | |
| "grad_norm": 0.35340312123298645, | |
| "learning_rate": 0.00042677515310586176, | |
| "loss": 3.3107, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 14.471027165675643, | |
| "grad_norm": 0.35049957036972046, | |
| "learning_rate": 0.0004266001749781277, | |
| "loss": 3.3003, | |
| "step": 49650 | |
| }, | |
| { | |
| "epoch": 14.485601025999767, | |
| "grad_norm": 0.3536100387573242, | |
| "learning_rate": 0.00042642519685039365, | |
| "loss": 3.2945, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 14.50017488632389, | |
| "grad_norm": 0.3375909626483917, | |
| "learning_rate": 0.00042625021872265965, | |
| "loss": 3.2997, | |
| "step": 49750 | |
| }, | |
| { | |
| "epoch": 14.514748746648012, | |
| "grad_norm": 0.34976649284362793, | |
| "learning_rate": 0.0004260752405949256, | |
| "loss": 3.3008, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 14.529322606972135, | |
| "grad_norm": 0.3607953190803528, | |
| "learning_rate": 0.00042590026246719153, | |
| "loss": 3.3118, | |
| "step": 49850 | |
| }, | |
| { | |
| "epoch": 14.543896467296257, | |
| "grad_norm": 0.3413010239601135, | |
| "learning_rate": 0.00042572528433945753, | |
| "loss": 3.3113, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 14.55847032762038, | |
| "grad_norm": 0.34926819801330566, | |
| "learning_rate": 0.0004255503062117235, | |
| "loss": 3.3018, | |
| "step": 49950 | |
| }, | |
| { | |
| "epoch": 14.573044187944502, | |
| "grad_norm": 0.37419041991233826, | |
| "learning_rate": 0.00042537532808398947, | |
| "loss": 3.3027, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.573044187944502, | |
| "eval_accuracy": 0.3705401242432528, | |
| "eval_loss": 3.5543155670166016, | |
| "eval_runtime": 53.2259, | |
| "eval_samples_per_second": 312.386, | |
| "eval_steps_per_second": 19.539, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.587618048268626, | |
| "grad_norm": 0.3565591275691986, | |
| "learning_rate": 0.0004252003499562554, | |
| "loss": 3.3013, | |
| "step": 50050 | |
| }, | |
| { | |
| "epoch": 14.602191908592747, | |
| "grad_norm": 0.36905860900878906, | |
| "learning_rate": 0.0004250253718285214, | |
| "loss": 3.2943, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 14.61676576891687, | |
| "grad_norm": 0.3611801564693451, | |
| "learning_rate": 0.00042485039370078736, | |
| "loss": 3.3154, | |
| "step": 50150 | |
| }, | |
| { | |
| "epoch": 14.631339629240994, | |
| "grad_norm": 0.36640042066574097, | |
| "learning_rate": 0.00042467541557305335, | |
| "loss": 3.3241, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 14.645913489565116, | |
| "grad_norm": 0.34255263209342957, | |
| "learning_rate": 0.0004245004374453193, | |
| "loss": 3.3169, | |
| "step": 50250 | |
| }, | |
| { | |
| "epoch": 14.66048734988924, | |
| "grad_norm": 0.3561849296092987, | |
| "learning_rate": 0.00042432545931758524, | |
| "loss": 3.3116, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 14.67506121021336, | |
| "grad_norm": 0.44158047437667847, | |
| "learning_rate": 0.00042415048118985124, | |
| "loss": 3.3122, | |
| "step": 50350 | |
| }, | |
| { | |
| "epoch": 14.689635070537484, | |
| "grad_norm": 0.37477970123291016, | |
| "learning_rate": 0.00042397550306211724, | |
| "loss": 3.325, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 14.704208930861606, | |
| "grad_norm": 0.34694111347198486, | |
| "learning_rate": 0.0004238005249343831, | |
| "loss": 3.3128, | |
| "step": 50450 | |
| }, | |
| { | |
| "epoch": 14.71878279118573, | |
| "grad_norm": 0.3817085027694702, | |
| "learning_rate": 0.0004236255468066491, | |
| "loss": 3.3179, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 14.733356651509851, | |
| "grad_norm": 0.3483661115169525, | |
| "learning_rate": 0.0004234505686789151, | |
| "loss": 3.3184, | |
| "step": 50550 | |
| }, | |
| { | |
| "epoch": 14.747930511833975, | |
| "grad_norm": 0.3755340576171875, | |
| "learning_rate": 0.0004232755905511811, | |
| "loss": 3.322, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 14.762504372158098, | |
| "grad_norm": 0.3704056441783905, | |
| "learning_rate": 0.000423100612423447, | |
| "loss": 3.315, | |
| "step": 50650 | |
| }, | |
| { | |
| "epoch": 14.77707823248222, | |
| "grad_norm": 0.35658955574035645, | |
| "learning_rate": 0.000422925634295713, | |
| "loss": 3.326, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 14.791652092806343, | |
| "grad_norm": 0.35138851404190063, | |
| "learning_rate": 0.000422750656167979, | |
| "loss": 3.3246, | |
| "step": 50750 | |
| }, | |
| { | |
| "epoch": 14.806225953130465, | |
| "grad_norm": 0.36114072799682617, | |
| "learning_rate": 0.0004225756780402449, | |
| "loss": 3.3263, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 14.820799813454588, | |
| "grad_norm": 0.3541187047958374, | |
| "learning_rate": 0.0004224006999125109, | |
| "loss": 3.3278, | |
| "step": 50850 | |
| }, | |
| { | |
| "epoch": 14.83537367377871, | |
| "grad_norm": 0.3514479696750641, | |
| "learning_rate": 0.0004222257217847769, | |
| "loss": 3.315, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 14.849947534102833, | |
| "grad_norm": 0.33424443006515503, | |
| "learning_rate": 0.00042205074365704283, | |
| "loss": 3.3185, | |
| "step": 50950 | |
| }, | |
| { | |
| "epoch": 14.864521394426955, | |
| "grad_norm": 0.3738822042942047, | |
| "learning_rate": 0.0004218757655293088, | |
| "loss": 3.3197, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.864521394426955, | |
| "eval_accuracy": 0.37113108062158584, | |
| "eval_loss": 3.5453720092773438, | |
| "eval_runtime": 53.2808, | |
| "eval_samples_per_second": 312.064, | |
| "eval_steps_per_second": 19.519, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.879095254751078, | |
| "grad_norm": 0.3561355471611023, | |
| "learning_rate": 0.0004217007874015748, | |
| "loss": 3.3336, | |
| "step": 51050 | |
| }, | |
| { | |
| "epoch": 14.893669115075202, | |
| "grad_norm": 0.34328049421310425, | |
| "learning_rate": 0.0004215258092738407, | |
| "loss": 3.3355, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 14.908242975399324, | |
| "grad_norm": 0.3354352116584778, | |
| "learning_rate": 0.0004213508311461067, | |
| "loss": 3.3321, | |
| "step": 51150 | |
| }, | |
| { | |
| "epoch": 14.922816835723447, | |
| "grad_norm": 0.35511845350265503, | |
| "learning_rate": 0.00042117585301837266, | |
| "loss": 3.3387, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 14.937390696047569, | |
| "grad_norm": 0.34938520193099976, | |
| "learning_rate": 0.0004210008748906386, | |
| "loss": 3.3265, | |
| "step": 51250 | |
| }, | |
| { | |
| "epoch": 14.951964556371692, | |
| "grad_norm": 0.3668470084667206, | |
| "learning_rate": 0.0004208258967629046, | |
| "loss": 3.3209, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 14.966538416695814, | |
| "grad_norm": 0.34431192278862, | |
| "learning_rate": 0.0004206509186351706, | |
| "loss": 3.3524, | |
| "step": 51350 | |
| }, | |
| { | |
| "epoch": 14.981112277019937, | |
| "grad_norm": 0.3520943820476532, | |
| "learning_rate": 0.00042047594050743654, | |
| "loss": 3.3261, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 14.995686137344059, | |
| "grad_norm": 0.3720589876174927, | |
| "learning_rate": 0.0004203009623797025, | |
| "loss": 3.3256, | |
| "step": 51450 | |
| }, | |
| { | |
| "epoch": 15.010201702226885, | |
| "grad_norm": 0.3676832914352417, | |
| "learning_rate": 0.0004201259842519685, | |
| "loss": 3.244, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 15.024775562551008, | |
| "grad_norm": 0.3591252267360687, | |
| "learning_rate": 0.0004199510061242344, | |
| "loss": 3.2082, | |
| "step": 51550 | |
| }, | |
| { | |
| "epoch": 15.039349422875132, | |
| "grad_norm": 0.36459583044052124, | |
| "learning_rate": 0.00041977602799650037, | |
| "loss": 3.2133, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 15.053923283199254, | |
| "grad_norm": 0.3644467890262604, | |
| "learning_rate": 0.00041960104986876637, | |
| "loss": 3.2377, | |
| "step": 51650 | |
| }, | |
| { | |
| "epoch": 15.068497143523377, | |
| "grad_norm": 0.3457610011100769, | |
| "learning_rate": 0.00041942607174103236, | |
| "loss": 3.2313, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 15.083071003847499, | |
| "grad_norm": 0.39632362127304077, | |
| "learning_rate": 0.00041925109361329825, | |
| "loss": 3.2374, | |
| "step": 51750 | |
| }, | |
| { | |
| "epoch": 15.097644864171622, | |
| "grad_norm": 0.36543601751327515, | |
| "learning_rate": 0.00041907611548556425, | |
| "loss": 3.239, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 15.112218724495744, | |
| "grad_norm": 0.3530130088329315, | |
| "learning_rate": 0.00041890113735783025, | |
| "loss": 3.2509, | |
| "step": 51850 | |
| }, | |
| { | |
| "epoch": 15.126792584819867, | |
| "grad_norm": 0.3841630816459656, | |
| "learning_rate": 0.00041872615923009625, | |
| "loss": 3.2485, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 15.141366445143989, | |
| "grad_norm": 0.3543168008327484, | |
| "learning_rate": 0.00041855118110236214, | |
| "loss": 3.2474, | |
| "step": 51950 | |
| }, | |
| { | |
| "epoch": 15.155940305468112, | |
| "grad_norm": 0.3882521390914917, | |
| "learning_rate": 0.00041837620297462813, | |
| "loss": 3.2525, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.155940305468112, | |
| "eval_accuracy": 0.37026000550586324, | |
| "eval_loss": 3.563176155090332, | |
| "eval_runtime": 53.3952, | |
| "eval_samples_per_second": 311.395, | |
| "eval_steps_per_second": 19.477, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.170514165792236, | |
| "grad_norm": 0.3743119239807129, | |
| "learning_rate": 0.00041820122484689413, | |
| "loss": 3.2451, | |
| "step": 52050 | |
| }, | |
| { | |
| "epoch": 15.185088026116357, | |
| "grad_norm": 0.35586288571357727, | |
| "learning_rate": 0.0004180262467191601, | |
| "loss": 3.2701, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 15.19966188644048, | |
| "grad_norm": 0.3539133667945862, | |
| "learning_rate": 0.000417851268591426, | |
| "loss": 3.258, | |
| "step": 52150 | |
| }, | |
| { | |
| "epoch": 15.214235746764603, | |
| "grad_norm": 0.37858128547668457, | |
| "learning_rate": 0.000417676290463692, | |
| "loss": 3.2425, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 15.228809607088726, | |
| "grad_norm": 0.35201990604400635, | |
| "learning_rate": 0.00041750131233595796, | |
| "loss": 3.2686, | |
| "step": 52250 | |
| }, | |
| { | |
| "epoch": 15.243383467412848, | |
| "grad_norm": 0.36704257130622864, | |
| "learning_rate": 0.00041732633420822396, | |
| "loss": 3.2593, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 15.257957327736971, | |
| "grad_norm": 0.33171460032463074, | |
| "learning_rate": 0.0004171513560804899, | |
| "loss": 3.2692, | |
| "step": 52350 | |
| }, | |
| { | |
| "epoch": 15.272531188061093, | |
| "grad_norm": 0.36244454979896545, | |
| "learning_rate": 0.00041697637795275584, | |
| "loss": 3.2672, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 15.287105048385216, | |
| "grad_norm": 0.34771937131881714, | |
| "learning_rate": 0.00041680139982502184, | |
| "loss": 3.2883, | |
| "step": 52450 | |
| }, | |
| { | |
| "epoch": 15.30167890870934, | |
| "grad_norm": 0.35113075375556946, | |
| "learning_rate": 0.0004166264216972878, | |
| "loss": 3.2633, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 15.316252769033461, | |
| "grad_norm": 0.3820761442184448, | |
| "learning_rate": 0.00041645144356955373, | |
| "loss": 3.289, | |
| "step": 52550 | |
| }, | |
| { | |
| "epoch": 15.330826629357585, | |
| "grad_norm": 0.35176438093185425, | |
| "learning_rate": 0.0004162764654418197, | |
| "loss": 3.2792, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 15.345400489681706, | |
| "grad_norm": 0.34542304277420044, | |
| "learning_rate": 0.0004161014873140857, | |
| "loss": 3.2843, | |
| "step": 52650 | |
| }, | |
| { | |
| "epoch": 15.35997435000583, | |
| "grad_norm": 0.36796221137046814, | |
| "learning_rate": 0.00041592650918635167, | |
| "loss": 3.2766, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 15.374548210329952, | |
| "grad_norm": 0.364033967256546, | |
| "learning_rate": 0.0004157515310586176, | |
| "loss": 3.2701, | |
| "step": 52750 | |
| }, | |
| { | |
| "epoch": 15.389122070654075, | |
| "grad_norm": 0.34179043769836426, | |
| "learning_rate": 0.0004155765529308836, | |
| "loss": 3.2855, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 15.403695930978197, | |
| "grad_norm": 0.3775258958339691, | |
| "learning_rate": 0.0004154015748031496, | |
| "loss": 3.2618, | |
| "step": 52850 | |
| }, | |
| { | |
| "epoch": 15.41826979130232, | |
| "grad_norm": 0.3358531892299652, | |
| "learning_rate": 0.0004152265966754155, | |
| "loss": 3.2783, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 15.432843651626444, | |
| "grad_norm": 0.40009891986846924, | |
| "learning_rate": 0.0004150516185476815, | |
| "loss": 3.2933, | |
| "step": 52950 | |
| }, | |
| { | |
| "epoch": 15.447417511950565, | |
| "grad_norm": 0.34463268518447876, | |
| "learning_rate": 0.0004148766404199475, | |
| "loss": 3.2921, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.447417511950565, | |
| "eval_accuracy": 0.3706980735481169, | |
| "eval_loss": 3.5539512634277344, | |
| "eval_runtime": 53.3188, | |
| "eval_samples_per_second": 311.841, | |
| "eval_steps_per_second": 19.505, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.461991372274689, | |
| "grad_norm": 0.37157610058784485, | |
| "learning_rate": 0.0004147016622922135, | |
| "loss": 3.2756, | |
| "step": 53050 | |
| }, | |
| { | |
| "epoch": 15.47656523259881, | |
| "grad_norm": 0.3813033699989319, | |
| "learning_rate": 0.0004145266841644794, | |
| "loss": 3.2895, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 15.491139092922934, | |
| "grad_norm": 0.3535122573375702, | |
| "learning_rate": 0.0004143517060367454, | |
| "loss": 3.2833, | |
| "step": 53150 | |
| }, | |
| { | |
| "epoch": 15.505712953247055, | |
| "grad_norm": 0.38189417123794556, | |
| "learning_rate": 0.0004141767279090114, | |
| "loss": 3.2982, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 15.520286813571179, | |
| "grad_norm": 0.3614920973777771, | |
| "learning_rate": 0.0004140017497812773, | |
| "loss": 3.2903, | |
| "step": 53250 | |
| }, | |
| { | |
| "epoch": 15.5348606738953, | |
| "grad_norm": 0.35769498348236084, | |
| "learning_rate": 0.00041382677165354326, | |
| "loss": 3.2955, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 15.549434534219424, | |
| "grad_norm": 0.3598308265209198, | |
| "learning_rate": 0.00041365179352580926, | |
| "loss": 3.3013, | |
| "step": 53350 | |
| }, | |
| { | |
| "epoch": 15.564008394543547, | |
| "grad_norm": 0.3613493740558624, | |
| "learning_rate": 0.0004134768153980752, | |
| "loss": 3.2847, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 15.57858225486767, | |
| "grad_norm": 0.3721916675567627, | |
| "learning_rate": 0.00041330183727034114, | |
| "loss": 3.2993, | |
| "step": 53450 | |
| }, | |
| { | |
| "epoch": 15.593156115191793, | |
| "grad_norm": 0.3966820538043976, | |
| "learning_rate": 0.00041312685914260714, | |
| "loss": 3.2931, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 15.607729975515914, | |
| "grad_norm": 0.35601505637168884, | |
| "learning_rate": 0.0004129518810148731, | |
| "loss": 3.2995, | |
| "step": 53550 | |
| }, | |
| { | |
| "epoch": 15.622303835840038, | |
| "grad_norm": 0.36930301785469055, | |
| "learning_rate": 0.0004127769028871391, | |
| "loss": 3.2891, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 15.63687769616416, | |
| "grad_norm": 0.3665529191493988, | |
| "learning_rate": 0.00041260192475940503, | |
| "loss": 3.2997, | |
| "step": 53650 | |
| }, | |
| { | |
| "epoch": 15.651451556488283, | |
| "grad_norm": 0.34720444679260254, | |
| "learning_rate": 0.00041242694663167097, | |
| "loss": 3.3025, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 15.666025416812404, | |
| "grad_norm": 0.33357468247413635, | |
| "learning_rate": 0.00041225196850393697, | |
| "loss": 3.2858, | |
| "step": 53750 | |
| }, | |
| { | |
| "epoch": 15.680599277136528, | |
| "grad_norm": 0.36720114946365356, | |
| "learning_rate": 0.00041207699037620297, | |
| "loss": 3.2931, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 15.695173137460651, | |
| "grad_norm": 0.3451690673828125, | |
| "learning_rate": 0.00041190201224846886, | |
| "loss": 3.3067, | |
| "step": 53850 | |
| }, | |
| { | |
| "epoch": 15.709746997784773, | |
| "grad_norm": 0.3749679625034332, | |
| "learning_rate": 0.00041172703412073485, | |
| "loss": 3.2982, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 15.724320858108896, | |
| "grad_norm": 0.33904799818992615, | |
| "learning_rate": 0.00041155205599300085, | |
| "loss": 3.3089, | |
| "step": 53950 | |
| }, | |
| { | |
| "epoch": 15.738894718433018, | |
| "grad_norm": 0.354440301656723, | |
| "learning_rate": 0.00041137707786526685, | |
| "loss": 3.3025, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.738894718433018, | |
| "eval_accuracy": 0.3713644736704276, | |
| "eval_loss": 3.5475549697875977, | |
| "eval_runtime": 53.2281, | |
| "eval_samples_per_second": 312.372, | |
| "eval_steps_per_second": 19.539, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.753468578757142, | |
| "grad_norm": 0.36913421750068665, | |
| "learning_rate": 0.00041120209973753274, | |
| "loss": 3.3109, | |
| "step": 54050 | |
| }, | |
| { | |
| "epoch": 15.768042439081263, | |
| "grad_norm": 0.3467569649219513, | |
| "learning_rate": 0.00041102712160979874, | |
| "loss": 3.2863, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 15.782616299405387, | |
| "grad_norm": 0.3800930380821228, | |
| "learning_rate": 0.00041085214348206473, | |
| "loss": 3.3074, | |
| "step": 54150 | |
| }, | |
| { | |
| "epoch": 15.797190159729508, | |
| "grad_norm": 0.3963550329208374, | |
| "learning_rate": 0.0004106771653543306, | |
| "loss": 3.317, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 15.811764020053632, | |
| "grad_norm": 0.4001551568508148, | |
| "learning_rate": 0.0004105021872265966, | |
| "loss": 3.302, | |
| "step": 54250 | |
| }, | |
| { | |
| "epoch": 15.826337880377755, | |
| "grad_norm": 0.39264172315597534, | |
| "learning_rate": 0.0004103272090988626, | |
| "loss": 3.3056, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 15.840911740701877, | |
| "grad_norm": 0.3680339455604553, | |
| "learning_rate": 0.0004101522309711286, | |
| "loss": 3.3132, | |
| "step": 54350 | |
| }, | |
| { | |
| "epoch": 15.855485601026, | |
| "grad_norm": 0.3351523280143738, | |
| "learning_rate": 0.0004099772528433945, | |
| "loss": 3.318, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 15.870059461350122, | |
| "grad_norm": 0.3352740705013275, | |
| "learning_rate": 0.0004098022747156605, | |
| "loss": 3.3053, | |
| "step": 54450 | |
| }, | |
| { | |
| "epoch": 15.884633321674245, | |
| "grad_norm": 0.3434459865093231, | |
| "learning_rate": 0.0004096272965879265, | |
| "loss": 3.3175, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 15.899207181998367, | |
| "grad_norm": 0.34822505712509155, | |
| "learning_rate": 0.00040945231846019244, | |
| "loss": 3.3014, | |
| "step": 54550 | |
| }, | |
| { | |
| "epoch": 15.91378104232249, | |
| "grad_norm": 0.37124574184417725, | |
| "learning_rate": 0.0004092773403324584, | |
| "loss": 3.3199, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 15.928354902646612, | |
| "grad_norm": 0.36846932768821716, | |
| "learning_rate": 0.0004091023622047244, | |
| "loss": 3.3047, | |
| "step": 54650 | |
| }, | |
| { | |
| "epoch": 15.942928762970736, | |
| "grad_norm": 0.36084410548210144, | |
| "learning_rate": 0.00040892738407699033, | |
| "loss": 3.3154, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 15.95750262329486, | |
| "grad_norm": 0.37809786200523376, | |
| "learning_rate": 0.0004087524059492563, | |
| "loss": 3.3178, | |
| "step": 54750 | |
| }, | |
| { | |
| "epoch": 15.97207648361898, | |
| "grad_norm": 0.3330834209918976, | |
| "learning_rate": 0.00040857742782152227, | |
| "loss": 3.3083, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 15.986650343943104, | |
| "grad_norm": 0.35704752802848816, | |
| "learning_rate": 0.0004084024496937882, | |
| "loss": 3.3223, | |
| "step": 54850 | |
| }, | |
| { | |
| "epoch": 16.00116590882593, | |
| "grad_norm": 0.3622002601623535, | |
| "learning_rate": 0.0004082274715660542, | |
| "loss": 3.3095, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 16.015739769150052, | |
| "grad_norm": 0.33063775300979614, | |
| "learning_rate": 0.0004080524934383202, | |
| "loss": 3.1916, | |
| "step": 54950 | |
| }, | |
| { | |
| "epoch": 16.030313629474175, | |
| "grad_norm": 0.37001755833625793, | |
| "learning_rate": 0.0004078775153105861, | |
| "loss": 3.2054, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.030313629474175, | |
| "eval_accuracy": 0.37094406017044634, | |
| "eval_loss": 3.557634115219116, | |
| "eval_runtime": 53.1562, | |
| "eval_samples_per_second": 312.795, | |
| "eval_steps_per_second": 19.565, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.0448874897983, | |
| "grad_norm": 0.33795633912086487, | |
| "learning_rate": 0.0004077025371828521, | |
| "loss": 3.2113, | |
| "step": 55050 | |
| }, | |
| { | |
| "epoch": 16.05946135012242, | |
| "grad_norm": 0.37001416087150574, | |
| "learning_rate": 0.0004075275590551181, | |
| "loss": 3.2144, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 16.074035210446542, | |
| "grad_norm": 0.3721146881580353, | |
| "learning_rate": 0.000407352580927384, | |
| "loss": 3.2185, | |
| "step": 55150 | |
| }, | |
| { | |
| "epoch": 16.088609070770666, | |
| "grad_norm": 0.3787801265716553, | |
| "learning_rate": 0.00040717760279965, | |
| "loss": 3.2339, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 16.10318293109479, | |
| "grad_norm": 0.38708654046058655, | |
| "learning_rate": 0.000407002624671916, | |
| "loss": 3.2246, | |
| "step": 55250 | |
| }, | |
| { | |
| "epoch": 16.117756791418913, | |
| "grad_norm": 0.37298810482025146, | |
| "learning_rate": 0.000406827646544182, | |
| "loss": 3.2327, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 16.132330651743032, | |
| "grad_norm": 0.3699816167354584, | |
| "learning_rate": 0.00040665266841644786, | |
| "loss": 3.2237, | |
| "step": 55350 | |
| }, | |
| { | |
| "epoch": 16.146904512067156, | |
| "grad_norm": 0.37289005517959595, | |
| "learning_rate": 0.00040647769028871386, | |
| "loss": 3.2419, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 16.16147837239128, | |
| "grad_norm": 0.41655048727989197, | |
| "learning_rate": 0.00040630271216097986, | |
| "loss": 3.2349, | |
| "step": 55450 | |
| }, | |
| { | |
| "epoch": 16.176052232715403, | |
| "grad_norm": 0.36438512802124023, | |
| "learning_rate": 0.00040612773403324586, | |
| "loss": 3.2401, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 16.190626093039523, | |
| "grad_norm": 0.38352882862091064, | |
| "learning_rate": 0.00040595275590551175, | |
| "loss": 3.2357, | |
| "step": 55550 | |
| }, | |
| { | |
| "epoch": 16.205199953363646, | |
| "grad_norm": 0.33786365389823914, | |
| "learning_rate": 0.00040577777777777774, | |
| "loss": 3.2352, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 16.21977381368777, | |
| "grad_norm": 0.3919704854488373, | |
| "learning_rate": 0.00040560279965004374, | |
| "loss": 3.259, | |
| "step": 55650 | |
| }, | |
| { | |
| "epoch": 16.234347674011893, | |
| "grad_norm": 0.3714604079723358, | |
| "learning_rate": 0.0004054278215223097, | |
| "loss": 3.2443, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 16.248921534336016, | |
| "grad_norm": 0.3812396824359894, | |
| "learning_rate": 0.00040525284339457563, | |
| "loss": 3.2373, | |
| "step": 55750 | |
| }, | |
| { | |
| "epoch": 16.263495394660136, | |
| "grad_norm": 0.37564516067504883, | |
| "learning_rate": 0.0004050778652668416, | |
| "loss": 3.2481, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 16.27806925498426, | |
| "grad_norm": 0.38240739703178406, | |
| "learning_rate": 0.00040490288713910757, | |
| "loss": 3.2488, | |
| "step": 55850 | |
| }, | |
| { | |
| "epoch": 16.292643115308383, | |
| "grad_norm": 0.387246698141098, | |
| "learning_rate": 0.00040472790901137357, | |
| "loss": 3.2542, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 16.307216975632507, | |
| "grad_norm": 0.37336498498916626, | |
| "learning_rate": 0.0004045529308836395, | |
| "loss": 3.2679, | |
| "step": 55950 | |
| }, | |
| { | |
| "epoch": 16.321790835956627, | |
| "grad_norm": 0.3574172854423523, | |
| "learning_rate": 0.00040437795275590546, | |
| "loss": 3.2648, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.321790835956627, | |
| "eval_accuracy": 0.37116627200918223, | |
| "eval_loss": 3.556175947189331, | |
| "eval_runtime": 53.2307, | |
| "eval_samples_per_second": 312.358, | |
| "eval_steps_per_second": 19.538, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.33636469628075, | |
| "grad_norm": 0.3797742426395416, | |
| "learning_rate": 0.00040420297462817145, | |
| "loss": 3.2606, | |
| "step": 56050 | |
| }, | |
| { | |
| "epoch": 16.350938556604873, | |
| "grad_norm": 0.3520547151565552, | |
| "learning_rate": 0.0004040279965004374, | |
| "loss": 3.2682, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 16.365512416928997, | |
| "grad_norm": 0.3968909978866577, | |
| "learning_rate": 0.00040385301837270334, | |
| "loss": 3.2687, | |
| "step": 56150 | |
| }, | |
| { | |
| "epoch": 16.38008627725312, | |
| "grad_norm": 0.4448312819004059, | |
| "learning_rate": 0.00040367804024496934, | |
| "loss": 3.2677, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 16.39466013757724, | |
| "grad_norm": 0.36881738901138306, | |
| "learning_rate": 0.00040350306211723534, | |
| "loss": 3.26, | |
| "step": 56250 | |
| }, | |
| { | |
| "epoch": 16.409233997901364, | |
| "grad_norm": 0.3703789710998535, | |
| "learning_rate": 0.0004033280839895012, | |
| "loss": 3.2568, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 16.423807858225487, | |
| "grad_norm": 0.38579708337783813, | |
| "learning_rate": 0.0004031531058617672, | |
| "loss": 3.265, | |
| "step": 56350 | |
| }, | |
| { | |
| "epoch": 16.43838171854961, | |
| "grad_norm": 0.35912278294563293, | |
| "learning_rate": 0.0004029781277340332, | |
| "loss": 3.2769, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 16.45295557887373, | |
| "grad_norm": 0.36150550842285156, | |
| "learning_rate": 0.0004028031496062992, | |
| "loss": 3.2593, | |
| "step": 56450 | |
| }, | |
| { | |
| "epoch": 16.467529439197854, | |
| "grad_norm": 0.37937209010124207, | |
| "learning_rate": 0.0004026281714785651, | |
| "loss": 3.273, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 16.482103299521977, | |
| "grad_norm": 0.3345341980457306, | |
| "learning_rate": 0.0004024531933508311, | |
| "loss": 3.274, | |
| "step": 56550 | |
| }, | |
| { | |
| "epoch": 16.4966771598461, | |
| "grad_norm": 0.36195069551467896, | |
| "learning_rate": 0.0004022782152230971, | |
| "loss": 3.2831, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 16.511251020170224, | |
| "grad_norm": 0.3733668625354767, | |
| "learning_rate": 0.0004021032370953631, | |
| "loss": 3.2811, | |
| "step": 56650 | |
| }, | |
| { | |
| "epoch": 16.525824880494344, | |
| "grad_norm": 0.3509543240070343, | |
| "learning_rate": 0.000401928258967629, | |
| "loss": 3.2788, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 16.540398740818468, | |
| "grad_norm": 0.3814429044723511, | |
| "learning_rate": 0.000401753280839895, | |
| "loss": 3.2759, | |
| "step": 56750 | |
| }, | |
| { | |
| "epoch": 16.55497260114259, | |
| "grad_norm": 0.38439804315567017, | |
| "learning_rate": 0.000401578302712161, | |
| "loss": 3.2847, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 16.569546461466715, | |
| "grad_norm": 0.37428414821624756, | |
| "learning_rate": 0.00040140332458442693, | |
| "loss": 3.2765, | |
| "step": 56850 | |
| }, | |
| { | |
| "epoch": 16.584120321790834, | |
| "grad_norm": 0.36497533321380615, | |
| "learning_rate": 0.00040122834645669287, | |
| "loss": 3.2783, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 16.598694182114958, | |
| "grad_norm": 0.3993648588657379, | |
| "learning_rate": 0.00040105336832895887, | |
| "loss": 3.2852, | |
| "step": 56950 | |
| }, | |
| { | |
| "epoch": 16.61326804243908, | |
| "grad_norm": 0.3501555025577545, | |
| "learning_rate": 0.0004008783902012248, | |
| "loss": 3.2897, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.61326804243908, | |
| "eval_accuracy": 0.3714454491709839, | |
| "eval_loss": 3.550541639328003, | |
| "eval_runtime": 53.2304, | |
| "eval_samples_per_second": 312.359, | |
| "eval_steps_per_second": 19.538, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.627841902763205, | |
| "grad_norm": 0.3381880521774292, | |
| "learning_rate": 0.00040070341207349076, | |
| "loss": 3.2955, | |
| "step": 57050 | |
| }, | |
| { | |
| "epoch": 16.642415763087328, | |
| "grad_norm": 0.35812264680862427, | |
| "learning_rate": 0.00040052843394575675, | |
| "loss": 3.2785, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 16.656989623411448, | |
| "grad_norm": 0.3534041941165924, | |
| "learning_rate": 0.0004003534558180227, | |
| "loss": 3.2976, | |
| "step": 57150 | |
| }, | |
| { | |
| "epoch": 16.67156348373557, | |
| "grad_norm": 0.3645527958869934, | |
| "learning_rate": 0.0004001784776902887, | |
| "loss": 3.2784, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 16.686137344059695, | |
| "grad_norm": 0.3515307307243347, | |
| "learning_rate": 0.00040000349956255464, | |
| "loss": 3.2845, | |
| "step": 57250 | |
| }, | |
| { | |
| "epoch": 16.70071120438382, | |
| "grad_norm": 0.3494560122489929, | |
| "learning_rate": 0.0003998285214348206, | |
| "loss": 3.2923, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 16.71528506470794, | |
| "grad_norm": 0.3853073716163635, | |
| "learning_rate": 0.0003996535433070866, | |
| "loss": 3.3093, | |
| "step": 57350 | |
| }, | |
| { | |
| "epoch": 16.72985892503206, | |
| "grad_norm": 0.357208788394928, | |
| "learning_rate": 0.0003994785651793526, | |
| "loss": 3.3005, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 16.744432785356185, | |
| "grad_norm": 0.38168585300445557, | |
| "learning_rate": 0.00039930358705161847, | |
| "loss": 3.2879, | |
| "step": 57450 | |
| }, | |
| { | |
| "epoch": 16.75900664568031, | |
| "grad_norm": 0.3707534968852997, | |
| "learning_rate": 0.00039912860892388446, | |
| "loss": 3.2997, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 16.773580506004432, | |
| "grad_norm": 0.3866535425186157, | |
| "learning_rate": 0.00039895363079615046, | |
| "loss": 3.2849, | |
| "step": 57550 | |
| }, | |
| { | |
| "epoch": 16.788154366328552, | |
| "grad_norm": 0.3614501655101776, | |
| "learning_rate": 0.00039877865266841646, | |
| "loss": 3.2929, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 16.802728226652675, | |
| "grad_norm": 0.35415467619895935, | |
| "learning_rate": 0.00039860367454068235, | |
| "loss": 3.3009, | |
| "step": 57650 | |
| }, | |
| { | |
| "epoch": 16.8173020869768, | |
| "grad_norm": 0.35893917083740234, | |
| "learning_rate": 0.00039842869641294835, | |
| "loss": 3.2887, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 16.831875947300922, | |
| "grad_norm": 0.3701547384262085, | |
| "learning_rate": 0.00039825371828521434, | |
| "loss": 3.2951, | |
| "step": 57750 | |
| }, | |
| { | |
| "epoch": 16.846449807625042, | |
| "grad_norm": 0.3377358615398407, | |
| "learning_rate": 0.00039807874015748023, | |
| "loss": 3.3029, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 16.861023667949166, | |
| "grad_norm": 0.36560627818107605, | |
| "learning_rate": 0.00039790376202974623, | |
| "loss": 3.2871, | |
| "step": 57850 | |
| }, | |
| { | |
| "epoch": 16.87559752827329, | |
| "grad_norm": 0.3592166006565094, | |
| "learning_rate": 0.00039772878390201223, | |
| "loss": 3.3013, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 16.890171388597413, | |
| "grad_norm": 0.362232506275177, | |
| "learning_rate": 0.0003975538057742782, | |
| "loss": 3.3032, | |
| "step": 57950 | |
| }, | |
| { | |
| "epoch": 16.904745248921536, | |
| "grad_norm": 0.3759749233722687, | |
| "learning_rate": 0.0003973788276465441, | |
| "loss": 3.2938, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.904745248921536, | |
| "eval_accuracy": 0.3717375729971187, | |
| "eval_loss": 3.543179512023926, | |
| "eval_runtime": 53.2256, | |
| "eval_samples_per_second": 312.387, | |
| "eval_steps_per_second": 19.539, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.919319109245656, | |
| "grad_norm": 0.37933215498924255, | |
| "learning_rate": 0.0003972038495188101, | |
| "loss": 3.3026, | |
| "step": 58050 | |
| }, | |
| { | |
| "epoch": 16.93389296956978, | |
| "grad_norm": 0.3669482469558716, | |
| "learning_rate": 0.0003970288713910761, | |
| "loss": 3.3075, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 16.948466829893903, | |
| "grad_norm": 0.41558346152305603, | |
| "learning_rate": 0.00039685389326334205, | |
| "loss": 3.2956, | |
| "step": 58150 | |
| }, | |
| { | |
| "epoch": 16.963040690218026, | |
| "grad_norm": 0.36796823143959045, | |
| "learning_rate": 0.000396678915135608, | |
| "loss": 3.3017, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 16.977614550542146, | |
| "grad_norm": 0.4123929440975189, | |
| "learning_rate": 0.000396503937007874, | |
| "loss": 3.3156, | |
| "step": 58250 | |
| }, | |
| { | |
| "epoch": 16.99218841086627, | |
| "grad_norm": 0.37400203943252563, | |
| "learning_rate": 0.00039632895888013994, | |
| "loss": 3.3063, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 17.006703975749097, | |
| "grad_norm": 0.3638571798801422, | |
| "learning_rate": 0.00039615398075240594, | |
| "loss": 3.251, | |
| "step": 58350 | |
| }, | |
| { | |
| "epoch": 17.021277836073217, | |
| "grad_norm": 0.37552201747894287, | |
| "learning_rate": 0.0003959790026246719, | |
| "loss": 3.196, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 17.03585169639734, | |
| "grad_norm": 0.39781540632247925, | |
| "learning_rate": 0.0003958040244969378, | |
| "loss": 3.1953, | |
| "step": 58450 | |
| }, | |
| { | |
| "epoch": 17.050425556721464, | |
| "grad_norm": 0.37113073468208313, | |
| "learning_rate": 0.0003956290463692038, | |
| "loss": 3.1901, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 17.064999417045588, | |
| "grad_norm": 0.37127241492271423, | |
| "learning_rate": 0.0003954540682414698, | |
| "loss": 3.201, | |
| "step": 58550 | |
| }, | |
| { | |
| "epoch": 17.07957327736971, | |
| "grad_norm": 0.3953086733818054, | |
| "learning_rate": 0.0003952790901137357, | |
| "loss": 3.218, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 17.09414713769383, | |
| "grad_norm": 0.3666759431362152, | |
| "learning_rate": 0.0003951041119860017, | |
| "loss": 3.214, | |
| "step": 58650 | |
| }, | |
| { | |
| "epoch": 17.108720998017954, | |
| "grad_norm": 0.35260671377182007, | |
| "learning_rate": 0.0003949291338582677, | |
| "loss": 3.2224, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 17.123294858342078, | |
| "grad_norm": 0.36087656021118164, | |
| "learning_rate": 0.0003947541557305336, | |
| "loss": 3.2197, | |
| "step": 58750 | |
| }, | |
| { | |
| "epoch": 17.1378687186662, | |
| "grad_norm": 0.41437220573425293, | |
| "learning_rate": 0.0003945791776027996, | |
| "loss": 3.2232, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 17.15244257899032, | |
| "grad_norm": 0.3841547966003418, | |
| "learning_rate": 0.0003944041994750656, | |
| "loss": 3.2088, | |
| "step": 58850 | |
| }, | |
| { | |
| "epoch": 17.167016439314445, | |
| "grad_norm": 0.38770514726638794, | |
| "learning_rate": 0.0003942292213473316, | |
| "loss": 3.2212, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 17.181590299638568, | |
| "grad_norm": 0.43151119351387024, | |
| "learning_rate": 0.0003940542432195975, | |
| "loss": 3.2283, | |
| "step": 58950 | |
| }, | |
| { | |
| "epoch": 17.19616415996269, | |
| "grad_norm": 0.35698238015174866, | |
| "learning_rate": 0.0003938792650918635, | |
| "loss": 3.2236, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.19616415996269, | |
| "eval_accuracy": 0.37119805018527263, | |
| "eval_loss": 3.561183452606201, | |
| "eval_runtime": 53.3167, | |
| "eval_samples_per_second": 311.853, | |
| "eval_steps_per_second": 19.506, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.210738020286815, | |
| "grad_norm": 0.36982160806655884, | |
| "learning_rate": 0.00039370428696412947, | |
| "loss": 3.2417, | |
| "step": 59050 | |
| }, | |
| { | |
| "epoch": 17.225311880610935, | |
| "grad_norm": 0.3661283552646637, | |
| "learning_rate": 0.0003935293088363954, | |
| "loss": 3.2369, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 17.23988574093506, | |
| "grad_norm": 0.37872856855392456, | |
| "learning_rate": 0.00039335433070866136, | |
| "loss": 3.2424, | |
| "step": 59150 | |
| }, | |
| { | |
| "epoch": 17.25445960125918, | |
| "grad_norm": 0.37743332982063293, | |
| "learning_rate": 0.00039317935258092736, | |
| "loss": 3.2531, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 17.269033461583305, | |
| "grad_norm": 0.36602476239204407, | |
| "learning_rate": 0.00039300437445319335, | |
| "loss": 3.2485, | |
| "step": 59250 | |
| }, | |
| { | |
| "epoch": 17.283607321907425, | |
| "grad_norm": 0.38170450925827026, | |
| "learning_rate": 0.0003928293963254593, | |
| "loss": 3.2447, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 17.29818118223155, | |
| "grad_norm": 0.3535911440849304, | |
| "learning_rate": 0.00039265441819772524, | |
| "loss": 3.2355, | |
| "step": 59350 | |
| }, | |
| { | |
| "epoch": 17.312755042555672, | |
| "grad_norm": 0.37537553906440735, | |
| "learning_rate": 0.00039247944006999124, | |
| "loss": 3.2474, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 17.327328902879795, | |
| "grad_norm": 0.3766328990459442, | |
| "learning_rate": 0.0003923044619422572, | |
| "loss": 3.2498, | |
| "step": 59450 | |
| }, | |
| { | |
| "epoch": 17.34190276320392, | |
| "grad_norm": 0.4029710292816162, | |
| "learning_rate": 0.0003921294838145232, | |
| "loss": 3.2516, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 17.35647662352804, | |
| "grad_norm": 0.3706645965576172, | |
| "learning_rate": 0.0003919545056867891, | |
| "loss": 3.255, | |
| "step": 59550 | |
| }, | |
| { | |
| "epoch": 17.371050483852162, | |
| "grad_norm": 0.376094788312912, | |
| "learning_rate": 0.00039177952755905507, | |
| "loss": 3.2536, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 17.385624344176286, | |
| "grad_norm": 0.3890036642551422, | |
| "learning_rate": 0.00039160454943132106, | |
| "loss": 3.2558, | |
| "step": 59650 | |
| }, | |
| { | |
| "epoch": 17.40019820450041, | |
| "grad_norm": 0.3617980182170868, | |
| "learning_rate": 0.000391429571303587, | |
| "loss": 3.2597, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 17.41477206482453, | |
| "grad_norm": 0.36625897884368896, | |
| "learning_rate": 0.00039125459317585295, | |
| "loss": 3.26, | |
| "step": 59750 | |
| }, | |
| { | |
| "epoch": 17.429345925148652, | |
| "grad_norm": 0.40066832304000854, | |
| "learning_rate": 0.00039107961504811895, | |
| "loss": 3.2589, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 17.443919785472776, | |
| "grad_norm": 0.36895522475242615, | |
| "learning_rate": 0.00039090463692038495, | |
| "loss": 3.2513, | |
| "step": 59850 | |
| }, | |
| { | |
| "epoch": 17.4584936457969, | |
| "grad_norm": 0.3618486821651459, | |
| "learning_rate": 0.00039072965879265084, | |
| "loss": 3.2505, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 17.473067506121023, | |
| "grad_norm": 0.40019845962524414, | |
| "learning_rate": 0.00039055468066491683, | |
| "loss": 3.2599, | |
| "step": 59950 | |
| }, | |
| { | |
| "epoch": 17.487641366445143, | |
| "grad_norm": 0.3406798541545868, | |
| "learning_rate": 0.00039037970253718283, | |
| "loss": 3.261, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.487641366445143, | |
| "eval_accuracy": 0.3714183788728328, | |
| "eval_loss": 3.5528781414031982, | |
| "eval_runtime": 53.3399, | |
| "eval_samples_per_second": 311.718, | |
| "eval_steps_per_second": 19.498, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.502215226769266, | |
| "grad_norm": 0.3532922565937042, | |
| "learning_rate": 0.00039020472440944883, | |
| "loss": 3.2684, | |
| "step": 60050 | |
| }, | |
| { | |
| "epoch": 17.51678908709339, | |
| "grad_norm": 0.3447803556919098, | |
| "learning_rate": 0.0003900297462817147, | |
| "loss": 3.2626, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 17.531362947417513, | |
| "grad_norm": 0.36045655608177185, | |
| "learning_rate": 0.0003898547681539807, | |
| "loss": 3.2602, | |
| "step": 60150 | |
| }, | |
| { | |
| "epoch": 17.545936807741633, | |
| "grad_norm": 0.37568119168281555, | |
| "learning_rate": 0.0003896797900262467, | |
| "loss": 3.2687, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 17.560510668065756, | |
| "grad_norm": 0.348172664642334, | |
| "learning_rate": 0.00038950481189851266, | |
| "loss": 3.278, | |
| "step": 60250 | |
| }, | |
| { | |
| "epoch": 17.57508452838988, | |
| "grad_norm": 0.3671204447746277, | |
| "learning_rate": 0.0003893298337707786, | |
| "loss": 3.2537, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 17.589658388714003, | |
| "grad_norm": 0.34588101506233215, | |
| "learning_rate": 0.0003891548556430446, | |
| "loss": 3.2723, | |
| "step": 60350 | |
| }, | |
| { | |
| "epoch": 17.604232249038127, | |
| "grad_norm": 0.3869750201702118, | |
| "learning_rate": 0.00038897987751531054, | |
| "loss": 3.2739, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 17.618806109362247, | |
| "grad_norm": 0.3896041214466095, | |
| "learning_rate": 0.00038880489938757654, | |
| "loss": 3.2726, | |
| "step": 60450 | |
| }, | |
| { | |
| "epoch": 17.63337996968637, | |
| "grad_norm": 0.3611544966697693, | |
| "learning_rate": 0.0003886299212598425, | |
| "loss": 3.2795, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 17.647953830010493, | |
| "grad_norm": 0.3854808509349823, | |
| "learning_rate": 0.0003884549431321085, | |
| "loss": 3.2688, | |
| "step": 60550 | |
| }, | |
| { | |
| "epoch": 17.662527690334617, | |
| "grad_norm": 0.3506561517715454, | |
| "learning_rate": 0.0003882799650043744, | |
| "loss": 3.2754, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 17.677101550658737, | |
| "grad_norm": 0.3766607940196991, | |
| "learning_rate": 0.00038810498687664037, | |
| "loss": 3.2741, | |
| "step": 60650 | |
| }, | |
| { | |
| "epoch": 17.69167541098286, | |
| "grad_norm": 0.3881302773952484, | |
| "learning_rate": 0.00038793000874890637, | |
| "loss": 3.2766, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 17.706249271306984, | |
| "grad_norm": 0.3620976209640503, | |
| "learning_rate": 0.0003877550306211723, | |
| "loss": 3.278, | |
| "step": 60750 | |
| }, | |
| { | |
| "epoch": 17.720823131631107, | |
| "grad_norm": 0.3486970365047455, | |
| "learning_rate": 0.0003875800524934383, | |
| "loss": 3.2781, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 17.73539699195523, | |
| "grad_norm": 0.34223899245262146, | |
| "learning_rate": 0.00038740507436570425, | |
| "loss": 3.2754, | |
| "step": 60850 | |
| }, | |
| { | |
| "epoch": 17.74997085227935, | |
| "grad_norm": 0.36822447180747986, | |
| "learning_rate": 0.0003872300962379702, | |
| "loss": 3.272, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 17.764544712603474, | |
| "grad_norm": 0.34223830699920654, | |
| "learning_rate": 0.0003870551181102362, | |
| "loss": 3.2766, | |
| "step": 60950 | |
| }, | |
| { | |
| "epoch": 17.779118572927597, | |
| "grad_norm": 0.35672491788864136, | |
| "learning_rate": 0.0003868801399825022, | |
| "loss": 3.2844, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.779118572927597, | |
| "eval_accuracy": 0.3721375072280638, | |
| "eval_loss": 3.5414795875549316, | |
| "eval_runtime": 53.2321, | |
| "eval_samples_per_second": 312.349, | |
| "eval_steps_per_second": 19.537, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.79369243325172, | |
| "grad_norm": 0.35861504077911377, | |
| "learning_rate": 0.0003867051618547681, | |
| "loss": 3.2845, | |
| "step": 61050 | |
| }, | |
| { | |
| "epoch": 17.80826629357584, | |
| "grad_norm": 0.38925936818122864, | |
| "learning_rate": 0.0003865301837270341, | |
| "loss": 3.2804, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 17.822840153899964, | |
| "grad_norm": 0.38915199041366577, | |
| "learning_rate": 0.0003863552055993001, | |
| "loss": 3.2986, | |
| "step": 61150 | |
| }, | |
| { | |
| "epoch": 17.837414014224088, | |
| "grad_norm": 0.3660210371017456, | |
| "learning_rate": 0.00038618022747156607, | |
| "loss": 3.3001, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 17.85198787454821, | |
| "grad_norm": 0.366853266954422, | |
| "learning_rate": 0.00038600524934383196, | |
| "loss": 3.2869, | |
| "step": 61250 | |
| }, | |
| { | |
| "epoch": 17.866561734872334, | |
| "grad_norm": 0.3562902808189392, | |
| "learning_rate": 0.00038583027121609796, | |
| "loss": 3.2901, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 17.881135595196454, | |
| "grad_norm": 0.3574168384075165, | |
| "learning_rate": 0.00038565529308836396, | |
| "loss": 3.2821, | |
| "step": 61350 | |
| }, | |
| { | |
| "epoch": 17.895709455520578, | |
| "grad_norm": 0.3829963207244873, | |
| "learning_rate": 0.00038548031496062984, | |
| "loss": 3.2883, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 17.9102833158447, | |
| "grad_norm": 0.3606160283088684, | |
| "learning_rate": 0.00038530533683289584, | |
| "loss": 3.2845, | |
| "step": 61450 | |
| }, | |
| { | |
| "epoch": 17.924857176168825, | |
| "grad_norm": 0.3578321635723114, | |
| "learning_rate": 0.00038513035870516184, | |
| "loss": 3.2801, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 17.939431036492945, | |
| "grad_norm": 0.3657797574996948, | |
| "learning_rate": 0.0003849553805774278, | |
| "loss": 3.2871, | |
| "step": 61550 | |
| }, | |
| { | |
| "epoch": 17.954004896817068, | |
| "grad_norm": 0.3794346749782562, | |
| "learning_rate": 0.00038478040244969373, | |
| "loss": 3.2791, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 17.96857875714119, | |
| "grad_norm": 0.36017322540283203, | |
| "learning_rate": 0.0003846054243219597, | |
| "loss": 3.2846, | |
| "step": 61650 | |
| }, | |
| { | |
| "epoch": 17.983152617465315, | |
| "grad_norm": 0.36378708481788635, | |
| "learning_rate": 0.00038443044619422567, | |
| "loss": 3.2921, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 17.99772647778944, | |
| "grad_norm": 0.3741123378276825, | |
| "learning_rate": 0.00038425546806649167, | |
| "loss": 3.2942, | |
| "step": 61750 | |
| }, | |
| { | |
| "epoch": 18.012242042672263, | |
| "grad_norm": 0.3481272757053375, | |
| "learning_rate": 0.0003840804899387576, | |
| "loss": 3.2076, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 18.026815902996386, | |
| "grad_norm": 0.3690752685070038, | |
| "learning_rate": 0.0003839055118110236, | |
| "loss": 3.1866, | |
| "step": 61850 | |
| }, | |
| { | |
| "epoch": 18.04138976332051, | |
| "grad_norm": 0.37555626034736633, | |
| "learning_rate": 0.00038373053368328955, | |
| "loss": 3.1798, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 18.05596362364463, | |
| "grad_norm": 0.4028767943382263, | |
| "learning_rate": 0.00038355555555555555, | |
| "loss": 3.1654, | |
| "step": 61950 | |
| }, | |
| { | |
| "epoch": 18.070537483968753, | |
| "grad_norm": 0.3782511353492737, | |
| "learning_rate": 0.0003833805774278215, | |
| "loss": 3.2023, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.070537483968753, | |
| "eval_accuracy": 0.3714931164351195, | |
| "eval_loss": 3.555772542953491, | |
| "eval_runtime": 53.4272, | |
| "eval_samples_per_second": 311.208, | |
| "eval_steps_per_second": 19.466, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.085111344292876, | |
| "grad_norm": 0.3724522590637207, | |
| "learning_rate": 0.00038320559930008744, | |
| "loss": 3.2086, | |
| "step": 62050 | |
| }, | |
| { | |
| "epoch": 18.099685204617, | |
| "grad_norm": 0.3868659436702728, | |
| "learning_rate": 0.00038303062117235343, | |
| "loss": 3.2042, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 18.114259064941123, | |
| "grad_norm": 0.37153860926628113, | |
| "learning_rate": 0.00038285564304461943, | |
| "loss": 3.1851, | |
| "step": 62150 | |
| }, | |
| { | |
| "epoch": 18.128832925265243, | |
| "grad_norm": 0.37508624792099, | |
| "learning_rate": 0.0003826806649168853, | |
| "loss": 3.2067, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 18.143406785589367, | |
| "grad_norm": 0.38075685501098633, | |
| "learning_rate": 0.0003825056867891513, | |
| "loss": 3.2042, | |
| "step": 62250 | |
| }, | |
| { | |
| "epoch": 18.15798064591349, | |
| "grad_norm": 0.36698952317237854, | |
| "learning_rate": 0.0003823307086614173, | |
| "loss": 3.2204, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 18.172554506237613, | |
| "grad_norm": 0.36773785948753357, | |
| "learning_rate": 0.0003821557305336832, | |
| "loss": 3.2278, | |
| "step": 62350 | |
| }, | |
| { | |
| "epoch": 18.187128366561733, | |
| "grad_norm": 0.4154503047466278, | |
| "learning_rate": 0.0003819807524059492, | |
| "loss": 3.2207, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 18.201702226885857, | |
| "grad_norm": 0.36895978450775146, | |
| "learning_rate": 0.0003818057742782152, | |
| "loss": 3.2159, | |
| "step": 62450 | |
| }, | |
| { | |
| "epoch": 18.21627608720998, | |
| "grad_norm": 0.3558099865913391, | |
| "learning_rate": 0.0003816307961504812, | |
| "loss": 3.2119, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 18.230849947534104, | |
| "grad_norm": 0.36864882707595825, | |
| "learning_rate": 0.0003814558180227471, | |
| "loss": 3.2354, | |
| "step": 62550 | |
| }, | |
| { | |
| "epoch": 18.245423807858227, | |
| "grad_norm": 0.3729044795036316, | |
| "learning_rate": 0.0003812808398950131, | |
| "loss": 3.2194, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 18.259997668182347, | |
| "grad_norm": 0.36386367678642273, | |
| "learning_rate": 0.0003811058617672791, | |
| "loss": 3.2238, | |
| "step": 62650 | |
| }, | |
| { | |
| "epoch": 18.27457152850647, | |
| "grad_norm": 0.37404847145080566, | |
| "learning_rate": 0.000380930883639545, | |
| "loss": 3.2283, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 18.289145388830594, | |
| "grad_norm": 0.38174042105674744, | |
| "learning_rate": 0.00038075590551181097, | |
| "loss": 3.2369, | |
| "step": 62750 | |
| }, | |
| { | |
| "epoch": 18.303719249154717, | |
| "grad_norm": 0.37085598707199097, | |
| "learning_rate": 0.00038058092738407697, | |
| "loss": 3.2269, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 18.318293109478837, | |
| "grad_norm": 0.37157735228538513, | |
| "learning_rate": 0.0003804059492563429, | |
| "loss": 3.2349, | |
| "step": 62850 | |
| }, | |
| { | |
| "epoch": 18.33286696980296, | |
| "grad_norm": 0.42219650745391846, | |
| "learning_rate": 0.0003802309711286089, | |
| "loss": 3.2473, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 18.347440830127084, | |
| "grad_norm": 0.3765549957752228, | |
| "learning_rate": 0.00038005599300087485, | |
| "loss": 3.2328, | |
| "step": 62950 | |
| }, | |
| { | |
| "epoch": 18.362014690451208, | |
| "grad_norm": 0.3612719178199768, | |
| "learning_rate": 0.0003798810148731408, | |
| "loss": 3.2502, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.362014690451208, | |
| "eval_accuracy": 0.37206123960544685, | |
| "eval_loss": 3.550529956817627, | |
| "eval_runtime": 53.2833, | |
| "eval_samples_per_second": 312.049, | |
| "eval_steps_per_second": 19.518, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.37658855077533, | |
| "grad_norm": 0.3705293536186218, | |
| "learning_rate": 0.0003797060367454068, | |
| "loss": 3.2415, | |
| "step": 63050 | |
| }, | |
| { | |
| "epoch": 18.39116241109945, | |
| "grad_norm": 0.406534880399704, | |
| "learning_rate": 0.0003795310586176728, | |
| "loss": 3.2368, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 18.405736271423574, | |
| "grad_norm": 0.4121636748313904, | |
| "learning_rate": 0.00037935608048993873, | |
| "loss": 3.2456, | |
| "step": 63150 | |
| }, | |
| { | |
| "epoch": 18.420310131747698, | |
| "grad_norm": 0.35870444774627686, | |
| "learning_rate": 0.0003791811023622047, | |
| "loss": 3.2387, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 18.43488399207182, | |
| "grad_norm": 0.3755796551704407, | |
| "learning_rate": 0.0003790061242344707, | |
| "loss": 3.2539, | |
| "step": 63250 | |
| }, | |
| { | |
| "epoch": 18.44945785239594, | |
| "grad_norm": 0.3582393229007721, | |
| "learning_rate": 0.0003788311461067366, | |
| "loss": 3.2442, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 18.464031712720065, | |
| "grad_norm": 0.37926796078681946, | |
| "learning_rate": 0.00037865616797900256, | |
| "loss": 3.2481, | |
| "step": 63350 | |
| }, | |
| { | |
| "epoch": 18.478605573044188, | |
| "grad_norm": 0.3608848452568054, | |
| "learning_rate": 0.00037848118985126856, | |
| "loss": 3.2444, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 18.49317943336831, | |
| "grad_norm": 0.40005603432655334, | |
| "learning_rate": 0.00037830621172353456, | |
| "loss": 3.2428, | |
| "step": 63450 | |
| }, | |
| { | |
| "epoch": 18.507753293692435, | |
| "grad_norm": 0.3767496347427368, | |
| "learning_rate": 0.00037813123359580045, | |
| "loss": 3.2603, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 18.522327154016555, | |
| "grad_norm": 0.36244866251945496, | |
| "learning_rate": 0.00037795625546806644, | |
| "loss": 3.2525, | |
| "step": 63550 | |
| }, | |
| { | |
| "epoch": 18.53690101434068, | |
| "grad_norm": 0.3706715703010559, | |
| "learning_rate": 0.00037778127734033244, | |
| "loss": 3.2555, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 18.5514748746648, | |
| "grad_norm": 0.3899987041950226, | |
| "learning_rate": 0.00037760629921259844, | |
| "loss": 3.2556, | |
| "step": 63650 | |
| }, | |
| { | |
| "epoch": 18.566048734988925, | |
| "grad_norm": 0.38272711634635925, | |
| "learning_rate": 0.00037743132108486433, | |
| "loss": 3.2559, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 18.580622595313045, | |
| "grad_norm": 0.38435348868370056, | |
| "learning_rate": 0.0003772563429571303, | |
| "loss": 3.2729, | |
| "step": 63750 | |
| }, | |
| { | |
| "epoch": 18.59519645563717, | |
| "grad_norm": 0.3879915177822113, | |
| "learning_rate": 0.0003770813648293963, | |
| "loss": 3.2638, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 18.609770315961292, | |
| "grad_norm": 0.3937206268310547, | |
| "learning_rate": 0.00037690638670166227, | |
| "loss": 3.2614, | |
| "step": 63850 | |
| }, | |
| { | |
| "epoch": 18.624344176285415, | |
| "grad_norm": 0.37411096692085266, | |
| "learning_rate": 0.0003767314085739282, | |
| "loss": 3.2676, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 18.63891803660954, | |
| "grad_norm": 0.368111252784729, | |
| "learning_rate": 0.0003765564304461942, | |
| "loss": 3.2642, | |
| "step": 63950 | |
| }, | |
| { | |
| "epoch": 18.65349189693366, | |
| "grad_norm": 0.35708072781562805, | |
| "learning_rate": 0.00037638145231846015, | |
| "loss": 3.2564, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.65349189693366, | |
| "eval_accuracy": 0.37207807026907996, | |
| "eval_loss": 3.546232223510742, | |
| "eval_runtime": 53.2641, | |
| "eval_samples_per_second": 312.162, | |
| "eval_steps_per_second": 19.525, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.668065757257782, | |
| "grad_norm": 0.3970872461795807, | |
| "learning_rate": 0.00037620647419072615, | |
| "loss": 3.2605, | |
| "step": 64050 | |
| }, | |
| { | |
| "epoch": 18.682639617581906, | |
| "grad_norm": 0.35976195335388184, | |
| "learning_rate": 0.0003760314960629921, | |
| "loss": 3.2583, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 18.69721347790603, | |
| "grad_norm": 0.3930548131465912, | |
| "learning_rate": 0.00037585651793525804, | |
| "loss": 3.2628, | |
| "step": 64150 | |
| }, | |
| { | |
| "epoch": 18.71178733823015, | |
| "grad_norm": 0.40853869915008545, | |
| "learning_rate": 0.00037568153980752404, | |
| "loss": 3.265, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 18.726361198554272, | |
| "grad_norm": 0.3586309850215912, | |
| "learning_rate": 0.00037550656167979, | |
| "loss": 3.2652, | |
| "step": 64250 | |
| }, | |
| { | |
| "epoch": 18.740935058878396, | |
| "grad_norm": 0.34977707266807556, | |
| "learning_rate": 0.0003753315835520559, | |
| "loss": 3.2499, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 18.75550891920252, | |
| "grad_norm": 0.3699282109737396, | |
| "learning_rate": 0.0003751566054243219, | |
| "loss": 3.2593, | |
| "step": 64350 | |
| }, | |
| { | |
| "epoch": 18.770082779526643, | |
| "grad_norm": 0.3860742747783661, | |
| "learning_rate": 0.0003749816272965879, | |
| "loss": 3.2671, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 18.784656639850763, | |
| "grad_norm": 0.35206711292266846, | |
| "learning_rate": 0.00037480664916885386, | |
| "loss": 3.2757, | |
| "step": 64450 | |
| }, | |
| { | |
| "epoch": 18.799230500174886, | |
| "grad_norm": 0.40201523900032043, | |
| "learning_rate": 0.0003746316710411198, | |
| "loss": 3.2733, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 18.81380436049901, | |
| "grad_norm": 0.35394608974456787, | |
| "learning_rate": 0.0003744566929133858, | |
| "loss": 3.2763, | |
| "step": 64550 | |
| }, | |
| { | |
| "epoch": 18.828378220823133, | |
| "grad_norm": 0.364339679479599, | |
| "learning_rate": 0.0003742817147856518, | |
| "loss": 3.2755, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 18.842952081147253, | |
| "grad_norm": 0.3828820288181305, | |
| "learning_rate": 0.0003741067366579177, | |
| "loss": 3.2687, | |
| "step": 64650 | |
| }, | |
| { | |
| "epoch": 18.857525941471376, | |
| "grad_norm": 0.37939324975013733, | |
| "learning_rate": 0.0003739317585301837, | |
| "loss": 3.2795, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 18.8720998017955, | |
| "grad_norm": 0.39449378848075867, | |
| "learning_rate": 0.0003737567804024497, | |
| "loss": 3.2844, | |
| "step": 64750 | |
| }, | |
| { | |
| "epoch": 18.886673662119623, | |
| "grad_norm": 0.3925759494304657, | |
| "learning_rate": 0.0003735818022747157, | |
| "loss": 3.268, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 18.901247522443747, | |
| "grad_norm": 0.3670575022697449, | |
| "learning_rate": 0.00037340682414698157, | |
| "loss": 3.2804, | |
| "step": 64850 | |
| }, | |
| { | |
| "epoch": 18.915821382767867, | |
| "grad_norm": 0.38824647665023804, | |
| "learning_rate": 0.00037323184601924757, | |
| "loss": 3.272, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 18.93039524309199, | |
| "grad_norm": 0.36675015091896057, | |
| "learning_rate": 0.00037305686789151357, | |
| "loss": 3.2985, | |
| "step": 64950 | |
| }, | |
| { | |
| "epoch": 18.944969103416113, | |
| "grad_norm": 0.3709295690059662, | |
| "learning_rate": 0.00037288188976377946, | |
| "loss": 3.2816, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.944969103416113, | |
| "eval_accuracy": 0.3725047217073308, | |
| "eval_loss": 3.5384469032287598, | |
| "eval_runtime": 53.2486, | |
| "eval_samples_per_second": 312.252, | |
| "eval_steps_per_second": 19.531, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.959542963740237, | |
| "grad_norm": 0.3714747726917267, | |
| "learning_rate": 0.00037270691163604545, | |
| "loss": 3.2805, | |
| "step": 65050 | |
| }, | |
| { | |
| "epoch": 18.974116824064357, | |
| "grad_norm": 0.36215534806251526, | |
| "learning_rate": 0.00037253193350831145, | |
| "loss": 3.2795, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 18.98869068438848, | |
| "grad_norm": 0.3677482008934021, | |
| "learning_rate": 0.0003723569553805774, | |
| "loss": 3.2835, | |
| "step": 65150 | |
| }, | |
| { | |
| "epoch": 19.003206249271308, | |
| "grad_norm": 0.3807225823402405, | |
| "learning_rate": 0.00037218197725284334, | |
| "loss": 3.2563, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 19.017780109595428, | |
| "grad_norm": 0.3954845070838928, | |
| "learning_rate": 0.00037200699912510934, | |
| "loss": 3.1662, | |
| "step": 65250 | |
| }, | |
| { | |
| "epoch": 19.03235396991955, | |
| "grad_norm": 0.3760719299316406, | |
| "learning_rate": 0.0003718320209973753, | |
| "loss": 3.1775, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 19.046927830243675, | |
| "grad_norm": 0.3775408864021301, | |
| "learning_rate": 0.0003716570428696413, | |
| "loss": 3.1867, | |
| "step": 65350 | |
| }, | |
| { | |
| "epoch": 19.0615016905678, | |
| "grad_norm": 0.3811887502670288, | |
| "learning_rate": 0.0003714820647419072, | |
| "loss": 3.1738, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 19.07607555089192, | |
| "grad_norm": 0.36342817544937134, | |
| "learning_rate": 0.00037130708661417316, | |
| "loss": 3.1776, | |
| "step": 65450 | |
| }, | |
| { | |
| "epoch": 19.09064941121604, | |
| "grad_norm": 0.3706771433353424, | |
| "learning_rate": 0.00037113210848643916, | |
| "loss": 3.1833, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 19.105223271540165, | |
| "grad_norm": 0.3676753640174866, | |
| "learning_rate": 0.00037095713035870516, | |
| "loss": 3.1831, | |
| "step": 65550 | |
| }, | |
| { | |
| "epoch": 19.11979713186429, | |
| "grad_norm": 0.3650805354118347, | |
| "learning_rate": 0.00037078215223097105, | |
| "loss": 3.2063, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 19.134370992188412, | |
| "grad_norm": 0.3826095759868622, | |
| "learning_rate": 0.00037060717410323705, | |
| "loss": 3.191, | |
| "step": 65650 | |
| }, | |
| { | |
| "epoch": 19.148944852512532, | |
| "grad_norm": 0.39902034401893616, | |
| "learning_rate": 0.00037043219597550304, | |
| "loss": 3.1941, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 19.163518712836655, | |
| "grad_norm": 0.4051700234413147, | |
| "learning_rate": 0.00037025721784776904, | |
| "loss": 3.2042, | |
| "step": 65750 | |
| }, | |
| { | |
| "epoch": 19.17809257316078, | |
| "grad_norm": 0.3963977098464966, | |
| "learning_rate": 0.00037008223972003493, | |
| "loss": 3.1994, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 19.192666433484902, | |
| "grad_norm": 0.4009312093257904, | |
| "learning_rate": 0.00036990726159230093, | |
| "loss": 3.206, | |
| "step": 65850 | |
| }, | |
| { | |
| "epoch": 19.207240293809026, | |
| "grad_norm": 0.4113180637359619, | |
| "learning_rate": 0.0003697322834645669, | |
| "loss": 3.214, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 19.221814154133146, | |
| "grad_norm": 0.37608832120895386, | |
| "learning_rate": 0.0003695573053368328, | |
| "loss": 3.2084, | |
| "step": 65950 | |
| }, | |
| { | |
| "epoch": 19.23638801445727, | |
| "grad_norm": 0.3920981287956238, | |
| "learning_rate": 0.0003693823272090988, | |
| "loss": 3.2194, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.23638801445727, | |
| "eval_accuracy": 0.37180807346926, | |
| "eval_loss": 3.55633544921875, | |
| "eval_runtime": 53.3372, | |
| "eval_samples_per_second": 311.733, | |
| "eval_steps_per_second": 19.499, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.250961874781392, | |
| "grad_norm": 0.358112633228302, | |
| "learning_rate": 0.0003692073490813648, | |
| "loss": 3.2184, | |
| "step": 66050 | |
| }, | |
| { | |
| "epoch": 19.265535735105516, | |
| "grad_norm": 0.38964906334877014, | |
| "learning_rate": 0.0003690323709536308, | |
| "loss": 3.2143, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 19.280109595429636, | |
| "grad_norm": 0.39134010672569275, | |
| "learning_rate": 0.0003688573928258967, | |
| "loss": 3.2149, | |
| "step": 66150 | |
| }, | |
| { | |
| "epoch": 19.29468345575376, | |
| "grad_norm": 0.4015119969844818, | |
| "learning_rate": 0.0003686824146981627, | |
| "loss": 3.23, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 19.309257316077883, | |
| "grad_norm": 0.3915894329547882, | |
| "learning_rate": 0.0003685074365704287, | |
| "loss": 3.2236, | |
| "step": 66250 | |
| }, | |
| { | |
| "epoch": 19.323831176402006, | |
| "grad_norm": 0.35716143250465393, | |
| "learning_rate": 0.00036833245844269464, | |
| "loss": 3.2368, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 19.33840503672613, | |
| "grad_norm": 0.40372762084007263, | |
| "learning_rate": 0.0003681574803149606, | |
| "loss": 3.2335, | |
| "step": 66350 | |
| }, | |
| { | |
| "epoch": 19.35297889705025, | |
| "grad_norm": 0.3974008858203888, | |
| "learning_rate": 0.0003679825021872266, | |
| "loss": 3.2347, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 19.367552757374373, | |
| "grad_norm": 0.35377946496009827, | |
| "learning_rate": 0.0003678075240594925, | |
| "loss": 3.2272, | |
| "step": 66450 | |
| }, | |
| { | |
| "epoch": 19.382126617698496, | |
| "grad_norm": 0.3589801788330078, | |
| "learning_rate": 0.0003676325459317585, | |
| "loss": 3.2309, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 19.39670047802262, | |
| "grad_norm": 0.3724297881126404, | |
| "learning_rate": 0.00036745756780402446, | |
| "loss": 3.245, | |
| "step": 66550 | |
| }, | |
| { | |
| "epoch": 19.41127433834674, | |
| "grad_norm": 0.4001477360725403, | |
| "learning_rate": 0.0003672825896762904, | |
| "loss": 3.2319, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 19.425848198670863, | |
| "grad_norm": 0.3749985694885254, | |
| "learning_rate": 0.0003671076115485564, | |
| "loss": 3.2322, | |
| "step": 66650 | |
| }, | |
| { | |
| "epoch": 19.440422058994987, | |
| "grad_norm": 0.37100356817245483, | |
| "learning_rate": 0.0003669326334208224, | |
| "loss": 3.2318, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 19.45499591931911, | |
| "grad_norm": 0.36847561597824097, | |
| "learning_rate": 0.0003667576552930883, | |
| "loss": 3.226, | |
| "step": 66750 | |
| }, | |
| { | |
| "epoch": 19.469569779643233, | |
| "grad_norm": 0.4181145429611206, | |
| "learning_rate": 0.0003665826771653543, | |
| "loss": 3.2353, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 19.484143639967353, | |
| "grad_norm": 0.3576458990573883, | |
| "learning_rate": 0.0003664076990376203, | |
| "loss": 3.2464, | |
| "step": 66850 | |
| }, | |
| { | |
| "epoch": 19.498717500291477, | |
| "grad_norm": 0.39416128396987915, | |
| "learning_rate": 0.0003662327209098862, | |
| "loss": 3.2355, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 19.5132913606156, | |
| "grad_norm": 0.40888020396232605, | |
| "learning_rate": 0.0003660577427821522, | |
| "loss": 3.2547, | |
| "step": 66950 | |
| }, | |
| { | |
| "epoch": 19.527865220939724, | |
| "grad_norm": 0.38048359751701355, | |
| "learning_rate": 0.00036588276465441817, | |
| "loss": 3.2408, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.527865220939724, | |
| "eval_accuracy": 0.37236913482267836, | |
| "eval_loss": 3.5473415851593018, | |
| "eval_runtime": 53.2303, | |
| "eval_samples_per_second": 312.359, | |
| "eval_steps_per_second": 19.538, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.542439081263844, | |
| "grad_norm": 0.3875918388366699, | |
| "learning_rate": 0.00036570778652668417, | |
| "loss": 3.2431, | |
| "step": 67050 | |
| }, | |
| { | |
| "epoch": 19.557012941587967, | |
| "grad_norm": 0.3789691627025604, | |
| "learning_rate": 0.00036553280839895006, | |
| "loss": 3.2548, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 19.57158680191209, | |
| "grad_norm": 0.3792179822921753, | |
| "learning_rate": 0.00036535783027121606, | |
| "loss": 3.2544, | |
| "step": 67150 | |
| }, | |
| { | |
| "epoch": 19.586160662236214, | |
| "grad_norm": 0.3668409585952759, | |
| "learning_rate": 0.00036518285214348205, | |
| "loss": 3.2473, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 19.600734522560337, | |
| "grad_norm": 0.3650484085083008, | |
| "learning_rate": 0.00036500787401574805, | |
| "loss": 3.2405, | |
| "step": 67250 | |
| }, | |
| { | |
| "epoch": 19.615308382884457, | |
| "grad_norm": 0.3609619140625, | |
| "learning_rate": 0.00036483289588801394, | |
| "loss": 3.249, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 19.62988224320858, | |
| "grad_norm": 0.3582451045513153, | |
| "learning_rate": 0.00036465791776027994, | |
| "loss": 3.2368, | |
| "step": 67350 | |
| }, | |
| { | |
| "epoch": 19.644456103532704, | |
| "grad_norm": 0.37356671690940857, | |
| "learning_rate": 0.00036448293963254594, | |
| "loss": 3.2464, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 19.659029963856828, | |
| "grad_norm": 0.42684417963027954, | |
| "learning_rate": 0.0003643079615048119, | |
| "loss": 3.2604, | |
| "step": 67450 | |
| }, | |
| { | |
| "epoch": 19.673603824180947, | |
| "grad_norm": 0.4055967330932617, | |
| "learning_rate": 0.0003641329833770778, | |
| "loss": 3.26, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 19.68817768450507, | |
| "grad_norm": 0.34891846776008606, | |
| "learning_rate": 0.0003639580052493438, | |
| "loss": 3.2648, | |
| "step": 67550 | |
| }, | |
| { | |
| "epoch": 19.702751544829194, | |
| "grad_norm": 0.3491284251213074, | |
| "learning_rate": 0.00036378302712160976, | |
| "loss": 3.2526, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 19.717325405153318, | |
| "grad_norm": 0.39875558018684387, | |
| "learning_rate": 0.00036360804899387576, | |
| "loss": 3.2551, | |
| "step": 67650 | |
| }, | |
| { | |
| "epoch": 19.73189926547744, | |
| "grad_norm": 0.344468355178833, | |
| "learning_rate": 0.0003634330708661417, | |
| "loss": 3.2551, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 19.74647312580156, | |
| "grad_norm": 0.3823426067829132, | |
| "learning_rate": 0.00036325809273840765, | |
| "loss": 3.2577, | |
| "step": 67750 | |
| }, | |
| { | |
| "epoch": 19.761046986125685, | |
| "grad_norm": 0.4144747257232666, | |
| "learning_rate": 0.00036308311461067365, | |
| "loss": 3.2515, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 19.775620846449808, | |
| "grad_norm": 0.3994449973106384, | |
| "learning_rate": 0.0003629081364829396, | |
| "loss": 3.245, | |
| "step": 67850 | |
| }, | |
| { | |
| "epoch": 19.79019470677393, | |
| "grad_norm": 0.38475626707077026, | |
| "learning_rate": 0.00036273315835520553, | |
| "loss": 3.2694, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 19.80476856709805, | |
| "grad_norm": 0.379226952791214, | |
| "learning_rate": 0.00036255818022747153, | |
| "loss": 3.2528, | |
| "step": 67950 | |
| }, | |
| { | |
| "epoch": 19.819342427422175, | |
| "grad_norm": 0.42125728726387024, | |
| "learning_rate": 0.00036238320209973753, | |
| "loss": 3.2572, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.819342427422175, | |
| "eval_accuracy": 0.3727095143976912, | |
| "eval_loss": 3.5435328483581543, | |
| "eval_runtime": 53.3288, | |
| "eval_samples_per_second": 311.783, | |
| "eval_steps_per_second": 19.502, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.8339162877463, | |
| "grad_norm": 0.3799435496330261, | |
| "learning_rate": 0.0003622082239720034, | |
| "loss": 3.2545, | |
| "step": 68050 | |
| }, | |
| { | |
| "epoch": 19.84849014807042, | |
| "grad_norm": 0.3758162260055542, | |
| "learning_rate": 0.0003620332458442694, | |
| "loss": 3.2655, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 19.863064008394545, | |
| "grad_norm": 0.4051867723464966, | |
| "learning_rate": 0.0003618582677165354, | |
| "loss": 3.2621, | |
| "step": 68150 | |
| }, | |
| { | |
| "epoch": 19.877637868718665, | |
| "grad_norm": 0.36065474152565, | |
| "learning_rate": 0.0003616832895888014, | |
| "loss": 3.258, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 19.89221172904279, | |
| "grad_norm": 0.3851832449436188, | |
| "learning_rate": 0.0003615083114610673, | |
| "loss": 3.2575, | |
| "step": 68250 | |
| }, | |
| { | |
| "epoch": 19.906785589366912, | |
| "grad_norm": 0.390201210975647, | |
| "learning_rate": 0.0003613333333333333, | |
| "loss": 3.2591, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 19.921359449691035, | |
| "grad_norm": 0.36681830883026123, | |
| "learning_rate": 0.0003611583552055993, | |
| "loss": 3.2666, | |
| "step": 68350 | |
| }, | |
| { | |
| "epoch": 19.935933310015155, | |
| "grad_norm": 0.38141345977783203, | |
| "learning_rate": 0.00036098337707786524, | |
| "loss": 3.2674, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 19.95050717033928, | |
| "grad_norm": 0.3752776086330414, | |
| "learning_rate": 0.0003608083989501312, | |
| "loss": 3.2611, | |
| "step": 68450 | |
| }, | |
| { | |
| "epoch": 19.965081030663402, | |
| "grad_norm": 0.40742582082748413, | |
| "learning_rate": 0.0003606334208223972, | |
| "loss": 3.2726, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 19.979654890987526, | |
| "grad_norm": 0.37358659505844116, | |
| "learning_rate": 0.0003604584426946632, | |
| "loss": 3.2807, | |
| "step": 68550 | |
| }, | |
| { | |
| "epoch": 19.99422875131165, | |
| "grad_norm": 0.37046629190444946, | |
| "learning_rate": 0.00036028346456692907, | |
| "loss": 3.2751, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 20.008744316194473, | |
| "grad_norm": 0.3958808481693268, | |
| "learning_rate": 0.00036010848643919507, | |
| "loss": 3.2098, | |
| "step": 68650 | |
| }, | |
| { | |
| "epoch": 20.023318176518597, | |
| "grad_norm": 0.38605043292045593, | |
| "learning_rate": 0.00035993350831146106, | |
| "loss": 3.1574, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 20.03789203684272, | |
| "grad_norm": 0.3664279878139496, | |
| "learning_rate": 0.000359758530183727, | |
| "loss": 3.1624, | |
| "step": 68750 | |
| }, | |
| { | |
| "epoch": 20.05246589716684, | |
| "grad_norm": 0.4011380970478058, | |
| "learning_rate": 0.00035958355205599295, | |
| "loss": 3.1759, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 20.067039757490964, | |
| "grad_norm": 0.4068359136581421, | |
| "learning_rate": 0.00035940857392825895, | |
| "loss": 3.1689, | |
| "step": 68850 | |
| }, | |
| { | |
| "epoch": 20.081613617815087, | |
| "grad_norm": 0.41633710265159607, | |
| "learning_rate": 0.0003592335958005249, | |
| "loss": 3.1759, | |
| "step": 68900 | |
| }, | |
| { | |
| "epoch": 20.09618747813921, | |
| "grad_norm": 0.38761988282203674, | |
| "learning_rate": 0.0003590586176727909, | |
| "loss": 3.1788, | |
| "step": 68950 | |
| }, | |
| { | |
| "epoch": 20.110761338463334, | |
| "grad_norm": 0.36621686816215515, | |
| "learning_rate": 0.00035888363954505683, | |
| "loss": 3.2013, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 20.110761338463334, | |
| "eval_accuracy": 0.3719890913760268, | |
| "eval_loss": 3.554094076156616, | |
| "eval_runtime": 53.1883, | |
| "eval_samples_per_second": 312.606, | |
| "eval_steps_per_second": 19.553, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 20.125335198787454, | |
| "grad_norm": 0.38778454065322876, | |
| "learning_rate": 0.0003587086614173228, | |
| "loss": 3.1886, | |
| "step": 69050 | |
| }, | |
| { | |
| "epoch": 20.139909059111577, | |
| "grad_norm": 0.39453768730163574, | |
| "learning_rate": 0.0003585336832895888, | |
| "loss": 3.1855, | |
| "step": 69100 | |
| }, | |
| { | |
| "epoch": 20.1544829194357, | |
| "grad_norm": 0.3769824802875519, | |
| "learning_rate": 0.00035835870516185477, | |
| "loss": 3.1867, | |
| "step": 69150 | |
| }, | |
| { | |
| "epoch": 20.169056779759824, | |
| "grad_norm": 0.40915030241012573, | |
| "learning_rate": 0.00035818372703412066, | |
| "loss": 3.1816, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 20.183630640083944, | |
| "grad_norm": 0.40823474526405334, | |
| "learning_rate": 0.00035800874890638666, | |
| "loss": 3.1942, | |
| "step": 69250 | |
| }, | |
| { | |
| "epoch": 20.198204500408067, | |
| "grad_norm": 0.40230390429496765, | |
| "learning_rate": 0.00035783377077865266, | |
| "loss": 3.2091, | |
| "step": 69300 | |
| }, | |
| { | |
| "epoch": 20.21277836073219, | |
| "grad_norm": 0.4020078480243683, | |
| "learning_rate": 0.00035765879265091865, | |
| "loss": 3.1972, | |
| "step": 69350 | |
| }, | |
| { | |
| "epoch": 20.227352221056314, | |
| "grad_norm": 0.3979397416114807, | |
| "learning_rate": 0.00035748381452318454, | |
| "loss": 3.1991, | |
| "step": 69400 | |
| }, | |
| { | |
| "epoch": 20.241926081380438, | |
| "grad_norm": 0.35688549280166626, | |
| "learning_rate": 0.00035730883639545054, | |
| "loss": 3.2108, | |
| "step": 69450 | |
| }, | |
| { | |
| "epoch": 20.256499941704558, | |
| "grad_norm": 0.3942968249320984, | |
| "learning_rate": 0.00035713385826771654, | |
| "loss": 3.2102, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 20.27107380202868, | |
| "grad_norm": 0.4229877293109894, | |
| "learning_rate": 0.00035695888013998243, | |
| "loss": 3.1885, | |
| "step": 69550 | |
| }, | |
| { | |
| "epoch": 20.285647662352805, | |
| "grad_norm": 0.36158671975135803, | |
| "learning_rate": 0.0003567839020122484, | |
| "loss": 3.2207, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 20.300221522676928, | |
| "grad_norm": 0.40409842133522034, | |
| "learning_rate": 0.0003566089238845144, | |
| "loss": 3.2079, | |
| "step": 69650 | |
| }, | |
| { | |
| "epoch": 20.314795383001048, | |
| "grad_norm": 0.385179340839386, | |
| "learning_rate": 0.00035643394575678037, | |
| "loss": 3.2049, | |
| "step": 69700 | |
| }, | |
| { | |
| "epoch": 20.32936924332517, | |
| "grad_norm": 0.4204241931438446, | |
| "learning_rate": 0.0003562589676290463, | |
| "loss": 3.2025, | |
| "step": 69750 | |
| }, | |
| { | |
| "epoch": 20.343943103649295, | |
| "grad_norm": 0.38813483715057373, | |
| "learning_rate": 0.0003560839895013123, | |
| "loss": 3.2236, | |
| "step": 69800 | |
| }, | |
| { | |
| "epoch": 20.35851696397342, | |
| "grad_norm": 0.35402917861938477, | |
| "learning_rate": 0.0003559090113735783, | |
| "loss": 3.212, | |
| "step": 69850 | |
| }, | |
| { | |
| "epoch": 20.37309082429754, | |
| "grad_norm": 0.40892571210861206, | |
| "learning_rate": 0.00035573403324584425, | |
| "loss": 3.2152, | |
| "step": 69900 | |
| }, | |
| { | |
| "epoch": 20.38766468462166, | |
| "grad_norm": 0.3680976927280426, | |
| "learning_rate": 0.0003555590551181102, | |
| "loss": 3.2247, | |
| "step": 69950 | |
| }, | |
| { | |
| "epoch": 20.402238544945785, | |
| "grad_norm": 0.39239105582237244, | |
| "learning_rate": 0.0003553840769903762, | |
| "loss": 3.2131, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 20.402238544945785, | |
| "eval_accuracy": 0.372465057835692, | |
| "eval_loss": 3.5467369556427, | |
| "eval_runtime": 53.4137, | |
| "eval_samples_per_second": 311.287, | |
| "eval_steps_per_second": 19.471, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 20.41681240526991, | |
| "grad_norm": 0.38115379214286804, | |
| "learning_rate": 0.00035520909886264213, | |
| "loss": 3.2223, | |
| "step": 70050 | |
| }, | |
| { | |
| "epoch": 20.431386265594032, | |
| "grad_norm": 0.38737982511520386, | |
| "learning_rate": 0.00035503412073490813, | |
| "loss": 3.2343, | |
| "step": 70100 | |
| }, | |
| { | |
| "epoch": 20.445960125918152, | |
| "grad_norm": 0.3989051282405853, | |
| "learning_rate": 0.0003548591426071741, | |
| "loss": 3.2201, | |
| "step": 70150 | |
| }, | |
| { | |
| "epoch": 20.460533986242275, | |
| "grad_norm": 0.3883934020996094, | |
| "learning_rate": 0.00035468416447944, | |
| "loss": 3.2349, | |
| "step": 70200 | |
| }, | |
| { | |
| "epoch": 20.4751078465664, | |
| "grad_norm": 0.39728063344955444, | |
| "learning_rate": 0.000354509186351706, | |
| "loss": 3.2319, | |
| "step": 70250 | |
| }, | |
| { | |
| "epoch": 20.489681706890522, | |
| "grad_norm": 0.38918405771255493, | |
| "learning_rate": 0.000354334208223972, | |
| "loss": 3.2266, | |
| "step": 70300 | |
| }, | |
| { | |
| "epoch": 20.504255567214642, | |
| "grad_norm": 0.397217333316803, | |
| "learning_rate": 0.0003541592300962379, | |
| "loss": 3.2362, | |
| "step": 70350 | |
| }, | |
| { | |
| "epoch": 20.518829427538765, | |
| "grad_norm": 0.39398711919784546, | |
| "learning_rate": 0.0003539842519685039, | |
| "loss": 3.2298, | |
| "step": 70400 | |
| }, | |
| { | |
| "epoch": 20.53340328786289, | |
| "grad_norm": 0.3995719850063324, | |
| "learning_rate": 0.0003538092738407699, | |
| "loss": 3.2338, | |
| "step": 70450 | |
| }, | |
| { | |
| "epoch": 20.547977148187012, | |
| "grad_norm": 0.4090834856033325, | |
| "learning_rate": 0.0003536342957130358, | |
| "loss": 3.2183, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 20.562551008511136, | |
| "grad_norm": 0.38448628783226013, | |
| "learning_rate": 0.0003534593175853018, | |
| "loss": 3.226, | |
| "step": 70550 | |
| }, | |
| { | |
| "epoch": 20.577124868835256, | |
| "grad_norm": 0.39190101623535156, | |
| "learning_rate": 0.0003532843394575678, | |
| "loss": 3.2334, | |
| "step": 70600 | |
| }, | |
| { | |
| "epoch": 20.59169872915938, | |
| "grad_norm": 0.39090031385421753, | |
| "learning_rate": 0.0003531093613298338, | |
| "loss": 3.248, | |
| "step": 70650 | |
| }, | |
| { | |
| "epoch": 20.606272589483503, | |
| "grad_norm": 0.41073334217071533, | |
| "learning_rate": 0.00035293438320209967, | |
| "loss": 3.2492, | |
| "step": 70700 | |
| }, | |
| { | |
| "epoch": 20.620846449807626, | |
| "grad_norm": 0.3776114583015442, | |
| "learning_rate": 0.00035275940507436567, | |
| "loss": 3.2373, | |
| "step": 70750 | |
| }, | |
| { | |
| "epoch": 20.63542031013175, | |
| "grad_norm": 0.4018295109272003, | |
| "learning_rate": 0.00035258442694663166, | |
| "loss": 3.2414, | |
| "step": 70800 | |
| }, | |
| { | |
| "epoch": 20.64999417045587, | |
| "grad_norm": 0.35556280612945557, | |
| "learning_rate": 0.0003524094488188976, | |
| "loss": 3.2356, | |
| "step": 70850 | |
| }, | |
| { | |
| "epoch": 20.664568030779993, | |
| "grad_norm": 0.3811604380607605, | |
| "learning_rate": 0.00035223447069116355, | |
| "loss": 3.2301, | |
| "step": 70900 | |
| }, | |
| { | |
| "epoch": 20.679141891104116, | |
| "grad_norm": 0.3758731484413147, | |
| "learning_rate": 0.00035205949256342955, | |
| "loss": 3.2462, | |
| "step": 70950 | |
| }, | |
| { | |
| "epoch": 20.69371575142824, | |
| "grad_norm": 0.403804749250412, | |
| "learning_rate": 0.0003518845144356955, | |
| "loss": 3.2408, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 20.69371575142824, | |
| "eval_accuracy": 0.3727085728221033, | |
| "eval_loss": 3.5426127910614014, | |
| "eval_runtime": 53.2562, | |
| "eval_samples_per_second": 312.208, | |
| "eval_steps_per_second": 19.528, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 20.70828961175236, | |
| "grad_norm": 0.41654902696609497, | |
| "learning_rate": 0.0003517095363079615, | |
| "loss": 3.2509, | |
| "step": 71050 | |
| }, | |
| { | |
| "epoch": 20.722863472076483, | |
| "grad_norm": 0.3992166817188263, | |
| "learning_rate": 0.00035153455818022743, | |
| "loss": 3.2378, | |
| "step": 71100 | |
| }, | |
| { | |
| "epoch": 20.737437332400606, | |
| "grad_norm": 0.4075424075126648, | |
| "learning_rate": 0.00035135958005249343, | |
| "loss": 3.2402, | |
| "step": 71150 | |
| }, | |
| { | |
| "epoch": 20.75201119272473, | |
| "grad_norm": 0.404988557100296, | |
| "learning_rate": 0.0003511846019247594, | |
| "loss": 3.251, | |
| "step": 71200 | |
| }, | |
| { | |
| "epoch": 20.76658505304885, | |
| "grad_norm": 0.37374556064605713, | |
| "learning_rate": 0.0003510096237970253, | |
| "loss": 3.2411, | |
| "step": 71250 | |
| }, | |
| { | |
| "epoch": 20.781158913372973, | |
| "grad_norm": 0.379692018032074, | |
| "learning_rate": 0.0003508346456692913, | |
| "loss": 3.2491, | |
| "step": 71300 | |
| }, | |
| { | |
| "epoch": 20.795732773697097, | |
| "grad_norm": 0.3699359595775604, | |
| "learning_rate": 0.00035065966754155726, | |
| "loss": 3.2541, | |
| "step": 71350 | |
| }, | |
| { | |
| "epoch": 20.81030663402122, | |
| "grad_norm": 0.3997096121311188, | |
| "learning_rate": 0.00035048468941382326, | |
| "loss": 3.2405, | |
| "step": 71400 | |
| }, | |
| { | |
| "epoch": 20.824880494345344, | |
| "grad_norm": 0.40189990401268005, | |
| "learning_rate": 0.0003503097112860892, | |
| "loss": 3.2496, | |
| "step": 71450 | |
| }, | |
| { | |
| "epoch": 20.839454354669463, | |
| "grad_norm": 0.3730684518814087, | |
| "learning_rate": 0.00035013473315835514, | |
| "loss": 3.2441, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 20.854028214993587, | |
| "grad_norm": 0.38036414980888367, | |
| "learning_rate": 0.00034995975503062114, | |
| "loss": 3.2635, | |
| "step": 71550 | |
| }, | |
| { | |
| "epoch": 20.86860207531771, | |
| "grad_norm": 0.3716338872909546, | |
| "learning_rate": 0.00034978477690288714, | |
| "loss": 3.2611, | |
| "step": 71600 | |
| }, | |
| { | |
| "epoch": 20.883175935641834, | |
| "grad_norm": 0.39651766419410706, | |
| "learning_rate": 0.00034960979877515303, | |
| "loss": 3.2585, | |
| "step": 71650 | |
| }, | |
| { | |
| "epoch": 20.897749795965957, | |
| "grad_norm": 0.41758492588996887, | |
| "learning_rate": 0.000349434820647419, | |
| "loss": 3.2526, | |
| "step": 71700 | |
| }, | |
| { | |
| "epoch": 20.912323656290077, | |
| "grad_norm": 0.37373974919319153, | |
| "learning_rate": 0.000349259842519685, | |
| "loss": 3.2588, | |
| "step": 71750 | |
| }, | |
| { | |
| "epoch": 20.9268975166142, | |
| "grad_norm": 0.36950206756591797, | |
| "learning_rate": 0.000349084864391951, | |
| "loss": 3.2514, | |
| "step": 71800 | |
| }, | |
| { | |
| "epoch": 20.941471376938324, | |
| "grad_norm": 0.3997727930545807, | |
| "learning_rate": 0.0003489098862642169, | |
| "loss": 3.2569, | |
| "step": 71850 | |
| }, | |
| { | |
| "epoch": 20.956045237262448, | |
| "grad_norm": 0.391072541475296, | |
| "learning_rate": 0.0003487349081364829, | |
| "loss": 3.2528, | |
| "step": 71900 | |
| }, | |
| { | |
| "epoch": 20.970619097586567, | |
| "grad_norm": 0.3755815029144287, | |
| "learning_rate": 0.0003485599300087489, | |
| "loss": 3.2651, | |
| "step": 71950 | |
| }, | |
| { | |
| "epoch": 20.98519295791069, | |
| "grad_norm": 0.4253823161125183, | |
| "learning_rate": 0.00034838495188101485, | |
| "loss": 3.2684, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 20.98519295791069, | |
| "eval_accuracy": 0.37319101261393506, | |
| "eval_loss": 3.5336711406707764, | |
| "eval_runtime": 53.3881, | |
| "eval_samples_per_second": 311.436, | |
| "eval_steps_per_second": 19.48, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 20.999766818234814, | |
| "grad_norm": 0.38008999824523926, | |
| "learning_rate": 0.0003482099737532808, | |
| "loss": 3.2663, | |
| "step": 72050 | |
| }, | |
| { | |
| "epoch": 21.01428238311764, | |
| "grad_norm": 0.38966473937034607, | |
| "learning_rate": 0.0003480349956255468, | |
| "loss": 3.1576, | |
| "step": 72100 | |
| }, | |
| { | |
| "epoch": 21.028856243441762, | |
| "grad_norm": 0.35641705989837646, | |
| "learning_rate": 0.00034786001749781274, | |
| "loss": 3.1555, | |
| "step": 72150 | |
| }, | |
| { | |
| "epoch": 21.043430103765886, | |
| "grad_norm": 0.450893372297287, | |
| "learning_rate": 0.0003476850393700787, | |
| "loss": 3.1528, | |
| "step": 72200 | |
| }, | |
| { | |
| "epoch": 21.05800396409001, | |
| "grad_norm": 0.3972832262516022, | |
| "learning_rate": 0.0003475100612423447, | |
| "loss": 3.1585, | |
| "step": 72250 | |
| }, | |
| { | |
| "epoch": 21.072577824414132, | |
| "grad_norm": 0.391781210899353, | |
| "learning_rate": 0.0003473350831146106, | |
| "loss": 3.1495, | |
| "step": 72300 | |
| }, | |
| { | |
| "epoch": 21.087151684738252, | |
| "grad_norm": 0.3778936564922333, | |
| "learning_rate": 0.0003471601049868766, | |
| "loss": 3.169, | |
| "step": 72350 | |
| }, | |
| { | |
| "epoch": 21.101725545062376, | |
| "grad_norm": 0.3655568063259125, | |
| "learning_rate": 0.00034698512685914256, | |
| "loss": 3.1737, | |
| "step": 72400 | |
| }, | |
| { | |
| "epoch": 21.1162994053865, | |
| "grad_norm": 0.3923577070236206, | |
| "learning_rate": 0.00034681014873140856, | |
| "loss": 3.1725, | |
| "step": 72450 | |
| }, | |
| { | |
| "epoch": 21.130873265710623, | |
| "grad_norm": 0.3961803913116455, | |
| "learning_rate": 0.0003466351706036745, | |
| "loss": 3.1745, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 21.145447126034743, | |
| "grad_norm": 0.4180135428905487, | |
| "learning_rate": 0.0003464601924759405, | |
| "loss": 3.1888, | |
| "step": 72550 | |
| }, | |
| { | |
| "epoch": 21.160020986358866, | |
| "grad_norm": 0.3757016956806183, | |
| "learning_rate": 0.00034628521434820644, | |
| "loss": 3.1843, | |
| "step": 72600 | |
| }, | |
| { | |
| "epoch": 21.17459484668299, | |
| "grad_norm": 0.38864660263061523, | |
| "learning_rate": 0.0003461102362204724, | |
| "loss": 3.2, | |
| "step": 72650 | |
| }, | |
| { | |
| "epoch": 21.189168707007113, | |
| "grad_norm": 0.39377427101135254, | |
| "learning_rate": 0.0003459352580927384, | |
| "loss": 3.1873, | |
| "step": 72700 | |
| }, | |
| { | |
| "epoch": 21.203742567331236, | |
| "grad_norm": 0.41590413451194763, | |
| "learning_rate": 0.0003457602799650044, | |
| "loss": 3.2033, | |
| "step": 72750 | |
| }, | |
| { | |
| "epoch": 21.218316427655356, | |
| "grad_norm": 0.38666868209838867, | |
| "learning_rate": 0.00034558530183727027, | |
| "loss": 3.1979, | |
| "step": 72800 | |
| }, | |
| { | |
| "epoch": 21.23289028797948, | |
| "grad_norm": 0.403987854719162, | |
| "learning_rate": 0.00034541032370953627, | |
| "loss": 3.1999, | |
| "step": 72850 | |
| }, | |
| { | |
| "epoch": 21.247464148303603, | |
| "grad_norm": 0.3956553637981415, | |
| "learning_rate": 0.00034523534558180227, | |
| "loss": 3.1968, | |
| "step": 72900 | |
| }, | |
| { | |
| "epoch": 21.262038008627727, | |
| "grad_norm": 0.43393418192863464, | |
| "learning_rate": 0.00034506036745406826, | |
| "loss": 3.1981, | |
| "step": 72950 | |
| }, | |
| { | |
| "epoch": 21.276611868951846, | |
| "grad_norm": 0.38441672921180725, | |
| "learning_rate": 0.00034488538932633415, | |
| "loss": 3.2104, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 21.276611868951846, | |
| "eval_accuracy": 0.3723660747020178, | |
| "eval_loss": 3.5544207096099854, | |
| "eval_runtime": 53.2491, | |
| "eval_samples_per_second": 312.249, | |
| "eval_steps_per_second": 19.531, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 21.29118572927597, | |
| "grad_norm": 0.38110947608947754, | |
| "learning_rate": 0.00034471041119860015, | |
| "loss": 3.19, | |
| "step": 73050 | |
| }, | |
| { | |
| "epoch": 21.305759589600093, | |
| "grad_norm": 0.39781099557876587, | |
| "learning_rate": 0.00034453543307086615, | |
| "loss": 3.1996, | |
| "step": 73100 | |
| }, | |
| { | |
| "epoch": 21.320333449924217, | |
| "grad_norm": 0.3817616105079651, | |
| "learning_rate": 0.00034436045494313204, | |
| "loss": 3.1977, | |
| "step": 73150 | |
| }, | |
| { | |
| "epoch": 21.33490731024834, | |
| "grad_norm": 0.37882694602012634, | |
| "learning_rate": 0.00034418547681539804, | |
| "loss": 3.2149, | |
| "step": 73200 | |
| }, | |
| { | |
| "epoch": 21.34948117057246, | |
| "grad_norm": 0.40080127120018005, | |
| "learning_rate": 0.00034401049868766403, | |
| "loss": 3.2046, | |
| "step": 73250 | |
| }, | |
| { | |
| "epoch": 21.364055030896584, | |
| "grad_norm": 0.3883497416973114, | |
| "learning_rate": 0.00034383552055993, | |
| "loss": 3.215, | |
| "step": 73300 | |
| }, | |
| { | |
| "epoch": 21.378628891220707, | |
| "grad_norm": 0.39146512746810913, | |
| "learning_rate": 0.0003436605424321959, | |
| "loss": 3.2035, | |
| "step": 73350 | |
| }, | |
| { | |
| "epoch": 21.39320275154483, | |
| "grad_norm": 0.44710758328437805, | |
| "learning_rate": 0.0003434855643044619, | |
| "loss": 3.2015, | |
| "step": 73400 | |
| }, | |
| { | |
| "epoch": 21.40777661186895, | |
| "grad_norm": 0.43041878938674927, | |
| "learning_rate": 0.00034331058617672786, | |
| "loss": 3.2143, | |
| "step": 73450 | |
| }, | |
| { | |
| "epoch": 21.422350472193074, | |
| "grad_norm": 0.4076734781265259, | |
| "learning_rate": 0.00034313560804899386, | |
| "loss": 3.2244, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 21.436924332517197, | |
| "grad_norm": 0.4158762991428375, | |
| "learning_rate": 0.0003429606299212598, | |
| "loss": 3.2085, | |
| "step": 73550 | |
| }, | |
| { | |
| "epoch": 21.45149819284132, | |
| "grad_norm": 0.4004541039466858, | |
| "learning_rate": 0.00034278565179352575, | |
| "loss": 3.2125, | |
| "step": 73600 | |
| }, | |
| { | |
| "epoch": 21.466072053165444, | |
| "grad_norm": 0.37133657932281494, | |
| "learning_rate": 0.00034261067366579174, | |
| "loss": 3.2192, | |
| "step": 73650 | |
| }, | |
| { | |
| "epoch": 21.480645913489564, | |
| "grad_norm": 0.39652833342552185, | |
| "learning_rate": 0.00034243569553805774, | |
| "loss": 3.2189, | |
| "step": 73700 | |
| }, | |
| { | |
| "epoch": 21.495219773813687, | |
| "grad_norm": 0.3788989782333374, | |
| "learning_rate": 0.0003422607174103237, | |
| "loss": 3.2197, | |
| "step": 73750 | |
| }, | |
| { | |
| "epoch": 21.50979363413781, | |
| "grad_norm": 0.405619740486145, | |
| "learning_rate": 0.00034208573928258963, | |
| "loss": 3.2189, | |
| "step": 73800 | |
| }, | |
| { | |
| "epoch": 21.524367494461934, | |
| "grad_norm": 0.39048507809638977, | |
| "learning_rate": 0.0003419107611548556, | |
| "loss": 3.2186, | |
| "step": 73850 | |
| }, | |
| { | |
| "epoch": 21.538941354786054, | |
| "grad_norm": 0.40843072533607483, | |
| "learning_rate": 0.0003417357830271216, | |
| "loss": 3.2242, | |
| "step": 73900 | |
| }, | |
| { | |
| "epoch": 21.553515215110178, | |
| "grad_norm": 0.3887636065483093, | |
| "learning_rate": 0.0003415608048993875, | |
| "loss": 3.217, | |
| "step": 73950 | |
| }, | |
| { | |
| "epoch": 21.5680890754343, | |
| "grad_norm": 0.3871009051799774, | |
| "learning_rate": 0.0003413858267716535, | |
| "loss": 3.2185, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 21.5680890754343, | |
| "eval_accuracy": 0.3726364245926832, | |
| "eval_loss": 3.5461220741271973, | |
| "eval_runtime": 53.2502, | |
| "eval_samples_per_second": 312.243, | |
| "eval_steps_per_second": 19.53, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 21.582662935758425, | |
| "grad_norm": 0.4204442799091339, | |
| "learning_rate": 0.0003412108486439195, | |
| "loss": 3.2321, | |
| "step": 74050 | |
| }, | |
| { | |
| "epoch": 21.597236796082548, | |
| "grad_norm": 0.3793659806251526, | |
| "learning_rate": 0.0003410358705161854, | |
| "loss": 3.2254, | |
| "step": 74100 | |
| }, | |
| { | |
| "epoch": 21.611810656406668, | |
| "grad_norm": 0.40363553166389465, | |
| "learning_rate": 0.0003408608923884514, | |
| "loss": 3.217, | |
| "step": 74150 | |
| }, | |
| { | |
| "epoch": 21.62638451673079, | |
| "grad_norm": 0.3986867070198059, | |
| "learning_rate": 0.0003406859142607174, | |
| "loss": 3.2277, | |
| "step": 74200 | |
| }, | |
| { | |
| "epoch": 21.640958377054915, | |
| "grad_norm": 0.37159422039985657, | |
| "learning_rate": 0.0003405109361329834, | |
| "loss": 3.2243, | |
| "step": 74250 | |
| }, | |
| { | |
| "epoch": 21.655532237379038, | |
| "grad_norm": 0.40285640954971313, | |
| "learning_rate": 0.0003403359580052493, | |
| "loss": 3.2353, | |
| "step": 74300 | |
| }, | |
| { | |
| "epoch": 21.670106097703158, | |
| "grad_norm": 0.4001893699169159, | |
| "learning_rate": 0.0003401609798775153, | |
| "loss": 3.2359, | |
| "step": 74350 | |
| }, | |
| { | |
| "epoch": 21.68467995802728, | |
| "grad_norm": 0.3854397237300873, | |
| "learning_rate": 0.0003399860017497813, | |
| "loss": 3.2281, | |
| "step": 74400 | |
| }, | |
| { | |
| "epoch": 21.699253818351405, | |
| "grad_norm": 0.4364528954029083, | |
| "learning_rate": 0.0003398110236220472, | |
| "loss": 3.2206, | |
| "step": 74450 | |
| }, | |
| { | |
| "epoch": 21.71382767867553, | |
| "grad_norm": 0.4174240827560425, | |
| "learning_rate": 0.00033963604549431316, | |
| "loss": 3.2339, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 21.728401538999652, | |
| "grad_norm": 0.40123292803764343, | |
| "learning_rate": 0.00033946106736657916, | |
| "loss": 3.2358, | |
| "step": 74550 | |
| }, | |
| { | |
| "epoch": 21.74297539932377, | |
| "grad_norm": 0.36441606283187866, | |
| "learning_rate": 0.0003392860892388451, | |
| "loss": 3.2384, | |
| "step": 74600 | |
| }, | |
| { | |
| "epoch": 21.757549259647895, | |
| "grad_norm": 0.431755393743515, | |
| "learning_rate": 0.0003391111111111111, | |
| "loss": 3.2295, | |
| "step": 74650 | |
| }, | |
| { | |
| "epoch": 21.77212311997202, | |
| "grad_norm": 0.4035051167011261, | |
| "learning_rate": 0.00033893613298337705, | |
| "loss": 3.2409, | |
| "step": 74700 | |
| }, | |
| { | |
| "epoch": 21.786696980296142, | |
| "grad_norm": 0.4678764045238495, | |
| "learning_rate": 0.000338761154855643, | |
| "loss": 3.225, | |
| "step": 74750 | |
| }, | |
| { | |
| "epoch": 21.801270840620262, | |
| "grad_norm": 0.3831508159637451, | |
| "learning_rate": 0.000338586176727909, | |
| "loss": 3.2392, | |
| "step": 74800 | |
| }, | |
| { | |
| "epoch": 21.815844700944385, | |
| "grad_norm": 0.4162442684173584, | |
| "learning_rate": 0.00033841119860017493, | |
| "loss": 3.2509, | |
| "step": 74850 | |
| }, | |
| { | |
| "epoch": 21.83041856126851, | |
| "grad_norm": 0.37772423028945923, | |
| "learning_rate": 0.0003382362204724409, | |
| "loss": 3.2573, | |
| "step": 74900 | |
| }, | |
| { | |
| "epoch": 21.844992421592632, | |
| "grad_norm": 0.3931127190589905, | |
| "learning_rate": 0.00033806124234470687, | |
| "loss": 3.2319, | |
| "step": 74950 | |
| }, | |
| { | |
| "epoch": 21.859566281916756, | |
| "grad_norm": 0.3934958279132843, | |
| "learning_rate": 0.00033788626421697287, | |
| "loss": 3.2397, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 21.859566281916756, | |
| "eval_accuracy": 0.37314040292608736, | |
| "eval_loss": 3.540088653564453, | |
| "eval_runtime": 53.2315, | |
| "eval_samples_per_second": 312.353, | |
| "eval_steps_per_second": 19.537, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 21.874140142240876, | |
| "grad_norm": 0.41406193375587463, | |
| "learning_rate": 0.0003377112860892388, | |
| "loss": 3.2408, | |
| "step": 75050 | |
| }, | |
| { | |
| "epoch": 21.888714002565, | |
| "grad_norm": 0.3920379877090454, | |
| "learning_rate": 0.00033753630796150476, | |
| "loss": 3.242, | |
| "step": 75100 | |
| }, | |
| { | |
| "epoch": 21.903287862889123, | |
| "grad_norm": 0.407136470079422, | |
| "learning_rate": 0.00033736132983377075, | |
| "loss": 3.2502, | |
| "step": 75150 | |
| }, | |
| { | |
| "epoch": 21.917861723213246, | |
| "grad_norm": 0.375531941652298, | |
| "learning_rate": 0.00033718635170603675, | |
| "loss": 3.239, | |
| "step": 75200 | |
| }, | |
| { | |
| "epoch": 21.932435583537366, | |
| "grad_norm": 0.3673073649406433, | |
| "learning_rate": 0.00033701137357830264, | |
| "loss": 3.2481, | |
| "step": 75250 | |
| }, | |
| { | |
| "epoch": 21.94700944386149, | |
| "grad_norm": 0.38741302490234375, | |
| "learning_rate": 0.00033683639545056864, | |
| "loss": 3.242, | |
| "step": 75300 | |
| }, | |
| { | |
| "epoch": 21.961583304185613, | |
| "grad_norm": 0.3984997272491455, | |
| "learning_rate": 0.00033666141732283464, | |
| "loss": 3.2439, | |
| "step": 75350 | |
| }, | |
| { | |
| "epoch": 21.976157164509736, | |
| "grad_norm": 0.3742145895957947, | |
| "learning_rate": 0.00033648643919510063, | |
| "loss": 3.2562, | |
| "step": 75400 | |
| }, | |
| { | |
| "epoch": 21.99073102483386, | |
| "grad_norm": 0.3876781463623047, | |
| "learning_rate": 0.0003363114610673665, | |
| "loss": 3.2519, | |
| "step": 75450 | |
| }, | |
| { | |
| "epoch": 22.005246589716684, | |
| "grad_norm": 0.37868696451187134, | |
| "learning_rate": 0.0003361364829396325, | |
| "loss": 3.2046, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 22.019820450040807, | |
| "grad_norm": 0.3847079277038574, | |
| "learning_rate": 0.0003359615048118985, | |
| "loss": 3.1304, | |
| "step": 75550 | |
| }, | |
| { | |
| "epoch": 22.03439431036493, | |
| "grad_norm": 0.37587589025497437, | |
| "learning_rate": 0.00033578652668416446, | |
| "loss": 3.1392, | |
| "step": 75600 | |
| }, | |
| { | |
| "epoch": 22.04896817068905, | |
| "grad_norm": 0.38762593269348145, | |
| "learning_rate": 0.0003356115485564304, | |
| "loss": 3.1518, | |
| "step": 75650 | |
| }, | |
| { | |
| "epoch": 22.063542031013174, | |
| "grad_norm": 0.4253588020801544, | |
| "learning_rate": 0.0003354365704286964, | |
| "loss": 3.1539, | |
| "step": 75700 | |
| }, | |
| { | |
| "epoch": 22.078115891337298, | |
| "grad_norm": 0.40510454773902893, | |
| "learning_rate": 0.00033526159230096235, | |
| "loss": 3.1768, | |
| "step": 75750 | |
| }, | |
| { | |
| "epoch": 22.09268975166142, | |
| "grad_norm": 0.41818878054618835, | |
| "learning_rate": 0.0003350866141732283, | |
| "loss": 3.1706, | |
| "step": 75800 | |
| }, | |
| { | |
| "epoch": 22.10726361198554, | |
| "grad_norm": 0.39923954010009766, | |
| "learning_rate": 0.0003349116360454943, | |
| "loss": 3.1712, | |
| "step": 75850 | |
| }, | |
| { | |
| "epoch": 22.121837472309664, | |
| "grad_norm": 0.39292123913764954, | |
| "learning_rate": 0.00033473665791776023, | |
| "loss": 3.1647, | |
| "step": 75900 | |
| }, | |
| { | |
| "epoch": 22.136411332633788, | |
| "grad_norm": 0.42234423756599426, | |
| "learning_rate": 0.00033456167979002623, | |
| "loss": 3.1826, | |
| "step": 75950 | |
| }, | |
| { | |
| "epoch": 22.15098519295791, | |
| "grad_norm": 0.4201187193393707, | |
| "learning_rate": 0.00033438670166229217, | |
| "loss": 3.1673, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 22.15098519295791, | |
| "eval_accuracy": 0.3727125745183517, | |
| "eval_loss": 3.553342580795288, | |
| "eval_runtime": 53.1416, | |
| "eval_samples_per_second": 312.881, | |
| "eval_steps_per_second": 19.57, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 22.165559053282035, | |
| "grad_norm": 0.38024842739105225, | |
| "learning_rate": 0.0003342117235345581, | |
| "loss": 3.1727, | |
| "step": 76050 | |
| }, | |
| { | |
| "epoch": 22.180132913606155, | |
| "grad_norm": 0.3718349039554596, | |
| "learning_rate": 0.0003340367454068241, | |
| "loss": 3.1828, | |
| "step": 76100 | |
| }, | |
| { | |
| "epoch": 22.194706773930278, | |
| "grad_norm": 0.39751553535461426, | |
| "learning_rate": 0.0003338617672790901, | |
| "loss": 3.1716, | |
| "step": 76150 | |
| }, | |
| { | |
| "epoch": 22.2092806342544, | |
| "grad_norm": 0.4270859360694885, | |
| "learning_rate": 0.000333686789151356, | |
| "loss": 3.1776, | |
| "step": 76200 | |
| }, | |
| { | |
| "epoch": 22.223854494578525, | |
| "grad_norm": 0.3863823413848877, | |
| "learning_rate": 0.000333511811023622, | |
| "loss": 3.1734, | |
| "step": 76250 | |
| }, | |
| { | |
| "epoch": 22.238428354902645, | |
| "grad_norm": 0.39657875895500183, | |
| "learning_rate": 0.000333336832895888, | |
| "loss": 3.1724, | |
| "step": 76300 | |
| }, | |
| { | |
| "epoch": 22.25300221522677, | |
| "grad_norm": 0.4099148213863373, | |
| "learning_rate": 0.000333161854768154, | |
| "loss": 3.1777, | |
| "step": 76350 | |
| }, | |
| { | |
| "epoch": 22.267576075550892, | |
| "grad_norm": 0.37436968088150024, | |
| "learning_rate": 0.0003329868766404199, | |
| "loss": 3.1932, | |
| "step": 76400 | |
| }, | |
| { | |
| "epoch": 22.282149935875015, | |
| "grad_norm": 0.4043516516685486, | |
| "learning_rate": 0.0003328118985126859, | |
| "loss": 3.1922, | |
| "step": 76450 | |
| }, | |
| { | |
| "epoch": 22.29672379619914, | |
| "grad_norm": 0.4059845805168152, | |
| "learning_rate": 0.0003326369203849519, | |
| "loss": 3.1931, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 22.31129765652326, | |
| "grad_norm": 0.44433534145355225, | |
| "learning_rate": 0.0003324619422572179, | |
| "loss": 3.181, | |
| "step": 76550 | |
| }, | |
| { | |
| "epoch": 22.325871516847382, | |
| "grad_norm": 0.42227113246917725, | |
| "learning_rate": 0.00033228696412948377, | |
| "loss": 3.2026, | |
| "step": 76600 | |
| }, | |
| { | |
| "epoch": 22.340445377171505, | |
| "grad_norm": 0.4291574954986572, | |
| "learning_rate": 0.00033211198600174976, | |
| "loss": 3.1957, | |
| "step": 76650 | |
| }, | |
| { | |
| "epoch": 22.35501923749563, | |
| "grad_norm": 0.4012506306171417, | |
| "learning_rate": 0.00033193700787401576, | |
| "loss": 3.2053, | |
| "step": 76700 | |
| }, | |
| { | |
| "epoch": 22.36959309781975, | |
| "grad_norm": 0.42267000675201416, | |
| "learning_rate": 0.00033176202974628165, | |
| "loss": 3.2001, | |
| "step": 76750 | |
| }, | |
| { | |
| "epoch": 22.384166958143872, | |
| "grad_norm": 0.39716577529907227, | |
| "learning_rate": 0.00033158705161854765, | |
| "loss": 3.2073, | |
| "step": 76800 | |
| }, | |
| { | |
| "epoch": 22.398740818467996, | |
| "grad_norm": 0.3808940351009369, | |
| "learning_rate": 0.00033141207349081365, | |
| "loss": 3.2027, | |
| "step": 76850 | |
| }, | |
| { | |
| "epoch": 22.41331467879212, | |
| "grad_norm": 0.4222249388694763, | |
| "learning_rate": 0.0003312370953630796, | |
| "loss": 3.2069, | |
| "step": 76900 | |
| }, | |
| { | |
| "epoch": 22.427888539116243, | |
| "grad_norm": 0.3741794526576996, | |
| "learning_rate": 0.00033106211723534553, | |
| "loss": 3.1951, | |
| "step": 76950 | |
| }, | |
| { | |
| "epoch": 22.442462399440362, | |
| "grad_norm": 0.37816691398620605, | |
| "learning_rate": 0.00033088713910761153, | |
| "loss": 3.1903, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 22.442462399440362, | |
| "eval_accuracy": 0.3728844120631369, | |
| "eval_loss": 3.5475516319274902, | |
| "eval_runtime": 53.1477, | |
| "eval_samples_per_second": 312.845, | |
| "eval_steps_per_second": 19.568, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 22.457036259764486, | |
| "grad_norm": 0.3930409550666809, | |
| "learning_rate": 0.0003307121609798775, | |
| "loss": 3.2086, | |
| "step": 77050 | |
| }, | |
| { | |
| "epoch": 22.47161012008861, | |
| "grad_norm": 0.4193686246871948, | |
| "learning_rate": 0.00033053718285214347, | |
| "loss": 3.2081, | |
| "step": 77100 | |
| }, | |
| { | |
| "epoch": 22.486183980412733, | |
| "grad_norm": 0.4014568328857422, | |
| "learning_rate": 0.0003303622047244094, | |
| "loss": 3.1979, | |
| "step": 77150 | |
| }, | |
| { | |
| "epoch": 22.500757840736853, | |
| "grad_norm": 0.403713583946228, | |
| "learning_rate": 0.00033018722659667536, | |
| "loss": 3.2081, | |
| "step": 77200 | |
| }, | |
| { | |
| "epoch": 22.515331701060976, | |
| "grad_norm": 0.3767714500427246, | |
| "learning_rate": 0.00033001224846894136, | |
| "loss": 3.2019, | |
| "step": 77250 | |
| }, | |
| { | |
| "epoch": 22.5299055613851, | |
| "grad_norm": 0.40091049671173096, | |
| "learning_rate": 0.00032983727034120735, | |
| "loss": 3.2092, | |
| "step": 77300 | |
| }, | |
| { | |
| "epoch": 22.544479421709223, | |
| "grad_norm": 0.407742440700531, | |
| "learning_rate": 0.00032966229221347324, | |
| "loss": 3.2054, | |
| "step": 77350 | |
| }, | |
| { | |
| "epoch": 22.559053282033346, | |
| "grad_norm": 0.4064652621746063, | |
| "learning_rate": 0.00032948731408573924, | |
| "loss": 3.2105, | |
| "step": 77400 | |
| }, | |
| { | |
| "epoch": 22.573627142357466, | |
| "grad_norm": 0.41998764872550964, | |
| "learning_rate": 0.00032931233595800524, | |
| "loss": 3.2185, | |
| "step": 77450 | |
| }, | |
| { | |
| "epoch": 22.58820100268159, | |
| "grad_norm": 0.37851375341415405, | |
| "learning_rate": 0.00032913735783027124, | |
| "loss": 3.2182, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 22.602774863005713, | |
| "grad_norm": 0.3938767611980438, | |
| "learning_rate": 0.0003289623797025371, | |
| "loss": 3.2215, | |
| "step": 77550 | |
| }, | |
| { | |
| "epoch": 22.617348723329837, | |
| "grad_norm": 0.38179537653923035, | |
| "learning_rate": 0.0003287874015748031, | |
| "loss": 3.22, | |
| "step": 77600 | |
| }, | |
| { | |
| "epoch": 22.631922583653957, | |
| "grad_norm": 0.37935879826545715, | |
| "learning_rate": 0.0003286124234470691, | |
| "loss": 3.2212, | |
| "step": 77650 | |
| }, | |
| { | |
| "epoch": 22.64649644397808, | |
| "grad_norm": 0.4034980535507202, | |
| "learning_rate": 0.000328437445319335, | |
| "loss": 3.218, | |
| "step": 77700 | |
| }, | |
| { | |
| "epoch": 22.661070304302203, | |
| "grad_norm": 0.37066003680229187, | |
| "learning_rate": 0.000328262467191601, | |
| "loss": 3.2231, | |
| "step": 77750 | |
| }, | |
| { | |
| "epoch": 22.675644164626327, | |
| "grad_norm": 0.4011686146259308, | |
| "learning_rate": 0.000328087489063867, | |
| "loss": 3.2241, | |
| "step": 77800 | |
| }, | |
| { | |
| "epoch": 22.69021802495045, | |
| "grad_norm": 0.3722177743911743, | |
| "learning_rate": 0.000327912510936133, | |
| "loss": 3.2249, | |
| "step": 77850 | |
| }, | |
| { | |
| "epoch": 22.70479188527457, | |
| "grad_norm": 0.39132416248321533, | |
| "learning_rate": 0.0003277375328083989, | |
| "loss": 3.2244, | |
| "step": 77900 | |
| }, | |
| { | |
| "epoch": 22.719365745598694, | |
| "grad_norm": 0.381059467792511, | |
| "learning_rate": 0.0003275625546806649, | |
| "loss": 3.2381, | |
| "step": 77950 | |
| }, | |
| { | |
| "epoch": 22.733939605922817, | |
| "grad_norm": 0.4152250587940216, | |
| "learning_rate": 0.0003273875765529309, | |
| "loss": 3.2327, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 22.733939605922817, | |
| "eval_accuracy": 0.37290053654507904, | |
| "eval_loss": 3.5416500568389893, | |
| "eval_runtime": 53.2893, | |
| "eval_samples_per_second": 312.014, | |
| "eval_steps_per_second": 19.516, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 22.74851346624694, | |
| "grad_norm": 0.37700963020324707, | |
| "learning_rate": 0.00032721259842519683, | |
| "loss": 3.2255, | |
| "step": 78050 | |
| }, | |
| { | |
| "epoch": 22.76308732657106, | |
| "grad_norm": 0.3865543603897095, | |
| "learning_rate": 0.0003270376202974628, | |
| "loss": 3.2261, | |
| "step": 78100 | |
| }, | |
| { | |
| "epoch": 22.777661186895184, | |
| "grad_norm": 0.4093526303768158, | |
| "learning_rate": 0.00032686264216972877, | |
| "loss": 3.2279, | |
| "step": 78150 | |
| }, | |
| { | |
| "epoch": 22.792235047219307, | |
| "grad_norm": 0.39011725783348083, | |
| "learning_rate": 0.0003266876640419947, | |
| "loss": 3.224, | |
| "step": 78200 | |
| }, | |
| { | |
| "epoch": 22.80680890754343, | |
| "grad_norm": 0.4137217402458191, | |
| "learning_rate": 0.0003265126859142607, | |
| "loss": 3.2247, | |
| "step": 78250 | |
| }, | |
| { | |
| "epoch": 22.821382767867554, | |
| "grad_norm": 0.38537049293518066, | |
| "learning_rate": 0.00032633770778652666, | |
| "loss": 3.2353, | |
| "step": 78300 | |
| }, | |
| { | |
| "epoch": 22.835956628191674, | |
| "grad_norm": 0.39159658551216125, | |
| "learning_rate": 0.0003261627296587926, | |
| "loss": 3.2266, | |
| "step": 78350 | |
| }, | |
| { | |
| "epoch": 22.850530488515798, | |
| "grad_norm": 0.40833067893981934, | |
| "learning_rate": 0.0003259877515310586, | |
| "loss": 3.2335, | |
| "step": 78400 | |
| }, | |
| { | |
| "epoch": 22.86510434883992, | |
| "grad_norm": 0.3539735972881317, | |
| "learning_rate": 0.00032581277340332454, | |
| "loss": 3.2215, | |
| "step": 78450 | |
| }, | |
| { | |
| "epoch": 22.879678209164044, | |
| "grad_norm": 0.4081200659275055, | |
| "learning_rate": 0.0003256377952755905, | |
| "loss": 3.2293, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 22.894252069488164, | |
| "grad_norm": 0.39085066318511963, | |
| "learning_rate": 0.0003254628171478565, | |
| "loss": 3.2384, | |
| "step": 78550 | |
| }, | |
| { | |
| "epoch": 22.908825929812288, | |
| "grad_norm": 0.383390873670578, | |
| "learning_rate": 0.0003252878390201225, | |
| "loss": 3.232, | |
| "step": 78600 | |
| }, | |
| { | |
| "epoch": 22.92339979013641, | |
| "grad_norm": 0.4007982611656189, | |
| "learning_rate": 0.00032511286089238837, | |
| "loss": 3.2366, | |
| "step": 78650 | |
| }, | |
| { | |
| "epoch": 22.937973650460535, | |
| "grad_norm": 0.4163041412830353, | |
| "learning_rate": 0.00032493788276465437, | |
| "loss": 3.2437, | |
| "step": 78700 | |
| }, | |
| { | |
| "epoch": 22.952547510784658, | |
| "grad_norm": 0.39729535579681396, | |
| "learning_rate": 0.00032476290463692036, | |
| "loss": 3.2472, | |
| "step": 78750 | |
| }, | |
| { | |
| "epoch": 22.967121371108778, | |
| "grad_norm": 0.38961970806121826, | |
| "learning_rate": 0.00032458792650918636, | |
| "loss": 3.2443, | |
| "step": 78800 | |
| }, | |
| { | |
| "epoch": 22.9816952314329, | |
| "grad_norm": 0.3606325685977936, | |
| "learning_rate": 0.00032441294838145225, | |
| "loss": 3.2433, | |
| "step": 78850 | |
| }, | |
| { | |
| "epoch": 22.996269091757025, | |
| "grad_norm": 0.4011639952659607, | |
| "learning_rate": 0.00032423797025371825, | |
| "loss": 3.2327, | |
| "step": 78900 | |
| }, | |
| { | |
| "epoch": 23.01078465663985, | |
| "grad_norm": 0.38173967599868774, | |
| "learning_rate": 0.00032406299212598425, | |
| "loss": 3.1527, | |
| "step": 78950 | |
| }, | |
| { | |
| "epoch": 23.025358516963973, | |
| "grad_norm": 0.42250046133995056, | |
| "learning_rate": 0.0003238880139982502, | |
| "loss": 3.1185, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 23.025358516963973, | |
| "eval_accuracy": 0.3727990817754867, | |
| "eval_loss": 3.5551207065582275, | |
| "eval_runtime": 53.2206, | |
| "eval_samples_per_second": 312.417, | |
| "eval_steps_per_second": 19.541, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 23.039932377288096, | |
| "grad_norm": 0.39397215843200684, | |
| "learning_rate": 0.00032371303587051613, | |
| "loss": 3.1487, | |
| "step": 79050 | |
| }, | |
| { | |
| "epoch": 23.05450623761222, | |
| "grad_norm": 0.4081815779209137, | |
| "learning_rate": 0.00032353805774278213, | |
| "loss": 3.1599, | |
| "step": 79100 | |
| }, | |
| { | |
| "epoch": 23.069080097936343, | |
| "grad_norm": 0.41130968928337097, | |
| "learning_rate": 0.00032336307961504813, | |
| "loss": 3.1422, | |
| "step": 79150 | |
| }, | |
| { | |
| "epoch": 23.083653958260463, | |
| "grad_norm": 0.4249158501625061, | |
| "learning_rate": 0.0003231881014873141, | |
| "loss": 3.1564, | |
| "step": 79200 | |
| }, | |
| { | |
| "epoch": 23.098227818584586, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.00032301312335958, | |
| "loss": 3.1428, | |
| "step": 79250 | |
| }, | |
| { | |
| "epoch": 23.11280167890871, | |
| "grad_norm": 0.40779754519462585, | |
| "learning_rate": 0.000322838145231846, | |
| "loss": 3.1596, | |
| "step": 79300 | |
| }, | |
| { | |
| "epoch": 23.127375539232833, | |
| "grad_norm": 0.418041855096817, | |
| "learning_rate": 0.00032266316710411196, | |
| "loss": 3.1649, | |
| "step": 79350 | |
| }, | |
| { | |
| "epoch": 23.141949399556953, | |
| "grad_norm": 0.3876676559448242, | |
| "learning_rate": 0.0003224881889763779, | |
| "loss": 3.1583, | |
| "step": 79400 | |
| }, | |
| { | |
| "epoch": 23.156523259881077, | |
| "grad_norm": 0.411615788936615, | |
| "learning_rate": 0.0003223132108486439, | |
| "loss": 3.1741, | |
| "step": 79450 | |
| }, | |
| { | |
| "epoch": 23.1710971202052, | |
| "grad_norm": 0.4377012848854065, | |
| "learning_rate": 0.00032213823272090984, | |
| "loss": 3.163, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 23.185670980529324, | |
| "grad_norm": 0.3975127041339874, | |
| "learning_rate": 0.00032196325459317584, | |
| "loss": 3.1631, | |
| "step": 79550 | |
| }, | |
| { | |
| "epoch": 23.200244840853447, | |
| "grad_norm": 0.49955064058303833, | |
| "learning_rate": 0.0003217882764654418, | |
| "loss": 3.1762, | |
| "step": 79600 | |
| }, | |
| { | |
| "epoch": 23.214818701177567, | |
| "grad_norm": 0.3728475272655487, | |
| "learning_rate": 0.0003216132983377077, | |
| "loss": 3.18, | |
| "step": 79650 | |
| }, | |
| { | |
| "epoch": 23.22939256150169, | |
| "grad_norm": 0.4319852888584137, | |
| "learning_rate": 0.0003214383202099737, | |
| "loss": 3.1784, | |
| "step": 79700 | |
| }, | |
| { | |
| "epoch": 23.243966421825814, | |
| "grad_norm": 0.39504387974739075, | |
| "learning_rate": 0.0003212633420822397, | |
| "loss": 3.1737, | |
| "step": 79750 | |
| }, | |
| { | |
| "epoch": 23.258540282149937, | |
| "grad_norm": 0.44203776121139526, | |
| "learning_rate": 0.0003210883639545056, | |
| "loss": 3.1716, | |
| "step": 79800 | |
| }, | |
| { | |
| "epoch": 23.273114142474057, | |
| "grad_norm": 0.3983067274093628, | |
| "learning_rate": 0.0003209133858267716, | |
| "loss": 3.1892, | |
| "step": 79850 | |
| }, | |
| { | |
| "epoch": 23.28768800279818, | |
| "grad_norm": 0.39879894256591797, | |
| "learning_rate": 0.0003207384076990376, | |
| "loss": 3.1892, | |
| "step": 79900 | |
| }, | |
| { | |
| "epoch": 23.302261863122304, | |
| "grad_norm": 0.4163643419742584, | |
| "learning_rate": 0.0003205634295713036, | |
| "loss": 3.1799, | |
| "step": 79950 | |
| }, | |
| { | |
| "epoch": 23.316835723446427, | |
| "grad_norm": 0.4006156325340271, | |
| "learning_rate": 0.0003203884514435695, | |
| "loss": 3.1744, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 23.316835723446427, | |
| "eval_accuracy": 0.37262594956426826, | |
| "eval_loss": 3.5548195838928223, | |
| "eval_runtime": 53.2607, | |
| "eval_samples_per_second": 312.182, | |
| "eval_steps_per_second": 19.527, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 23.33140958377055, | |
| "grad_norm": 0.44247502088546753, | |
| "learning_rate": 0.0003202134733158355, | |
| "loss": 3.1736, | |
| "step": 80050 | |
| }, | |
| { | |
| "epoch": 23.34598344409467, | |
| "grad_norm": 0.41587111353874207, | |
| "learning_rate": 0.0003200384951881015, | |
| "loss": 3.1803, | |
| "step": 80100 | |
| }, | |
| { | |
| "epoch": 23.360557304418794, | |
| "grad_norm": 0.3835608959197998, | |
| "learning_rate": 0.00031986351706036743, | |
| "loss": 3.1808, | |
| "step": 80150 | |
| }, | |
| { | |
| "epoch": 23.375131164742918, | |
| "grad_norm": 0.382169634103775, | |
| "learning_rate": 0.0003196885389326334, | |
| "loss": 3.1914, | |
| "step": 80200 | |
| }, | |
| { | |
| "epoch": 23.38970502506704, | |
| "grad_norm": 0.4157732427120209, | |
| "learning_rate": 0.0003195135608048994, | |
| "loss": 3.1982, | |
| "step": 80250 | |
| }, | |
| { | |
| "epoch": 23.40427888539116, | |
| "grad_norm": 0.4157496988773346, | |
| "learning_rate": 0.0003193385826771653, | |
| "loss": 3.1861, | |
| "step": 80300 | |
| }, | |
| { | |
| "epoch": 23.418852745715284, | |
| "grad_norm": 0.4726715087890625, | |
| "learning_rate": 0.00031916360454943126, | |
| "loss": 3.1849, | |
| "step": 80350 | |
| }, | |
| { | |
| "epoch": 23.433426606039408, | |
| "grad_norm": 0.4136442244052887, | |
| "learning_rate": 0.00031898862642169726, | |
| "loss": 3.1953, | |
| "step": 80400 | |
| }, | |
| { | |
| "epoch": 23.44800046636353, | |
| "grad_norm": 0.4338325560092926, | |
| "learning_rate": 0.00031881364829396326, | |
| "loss": 3.1965, | |
| "step": 80450 | |
| }, | |
| { | |
| "epoch": 23.462574326687655, | |
| "grad_norm": 0.4061412811279297, | |
| "learning_rate": 0.0003186386701662292, | |
| "loss": 3.1974, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 23.477148187011775, | |
| "grad_norm": 0.39773857593536377, | |
| "learning_rate": 0.00031846369203849514, | |
| "loss": 3.1902, | |
| "step": 80550 | |
| }, | |
| { | |
| "epoch": 23.491722047335898, | |
| "grad_norm": 0.39488843083381653, | |
| "learning_rate": 0.00031828871391076114, | |
| "loss": 3.1955, | |
| "step": 80600 | |
| }, | |
| { | |
| "epoch": 23.50629590766002, | |
| "grad_norm": 0.39714646339416504, | |
| "learning_rate": 0.0003181137357830271, | |
| "loss": 3.1985, | |
| "step": 80650 | |
| }, | |
| { | |
| "epoch": 23.520869767984145, | |
| "grad_norm": 0.38533398509025574, | |
| "learning_rate": 0.0003179387576552931, | |
| "loss": 3.2036, | |
| "step": 80700 | |
| }, | |
| { | |
| "epoch": 23.535443628308265, | |
| "grad_norm": 0.40813782811164856, | |
| "learning_rate": 0.000317763779527559, | |
| "loss": 3.1977, | |
| "step": 80750 | |
| }, | |
| { | |
| "epoch": 23.55001748863239, | |
| "grad_norm": 0.40301379561424255, | |
| "learning_rate": 0.00031758880139982497, | |
| "loss": 3.2099, | |
| "step": 80800 | |
| }, | |
| { | |
| "epoch": 23.56459134895651, | |
| "grad_norm": 0.4391363561153412, | |
| "learning_rate": 0.00031741382327209097, | |
| "loss": 3.2059, | |
| "step": 80850 | |
| }, | |
| { | |
| "epoch": 23.579165209280635, | |
| "grad_norm": 0.4028854966163635, | |
| "learning_rate": 0.00031723884514435696, | |
| "loss": 3.1987, | |
| "step": 80900 | |
| }, | |
| { | |
| "epoch": 23.59373906960476, | |
| "grad_norm": 0.4159344732761383, | |
| "learning_rate": 0.00031706386701662285, | |
| "loss": 3.2068, | |
| "step": 80950 | |
| }, | |
| { | |
| "epoch": 23.60831292992888, | |
| "grad_norm": 0.38993582129478455, | |
| "learning_rate": 0.00031688888888888885, | |
| "loss": 3.2041, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 23.60831292992888, | |
| "eval_accuracy": 0.3734721906238609, | |
| "eval_loss": 3.5403220653533936, | |
| "eval_runtime": 53.2707, | |
| "eval_samples_per_second": 312.123, | |
| "eval_steps_per_second": 19.523, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 23.622886790253002, | |
| "grad_norm": 0.40576037764549255, | |
| "learning_rate": 0.00031671391076115485, | |
| "loss": 3.2149, | |
| "step": 81050 | |
| }, | |
| { | |
| "epoch": 23.637460650577125, | |
| "grad_norm": 0.4368989169597626, | |
| "learning_rate": 0.00031653893263342085, | |
| "loss": 3.204, | |
| "step": 81100 | |
| }, | |
| { | |
| "epoch": 23.65203451090125, | |
| "grad_norm": 0.43579620122909546, | |
| "learning_rate": 0.00031636395450568674, | |
| "loss": 3.2224, | |
| "step": 81150 | |
| }, | |
| { | |
| "epoch": 23.66660837122537, | |
| "grad_norm": 0.42136499285697937, | |
| "learning_rate": 0.00031618897637795273, | |
| "loss": 3.2131, | |
| "step": 81200 | |
| }, | |
| { | |
| "epoch": 23.681182231549492, | |
| "grad_norm": 0.4127443730831146, | |
| "learning_rate": 0.00031601399825021873, | |
| "loss": 3.2246, | |
| "step": 81250 | |
| }, | |
| { | |
| "epoch": 23.695756091873616, | |
| "grad_norm": 0.4025081694126129, | |
| "learning_rate": 0.0003158390201224846, | |
| "loss": 3.2164, | |
| "step": 81300 | |
| }, | |
| { | |
| "epoch": 23.71032995219774, | |
| "grad_norm": 0.40500345826148987, | |
| "learning_rate": 0.0003156640419947506, | |
| "loss": 3.2144, | |
| "step": 81350 | |
| }, | |
| { | |
| "epoch": 23.72490381252186, | |
| "grad_norm": 0.42702171206474304, | |
| "learning_rate": 0.0003154890638670166, | |
| "loss": 3.215, | |
| "step": 81400 | |
| }, | |
| { | |
| "epoch": 23.739477672845982, | |
| "grad_norm": 0.4396904408931732, | |
| "learning_rate": 0.00031531408573928256, | |
| "loss": 3.2153, | |
| "step": 81450 | |
| }, | |
| { | |
| "epoch": 23.754051533170106, | |
| "grad_norm": 0.38937604427337646, | |
| "learning_rate": 0.0003151391076115485, | |
| "loss": 3.2199, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 23.76862539349423, | |
| "grad_norm": 0.43401476740837097, | |
| "learning_rate": 0.0003149641294838145, | |
| "loss": 3.2106, | |
| "step": 81550 | |
| }, | |
| { | |
| "epoch": 23.783199253818353, | |
| "grad_norm": 0.40791040658950806, | |
| "learning_rate": 0.00031478915135608044, | |
| "loss": 3.2293, | |
| "step": 81600 | |
| }, | |
| { | |
| "epoch": 23.797773114142473, | |
| "grad_norm": 0.39792293310165405, | |
| "learning_rate": 0.00031461417322834644, | |
| "loss": 3.2349, | |
| "step": 81650 | |
| }, | |
| { | |
| "epoch": 23.812346974466596, | |
| "grad_norm": 0.4132463335990906, | |
| "learning_rate": 0.0003144391951006124, | |
| "loss": 3.2196, | |
| "step": 81700 | |
| }, | |
| { | |
| "epoch": 23.82692083479072, | |
| "grad_norm": 0.3725283443927765, | |
| "learning_rate": 0.0003142642169728784, | |
| "loss": 3.2128, | |
| "step": 81750 | |
| }, | |
| { | |
| "epoch": 23.841494695114843, | |
| "grad_norm": 0.41264012455940247, | |
| "learning_rate": 0.0003140892388451443, | |
| "loss": 3.2286, | |
| "step": 81800 | |
| }, | |
| { | |
| "epoch": 23.856068555438966, | |
| "grad_norm": 0.39688318967819214, | |
| "learning_rate": 0.0003139142607174103, | |
| "loss": 3.2239, | |
| "step": 81850 | |
| }, | |
| { | |
| "epoch": 23.870642415763086, | |
| "grad_norm": 0.3925932049751282, | |
| "learning_rate": 0.00031373928258967627, | |
| "loss": 3.2283, | |
| "step": 81900 | |
| }, | |
| { | |
| "epoch": 23.88521627608721, | |
| "grad_norm": 0.4429666996002197, | |
| "learning_rate": 0.0003135643044619422, | |
| "loss": 3.2317, | |
| "step": 81950 | |
| }, | |
| { | |
| "epoch": 23.899790136411333, | |
| "grad_norm": 0.4149000942707062, | |
| "learning_rate": 0.0003133893263342082, | |
| "loss": 3.2131, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 23.899790136411333, | |
| "eval_accuracy": 0.3737786734777106, | |
| "eval_loss": 3.535808563232422, | |
| "eval_runtime": 53.3201, | |
| "eval_samples_per_second": 311.833, | |
| "eval_steps_per_second": 19.505, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 23.914363996735457, | |
| "grad_norm": 0.41211676597595215, | |
| "learning_rate": 0.00031321434820647415, | |
| "loss": 3.2245, | |
| "step": 82050 | |
| }, | |
| { | |
| "epoch": 23.928937857059577, | |
| "grad_norm": 0.42245087027549744, | |
| "learning_rate": 0.0003130393700787401, | |
| "loss": 3.2254, | |
| "step": 82100 | |
| }, | |
| { | |
| "epoch": 23.9435117173837, | |
| "grad_norm": 0.40680941939353943, | |
| "learning_rate": 0.0003128643919510061, | |
| "loss": 3.2294, | |
| "step": 82150 | |
| }, | |
| { | |
| "epoch": 23.958085577707823, | |
| "grad_norm": 0.40813785791397095, | |
| "learning_rate": 0.0003126894138232721, | |
| "loss": 3.2326, | |
| "step": 82200 | |
| }, | |
| { | |
| "epoch": 23.972659438031947, | |
| "grad_norm": 0.40311017632484436, | |
| "learning_rate": 0.000312514435695538, | |
| "loss": 3.2207, | |
| "step": 82250 | |
| }, | |
| { | |
| "epoch": 23.987233298356067, | |
| "grad_norm": 0.4265156388282776, | |
| "learning_rate": 0.000312339457567804, | |
| "loss": 3.2262, | |
| "step": 82300 | |
| }, | |
| { | |
| "epoch": 24.001748863238895, | |
| "grad_norm": 0.40735989809036255, | |
| "learning_rate": 0.00031216447944007, | |
| "loss": 3.2165, | |
| "step": 82350 | |
| }, | |
| { | |
| "epoch": 24.016322723563018, | |
| "grad_norm": 0.4180150628089905, | |
| "learning_rate": 0.000311989501312336, | |
| "loss": 3.1257, | |
| "step": 82400 | |
| }, | |
| { | |
| "epoch": 24.03089658388714, | |
| "grad_norm": 0.4029780924320221, | |
| "learning_rate": 0.00031181452318460186, | |
| "loss": 3.1385, | |
| "step": 82450 | |
| }, | |
| { | |
| "epoch": 24.04547044421126, | |
| "grad_norm": 0.42957931756973267, | |
| "learning_rate": 0.00031163954505686786, | |
| "loss": 3.1267, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 24.060044304535385, | |
| "grad_norm": 0.4051404893398285, | |
| "learning_rate": 0.00031146456692913386, | |
| "loss": 3.1293, | |
| "step": 82550 | |
| }, | |
| { | |
| "epoch": 24.07461816485951, | |
| "grad_norm": 0.3836676776409149, | |
| "learning_rate": 0.0003112895888013998, | |
| "loss": 3.1363, | |
| "step": 82600 | |
| }, | |
| { | |
| "epoch": 24.089192025183632, | |
| "grad_norm": 0.4129314422607422, | |
| "learning_rate": 0.00031111461067366575, | |
| "loss": 3.1348, | |
| "step": 82650 | |
| }, | |
| { | |
| "epoch": 24.10376588550775, | |
| "grad_norm": 0.4051026999950409, | |
| "learning_rate": 0.00031093963254593174, | |
| "loss": 3.1671, | |
| "step": 82700 | |
| }, | |
| { | |
| "epoch": 24.118339745831875, | |
| "grad_norm": 0.39646244049072266, | |
| "learning_rate": 0.0003107646544181977, | |
| "loss": 3.1398, | |
| "step": 82750 | |
| }, | |
| { | |
| "epoch": 24.132913606156, | |
| "grad_norm": 0.3891238272190094, | |
| "learning_rate": 0.0003105896762904637, | |
| "loss": 3.1521, | |
| "step": 82800 | |
| }, | |
| { | |
| "epoch": 24.147487466480122, | |
| "grad_norm": 0.3894284665584564, | |
| "learning_rate": 0.00031041469816272963, | |
| "loss": 3.146, | |
| "step": 82850 | |
| }, | |
| { | |
| "epoch": 24.162061326804245, | |
| "grad_norm": 0.4276731312274933, | |
| "learning_rate": 0.00031023972003499557, | |
| "loss": 3.1619, | |
| "step": 82900 | |
| }, | |
| { | |
| "epoch": 24.176635187128365, | |
| "grad_norm": 0.45556432008743286, | |
| "learning_rate": 0.00031006474190726157, | |
| "loss": 3.1422, | |
| "step": 82950 | |
| }, | |
| { | |
| "epoch": 24.19120904745249, | |
| "grad_norm": 0.41922351717948914, | |
| "learning_rate": 0.0003098897637795275, | |
| "loss": 3.1545, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 24.19120904745249, | |
| "eval_accuracy": 0.3726754999795796, | |
| "eval_loss": 3.552673101425171, | |
| "eval_runtime": 53.3097, | |
| "eval_samples_per_second": 311.894, | |
| "eval_steps_per_second": 19.509, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 24.205782907776612, | |
| "grad_norm": 0.38522177934646606, | |
| "learning_rate": 0.0003097147856517935, | |
| "loss": 3.1641, | |
| "step": 83050 | |
| }, | |
| { | |
| "epoch": 24.220356768100736, | |
| "grad_norm": 0.41649022698402405, | |
| "learning_rate": 0.00030953980752405945, | |
| "loss": 3.1551, | |
| "step": 83100 | |
| }, | |
| { | |
| "epoch": 24.234930628424856, | |
| "grad_norm": 0.3905382752418518, | |
| "learning_rate": 0.00030936482939632545, | |
| "loss": 3.1687, | |
| "step": 83150 | |
| }, | |
| { | |
| "epoch": 24.24950448874898, | |
| "grad_norm": 0.4067355990409851, | |
| "learning_rate": 0.0003091898512685914, | |
| "loss": 3.1718, | |
| "step": 83200 | |
| }, | |
| { | |
| "epoch": 24.264078349073102, | |
| "grad_norm": 0.414569228887558, | |
| "learning_rate": 0.00030901487314085734, | |
| "loss": 3.1636, | |
| "step": 83250 | |
| }, | |
| { | |
| "epoch": 24.278652209397226, | |
| "grad_norm": 0.4139001667499542, | |
| "learning_rate": 0.00030883989501312334, | |
| "loss": 3.1865, | |
| "step": 83300 | |
| }, | |
| { | |
| "epoch": 24.29322606972135, | |
| "grad_norm": 0.40517884492874146, | |
| "learning_rate": 0.00030866491688538933, | |
| "loss": 3.1784, | |
| "step": 83350 | |
| }, | |
| { | |
| "epoch": 24.30779993004547, | |
| "grad_norm": 0.3915081024169922, | |
| "learning_rate": 0.0003084899387576552, | |
| "loss": 3.1592, | |
| "step": 83400 | |
| }, | |
| { | |
| "epoch": 24.322373790369593, | |
| "grad_norm": 0.400414377450943, | |
| "learning_rate": 0.0003083149606299212, | |
| "loss": 3.1815, | |
| "step": 83450 | |
| }, | |
| { | |
| "epoch": 24.336947650693716, | |
| "grad_norm": 0.4337230324745178, | |
| "learning_rate": 0.0003081399825021872, | |
| "loss": 3.1742, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 24.35152151101784, | |
| "grad_norm": 0.3856227695941925, | |
| "learning_rate": 0.0003079650043744532, | |
| "loss": 3.1731, | |
| "step": 83550 | |
| }, | |
| { | |
| "epoch": 24.36609537134196, | |
| "grad_norm": 0.40683674812316895, | |
| "learning_rate": 0.0003077900262467191, | |
| "loss": 3.186, | |
| "step": 83600 | |
| }, | |
| { | |
| "epoch": 24.380669231666083, | |
| "grad_norm": 0.420723557472229, | |
| "learning_rate": 0.0003076150481189851, | |
| "loss": 3.1901, | |
| "step": 83650 | |
| }, | |
| { | |
| "epoch": 24.395243091990206, | |
| "grad_norm": 0.39648476243019104, | |
| "learning_rate": 0.0003074400699912511, | |
| "loss": 3.1861, | |
| "step": 83700 | |
| }, | |
| { | |
| "epoch": 24.40981695231433, | |
| "grad_norm": 0.4028700292110443, | |
| "learning_rate": 0.00030726509186351704, | |
| "loss": 3.1852, | |
| "step": 83750 | |
| }, | |
| { | |
| "epoch": 24.424390812638453, | |
| "grad_norm": 0.4025539457798004, | |
| "learning_rate": 0.000307090113735783, | |
| "loss": 3.1875, | |
| "step": 83800 | |
| }, | |
| { | |
| "epoch": 24.438964672962573, | |
| "grad_norm": 0.4205414354801178, | |
| "learning_rate": 0.000306915135608049, | |
| "loss": 3.1882, | |
| "step": 83850 | |
| }, | |
| { | |
| "epoch": 24.453538533286697, | |
| "grad_norm": 0.4485682547092438, | |
| "learning_rate": 0.00030674015748031493, | |
| "loss": 3.1828, | |
| "step": 83900 | |
| }, | |
| { | |
| "epoch": 24.46811239361082, | |
| "grad_norm": 0.4024691581726074, | |
| "learning_rate": 0.00030656517935258087, | |
| "loss": 3.1875, | |
| "step": 83950 | |
| }, | |
| { | |
| "epoch": 24.482686253934943, | |
| "grad_norm": 0.41920554637908936, | |
| "learning_rate": 0.00030639020122484687, | |
| "loss": 3.1889, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 24.482686253934943, | |
| "eval_accuracy": 0.3733111811983362, | |
| "eval_loss": 3.546642303466797, | |
| "eval_runtime": 53.2322, | |
| "eval_samples_per_second": 312.348, | |
| "eval_steps_per_second": 19.537, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 24.497260114259063, | |
| "grad_norm": 0.41416135430336, | |
| "learning_rate": 0.0003062152230971128, | |
| "loss": 3.199, | |
| "step": 84050 | |
| }, | |
| { | |
| "epoch": 24.511833974583187, | |
| "grad_norm": 0.3936392366886139, | |
| "learning_rate": 0.0003060402449693788, | |
| "loss": 3.1838, | |
| "step": 84100 | |
| }, | |
| { | |
| "epoch": 24.52640783490731, | |
| "grad_norm": 0.4029703438282013, | |
| "learning_rate": 0.00030586526684164475, | |
| "loss": 3.1963, | |
| "step": 84150 | |
| }, | |
| { | |
| "epoch": 24.540981695231434, | |
| "grad_norm": 0.3771865665912628, | |
| "learning_rate": 0.0003056902887139107, | |
| "loss": 3.1984, | |
| "step": 84200 | |
| }, | |
| { | |
| "epoch": 24.555555555555557, | |
| "grad_norm": 0.3756367266178131, | |
| "learning_rate": 0.0003055153105861767, | |
| "loss": 3.1901, | |
| "step": 84250 | |
| }, | |
| { | |
| "epoch": 24.570129415879677, | |
| "grad_norm": 0.40955060720443726, | |
| "learning_rate": 0.0003053403324584427, | |
| "loss": 3.2019, | |
| "step": 84300 | |
| }, | |
| { | |
| "epoch": 24.5847032762038, | |
| "grad_norm": 0.43558764457702637, | |
| "learning_rate": 0.00030516535433070864, | |
| "loss": 3.1851, | |
| "step": 84350 | |
| }, | |
| { | |
| "epoch": 24.599277136527924, | |
| "grad_norm": 0.46225878596305847, | |
| "learning_rate": 0.0003049903762029746, | |
| "loss": 3.2034, | |
| "step": 84400 | |
| }, | |
| { | |
| "epoch": 24.613850996852047, | |
| "grad_norm": 0.38759079575538635, | |
| "learning_rate": 0.0003048153980752406, | |
| "loss": 3.1924, | |
| "step": 84450 | |
| }, | |
| { | |
| "epoch": 24.628424857176167, | |
| "grad_norm": 0.40002742409706116, | |
| "learning_rate": 0.0003046404199475066, | |
| "loss": 3.2004, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 24.64299871750029, | |
| "grad_norm": 0.4547050893306732, | |
| "learning_rate": 0.00030446544181977247, | |
| "loss": 3.2056, | |
| "step": 84550 | |
| }, | |
| { | |
| "epoch": 24.657572577824414, | |
| "grad_norm": 0.42415475845336914, | |
| "learning_rate": 0.00030429046369203846, | |
| "loss": 3.2092, | |
| "step": 84600 | |
| }, | |
| { | |
| "epoch": 24.672146438148538, | |
| "grad_norm": 0.41034477949142456, | |
| "learning_rate": 0.00030411548556430446, | |
| "loss": 3.2004, | |
| "step": 84650 | |
| }, | |
| { | |
| "epoch": 24.68672029847266, | |
| "grad_norm": 0.411905974149704, | |
| "learning_rate": 0.00030394050743657046, | |
| "loss": 3.1921, | |
| "step": 84700 | |
| }, | |
| { | |
| "epoch": 24.70129415879678, | |
| "grad_norm": 0.38791701197624207, | |
| "learning_rate": 0.00030376552930883635, | |
| "loss": 3.2034, | |
| "step": 84750 | |
| }, | |
| { | |
| "epoch": 24.715868019120904, | |
| "grad_norm": 0.41622650623321533, | |
| "learning_rate": 0.00030359055118110235, | |
| "loss": 3.199, | |
| "step": 84800 | |
| }, | |
| { | |
| "epoch": 24.730441879445028, | |
| "grad_norm": 0.4126710891723633, | |
| "learning_rate": 0.00030341557305336834, | |
| "loss": 3.2097, | |
| "step": 84850 | |
| }, | |
| { | |
| "epoch": 24.74501573976915, | |
| "grad_norm": 0.3979971408843994, | |
| "learning_rate": 0.00030324059492563423, | |
| "loss": 3.199, | |
| "step": 84900 | |
| }, | |
| { | |
| "epoch": 24.75958960009327, | |
| "grad_norm": 0.4198929965496063, | |
| "learning_rate": 0.00030306561679790023, | |
| "loss": 3.2083, | |
| "step": 84950 | |
| }, | |
| { | |
| "epoch": 24.774163460417395, | |
| "grad_norm": 0.432871013879776, | |
| "learning_rate": 0.00030289063867016623, | |
| "loss": 3.2161, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 24.774163460417395, | |
| "eval_accuracy": 0.3738008005040254, | |
| "eval_loss": 3.5413074493408203, | |
| "eval_runtime": 53.3055, | |
| "eval_samples_per_second": 311.919, | |
| "eval_steps_per_second": 19.51, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 24.788737320741518, | |
| "grad_norm": 0.41536158323287964, | |
| "learning_rate": 0.00030271566054243217, | |
| "loss": 3.1999, | |
| "step": 85050 | |
| }, | |
| { | |
| "epoch": 24.80331118106564, | |
| "grad_norm": 0.39557045698165894, | |
| "learning_rate": 0.0003025406824146981, | |
| "loss": 3.2129, | |
| "step": 85100 | |
| }, | |
| { | |
| "epoch": 24.817885041389765, | |
| "grad_norm": 0.43374064564704895, | |
| "learning_rate": 0.0003023657042869641, | |
| "loss": 3.2197, | |
| "step": 85150 | |
| }, | |
| { | |
| "epoch": 24.832458901713885, | |
| "grad_norm": 0.3977198898792267, | |
| "learning_rate": 0.00030219072615923006, | |
| "loss": 3.211, | |
| "step": 85200 | |
| }, | |
| { | |
| "epoch": 24.84703276203801, | |
| "grad_norm": 0.4073224663734436, | |
| "learning_rate": 0.00030201574803149605, | |
| "loss": 3.2171, | |
| "step": 85250 | |
| }, | |
| { | |
| "epoch": 24.86160662236213, | |
| "grad_norm": 0.4387545585632324, | |
| "learning_rate": 0.000301840769903762, | |
| "loss": 3.2168, | |
| "step": 85300 | |
| }, | |
| { | |
| "epoch": 24.876180482686255, | |
| "grad_norm": 0.4089738726615906, | |
| "learning_rate": 0.00030166579177602794, | |
| "loss": 3.2198, | |
| "step": 85350 | |
| }, | |
| { | |
| "epoch": 24.890754343010375, | |
| "grad_norm": 0.4149879217147827, | |
| "learning_rate": 0.00030149081364829394, | |
| "loss": 3.2195, | |
| "step": 85400 | |
| }, | |
| { | |
| "epoch": 24.9053282033345, | |
| "grad_norm": 0.43615013360977173, | |
| "learning_rate": 0.00030131583552055994, | |
| "loss": 3.2236, | |
| "step": 85450 | |
| }, | |
| { | |
| "epoch": 24.919902063658622, | |
| "grad_norm": 0.43617773056030273, | |
| "learning_rate": 0.0003011408573928258, | |
| "loss": 3.2136, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 24.934475923982745, | |
| "grad_norm": 0.41982802748680115, | |
| "learning_rate": 0.0003009658792650918, | |
| "loss": 3.2235, | |
| "step": 85550 | |
| }, | |
| { | |
| "epoch": 24.94904978430687, | |
| "grad_norm": 0.3977769911289215, | |
| "learning_rate": 0.0003007909011373578, | |
| "loss": 3.2156, | |
| "step": 85600 | |
| }, | |
| { | |
| "epoch": 24.96362364463099, | |
| "grad_norm": 0.4145846366882324, | |
| "learning_rate": 0.00030061592300962376, | |
| "loss": 3.2121, | |
| "step": 85650 | |
| }, | |
| { | |
| "epoch": 24.978197504955112, | |
| "grad_norm": 0.5370997190475464, | |
| "learning_rate": 0.0003004409448818897, | |
| "loss": 3.2193, | |
| "step": 85700 | |
| }, | |
| { | |
| "epoch": 24.992771365279236, | |
| "grad_norm": 0.3931005895137787, | |
| "learning_rate": 0.0003002659667541557, | |
| "loss": 3.2151, | |
| "step": 85750 | |
| }, | |
| { | |
| "epoch": 25.00728693016206, | |
| "grad_norm": 0.3954836428165436, | |
| "learning_rate": 0.0003000909886264217, | |
| "loss": 3.162, | |
| "step": 85800 | |
| }, | |
| { | |
| "epoch": 25.021860790486183, | |
| "grad_norm": 0.42345207929611206, | |
| "learning_rate": 0.00029991601049868765, | |
| "loss": 3.1196, | |
| "step": 85850 | |
| }, | |
| { | |
| "epoch": 25.036434650810307, | |
| "grad_norm": 0.4194379150867462, | |
| "learning_rate": 0.0002997410323709536, | |
| "loss": 3.1185, | |
| "step": 85900 | |
| }, | |
| { | |
| "epoch": 25.05100851113443, | |
| "grad_norm": 0.4132949113845825, | |
| "learning_rate": 0.0002995660542432196, | |
| "loss": 3.1275, | |
| "step": 85950 | |
| }, | |
| { | |
| "epoch": 25.065582371458554, | |
| "grad_norm": 0.3960988521575928, | |
| "learning_rate": 0.00029939107611548553, | |
| "loss": 3.1333, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 25.065582371458554, | |
| "eval_accuracy": 0.3730515417299827, | |
| "eval_loss": 3.5527703762054443, | |
| "eval_runtime": 53.3809, | |
| "eval_samples_per_second": 311.479, | |
| "eval_steps_per_second": 19.483, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 25.080156231782674, | |
| "grad_norm": 0.39942601323127747, | |
| "learning_rate": 0.00029921609798775153, | |
| "loss": 3.137, | |
| "step": 86050 | |
| }, | |
| { | |
| "epoch": 25.094730092106797, | |
| "grad_norm": 0.41080349683761597, | |
| "learning_rate": 0.00029904111986001747, | |
| "loss": 3.1285, | |
| "step": 86100 | |
| }, | |
| { | |
| "epoch": 25.10930395243092, | |
| "grad_norm": 0.41935205459594727, | |
| "learning_rate": 0.00029886614173228347, | |
| "loss": 3.1401, | |
| "step": 86150 | |
| }, | |
| { | |
| "epoch": 25.123877812755044, | |
| "grad_norm": 0.4359089732170105, | |
| "learning_rate": 0.0002986911636045494, | |
| "loss": 3.1532, | |
| "step": 86200 | |
| }, | |
| { | |
| "epoch": 25.138451673079164, | |
| "grad_norm": 0.4485057294368744, | |
| "learning_rate": 0.0002985161854768154, | |
| "loss": 3.1389, | |
| "step": 86250 | |
| }, | |
| { | |
| "epoch": 25.153025533403287, | |
| "grad_norm": 0.3963627815246582, | |
| "learning_rate": 0.00029834120734908135, | |
| "loss": 3.1384, | |
| "step": 86300 | |
| }, | |
| { | |
| "epoch": 25.16759939372741, | |
| "grad_norm": 0.4120327830314636, | |
| "learning_rate": 0.0002981662292213473, | |
| "loss": 3.1528, | |
| "step": 86350 | |
| }, | |
| { | |
| "epoch": 25.182173254051534, | |
| "grad_norm": 0.4055855870246887, | |
| "learning_rate": 0.0002979912510936133, | |
| "loss": 3.1445, | |
| "step": 86400 | |
| }, | |
| { | |
| "epoch": 25.196747114375654, | |
| "grad_norm": 0.40137195587158203, | |
| "learning_rate": 0.00029781627296587924, | |
| "loss": 3.1552, | |
| "step": 86450 | |
| }, | |
| { | |
| "epoch": 25.211320974699778, | |
| "grad_norm": 0.4002479612827301, | |
| "learning_rate": 0.0002976412948381452, | |
| "loss": 3.1534, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 25.2258948350239, | |
| "grad_norm": 0.42092686891555786, | |
| "learning_rate": 0.0002974663167104112, | |
| "loss": 3.16, | |
| "step": 86550 | |
| }, | |
| { | |
| "epoch": 25.240468695348024, | |
| "grad_norm": 0.4151979982852936, | |
| "learning_rate": 0.0002972913385826771, | |
| "loss": 3.1554, | |
| "step": 86600 | |
| }, | |
| { | |
| "epoch": 25.255042555672148, | |
| "grad_norm": 0.40470385551452637, | |
| "learning_rate": 0.00029711636045494307, | |
| "loss": 3.1603, | |
| "step": 86650 | |
| }, | |
| { | |
| "epoch": 25.269616415996268, | |
| "grad_norm": 0.4268498718738556, | |
| "learning_rate": 0.00029694138232720906, | |
| "loss": 3.1603, | |
| "step": 86700 | |
| }, | |
| { | |
| "epoch": 25.28419027632039, | |
| "grad_norm": 0.40396055579185486, | |
| "learning_rate": 0.000296766404199475, | |
| "loss": 3.1637, | |
| "step": 86750 | |
| }, | |
| { | |
| "epoch": 25.298764136644515, | |
| "grad_norm": 0.4064796268939972, | |
| "learning_rate": 0.000296591426071741, | |
| "loss": 3.1593, | |
| "step": 86800 | |
| }, | |
| { | |
| "epoch": 25.313337996968638, | |
| "grad_norm": 0.39150217175483704, | |
| "learning_rate": 0.00029641644794400695, | |
| "loss": 3.1734, | |
| "step": 86850 | |
| }, | |
| { | |
| "epoch": 25.327911857292758, | |
| "grad_norm": 0.43401914834976196, | |
| "learning_rate": 0.00029624146981627295, | |
| "loss": 3.1573, | |
| "step": 86900 | |
| }, | |
| { | |
| "epoch": 25.34248571761688, | |
| "grad_norm": 0.43742623925209045, | |
| "learning_rate": 0.0002960664916885389, | |
| "loss": 3.1624, | |
| "step": 86950 | |
| }, | |
| { | |
| "epoch": 25.357059577941005, | |
| "grad_norm": 0.4239753186702728, | |
| "learning_rate": 0.0002958915135608049, | |
| "loss": 3.1656, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 25.357059577941005, | |
| "eval_accuracy": 0.37326174847997334, | |
| "eval_loss": 3.553415298461914, | |
| "eval_runtime": 53.3829, | |
| "eval_samples_per_second": 311.467, | |
| "eval_steps_per_second": 19.482, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 25.37163343826513, | |
| "grad_norm": 0.44977104663848877, | |
| "learning_rate": 0.00029571653543307083, | |
| "loss": 3.1688, | |
| "step": 87050 | |
| }, | |
| { | |
| "epoch": 25.38620729858925, | |
| "grad_norm": 0.4430921971797943, | |
| "learning_rate": 0.00029554155730533683, | |
| "loss": 3.1682, | |
| "step": 87100 | |
| }, | |
| { | |
| "epoch": 25.40078115891337, | |
| "grad_norm": 0.4145093262195587, | |
| "learning_rate": 0.0002953665791776028, | |
| "loss": 3.1794, | |
| "step": 87150 | |
| }, | |
| { | |
| "epoch": 25.415355019237495, | |
| "grad_norm": 0.3987514078617096, | |
| "learning_rate": 0.00029519160104986877, | |
| "loss": 3.1737, | |
| "step": 87200 | |
| }, | |
| { | |
| "epoch": 25.42992887956162, | |
| "grad_norm": 0.3947341740131378, | |
| "learning_rate": 0.0002950166229221347, | |
| "loss": 3.1772, | |
| "step": 87250 | |
| }, | |
| { | |
| "epoch": 25.444502739885742, | |
| "grad_norm": 0.4111432433128357, | |
| "learning_rate": 0.0002948416447944007, | |
| "loss": 3.1711, | |
| "step": 87300 | |
| }, | |
| { | |
| "epoch": 25.459076600209862, | |
| "grad_norm": 0.42870181798934937, | |
| "learning_rate": 0.00029466666666666666, | |
| "loss": 3.1795, | |
| "step": 87350 | |
| }, | |
| { | |
| "epoch": 25.473650460533985, | |
| "grad_norm": 0.43575218319892883, | |
| "learning_rate": 0.0002944916885389326, | |
| "loss": 3.1767, | |
| "step": 87400 | |
| }, | |
| { | |
| "epoch": 25.48822432085811, | |
| "grad_norm": 0.43777981400489807, | |
| "learning_rate": 0.0002943167104111986, | |
| "loss": 3.1797, | |
| "step": 87450 | |
| }, | |
| { | |
| "epoch": 25.502798181182232, | |
| "grad_norm": 0.404784232378006, | |
| "learning_rate": 0.00029414173228346454, | |
| "loss": 3.1845, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 25.517372041506356, | |
| "grad_norm": 0.418813556432724, | |
| "learning_rate": 0.00029396675415573054, | |
| "loss": 3.1882, | |
| "step": 87550 | |
| }, | |
| { | |
| "epoch": 25.531945901830476, | |
| "grad_norm": 0.4354451894760132, | |
| "learning_rate": 0.0002937917760279965, | |
| "loss": 3.1846, | |
| "step": 87600 | |
| }, | |
| { | |
| "epoch": 25.5465197621546, | |
| "grad_norm": 0.4002273380756378, | |
| "learning_rate": 0.0002936167979002624, | |
| "loss": 3.1922, | |
| "step": 87650 | |
| }, | |
| { | |
| "epoch": 25.561093622478722, | |
| "grad_norm": 0.42676427960395813, | |
| "learning_rate": 0.0002934418197725284, | |
| "loss": 3.1858, | |
| "step": 87700 | |
| }, | |
| { | |
| "epoch": 25.575667482802846, | |
| "grad_norm": 0.39750683307647705, | |
| "learning_rate": 0.00029326684164479437, | |
| "loss": 3.1836, | |
| "step": 87750 | |
| }, | |
| { | |
| "epoch": 25.59024134312697, | |
| "grad_norm": 0.44498196244239807, | |
| "learning_rate": 0.0002930918635170603, | |
| "loss": 3.1909, | |
| "step": 87800 | |
| }, | |
| { | |
| "epoch": 25.60481520345109, | |
| "grad_norm": 0.42401590943336487, | |
| "learning_rate": 0.0002929168853893263, | |
| "loss": 3.1926, | |
| "step": 87850 | |
| }, | |
| { | |
| "epoch": 25.619389063775213, | |
| "grad_norm": 0.41390058398246765, | |
| "learning_rate": 0.00029274190726159225, | |
| "loss": 3.1983, | |
| "step": 87900 | |
| }, | |
| { | |
| "epoch": 25.633962924099336, | |
| "grad_norm": 0.4257187247276306, | |
| "learning_rate": 0.00029256692913385825, | |
| "loss": 3.2074, | |
| "step": 87950 | |
| }, | |
| { | |
| "epoch": 25.64853678442346, | |
| "grad_norm": 0.44523632526397705, | |
| "learning_rate": 0.0002923919510061242, | |
| "loss": 3.2035, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 25.64853678442346, | |
| "eval_accuracy": 0.3735054988602816, | |
| "eval_loss": 3.5426251888275146, | |
| "eval_runtime": 53.1684, | |
| "eval_samples_per_second": 312.723, | |
| "eval_steps_per_second": 19.56, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 25.66311064474758, | |
| "grad_norm": 0.41291964054107666, | |
| "learning_rate": 0.0002922169728783902, | |
| "loss": 3.1788, | |
| "step": 88050 | |
| }, | |
| { | |
| "epoch": 25.677684505071703, | |
| "grad_norm": 0.4161721467971802, | |
| "learning_rate": 0.00029204199475065613, | |
| "loss": 3.193, | |
| "step": 88100 | |
| }, | |
| { | |
| "epoch": 25.692258365395826, | |
| "grad_norm": 0.46007800102233887, | |
| "learning_rate": 0.00029186701662292213, | |
| "loss": 3.1947, | |
| "step": 88150 | |
| }, | |
| { | |
| "epoch": 25.70683222571995, | |
| "grad_norm": 0.4044644236564636, | |
| "learning_rate": 0.0002916920384951881, | |
| "loss": 3.1954, | |
| "step": 88200 | |
| }, | |
| { | |
| "epoch": 25.72140608604407, | |
| "grad_norm": 0.4262496829032898, | |
| "learning_rate": 0.00029151706036745407, | |
| "loss": 3.2019, | |
| "step": 88250 | |
| }, | |
| { | |
| "epoch": 25.735979946368193, | |
| "grad_norm": 0.43586814403533936, | |
| "learning_rate": 0.00029134208223972, | |
| "loss": 3.1869, | |
| "step": 88300 | |
| }, | |
| { | |
| "epoch": 25.750553806692317, | |
| "grad_norm": 0.41651099920272827, | |
| "learning_rate": 0.00029116710411198596, | |
| "loss": 3.2099, | |
| "step": 88350 | |
| }, | |
| { | |
| "epoch": 25.76512766701644, | |
| "grad_norm": 0.4524548053741455, | |
| "learning_rate": 0.00029099212598425196, | |
| "loss": 3.2114, | |
| "step": 88400 | |
| }, | |
| { | |
| "epoch": 25.779701527340563, | |
| "grad_norm": 0.42696666717529297, | |
| "learning_rate": 0.0002908171478565179, | |
| "loss": 3.1968, | |
| "step": 88450 | |
| }, | |
| { | |
| "epoch": 25.794275387664683, | |
| "grad_norm": 0.4079219102859497, | |
| "learning_rate": 0.0002906421697287839, | |
| "loss": 3.1967, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 25.808849247988807, | |
| "grad_norm": 0.40614786744117737, | |
| "learning_rate": 0.00029046719160104984, | |
| "loss": 3.2124, | |
| "step": 88550 | |
| }, | |
| { | |
| "epoch": 25.82342310831293, | |
| "grad_norm": 0.45513415336608887, | |
| "learning_rate": 0.00029029221347331584, | |
| "loss": 3.1972, | |
| "step": 88600 | |
| }, | |
| { | |
| "epoch": 25.837996968637054, | |
| "grad_norm": 0.4304491877555847, | |
| "learning_rate": 0.0002901172353455818, | |
| "loss": 3.2128, | |
| "step": 88650 | |
| }, | |
| { | |
| "epoch": 25.852570828961174, | |
| "grad_norm": 0.39539921283721924, | |
| "learning_rate": 0.0002899422572178477, | |
| "loss": 3.1988, | |
| "step": 88700 | |
| }, | |
| { | |
| "epoch": 25.867144689285297, | |
| "grad_norm": 0.38909876346588135, | |
| "learning_rate": 0.0002897672790901137, | |
| "loss": 3.1997, | |
| "step": 88750 | |
| }, | |
| { | |
| "epoch": 25.88171854960942, | |
| "grad_norm": 0.40517860651016235, | |
| "learning_rate": 0.00028959230096237967, | |
| "loss": 3.2029, | |
| "step": 88800 | |
| }, | |
| { | |
| "epoch": 25.896292409933544, | |
| "grad_norm": 0.38918161392211914, | |
| "learning_rate": 0.00028941732283464566, | |
| "loss": 3.2022, | |
| "step": 88850 | |
| }, | |
| { | |
| "epoch": 25.910866270257667, | |
| "grad_norm": 0.40442124009132385, | |
| "learning_rate": 0.0002892423447069116, | |
| "loss": 3.207, | |
| "step": 88900 | |
| }, | |
| { | |
| "epoch": 25.925440130581787, | |
| "grad_norm": 0.4054309129714966, | |
| "learning_rate": 0.00028906736657917755, | |
| "loss": 3.2071, | |
| "step": 88950 | |
| }, | |
| { | |
| "epoch": 25.94001399090591, | |
| "grad_norm": 0.42414960265159607, | |
| "learning_rate": 0.00028889238845144355, | |
| "loss": 3.2018, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 25.94001399090591, | |
| "eval_accuracy": 0.3743577424642469, | |
| "eval_loss": 3.5337586402893066, | |
| "eval_runtime": 53.3199, | |
| "eval_samples_per_second": 311.835, | |
| "eval_steps_per_second": 19.505, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 25.954587851230034, | |
| "grad_norm": 0.38889071345329285, | |
| "learning_rate": 0.0002887174103237095, | |
| "loss": 3.2059, | |
| "step": 89050 | |
| }, | |
| { | |
| "epoch": 25.969161711554158, | |
| "grad_norm": 0.4038815200328827, | |
| "learning_rate": 0.0002885424321959755, | |
| "loss": 3.2054, | |
| "step": 89100 | |
| }, | |
| { | |
| "epoch": 25.983735571878277, | |
| "grad_norm": 0.4206139147281647, | |
| "learning_rate": 0.00028836745406824143, | |
| "loss": 3.1969, | |
| "step": 89150 | |
| }, | |
| { | |
| "epoch": 25.9983094322024, | |
| "grad_norm": 0.42321598529815674, | |
| "learning_rate": 0.00028819247594050743, | |
| "loss": 3.2222, | |
| "step": 89200 | |
| }, | |
| { | |
| "epoch": 26.01282499708523, | |
| "grad_norm": 0.4227030277252197, | |
| "learning_rate": 0.0002880174978127734, | |
| "loss": 3.1136, | |
| "step": 89250 | |
| }, | |
| { | |
| "epoch": 26.027398857409352, | |
| "grad_norm": 0.4692704677581787, | |
| "learning_rate": 0.0002878425196850393, | |
| "loss": 3.1086, | |
| "step": 89300 | |
| }, | |
| { | |
| "epoch": 26.041972717733472, | |
| "grad_norm": 0.3992956578731537, | |
| "learning_rate": 0.0002876675415573053, | |
| "loss": 3.1058, | |
| "step": 89350 | |
| }, | |
| { | |
| "epoch": 26.056546578057596, | |
| "grad_norm": 0.39758923649787903, | |
| "learning_rate": 0.00028749256342957126, | |
| "loss": 3.1025, | |
| "step": 89400 | |
| }, | |
| { | |
| "epoch": 26.07112043838172, | |
| "grad_norm": 0.42007169127464294, | |
| "learning_rate": 0.00028731758530183726, | |
| "loss": 3.124, | |
| "step": 89450 | |
| }, | |
| { | |
| "epoch": 26.085694298705842, | |
| "grad_norm": 0.45216524600982666, | |
| "learning_rate": 0.0002871426071741032, | |
| "loss": 3.1132, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 26.100268159029962, | |
| "grad_norm": 0.42939022183418274, | |
| "learning_rate": 0.0002869676290463692, | |
| "loss": 3.1365, | |
| "step": 89550 | |
| }, | |
| { | |
| "epoch": 26.114842019354086, | |
| "grad_norm": 0.42322081327438354, | |
| "learning_rate": 0.00028679265091863514, | |
| "loss": 3.1188, | |
| "step": 89600 | |
| }, | |
| { | |
| "epoch": 26.12941587967821, | |
| "grad_norm": 0.433601438999176, | |
| "learning_rate": 0.00028661767279090114, | |
| "loss": 3.1403, | |
| "step": 89650 | |
| }, | |
| { | |
| "epoch": 26.143989740002333, | |
| "grad_norm": 0.43289831280708313, | |
| "learning_rate": 0.0002864426946631671, | |
| "loss": 3.1435, | |
| "step": 89700 | |
| }, | |
| { | |
| "epoch": 26.158563600326456, | |
| "grad_norm": 0.43380987644195557, | |
| "learning_rate": 0.0002862677165354331, | |
| "loss": 3.1476, | |
| "step": 89750 | |
| }, | |
| { | |
| "epoch": 26.173137460650576, | |
| "grad_norm": 0.4708380699157715, | |
| "learning_rate": 0.000286092738407699, | |
| "loss": 3.1418, | |
| "step": 89800 | |
| }, | |
| { | |
| "epoch": 26.1877113209747, | |
| "grad_norm": 0.42673295736312866, | |
| "learning_rate": 0.00028591776027996497, | |
| "loss": 3.149, | |
| "step": 89850 | |
| }, | |
| { | |
| "epoch": 26.202285181298823, | |
| "grad_norm": 0.46042966842651367, | |
| "learning_rate": 0.00028574278215223097, | |
| "loss": 3.152, | |
| "step": 89900 | |
| }, | |
| { | |
| "epoch": 26.216859041622946, | |
| "grad_norm": 0.43242865800857544, | |
| "learning_rate": 0.0002855678040244969, | |
| "loss": 3.132, | |
| "step": 89950 | |
| }, | |
| { | |
| "epoch": 26.231432901947066, | |
| "grad_norm": 0.4106438457965851, | |
| "learning_rate": 0.00028539282589676285, | |
| "loss": 3.158, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 26.231432901947066, | |
| "eval_accuracy": 0.3732618661769218, | |
| "eval_loss": 3.551401138305664, | |
| "eval_runtime": 53.2441, | |
| "eval_samples_per_second": 312.279, | |
| "eval_steps_per_second": 19.533, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 26.24600676227119, | |
| "grad_norm": 0.442765474319458, | |
| "learning_rate": 0.00028521784776902885, | |
| "loss": 3.1519, | |
| "step": 90050 | |
| }, | |
| { | |
| "epoch": 26.260580622595313, | |
| "grad_norm": 0.42735040187835693, | |
| "learning_rate": 0.0002850428696412948, | |
| "loss": 3.1545, | |
| "step": 90100 | |
| }, | |
| { | |
| "epoch": 26.275154482919437, | |
| "grad_norm": 0.4304802417755127, | |
| "learning_rate": 0.00028486789151356074, | |
| "loss": 3.1398, | |
| "step": 90150 | |
| }, | |
| { | |
| "epoch": 26.28972834324356, | |
| "grad_norm": 0.4221126139163971, | |
| "learning_rate": 0.00028469291338582673, | |
| "loss": 3.1707, | |
| "step": 90200 | |
| }, | |
| { | |
| "epoch": 26.30430220356768, | |
| "grad_norm": 0.4367322623729706, | |
| "learning_rate": 0.0002845179352580927, | |
| "loss": 3.1604, | |
| "step": 90250 | |
| }, | |
| { | |
| "epoch": 26.318876063891803, | |
| "grad_norm": 0.45300185680389404, | |
| "learning_rate": 0.0002843429571303587, | |
| "loss": 3.1601, | |
| "step": 90300 | |
| }, | |
| { | |
| "epoch": 26.333449924215927, | |
| "grad_norm": 0.4125306308269501, | |
| "learning_rate": 0.0002841679790026246, | |
| "loss": 3.1538, | |
| "step": 90350 | |
| }, | |
| { | |
| "epoch": 26.34802378454005, | |
| "grad_norm": 0.4720957279205322, | |
| "learning_rate": 0.0002839930008748906, | |
| "loss": 3.1676, | |
| "step": 90400 | |
| }, | |
| { | |
| "epoch": 26.36259764486417, | |
| "grad_norm": 0.41910240054130554, | |
| "learning_rate": 0.00028381802274715656, | |
| "loss": 3.1667, | |
| "step": 90450 | |
| }, | |
| { | |
| "epoch": 26.377171505188294, | |
| "grad_norm": 0.41693606972694397, | |
| "learning_rate": 0.00028364304461942256, | |
| "loss": 3.164, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 26.391745365512417, | |
| "grad_norm": 0.44004714488983154, | |
| "learning_rate": 0.0002834680664916885, | |
| "loss": 3.1756, | |
| "step": 90550 | |
| }, | |
| { | |
| "epoch": 26.40631922583654, | |
| "grad_norm": 0.41405948996543884, | |
| "learning_rate": 0.0002832930883639545, | |
| "loss": 3.1642, | |
| "step": 90600 | |
| }, | |
| { | |
| "epoch": 26.420893086160664, | |
| "grad_norm": 0.4055291414260864, | |
| "learning_rate": 0.00028311811023622044, | |
| "loss": 3.1812, | |
| "step": 90650 | |
| }, | |
| { | |
| "epoch": 26.435466946484784, | |
| "grad_norm": 0.4173514246940613, | |
| "learning_rate": 0.00028294313210848644, | |
| "loss": 3.1768, | |
| "step": 90700 | |
| }, | |
| { | |
| "epoch": 26.450040806808907, | |
| "grad_norm": 0.4086509644985199, | |
| "learning_rate": 0.0002827681539807524, | |
| "loss": 3.1814, | |
| "step": 90750 | |
| }, | |
| { | |
| "epoch": 26.46461466713303, | |
| "grad_norm": 0.43321284651756287, | |
| "learning_rate": 0.0002825931758530184, | |
| "loss": 3.1673, | |
| "step": 90800 | |
| }, | |
| { | |
| "epoch": 26.479188527457154, | |
| "grad_norm": 0.40561938285827637, | |
| "learning_rate": 0.0002824181977252843, | |
| "loss": 3.1713, | |
| "step": 90850 | |
| }, | |
| { | |
| "epoch": 26.493762387781274, | |
| "grad_norm": 0.3966347873210907, | |
| "learning_rate": 0.0002822432195975503, | |
| "loss": 3.1821, | |
| "step": 90900 | |
| }, | |
| { | |
| "epoch": 26.508336248105397, | |
| "grad_norm": 0.42379072308540344, | |
| "learning_rate": 0.00028206824146981627, | |
| "loss": 3.1734, | |
| "step": 90950 | |
| }, | |
| { | |
| "epoch": 26.52291010842952, | |
| "grad_norm": 0.446698933839798, | |
| "learning_rate": 0.0002818932633420822, | |
| "loss": 3.1658, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 26.52291010842952, | |
| "eval_accuracy": 0.37363249386769476, | |
| "eval_loss": 3.5465731620788574, | |
| "eval_runtime": 53.3076, | |
| "eval_samples_per_second": 311.907, | |
| "eval_steps_per_second": 19.509, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 26.537483968753644, | |
| "grad_norm": 0.4048849940299988, | |
| "learning_rate": 0.0002817182852143482, | |
| "loss": 3.1809, | |
| "step": 91050 | |
| }, | |
| { | |
| "epoch": 26.552057829077768, | |
| "grad_norm": 0.41570523381233215, | |
| "learning_rate": 0.00028154330708661415, | |
| "loss": 3.1688, | |
| "step": 91100 | |
| }, | |
| { | |
| "epoch": 26.566631689401888, | |
| "grad_norm": 0.416762113571167, | |
| "learning_rate": 0.0002813683289588801, | |
| "loss": 3.1737, | |
| "step": 91150 | |
| }, | |
| { | |
| "epoch": 26.58120554972601, | |
| "grad_norm": 0.4728192389011383, | |
| "learning_rate": 0.0002811933508311461, | |
| "loss": 3.1878, | |
| "step": 91200 | |
| }, | |
| { | |
| "epoch": 26.595779410050135, | |
| "grad_norm": 0.4537615478038788, | |
| "learning_rate": 0.00028101837270341204, | |
| "loss": 3.1861, | |
| "step": 91250 | |
| }, | |
| { | |
| "epoch": 26.610353270374258, | |
| "grad_norm": 0.43289709091186523, | |
| "learning_rate": 0.000280843394575678, | |
| "loss": 3.1801, | |
| "step": 91300 | |
| }, | |
| { | |
| "epoch": 26.624927130698378, | |
| "grad_norm": 0.4409210979938507, | |
| "learning_rate": 0.000280668416447944, | |
| "loss": 3.1817, | |
| "step": 91350 | |
| }, | |
| { | |
| "epoch": 26.6395009910225, | |
| "grad_norm": 0.42330750823020935, | |
| "learning_rate": 0.0002804934383202099, | |
| "loss": 3.1697, | |
| "step": 91400 | |
| }, | |
| { | |
| "epoch": 26.654074851346625, | |
| "grad_norm": 0.4004875123500824, | |
| "learning_rate": 0.0002803184601924759, | |
| "loss": 3.1854, | |
| "step": 91450 | |
| }, | |
| { | |
| "epoch": 26.66864871167075, | |
| "grad_norm": 0.4062268137931824, | |
| "learning_rate": 0.00028014348206474186, | |
| "loss": 3.1803, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 26.68322257199487, | |
| "grad_norm": 0.41472315788269043, | |
| "learning_rate": 0.00027996850393700786, | |
| "loss": 3.1903, | |
| "step": 91550 | |
| }, | |
| { | |
| "epoch": 26.69779643231899, | |
| "grad_norm": 0.4198923408985138, | |
| "learning_rate": 0.0002797935258092738, | |
| "loss": 3.1957, | |
| "step": 91600 | |
| }, | |
| { | |
| "epoch": 26.712370292643115, | |
| "grad_norm": 0.4187171459197998, | |
| "learning_rate": 0.0002796185476815398, | |
| "loss": 3.1835, | |
| "step": 91650 | |
| }, | |
| { | |
| "epoch": 26.72694415296724, | |
| "grad_norm": 0.4212145507335663, | |
| "learning_rate": 0.00027944356955380574, | |
| "loss": 3.1809, | |
| "step": 91700 | |
| }, | |
| { | |
| "epoch": 26.741518013291362, | |
| "grad_norm": 0.39935165643692017, | |
| "learning_rate": 0.00027926859142607174, | |
| "loss": 3.1862, | |
| "step": 91750 | |
| }, | |
| { | |
| "epoch": 26.756091873615482, | |
| "grad_norm": 0.42284080386161804, | |
| "learning_rate": 0.0002790936132983377, | |
| "loss": 3.1944, | |
| "step": 91800 | |
| }, | |
| { | |
| "epoch": 26.770665733939605, | |
| "grad_norm": 0.40127700567245483, | |
| "learning_rate": 0.0002789186351706037, | |
| "loss": 3.196, | |
| "step": 91850 | |
| }, | |
| { | |
| "epoch": 26.78523959426373, | |
| "grad_norm": 0.4029678702354431, | |
| "learning_rate": 0.0002787436570428696, | |
| "loss": 3.2019, | |
| "step": 91900 | |
| }, | |
| { | |
| "epoch": 26.799813454587852, | |
| "grad_norm": 0.41483286023139954, | |
| "learning_rate": 0.00027856867891513557, | |
| "loss": 3.195, | |
| "step": 91950 | |
| }, | |
| { | |
| "epoch": 26.814387314911976, | |
| "grad_norm": 0.42294371128082275, | |
| "learning_rate": 0.00027839370078740157, | |
| "loss": 3.199, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 26.814387314911976, | |
| "eval_accuracy": 0.3742275696392247, | |
| "eval_loss": 3.536498546600342, | |
| "eval_runtime": 53.2794, | |
| "eval_samples_per_second": 312.072, | |
| "eval_steps_per_second": 19.52, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 26.814387314911976, | |
| "step": 92000, | |
| "total_flos": 1.922898754142208e+18, | |
| "train_loss": 3.4146219946819802, | |
| "train_runtime": 40914.164, | |
| "train_samples_per_second": 335.396, | |
| "train_steps_per_second": 4.193 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 171550, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 20, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 18 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.922898754142208e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |