Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": 79000, | |
| "best_metric": 3.5276875495910645, | |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/swap_take_to_drop_2128/checkpoint-40000", | |
| "epoch": 32.01426408942711, | |
| "eval_steps": 1000, | |
| "global_step": 110000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014555193292966931, | |
| "grad_norm": 0.7827480435371399, | |
| "learning_rate": 0.000294, | |
| "loss": 8.4453, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.029110386585933862, | |
| "grad_norm": 0.6805397272109985, | |
| "learning_rate": 0.0005939999999999999, | |
| "loss": 6.713, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04366557987890079, | |
| "grad_norm": 0.4572669565677643, | |
| "learning_rate": 0.0005998287711124053, | |
| "loss": 6.3258, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.058220773171867725, | |
| "grad_norm": 0.5754550695419312, | |
| "learning_rate": 0.000599654047757717, | |
| "loss": 6.1466, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07277596646483465, | |
| "grad_norm": 0.4736722707748413, | |
| "learning_rate": 0.0005994793244030285, | |
| "loss": 5.9838, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08733115975780158, | |
| "grad_norm": 0.43187662959098816, | |
| "learning_rate": 0.00059930460104834, | |
| "loss": 5.8607, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10188635305076851, | |
| "grad_norm": 0.4714319705963135, | |
| "learning_rate": 0.0005991298776936517, | |
| "loss": 5.7321, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11644154634373545, | |
| "grad_norm": 0.4762209951877594, | |
| "learning_rate": 0.0005989551543389632, | |
| "loss": 5.6365, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1309967396367024, | |
| "grad_norm": 0.49314582347869873, | |
| "learning_rate": 0.0005987804309842748, | |
| "loss": 5.5101, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1455519329296693, | |
| "grad_norm": 0.41656768321990967, | |
| "learning_rate": 0.0005986057076295864, | |
| "loss": 5.4381, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.16010712622263623, | |
| "grad_norm": 0.40143629908561707, | |
| "learning_rate": 0.0005984309842748981, | |
| "loss": 5.3319, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.17466231951560315, | |
| "grad_norm": 0.4198455214500427, | |
| "learning_rate": 0.0005982562609202096, | |
| "loss": 5.2531, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1892175128085701, | |
| "grad_norm": 0.40385374426841736, | |
| "learning_rate": 0.0005980815375655212, | |
| "loss": 5.1872, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.20377270610153703, | |
| "grad_norm": 0.420478880405426, | |
| "learning_rate": 0.0005979068142108328, | |
| "loss": 5.143, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.21832789939450395, | |
| "grad_norm": 0.4238859713077545, | |
| "learning_rate": 0.0005977320908561445, | |
| "loss": 5.088, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2328830926874709, | |
| "grad_norm": 0.47646620869636536, | |
| "learning_rate": 0.000597557367501456, | |
| "loss": 5.0438, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.24743828598043782, | |
| "grad_norm": 0.418918639421463, | |
| "learning_rate": 0.0005973826441467675, | |
| "loss": 4.9679, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2619934792734048, | |
| "grad_norm": 0.48757559061050415, | |
| "learning_rate": 0.0005972079207920792, | |
| "loss": 4.9244, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.27654867256637167, | |
| "grad_norm": 0.41967347264289856, | |
| "learning_rate": 0.0005970331974373907, | |
| "loss": 4.8743, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.2911038658593386, | |
| "grad_norm": 0.4787144064903259, | |
| "learning_rate": 0.0005968584740827023, | |
| "loss": 4.829, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2911038658593386, | |
| "eval_accuracy": 0.2549372927185605, | |
| "eval_loss": 4.751561641693115, | |
| "eval_runtime": 180.2912, | |
| "eval_samples_per_second": 92.356, | |
| "eval_steps_per_second": 5.774, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.30565905915230557, | |
| "grad_norm": 0.5646753907203674, | |
| "learning_rate": 0.0005966837507280139, | |
| "loss": 4.777, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.32021425244527246, | |
| "grad_norm": 0.418170690536499, | |
| "learning_rate": 0.0005965090273733256, | |
| "loss": 4.7342, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3347694457382394, | |
| "grad_norm": 0.4552096128463745, | |
| "learning_rate": 0.0005963343040186371, | |
| "loss": 4.7047, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3493246390312063, | |
| "grad_norm": 0.4131508469581604, | |
| "learning_rate": 0.0005961595806639486, | |
| "loss": 4.665, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.36387983232417326, | |
| "grad_norm": 0.40163981914520264, | |
| "learning_rate": 0.0005959848573092603, | |
| "loss": 4.6263, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.3784350256171402, | |
| "grad_norm": 0.4545515775680542, | |
| "learning_rate": 0.0005958101339545718, | |
| "loss": 4.6074, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.3929902189101071, | |
| "grad_norm": 0.4808027744293213, | |
| "learning_rate": 0.0005956354105998835, | |
| "loss": 4.5906, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.40754541220307405, | |
| "grad_norm": 0.4855094254016876, | |
| "learning_rate": 0.000595460687245195, | |
| "loss": 4.5764, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.422100605496041, | |
| "grad_norm": 0.42071059346199036, | |
| "learning_rate": 0.0005952859638905067, | |
| "loss": 4.5262, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4366557987890079, | |
| "grad_norm": 0.3910202383995056, | |
| "learning_rate": 0.0005951112405358182, | |
| "loss": 4.5007, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.45121099208197485, | |
| "grad_norm": 0.39215177297592163, | |
| "learning_rate": 0.0005949365171811299, | |
| "loss": 4.4985, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4657661853749418, | |
| "grad_norm": 0.40318185091018677, | |
| "learning_rate": 0.0005947617938264414, | |
| "loss": 4.4596, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.4803213786679087, | |
| "grad_norm": 0.42347994446754456, | |
| "learning_rate": 0.000594587070471753, | |
| "loss": 4.4476, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.49487657196087564, | |
| "grad_norm": 0.40894490480422974, | |
| "learning_rate": 0.0005944123471170646, | |
| "loss": 4.4272, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5094317652538426, | |
| "grad_norm": 0.5144431591033936, | |
| "learning_rate": 0.0005942376237623762, | |
| "loss": 4.4239, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5239869585468095, | |
| "grad_norm": 0.40016046166419983, | |
| "learning_rate": 0.0005940629004076878, | |
| "loss": 4.3961, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5385421518397764, | |
| "grad_norm": 0.39825013279914856, | |
| "learning_rate": 0.0005938881770529993, | |
| "loss": 4.3878, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5530973451327433, | |
| "grad_norm": 0.3824250400066376, | |
| "learning_rate": 0.000593713453698311, | |
| "loss": 4.3812, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5676525384257103, | |
| "grad_norm": 0.4112911522388458, | |
| "learning_rate": 0.0005935387303436226, | |
| "loss": 4.3516, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5822077317186772, | |
| "grad_norm": 0.3895181715488434, | |
| "learning_rate": 0.0005933640069889342, | |
| "loss": 4.3354, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5822077317186772, | |
| "eval_accuracy": 0.2993394612853891, | |
| "eval_loss": 4.287642478942871, | |
| "eval_runtime": 179.6408, | |
| "eval_samples_per_second": 92.691, | |
| "eval_steps_per_second": 5.795, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5967629250116442, | |
| "grad_norm": 0.4498955309391022, | |
| "learning_rate": 0.0005931892836342457, | |
| "loss": 4.3271, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6113181183046111, | |
| "grad_norm": 0.36283108592033386, | |
| "learning_rate": 0.0005930145602795573, | |
| "loss": 4.3172, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.625873311597578, | |
| "grad_norm": 0.3976347744464874, | |
| "learning_rate": 0.000592839836924869, | |
| "loss": 4.3016, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6404285048905449, | |
| "grad_norm": 0.38537269830703735, | |
| "learning_rate": 0.0005926651135701805, | |
| "loss": 4.2918, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6549836981835119, | |
| "grad_norm": 0.38066738843917847, | |
| "learning_rate": 0.0005924903902154921, | |
| "loss": 4.2714, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.6695388914764788, | |
| "grad_norm": 0.3778422474861145, | |
| "learning_rate": 0.0005923156668608037, | |
| "loss": 4.2645, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6840940847694458, | |
| "grad_norm": 0.41107630729675293, | |
| "learning_rate": 0.0005921409435061153, | |
| "loss": 4.2787, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6986492780624126, | |
| "grad_norm": 0.38569191098213196, | |
| "learning_rate": 0.0005919662201514268, | |
| "loss": 4.259, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7132044713553796, | |
| "grad_norm": 0.3875567615032196, | |
| "learning_rate": 0.0005917914967967384, | |
| "loss": 4.2298, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7277596646483465, | |
| "grad_norm": 0.4167780876159668, | |
| "learning_rate": 0.0005916167734420501, | |
| "loss": 4.2352, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7423148579413135, | |
| "grad_norm": 0.3778788149356842, | |
| "learning_rate": 0.0005914420500873616, | |
| "loss": 4.2102, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.7568700512342804, | |
| "grad_norm": 0.40821921825408936, | |
| "learning_rate": 0.0005912673267326732, | |
| "loss": 4.2131, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.7714252445272474, | |
| "grad_norm": 0.3879176676273346, | |
| "learning_rate": 0.0005910926033779848, | |
| "loss": 4.2034, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.7859804378202142, | |
| "grad_norm": 0.41103458404541016, | |
| "learning_rate": 0.0005909178800232964, | |
| "loss": 4.1869, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8005356311131812, | |
| "grad_norm": 0.3972039818763733, | |
| "learning_rate": 0.000590743156668608, | |
| "loss": 4.1829, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8150908244061481, | |
| "grad_norm": 0.37364527583122253, | |
| "learning_rate": 0.0005905684333139196, | |
| "loss": 4.165, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8296460176991151, | |
| "grad_norm": 0.3501448631286621, | |
| "learning_rate": 0.0005903937099592312, | |
| "loss": 4.1794, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.844201210992082, | |
| "grad_norm": 0.3832891583442688, | |
| "learning_rate": 0.0005902189866045427, | |
| "loss": 4.1615, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.858756404285049, | |
| "grad_norm": 0.3367897570133209, | |
| "learning_rate": 0.0005900442632498543, | |
| "loss": 4.1633, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.8733115975780158, | |
| "grad_norm": 0.37943506240844727, | |
| "learning_rate": 0.0005898695398951659, | |
| "loss": 4.147, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8733115975780158, | |
| "eval_accuracy": 0.31569479616122914, | |
| "eval_loss": 4.09618616104126, | |
| "eval_runtime": 179.6956, | |
| "eval_samples_per_second": 92.662, | |
| "eval_steps_per_second": 5.793, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8878667908709827, | |
| "grad_norm": 0.368857204914093, | |
| "learning_rate": 0.0005896948165404776, | |
| "loss": 4.1638, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9024219841639497, | |
| "grad_norm": 0.3786642849445343, | |
| "learning_rate": 0.0005895200931857891, | |
| "loss": 4.1341, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9169771774569166, | |
| "grad_norm": 0.3719625771045685, | |
| "learning_rate": 0.0005893453698311007, | |
| "loss": 4.1374, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.9315323707498836, | |
| "grad_norm": 0.34554851055145264, | |
| "learning_rate": 0.0005891706464764123, | |
| "loss": 4.1151, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.9460875640428504, | |
| "grad_norm": 0.3568631708621979, | |
| "learning_rate": 0.0005889959231217238, | |
| "loss": 4.1143, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.9606427573358174, | |
| "grad_norm": 0.34819287061691284, | |
| "learning_rate": 0.0005888211997670355, | |
| "loss": 4.1009, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.9751979506287843, | |
| "grad_norm": 0.3499997854232788, | |
| "learning_rate": 0.000588646476412347, | |
| "loss": 4.0893, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.9897531439217513, | |
| "grad_norm": 0.3431825041770935, | |
| "learning_rate": 0.0005884717530576587, | |
| "loss": 4.1051, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.0040754541220307, | |
| "grad_norm": 0.36185380816459656, | |
| "learning_rate": 0.0005882970297029702, | |
| "loss": 4.0674, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.0186306474149978, | |
| "grad_norm": 0.3525213599205017, | |
| "learning_rate": 0.0005881223063482818, | |
| "loss": 4.027, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.0331858407079646, | |
| "grad_norm": 0.3817584216594696, | |
| "learning_rate": 0.0005879475829935934, | |
| "loss": 4.0201, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.0477410340009314, | |
| "grad_norm": 0.3375789523124695, | |
| "learning_rate": 0.0005877728596389051, | |
| "loss": 4.0083, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.0622962272938985, | |
| "grad_norm": 0.40628930926322937, | |
| "learning_rate": 0.0005875981362842166, | |
| "loss": 4.0242, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.0768514205868653, | |
| "grad_norm": 0.3391870856285095, | |
| "learning_rate": 0.0005874234129295281, | |
| "loss": 4.0181, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.0914066138798324, | |
| "grad_norm": 0.35236597061157227, | |
| "learning_rate": 0.0005872486895748398, | |
| "loss": 4.0097, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.1059618071727992, | |
| "grad_norm": 0.35760727524757385, | |
| "learning_rate": 0.0005870739662201513, | |
| "loss": 4.0013, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.120517000465766, | |
| "grad_norm": 0.3544367253780365, | |
| "learning_rate": 0.000586899242865463, | |
| "loss": 4.0004, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.1350721937587331, | |
| "grad_norm": 0.3396351933479309, | |
| "learning_rate": 0.0005867245195107746, | |
| "loss": 4.0188, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.1496273870517, | |
| "grad_norm": 0.3459094762802124, | |
| "learning_rate": 0.0005865497961560862, | |
| "loss": 4.0019, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.164182580344667, | |
| "grad_norm": 0.35661768913269043, | |
| "learning_rate": 0.0005863750728013977, | |
| "loss": 3.9906, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.164182580344667, | |
| "eval_accuracy": 0.32550056936103106, | |
| "eval_loss": 3.994142532348633, | |
| "eval_runtime": 179.8284, | |
| "eval_samples_per_second": 92.594, | |
| "eval_steps_per_second": 5.789, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.1787377736376339, | |
| "grad_norm": 0.3275563418865204, | |
| "learning_rate": 0.0005862003494467094, | |
| "loss": 3.9935, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.193292966930601, | |
| "grad_norm": 0.33437496423721313, | |
| "learning_rate": 0.0005860256260920209, | |
| "loss": 3.9752, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.2078481602235678, | |
| "grad_norm": 0.3431974947452545, | |
| "learning_rate": 0.0005858509027373325, | |
| "loss": 3.9743, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.2224033535165346, | |
| "grad_norm": 0.33818677067756653, | |
| "learning_rate": 0.0005856761793826441, | |
| "loss": 3.9812, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.2369585468095017, | |
| "grad_norm": 0.3403140902519226, | |
| "learning_rate": 0.0005855014560279557, | |
| "loss": 3.972, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.2515137401024685, | |
| "grad_norm": 0.3182094097137451, | |
| "learning_rate": 0.0005853267326732673, | |
| "loss": 3.9708, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.2660689333954354, | |
| "grad_norm": 0.3568837642669678, | |
| "learning_rate": 0.0005851520093185788, | |
| "loss": 3.9688, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.2806241266884024, | |
| "grad_norm": 0.36247938871383667, | |
| "learning_rate": 0.0005849772859638905, | |
| "loss": 3.9566, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.2951793199813695, | |
| "grad_norm": 0.3376859128475189, | |
| "learning_rate": 0.0005848025626092021, | |
| "loss": 3.9535, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.3097345132743363, | |
| "grad_norm": 0.3505348563194275, | |
| "learning_rate": 0.0005846278392545136, | |
| "loss": 3.9686, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3242897065673032, | |
| "grad_norm": 0.32666268944740295, | |
| "learning_rate": 0.0005844531158998252, | |
| "loss": 3.9632, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.3388448998602702, | |
| "grad_norm": 0.3384810984134674, | |
| "learning_rate": 0.0005842783925451368, | |
| "loss": 3.9473, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.353400093153237, | |
| "grad_norm": 0.3483339846134186, | |
| "learning_rate": 0.0005841036691904484, | |
| "loss": 3.9521, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.367955286446204, | |
| "grad_norm": 0.33106961846351624, | |
| "learning_rate": 0.00058392894583576, | |
| "loss": 3.9545, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.382510479739171, | |
| "grad_norm": 0.3450721502304077, | |
| "learning_rate": 0.0005837542224810716, | |
| "loss": 3.9544, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.3970656730321378, | |
| "grad_norm": 0.37358760833740234, | |
| "learning_rate": 0.0005835794991263832, | |
| "loss": 3.9441, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.4116208663251049, | |
| "grad_norm": 0.33683931827545166, | |
| "learning_rate": 0.0005834047757716948, | |
| "loss": 3.9405, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.4261760596180717, | |
| "grad_norm": 0.3543245196342468, | |
| "learning_rate": 0.0005832300524170063, | |
| "loss": 3.9497, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.4407312529110388, | |
| "grad_norm": 0.33687421679496765, | |
| "learning_rate": 0.0005830553290623179, | |
| "loss": 3.9257, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.4552864462040056, | |
| "grad_norm": 0.3444598317146301, | |
| "learning_rate": 0.0005828806057076296, | |
| "loss": 3.9338, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4552864462040056, | |
| "eval_accuracy": 0.3319364821327351, | |
| "eval_loss": 3.918074607849121, | |
| "eval_runtime": 179.5997, | |
| "eval_samples_per_second": 92.712, | |
| "eval_steps_per_second": 5.796, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4698416394969724, | |
| "grad_norm": 0.3072463274002075, | |
| "learning_rate": 0.0005827058823529411, | |
| "loss": 3.926, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.4843968327899395, | |
| "grad_norm": 0.34945112466812134, | |
| "learning_rate": 0.0005825311589982527, | |
| "loss": 3.9185, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.4989520260829063, | |
| "grad_norm": 0.3230547606945038, | |
| "learning_rate": 0.0005823564356435643, | |
| "loss": 3.9159, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.5135072193758732, | |
| "grad_norm": 0.315425306558609, | |
| "learning_rate": 0.0005821817122888759, | |
| "loss": 3.9278, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.5280624126688402, | |
| "grad_norm": 0.32981154322624207, | |
| "learning_rate": 0.0005820069889341875, | |
| "loss": 3.9159, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.5426176059618073, | |
| "grad_norm": 0.3316299319267273, | |
| "learning_rate": 0.000581832265579499, | |
| "loss": 3.9159, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.5571727992547741, | |
| "grad_norm": 0.3105960190296173, | |
| "learning_rate": 0.0005816575422248107, | |
| "loss": 3.918, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.571727992547741, | |
| "grad_norm": 0.3287831246852875, | |
| "learning_rate": 0.0005814828188701222, | |
| "loss": 3.9092, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.586283185840708, | |
| "grad_norm": 0.33905884623527527, | |
| "learning_rate": 0.0005813080955154338, | |
| "loss": 3.9084, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.6008383791336749, | |
| "grad_norm": 0.314873069524765, | |
| "learning_rate": 0.0005811333721607454, | |
| "loss": 3.9073, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6153935724266417, | |
| "grad_norm": 0.3269849419593811, | |
| "learning_rate": 0.0005809586488060571, | |
| "loss": 3.9036, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.6299487657196088, | |
| "grad_norm": 0.3468124568462372, | |
| "learning_rate": 0.0005807839254513686, | |
| "loss": 3.8978, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.6445039590125758, | |
| "grad_norm": 0.32440632581710815, | |
| "learning_rate": 0.0005806092020966802, | |
| "loss": 3.9003, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.6590591523055425, | |
| "grad_norm": 0.3260248899459839, | |
| "learning_rate": 0.0005804344787419918, | |
| "loss": 3.8929, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.6736143455985095, | |
| "grad_norm": 0.35749199986457825, | |
| "learning_rate": 0.0005802597553873033, | |
| "loss": 3.9124, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.6881695388914766, | |
| "grad_norm": 0.3142864406108856, | |
| "learning_rate": 0.000580085032032615, | |
| "loss": 3.8886, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.7027247321844434, | |
| "grad_norm": 0.34212082624435425, | |
| "learning_rate": 0.0005799103086779265, | |
| "loss": 3.8873, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.7172799254774103, | |
| "grad_norm": 0.29918429255485535, | |
| "learning_rate": 0.0005797355853232382, | |
| "loss": 3.8782, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.7318351187703773, | |
| "grad_norm": 0.33171242475509644, | |
| "learning_rate": 0.0005795608619685497, | |
| "loss": 3.8914, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.7463903120633442, | |
| "grad_norm": 0.31623977422714233, | |
| "learning_rate": 0.0005793861386138614, | |
| "loss": 3.8809, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.7463903120633442, | |
| "eval_accuracy": 0.33669140185512153, | |
| "eval_loss": 3.8595082759857178, | |
| "eval_runtime": 179.6511, | |
| "eval_samples_per_second": 92.685, | |
| "eval_steps_per_second": 5.795, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.760945505356311, | |
| "grad_norm": 0.3388454020023346, | |
| "learning_rate": 0.0005792114152591729, | |
| "loss": 3.87, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.775500698649278, | |
| "grad_norm": 0.3301540017127991, | |
| "learning_rate": 0.0005790366919044846, | |
| "loss": 3.871, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.7900558919422451, | |
| "grad_norm": 0.32335081696510315, | |
| "learning_rate": 0.0005788619685497961, | |
| "loss": 3.8808, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.804611085235212, | |
| "grad_norm": 0.3080996870994568, | |
| "learning_rate": 0.0005786872451951077, | |
| "loss": 3.8738, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.8191662785281788, | |
| "grad_norm": 0.30384325981140137, | |
| "learning_rate": 0.0005785125218404193, | |
| "loss": 3.876, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.8337214718211459, | |
| "grad_norm": 0.32148781418800354, | |
| "learning_rate": 0.0005783377984857308, | |
| "loss": 3.8791, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.8482766651141127, | |
| "grad_norm": 0.31021323800086975, | |
| "learning_rate": 0.0005781630751310425, | |
| "loss": 3.8709, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.8628318584070795, | |
| "grad_norm": 0.33548709750175476, | |
| "learning_rate": 0.0005779883517763541, | |
| "loss": 3.8577, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.8773870517000466, | |
| "grad_norm": 0.3398282825946808, | |
| "learning_rate": 0.0005778136284216657, | |
| "loss": 3.8553, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.8919422449930137, | |
| "grad_norm": 0.34088537096977234, | |
| "learning_rate": 0.0005776389050669772, | |
| "loss": 3.8711, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.9064974382859803, | |
| "grad_norm": 0.34938573837280273, | |
| "learning_rate": 0.0005774641817122889, | |
| "loss": 3.8549, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.9210526315789473, | |
| "grad_norm": 0.31073132157325745, | |
| "learning_rate": 0.0005772894583576004, | |
| "loss": 3.8497, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.9356078248719144, | |
| "grad_norm": 0.34067502617836, | |
| "learning_rate": 0.000577114735002912, | |
| "loss": 3.8482, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.9501630181648812, | |
| "grad_norm": 0.30622148513793945, | |
| "learning_rate": 0.0005769400116482236, | |
| "loss": 3.8414, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.964718211457848, | |
| "grad_norm": 0.33704185485839844, | |
| "learning_rate": 0.0005767652882935352, | |
| "loss": 3.8477, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.9792734047508151, | |
| "grad_norm": 0.3220314383506775, | |
| "learning_rate": 0.0005765905649388468, | |
| "loss": 3.8507, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.993828598043782, | |
| "grad_norm": 0.3205111622810364, | |
| "learning_rate": 0.0005764158415841583, | |
| "loss": 3.8551, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.0081509082440614, | |
| "grad_norm": 0.3426797688007355, | |
| "learning_rate": 0.00057624111822947, | |
| "loss": 3.7992, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.0227061015370285, | |
| "grad_norm": 0.3363146185874939, | |
| "learning_rate": 0.0005760663948747816, | |
| "loss": 3.752, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.0372612948299955, | |
| "grad_norm": 0.3153912127017975, | |
| "learning_rate": 0.0005758916715200931, | |
| "loss": 3.7521, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.0372612948299955, | |
| "eval_accuracy": 0.34137921348611727, | |
| "eval_loss": 3.8178396224975586, | |
| "eval_runtime": 179.5868, | |
| "eval_samples_per_second": 92.718, | |
| "eval_steps_per_second": 5.797, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.051816488122962, | |
| "grad_norm": 0.3598702847957611, | |
| "learning_rate": 0.0005757169481654047, | |
| "loss": 3.75, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.066371681415929, | |
| "grad_norm": 0.3275192975997925, | |
| "learning_rate": 0.0005755422248107163, | |
| "loss": 3.7405, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.0809268747088963, | |
| "grad_norm": 0.3213796019554138, | |
| "learning_rate": 0.0005753675014560279, | |
| "loss": 3.7502, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.095482068001863, | |
| "grad_norm": 0.32875528931617737, | |
| "learning_rate": 0.0005751927781013395, | |
| "loss": 3.7484, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.11003726129483, | |
| "grad_norm": 0.32091593742370605, | |
| "learning_rate": 0.0005750180547466511, | |
| "loss": 3.7539, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.124592454587797, | |
| "grad_norm": 0.33569371700286865, | |
| "learning_rate": 0.0005748433313919627, | |
| "loss": 3.7465, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.139147647880764, | |
| "grad_norm": 0.3289715647697449, | |
| "learning_rate": 0.0005746686080372743, | |
| "loss": 3.7561, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.1537028411737307, | |
| "grad_norm": 0.3335973620414734, | |
| "learning_rate": 0.0005744938846825858, | |
| "loss": 3.7641, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.1682580344666977, | |
| "grad_norm": 0.3361070156097412, | |
| "learning_rate": 0.0005743191613278974, | |
| "loss": 3.767, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.182813227759665, | |
| "grad_norm": 0.32386162877082825, | |
| "learning_rate": 0.0005741444379732091, | |
| "loss": 3.7544, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.1973684210526314, | |
| "grad_norm": 0.3267822563648224, | |
| "learning_rate": 0.0005739697146185206, | |
| "loss": 3.7638, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.2119236143455985, | |
| "grad_norm": 0.3390170931816101, | |
| "learning_rate": 0.0005737949912638322, | |
| "loss": 3.7489, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.2264788076385655, | |
| "grad_norm": 0.3106227517127991, | |
| "learning_rate": 0.0005736202679091438, | |
| "loss": 3.7501, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.241034000931532, | |
| "grad_norm": 0.31411436200141907, | |
| "learning_rate": 0.0005734455445544554, | |
| "loss": 3.7464, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.255589194224499, | |
| "grad_norm": 0.32941755652427673, | |
| "learning_rate": 0.000573270821199767, | |
| "loss": 3.7533, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.2701443875174663, | |
| "grad_norm": 0.32073238492012024, | |
| "learning_rate": 0.0005730960978450785, | |
| "loss": 3.7581, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.2846995808104333, | |
| "grad_norm": 0.3364821970462799, | |
| "learning_rate": 0.0005729213744903902, | |
| "loss": 3.7547, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.2992547741034, | |
| "grad_norm": 0.3143947124481201, | |
| "learning_rate": 0.0005727466511357017, | |
| "loss": 3.7595, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.313809967396367, | |
| "grad_norm": 0.3281038999557495, | |
| "learning_rate": 0.0005725719277810134, | |
| "loss": 3.7521, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.328365160689334, | |
| "grad_norm": 0.32708603143692017, | |
| "learning_rate": 0.0005723972044263249, | |
| "loss": 3.7646, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.328365160689334, | |
| "eval_accuracy": 0.3444594866336783, | |
| "eval_loss": 3.7855265140533447, | |
| "eval_runtime": 179.7743, | |
| "eval_samples_per_second": 92.622, | |
| "eval_steps_per_second": 5.791, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.3429203539823007, | |
| "grad_norm": 0.33208343386650085, | |
| "learning_rate": 0.0005722224810716366, | |
| "loss": 3.7352, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.3574755472752678, | |
| "grad_norm": 0.30804917216300964, | |
| "learning_rate": 0.0005720477577169481, | |
| "loss": 3.7543, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.372030740568235, | |
| "grad_norm": 0.3458641767501831, | |
| "learning_rate": 0.0005718730343622598, | |
| "loss": 3.7548, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.386585933861202, | |
| "grad_norm": 0.30380886793136597, | |
| "learning_rate": 0.0005716983110075713, | |
| "loss": 3.7549, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.4011411271541685, | |
| "grad_norm": 0.3295377492904663, | |
| "learning_rate": 0.0005715235876528828, | |
| "loss": 3.7513, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.4156963204471356, | |
| "grad_norm": 0.3375111222267151, | |
| "learning_rate": 0.0005713488642981945, | |
| "loss": 3.7578, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.4302515137401026, | |
| "grad_norm": 0.3401114344596863, | |
| "learning_rate": 0.0005711741409435061, | |
| "loss": 3.7534, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.4448067070330692, | |
| "grad_norm": 0.31809717416763306, | |
| "learning_rate": 0.0005709994175888177, | |
| "loss": 3.7562, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.4593619003260363, | |
| "grad_norm": 0.3254973888397217, | |
| "learning_rate": 0.0005708246942341292, | |
| "loss": 3.7595, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.4739170936190034, | |
| "grad_norm": 0.31873786449432373, | |
| "learning_rate": 0.0005706499708794409, | |
| "loss": 3.7628, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4884722869119704, | |
| "grad_norm": 0.3236052989959717, | |
| "learning_rate": 0.0005704752475247524, | |
| "loss": 3.7373, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.503027480204937, | |
| "grad_norm": 0.33082205057144165, | |
| "learning_rate": 0.0005703005241700641, | |
| "loss": 3.7584, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.517582673497904, | |
| "grad_norm": 0.3304481506347656, | |
| "learning_rate": 0.0005701258008153756, | |
| "loss": 3.7469, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.5321378667908707, | |
| "grad_norm": 0.3295728266239166, | |
| "learning_rate": 0.0005699510774606872, | |
| "loss": 3.7305, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.546693060083838, | |
| "grad_norm": 0.31206071376800537, | |
| "learning_rate": 0.0005697763541059988, | |
| "loss": 3.7431, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.561248253376805, | |
| "grad_norm": 0.34510451555252075, | |
| "learning_rate": 0.0005696016307513103, | |
| "loss": 3.7496, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.575803446669772, | |
| "grad_norm": 0.3276890814304352, | |
| "learning_rate": 0.000569426907396622, | |
| "loss": 3.7304, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 2.590358639962739, | |
| "grad_norm": 0.32492634654045105, | |
| "learning_rate": 0.0005692521840419336, | |
| "loss": 3.7435, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.6049138332557056, | |
| "grad_norm": 0.3226850628852844, | |
| "learning_rate": 0.0005690774606872452, | |
| "loss": 3.7377, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 2.6194690265486726, | |
| "grad_norm": 0.3230507969856262, | |
| "learning_rate": 0.0005689027373325567, | |
| "loss": 3.7479, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.6194690265486726, | |
| "eval_accuracy": 0.3471528598918208, | |
| "eval_loss": 3.756319999694824, | |
| "eval_runtime": 179.7301, | |
| "eval_samples_per_second": 92.644, | |
| "eval_steps_per_second": 5.792, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.6340242198416393, | |
| "grad_norm": 0.31811442971229553, | |
| "learning_rate": 0.0005687280139778683, | |
| "loss": 3.7453, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 2.6485794131346063, | |
| "grad_norm": 0.3335292935371399, | |
| "learning_rate": 0.0005685532906231799, | |
| "loss": 3.7324, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.6631346064275734, | |
| "grad_norm": 0.34354859590530396, | |
| "learning_rate": 0.0005683785672684915, | |
| "loss": 3.7301, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 2.6776897997205404, | |
| "grad_norm": 0.33733129501342773, | |
| "learning_rate": 0.0005682038439138031, | |
| "loss": 3.7482, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.692244993013507, | |
| "grad_norm": 0.30945345759391785, | |
| "learning_rate": 0.0005680291205591147, | |
| "loss": 3.7516, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 2.706800186306474, | |
| "grad_norm": 0.3279203176498413, | |
| "learning_rate": 0.0005678543972044263, | |
| "loss": 3.7446, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.721355379599441, | |
| "grad_norm": 0.3235558569431305, | |
| "learning_rate": 0.0005676796738497378, | |
| "loss": 3.7371, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 2.735910572892408, | |
| "grad_norm": 0.3249393105506897, | |
| "learning_rate": 0.0005675049504950495, | |
| "loss": 3.7315, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.750465766185375, | |
| "grad_norm": 0.35973361134529114, | |
| "learning_rate": 0.0005673302271403611, | |
| "loss": 3.744, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 2.765020959478342, | |
| "grad_norm": 0.3144098222255707, | |
| "learning_rate": 0.0005671555037856726, | |
| "loss": 3.7254, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.779576152771309, | |
| "grad_norm": 0.330697625875473, | |
| "learning_rate": 0.0005669807804309842, | |
| "loss": 3.7328, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 2.7941313460642756, | |
| "grad_norm": 0.33324456214904785, | |
| "learning_rate": 0.0005668060570762958, | |
| "loss": 3.734, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.8086865393572427, | |
| "grad_norm": 0.3099815547466278, | |
| "learning_rate": 0.0005666313337216074, | |
| "loss": 3.7258, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 2.8232417326502097, | |
| "grad_norm": 0.31584757566452026, | |
| "learning_rate": 0.000566456610366919, | |
| "loss": 3.7359, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.8377969259431763, | |
| "grad_norm": 0.3102036118507385, | |
| "learning_rate": 0.0005662818870122306, | |
| "loss": 3.712, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 2.8523521192361434, | |
| "grad_norm": 0.32213059067726135, | |
| "learning_rate": 0.0005661071636575422, | |
| "loss": 3.743, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.8669073125291105, | |
| "grad_norm": 0.33274680376052856, | |
| "learning_rate": 0.0005659324403028537, | |
| "loss": 3.7324, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 2.8814625058220775, | |
| "grad_norm": 0.30774420499801636, | |
| "learning_rate": 0.0005657577169481653, | |
| "loss": 3.7203, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.896017699115044, | |
| "grad_norm": 0.33334290981292725, | |
| "learning_rate": 0.0005655829935934769, | |
| "loss": 3.7227, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 2.910572892408011, | |
| "grad_norm": 0.3120776414871216, | |
| "learning_rate": 0.0005654082702387886, | |
| "loss": 3.731, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.910572892408011, | |
| "eval_accuracy": 0.34952984964379236, | |
| "eval_loss": 3.729276180267334, | |
| "eval_runtime": 179.9069, | |
| "eval_samples_per_second": 92.553, | |
| "eval_steps_per_second": 5.786, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.9251280857009783, | |
| "grad_norm": 0.3083525598049164, | |
| "learning_rate": 0.0005652335468841001, | |
| "loss": 3.7183, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 2.939683278993945, | |
| "grad_norm": 0.33243975043296814, | |
| "learning_rate": 0.0005650588235294117, | |
| "loss": 3.7295, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.954238472286912, | |
| "grad_norm": 0.31787440180778503, | |
| "learning_rate": 0.0005648841001747233, | |
| "loss": 3.7152, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 2.968793665579879, | |
| "grad_norm": 0.318163126707077, | |
| "learning_rate": 0.0005647093768200349, | |
| "loss": 3.7209, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.983348858872846, | |
| "grad_norm": 0.32628071308135986, | |
| "learning_rate": 0.0005645346534653465, | |
| "loss": 3.7096, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 2.9979040521658127, | |
| "grad_norm": 0.32128211855888367, | |
| "learning_rate": 0.0005643599301106582, | |
| "loss": 3.7138, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.0122263623660923, | |
| "grad_norm": 0.32485687732696533, | |
| "learning_rate": 0.0005641852067559697, | |
| "loss": 3.6386, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 3.026781555659059, | |
| "grad_norm": 0.3298698663711548, | |
| "learning_rate": 0.0005640104834012812, | |
| "loss": 3.607, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.041336748952026, | |
| "grad_norm": 0.3299548923969269, | |
| "learning_rate": 0.0005638357600465929, | |
| "loss": 3.6174, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 3.055891942244993, | |
| "grad_norm": 0.32817062735557556, | |
| "learning_rate": 0.0005636610366919044, | |
| "loss": 3.6033, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.07044713553796, | |
| "grad_norm": 0.32066047191619873, | |
| "learning_rate": 0.0005634863133372161, | |
| "loss": 3.6116, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 3.0850023288309267, | |
| "grad_norm": 0.35941895842552185, | |
| "learning_rate": 0.0005633115899825276, | |
| "loss": 3.6274, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.099557522123894, | |
| "grad_norm": 0.31813332438468933, | |
| "learning_rate": 0.0005631368666278393, | |
| "loss": 3.6327, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 3.114112715416861, | |
| "grad_norm": 0.3107002079486847, | |
| "learning_rate": 0.0005629621432731508, | |
| "loss": 3.6317, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.1286679087098275, | |
| "grad_norm": 0.33238351345062256, | |
| "learning_rate": 0.0005627874199184623, | |
| "loss": 3.6321, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 3.1432231020027945, | |
| "grad_norm": 0.31337371468544006, | |
| "learning_rate": 0.000562612696563774, | |
| "loss": 3.6279, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.1577782952957616, | |
| "grad_norm": 0.3272722661495209, | |
| "learning_rate": 0.0005624379732090856, | |
| "loss": 3.6337, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 3.1723334885887287, | |
| "grad_norm": 0.3187547028064728, | |
| "learning_rate": 0.0005622632498543972, | |
| "loss": 3.6355, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.1868886818816953, | |
| "grad_norm": 0.32575252652168274, | |
| "learning_rate": 0.0005620885264997087, | |
| "loss": 3.6275, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 3.2014438751746623, | |
| "grad_norm": 0.32410597801208496, | |
| "learning_rate": 0.0005619138031450204, | |
| "loss": 3.6314, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.2014438751746623, | |
| "eval_accuracy": 0.35139030688847517, | |
| "eval_loss": 3.717708110809326, | |
| "eval_runtime": 179.6945, | |
| "eval_samples_per_second": 92.663, | |
| "eval_steps_per_second": 5.793, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.2159990684676294, | |
| "grad_norm": 0.3231126070022583, | |
| "learning_rate": 0.0005617390797903319, | |
| "loss": 3.6337, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 3.230554261760596, | |
| "grad_norm": 0.3237737715244293, | |
| "learning_rate": 0.0005615643564356436, | |
| "loss": 3.6431, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.245109455053563, | |
| "grad_norm": 0.32808512449264526, | |
| "learning_rate": 0.0005613896330809551, | |
| "loss": 3.6507, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 3.25966464834653, | |
| "grad_norm": 0.32704418897628784, | |
| "learning_rate": 0.0005612149097262667, | |
| "loss": 3.6319, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.274219841639497, | |
| "grad_norm": 0.3302861154079437, | |
| "learning_rate": 0.0005610401863715783, | |
| "loss": 3.6513, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 3.288775034932464, | |
| "grad_norm": 0.321476548910141, | |
| "learning_rate": 0.0005608654630168898, | |
| "loss": 3.6426, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.303330228225431, | |
| "grad_norm": 0.32909634709358215, | |
| "learning_rate": 0.0005606907396622015, | |
| "loss": 3.6423, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 3.317885421518398, | |
| "grad_norm": 0.341126412153244, | |
| "learning_rate": 0.0005605160163075131, | |
| "loss": 3.6457, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.3324406148113646, | |
| "grad_norm": 0.3450850546360016, | |
| "learning_rate": 0.0005603412929528247, | |
| "loss": 3.6463, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 3.3469958081043316, | |
| "grad_norm": 0.3120056092739105, | |
| "learning_rate": 0.0005601665695981362, | |
| "loss": 3.6387, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.3615510013972987, | |
| "grad_norm": 0.31879013776779175, | |
| "learning_rate": 0.0005599918462434478, | |
| "loss": 3.6382, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 3.3761061946902653, | |
| "grad_norm": 0.3154982924461365, | |
| "learning_rate": 0.0005598171228887594, | |
| "loss": 3.6479, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.3906613879832324, | |
| "grad_norm": 0.32268548011779785, | |
| "learning_rate": 0.0005596423995340709, | |
| "loss": 3.6382, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 3.4052165812761994, | |
| "grad_norm": 0.33158648014068604, | |
| "learning_rate": 0.0005594676761793826, | |
| "loss": 3.6418, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.419771774569166, | |
| "grad_norm": 0.32132261991500854, | |
| "learning_rate": 0.0005592929528246942, | |
| "loss": 3.6579, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 3.434326967862133, | |
| "grad_norm": 0.3326365351676941, | |
| "learning_rate": 0.0005591182294700058, | |
| "loss": 3.6439, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.4488821611551, | |
| "grad_norm": 0.31670433282852173, | |
| "learning_rate": 0.0005589435061153173, | |
| "loss": 3.6436, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 3.463437354448067, | |
| "grad_norm": 0.32499897480010986, | |
| "learning_rate": 0.000558768782760629, | |
| "loss": 3.6526, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.477992547741034, | |
| "grad_norm": 0.3316957354545593, | |
| "learning_rate": 0.0005585940594059406, | |
| "loss": 3.6533, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 3.492547741034001, | |
| "grad_norm": 0.31926876306533813, | |
| "learning_rate": 0.0005584193360512521, | |
| "loss": 3.6512, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.492547741034001, | |
| "eval_accuracy": 0.35302381890640605, | |
| "eval_loss": 3.6994526386260986, | |
| "eval_runtime": 179.7698, | |
| "eval_samples_per_second": 92.624, | |
| "eval_steps_per_second": 5.791, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.507102934326968, | |
| "grad_norm": 0.3377520442008972, | |
| "learning_rate": 0.0005582446126965637, | |
| "loss": 3.6448, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 3.5216581276199346, | |
| "grad_norm": 0.331097811460495, | |
| "learning_rate": 0.0005580698893418753, | |
| "loss": 3.6499, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 3.5362133209129016, | |
| "grad_norm": 0.3121351897716522, | |
| "learning_rate": 0.0005578951659871869, | |
| "loss": 3.643, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 3.5507685142058687, | |
| "grad_norm": 0.3216916024684906, | |
| "learning_rate": 0.0005577204426324985, | |
| "loss": 3.6516, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 3.5653237074988358, | |
| "grad_norm": 0.3219372630119324, | |
| "learning_rate": 0.0005575457192778101, | |
| "loss": 3.6361, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 3.5798789007918024, | |
| "grad_norm": 0.30819830298423767, | |
| "learning_rate": 0.0005573709959231217, | |
| "loss": 3.6477, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 3.5944340940847694, | |
| "grad_norm": 0.31696081161499023, | |
| "learning_rate": 0.0005571962725684332, | |
| "loss": 3.6524, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 3.6089892873777365, | |
| "grad_norm": 0.32272869348526, | |
| "learning_rate": 0.0005570215492137449, | |
| "loss": 3.6434, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 3.623544480670703, | |
| "grad_norm": 0.32395613193511963, | |
| "learning_rate": 0.0005568468258590564, | |
| "loss": 3.648, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 3.63809967396367, | |
| "grad_norm": 0.32271820306777954, | |
| "learning_rate": 0.0005566721025043681, | |
| "loss": 3.635, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.6526548672566372, | |
| "grad_norm": 0.3275328278541565, | |
| "learning_rate": 0.0005564973791496796, | |
| "loss": 3.6529, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 3.6672100605496043, | |
| "grad_norm": 0.3326575458049774, | |
| "learning_rate": 0.0005563226557949913, | |
| "loss": 3.635, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 3.681765253842571, | |
| "grad_norm": 0.3293660581111908, | |
| "learning_rate": 0.0005561479324403028, | |
| "loss": 3.6402, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 3.696320447135538, | |
| "grad_norm": 0.31002599000930786, | |
| "learning_rate": 0.0005559732090856144, | |
| "loss": 3.6294, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 3.710875640428505, | |
| "grad_norm": 0.32257211208343506, | |
| "learning_rate": 0.000555798485730926, | |
| "loss": 3.6576, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 3.7254308337214717, | |
| "grad_norm": 0.3172012269496918, | |
| "learning_rate": 0.0005556237623762376, | |
| "loss": 3.65, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 3.7399860270144387, | |
| "grad_norm": 0.31903111934661865, | |
| "learning_rate": 0.0005554490390215492, | |
| "loss": 3.6345, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 3.754541220307406, | |
| "grad_norm": 0.312339186668396, | |
| "learning_rate": 0.0005552743156668607, | |
| "loss": 3.6369, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 3.769096413600373, | |
| "grad_norm": 0.3321901261806488, | |
| "learning_rate": 0.0005550995923121724, | |
| "loss": 3.6356, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 3.7836516068933395, | |
| "grad_norm": 0.3350261449813843, | |
| "learning_rate": 0.0005549248689574839, | |
| "loss": 3.6358, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.7836516068933395, | |
| "eval_accuracy": 0.35473207829057946, | |
| "eval_loss": 3.6826794147491455, | |
| "eval_runtime": 179.7386, | |
| "eval_samples_per_second": 92.64, | |
| "eval_steps_per_second": 5.792, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.7982068001863065, | |
| "grad_norm": 0.3085910677909851, | |
| "learning_rate": 0.0005547501456027955, | |
| "loss": 3.6481, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 3.812761993479273, | |
| "grad_norm": 0.3271644115447998, | |
| "learning_rate": 0.0005545754222481071, | |
| "loss": 3.6547, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 3.82731718677224, | |
| "grad_norm": 0.32669100165367126, | |
| "learning_rate": 0.0005544006988934188, | |
| "loss": 3.6391, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 3.8418723800652073, | |
| "grad_norm": 0.3179662227630615, | |
| "learning_rate": 0.0005542259755387303, | |
| "loss": 3.6481, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 3.8564275733581743, | |
| "grad_norm": 0.3170970380306244, | |
| "learning_rate": 0.0005540512521840418, | |
| "loss": 3.6454, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 3.8709827666511414, | |
| "grad_norm": 0.32422661781311035, | |
| "learning_rate": 0.0005538765288293535, | |
| "loss": 3.6388, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 3.885537959944108, | |
| "grad_norm": 0.31919288635253906, | |
| "learning_rate": 0.0005537018054746651, | |
| "loss": 3.6434, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 3.900093153237075, | |
| "grad_norm": 0.3215365707874298, | |
| "learning_rate": 0.0005535270821199767, | |
| "loss": 3.6406, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 3.9146483465300417, | |
| "grad_norm": 0.3166106939315796, | |
| "learning_rate": 0.0005533523587652882, | |
| "loss": 3.6334, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 3.9292035398230087, | |
| "grad_norm": 0.30573785305023193, | |
| "learning_rate": 0.0005531776354105999, | |
| "loss": 3.6395, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.943758733115976, | |
| "grad_norm": 0.3205771744251251, | |
| "learning_rate": 0.0005530029120559114, | |
| "loss": 3.6447, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 3.958313926408943, | |
| "grad_norm": 0.3078632354736328, | |
| "learning_rate": 0.0005528281887012229, | |
| "loss": 3.6433, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 3.9728691197019095, | |
| "grad_norm": 0.3331320881843567, | |
| "learning_rate": 0.0005526534653465346, | |
| "loss": 3.6405, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 3.9874243129948765, | |
| "grad_norm": 0.3237683176994324, | |
| "learning_rate": 0.0005524787419918462, | |
| "loss": 3.636, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 4.001746623195156, | |
| "grad_norm": 0.32297882437705994, | |
| "learning_rate": 0.0005523040186371578, | |
| "loss": 3.6296, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 4.016301816488123, | |
| "grad_norm": 0.3407275378704071, | |
| "learning_rate": 0.0005521292952824693, | |
| "loss": 3.5279, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 4.03085700978109, | |
| "grad_norm": 0.3329641819000244, | |
| "learning_rate": 0.000551954571927781, | |
| "loss": 3.5292, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 4.045412203074057, | |
| "grad_norm": 0.3195856213569641, | |
| "learning_rate": 0.0005517798485730926, | |
| "loss": 3.5221, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 4.059967396367024, | |
| "grad_norm": 0.3252445459365845, | |
| "learning_rate": 0.0005516051252184042, | |
| "loss": 3.5307, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 4.074522589659991, | |
| "grad_norm": 0.31203794479370117, | |
| "learning_rate": 0.0005514304018637157, | |
| "loss": 3.5343, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.074522589659991, | |
| "eval_accuracy": 0.3562800304301699, | |
| "eval_loss": 3.6766884326934814, | |
| "eval_runtime": 179.6441, | |
| "eval_samples_per_second": 92.689, | |
| "eval_steps_per_second": 5.795, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.089077782952957, | |
| "grad_norm": 0.3372914791107178, | |
| "learning_rate": 0.0005512556785090273, | |
| "loss": 3.5412, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 4.103632976245924, | |
| "grad_norm": 0.31727397441864014, | |
| "learning_rate": 0.0005510809551543389, | |
| "loss": 3.5477, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 4.118188169538891, | |
| "grad_norm": 0.31574997305870056, | |
| "learning_rate": 0.0005509062317996504, | |
| "loss": 3.5506, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 4.132743362831858, | |
| "grad_norm": 0.34020867943763733, | |
| "learning_rate": 0.0005507315084449621, | |
| "loss": 3.5478, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 4.1472985561248255, | |
| "grad_norm": 0.3455803394317627, | |
| "learning_rate": 0.0005505567850902737, | |
| "loss": 3.5489, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 4.1618537494177925, | |
| "grad_norm": 0.3063943088054657, | |
| "learning_rate": 0.0005503820617355853, | |
| "loss": 3.5644, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 4.17640894271076, | |
| "grad_norm": 0.3226618766784668, | |
| "learning_rate": 0.0005502073383808969, | |
| "loss": 3.5668, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 4.190964136003726, | |
| "grad_norm": 0.33126798272132874, | |
| "learning_rate": 0.0005500326150262085, | |
| "loss": 3.556, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.205519329296693, | |
| "grad_norm": 0.34566444158554077, | |
| "learning_rate": 0.00054985789167152, | |
| "loss": 3.5627, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 4.22007452258966, | |
| "grad_norm": 0.32717248797416687, | |
| "learning_rate": 0.0005496831683168316, | |
| "loss": 3.5734, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.234629715882627, | |
| "grad_norm": 0.34188687801361084, | |
| "learning_rate": 0.0005495084449621433, | |
| "loss": 3.5546, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 4.249184909175594, | |
| "grad_norm": 0.3105984032154083, | |
| "learning_rate": 0.0005493337216074548, | |
| "loss": 3.5519, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 4.263740102468561, | |
| "grad_norm": 0.30693769454956055, | |
| "learning_rate": 0.0005491589982527664, | |
| "loss": 3.5647, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 4.278295295761528, | |
| "grad_norm": 0.3240891396999359, | |
| "learning_rate": 0.000548984274898078, | |
| "loss": 3.5612, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 4.292850489054494, | |
| "grad_norm": 0.3513204753398895, | |
| "learning_rate": 0.0005488095515433897, | |
| "loss": 3.5624, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 4.307405682347461, | |
| "grad_norm": 0.3214813470840454, | |
| "learning_rate": 0.0005486348281887012, | |
| "loss": 3.5711, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 4.321960875640428, | |
| "grad_norm": 0.32047712802886963, | |
| "learning_rate": 0.0005484601048340127, | |
| "loss": 3.5641, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 4.3365160689333955, | |
| "grad_norm": 0.33672577142715454, | |
| "learning_rate": 0.0005482853814793244, | |
| "loss": 3.5647, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 4.3510712622263625, | |
| "grad_norm": 0.3281972408294678, | |
| "learning_rate": 0.0005481106581246359, | |
| "loss": 3.572, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 4.36562645551933, | |
| "grad_norm": 0.32151132822036743, | |
| "learning_rate": 0.0005479359347699475, | |
| "loss": 3.5705, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.36562645551933, | |
| "eval_accuracy": 0.35709931327620176, | |
| "eval_loss": 3.6636641025543213, | |
| "eval_runtime": 180.0045, | |
| "eval_samples_per_second": 92.503, | |
| "eval_steps_per_second": 5.783, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.380181648812297, | |
| "grad_norm": 0.31978142261505127, | |
| "learning_rate": 0.0005477612114152591, | |
| "loss": 3.5752, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 4.394736842105263, | |
| "grad_norm": 0.32059603929519653, | |
| "learning_rate": 0.0005475864880605708, | |
| "loss": 3.5737, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 4.40929203539823, | |
| "grad_norm": 0.31743520498275757, | |
| "learning_rate": 0.0005474117647058823, | |
| "loss": 3.5712, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 4.423847228691197, | |
| "grad_norm": 0.32264477014541626, | |
| "learning_rate": 0.0005472370413511939, | |
| "loss": 3.5719, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 4.438402421984164, | |
| "grad_norm": 0.3091832101345062, | |
| "learning_rate": 0.0005470623179965055, | |
| "loss": 3.5736, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 4.452957615277131, | |
| "grad_norm": 0.32630136609077454, | |
| "learning_rate": 0.0005468875946418171, | |
| "loss": 3.5729, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 4.467512808570098, | |
| "grad_norm": 0.30980727076530457, | |
| "learning_rate": 0.0005467128712871287, | |
| "loss": 3.5674, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 4.482068001863064, | |
| "grad_norm": 0.33094680309295654, | |
| "learning_rate": 0.0005465381479324402, | |
| "loss": 3.5672, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 4.496623195156031, | |
| "grad_norm": 0.32282018661499023, | |
| "learning_rate": 0.0005463634245777519, | |
| "loss": 3.5749, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 4.511178388448998, | |
| "grad_norm": 0.32736220955848694, | |
| "learning_rate": 0.0005461887012230634, | |
| "loss": 3.5846, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.5257335817419655, | |
| "grad_norm": 0.3186218738555908, | |
| "learning_rate": 0.000546013977868375, | |
| "loss": 3.5792, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 4.5402887750349326, | |
| "grad_norm": 0.31435152888298035, | |
| "learning_rate": 0.0005458392545136866, | |
| "loss": 3.5753, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 4.5548439683279, | |
| "grad_norm": 0.3342227339744568, | |
| "learning_rate": 0.0005456645311589983, | |
| "loss": 3.5673, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 4.569399161620867, | |
| "grad_norm": 0.31737950444221497, | |
| "learning_rate": 0.0005454898078043098, | |
| "loss": 3.5851, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 4.583954354913834, | |
| "grad_norm": 0.34083765745162964, | |
| "learning_rate": 0.0005453150844496213, | |
| "loss": 3.5792, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 4.5985095482068, | |
| "grad_norm": 0.33823180198669434, | |
| "learning_rate": 0.000545140361094933, | |
| "loss": 3.5838, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 4.613064741499767, | |
| "grad_norm": 0.3152465522289276, | |
| "learning_rate": 0.0005449656377402445, | |
| "loss": 3.5888, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 4.627619934792734, | |
| "grad_norm": 0.309803307056427, | |
| "learning_rate": 0.0005447909143855562, | |
| "loss": 3.5648, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 4.642175128085701, | |
| "grad_norm": 0.31014177203178406, | |
| "learning_rate": 0.0005446161910308677, | |
| "loss": 3.5716, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 4.656730321378668, | |
| "grad_norm": 0.3170645534992218, | |
| "learning_rate": 0.0005444414676761794, | |
| "loss": 3.5819, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.656730321378668, | |
| "eval_accuracy": 0.3584822570789928, | |
| "eval_loss": 3.6490225791931152, | |
| "eval_runtime": 179.6718, | |
| "eval_samples_per_second": 92.675, | |
| "eval_steps_per_second": 5.794, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.671285514671635, | |
| "grad_norm": 0.32944896817207336, | |
| "learning_rate": 0.0005442667443214909, | |
| "loss": 3.5842, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 4.685840707964601, | |
| "grad_norm": 0.33239537477493286, | |
| "learning_rate": 0.0005440920209668024, | |
| "loss": 3.5775, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 4.7003959012575685, | |
| "grad_norm": 0.34176382422447205, | |
| "learning_rate": 0.0005439172976121141, | |
| "loss": 3.5865, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 4.7149510945505355, | |
| "grad_norm": 0.31231263279914856, | |
| "learning_rate": 0.0005437425742574257, | |
| "loss": 3.565, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 4.729506287843503, | |
| "grad_norm": 0.32177722454071045, | |
| "learning_rate": 0.0005435678509027373, | |
| "loss": 3.5874, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 4.74406148113647, | |
| "grad_norm": 0.32989418506622314, | |
| "learning_rate": 0.0005433931275480488, | |
| "loss": 3.5831, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 4.758616674429437, | |
| "grad_norm": 0.3280948996543884, | |
| "learning_rate": 0.0005432184041933605, | |
| "loss": 3.5809, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 4.773171867722404, | |
| "grad_norm": 0.3182077705860138, | |
| "learning_rate": 0.000543043680838672, | |
| "loss": 3.5832, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 4.78772706101537, | |
| "grad_norm": 0.325810045003891, | |
| "learning_rate": 0.0005428689574839837, | |
| "loss": 3.5741, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 4.802282254308337, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.0005426942341292952, | |
| "loss": 3.5867, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.816837447601304, | |
| "grad_norm": 0.31739383935928345, | |
| "learning_rate": 0.0005425195107746068, | |
| "loss": 3.5814, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 4.831392640894271, | |
| "grad_norm": 0.31025153398513794, | |
| "learning_rate": 0.0005423447874199184, | |
| "loss": 3.586, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 4.845947834187238, | |
| "grad_norm": 0.31311795115470886, | |
| "learning_rate": 0.00054217006406523, | |
| "loss": 3.5723, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 4.860503027480205, | |
| "grad_norm": 0.3323357105255127, | |
| "learning_rate": 0.0005419953407105417, | |
| "loss": 3.583, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 4.875058220773171, | |
| "grad_norm": 0.3083381950855255, | |
| "learning_rate": 0.0005418206173558532, | |
| "loss": 3.5868, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 4.8896134140661385, | |
| "grad_norm": 0.3021372854709625, | |
| "learning_rate": 0.0005416458940011648, | |
| "loss": 3.5871, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 4.9041686073591055, | |
| "grad_norm": 0.3238731324672699, | |
| "learning_rate": 0.0005414711706464764, | |
| "loss": 3.5791, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 4.918723800652073, | |
| "grad_norm": 0.32329273223876953, | |
| "learning_rate": 0.000541296447291788, | |
| "loss": 3.5852, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 4.93327899394504, | |
| "grad_norm": 0.3102934658527374, | |
| "learning_rate": 0.0005411217239370995, | |
| "loss": 3.5786, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 4.947834187238007, | |
| "grad_norm": 0.312805712223053, | |
| "learning_rate": 0.0005409470005824111, | |
| "loss": 3.5651, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.947834187238007, | |
| "eval_accuracy": 0.359419537339659, | |
| "eval_loss": 3.635596513748169, | |
| "eval_runtime": 179.8855, | |
| "eval_samples_per_second": 92.564, | |
| "eval_steps_per_second": 5.787, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.962389380530974, | |
| "grad_norm": 0.3215927183628082, | |
| "learning_rate": 0.0005407722772277228, | |
| "loss": 3.5664, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 4.976944573823941, | |
| "grad_norm": 0.3170582950115204, | |
| "learning_rate": 0.0005405975538730343, | |
| "loss": 3.5686, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 4.991499767116907, | |
| "grad_norm": 0.32551613450050354, | |
| "learning_rate": 0.0005404228305183459, | |
| "loss": 3.5772, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 5.005822077317187, | |
| "grad_norm": 0.33707520365715027, | |
| "learning_rate": 0.0005402481071636575, | |
| "loss": 3.5305, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 5.020377270610154, | |
| "grad_norm": 0.3136267364025116, | |
| "learning_rate": 0.0005400733838089692, | |
| "loss": 3.4635, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 5.034932463903121, | |
| "grad_norm": 0.3290776014328003, | |
| "learning_rate": 0.0005398986604542807, | |
| "loss": 3.4722, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 5.049487657196088, | |
| "grad_norm": 0.3506677746772766, | |
| "learning_rate": 0.0005397239370995922, | |
| "loss": 3.4764, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 5.064042850489055, | |
| "grad_norm": 0.31434836983680725, | |
| "learning_rate": 0.0005395492137449039, | |
| "loss": 3.4737, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 5.078598043782021, | |
| "grad_norm": 0.3276131749153137, | |
| "learning_rate": 0.0005393744903902154, | |
| "loss": 3.4842, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 5.093153237074988, | |
| "grad_norm": 0.3244301974773407, | |
| "learning_rate": 0.000539199767035527, | |
| "loss": 3.4819, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.107708430367955, | |
| "grad_norm": 0.32069942355155945, | |
| "learning_rate": 0.0005390250436808386, | |
| "loss": 3.4811, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 5.122263623660922, | |
| "grad_norm": 0.3195439577102661, | |
| "learning_rate": 0.0005388503203261503, | |
| "loss": 3.5033, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 5.136818816953889, | |
| "grad_norm": 0.33659523725509644, | |
| "learning_rate": 0.0005386755969714618, | |
| "loss": 3.485, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 5.151374010246856, | |
| "grad_norm": 0.326090931892395, | |
| "learning_rate": 0.0005385008736167733, | |
| "loss": 3.4878, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 5.165929203539823, | |
| "grad_norm": 0.3459244668483734, | |
| "learning_rate": 0.000538326150262085, | |
| "loss": 3.498, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 5.18048439683279, | |
| "grad_norm": 0.3431548774242401, | |
| "learning_rate": 0.0005381514269073965, | |
| "loss": 3.4766, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 5.195039590125757, | |
| "grad_norm": 0.3117247223854065, | |
| "learning_rate": 0.0005379767035527082, | |
| "loss": 3.4959, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 5.209594783418724, | |
| "grad_norm": 0.3133803904056549, | |
| "learning_rate": 0.0005378019801980197, | |
| "loss": 3.5106, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 5.224149976711691, | |
| "grad_norm": 0.3180356025695801, | |
| "learning_rate": 0.0005376272568433314, | |
| "loss": 3.4932, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 5.238705170004658, | |
| "grad_norm": 0.3265446424484253, | |
| "learning_rate": 0.0005374525334886429, | |
| "loss": 3.504, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.238705170004658, | |
| "eval_accuracy": 0.36018628548017134, | |
| "eval_loss": 3.6369755268096924, | |
| "eval_runtime": 179.8999, | |
| "eval_samples_per_second": 92.557, | |
| "eval_steps_per_second": 5.787, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.253260363297625, | |
| "grad_norm": 0.31579458713531494, | |
| "learning_rate": 0.0005372778101339545, | |
| "loss": 3.5122, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 5.267815556590591, | |
| "grad_norm": 0.3140856623649597, | |
| "learning_rate": 0.0005371030867792661, | |
| "loss": 3.503, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 5.282370749883558, | |
| "grad_norm": 0.32965609431266785, | |
| "learning_rate": 0.0005369283634245778, | |
| "loss": 3.5008, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 5.296925943176525, | |
| "grad_norm": 0.31670889258384705, | |
| "learning_rate": 0.0005367536400698893, | |
| "loss": 3.504, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 5.311481136469492, | |
| "grad_norm": 0.3167295455932617, | |
| "learning_rate": 0.0005365789167152008, | |
| "loss": 3.5188, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 5.326036329762459, | |
| "grad_norm": 0.3241732120513916, | |
| "learning_rate": 0.0005364041933605125, | |
| "loss": 3.5228, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 5.340591523055426, | |
| "grad_norm": 0.3414795398712158, | |
| "learning_rate": 0.000536229470005824, | |
| "loss": 3.5113, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 5.3551467163483935, | |
| "grad_norm": 0.3266086280345917, | |
| "learning_rate": 0.0005360547466511357, | |
| "loss": 3.5104, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 5.36970190964136, | |
| "grad_norm": 0.3137924373149872, | |
| "learning_rate": 0.0005358800232964472, | |
| "loss": 3.5278, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 5.384257102934327, | |
| "grad_norm": 0.33976447582244873, | |
| "learning_rate": 0.0005357052999417589, | |
| "loss": 3.5211, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.398812296227294, | |
| "grad_norm": 0.33553993701934814, | |
| "learning_rate": 0.0005355305765870704, | |
| "loss": 3.5045, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 5.413367489520261, | |
| "grad_norm": 0.3117082417011261, | |
| "learning_rate": 0.000535355853232382, | |
| "loss": 3.5246, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 5.427922682813228, | |
| "grad_norm": 0.3177870213985443, | |
| "learning_rate": 0.0005351811298776936, | |
| "loss": 3.5214, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 5.442477876106195, | |
| "grad_norm": 0.321214884519577, | |
| "learning_rate": 0.0005350064065230052, | |
| "loss": 3.5248, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 5.457033069399162, | |
| "grad_norm": 0.34071120619773865, | |
| "learning_rate": 0.0005348316831683168, | |
| "loss": 3.5203, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 5.471588262692128, | |
| "grad_norm": 0.3280022442340851, | |
| "learning_rate": 0.0005346569598136284, | |
| "loss": 3.5155, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 5.486143455985095, | |
| "grad_norm": 0.35156744718551636, | |
| "learning_rate": 0.00053448223645894, | |
| "loss": 3.5253, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 5.500698649278062, | |
| "grad_norm": 0.3191376328468323, | |
| "learning_rate": 0.0005343075131042515, | |
| "loss": 3.5277, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 5.515253842571029, | |
| "grad_norm": 0.3046784996986389, | |
| "learning_rate": 0.0005341327897495632, | |
| "loss": 3.5235, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 5.529809035863996, | |
| "grad_norm": 0.3302401304244995, | |
| "learning_rate": 0.0005339580663948748, | |
| "loss": 3.5158, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.529809035863996, | |
| "eval_accuracy": 0.36118879339534155, | |
| "eval_loss": 3.6266047954559326, | |
| "eval_runtime": 179.8074, | |
| "eval_samples_per_second": 92.605, | |
| "eval_steps_per_second": 5.79, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.5443642291569635, | |
| "grad_norm": 0.33206748962402344, | |
| "learning_rate": 0.0005337833430401863, | |
| "loss": 3.5214, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 5.5589194224499305, | |
| "grad_norm": 0.3455377221107483, | |
| "learning_rate": 0.0005336086196854979, | |
| "loss": 3.5253, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 5.573474615742897, | |
| "grad_norm": 0.34505409002304077, | |
| "learning_rate": 0.0005334338963308095, | |
| "loss": 3.5184, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 5.588029809035864, | |
| "grad_norm": 0.33611926436424255, | |
| "learning_rate": 0.0005332591729761211, | |
| "loss": 3.5453, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 5.602585002328831, | |
| "grad_norm": 0.3275839388370514, | |
| "learning_rate": 0.0005330844496214327, | |
| "loss": 3.5193, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 5.617140195621798, | |
| "grad_norm": 0.3343163728713989, | |
| "learning_rate": 0.0005329097262667443, | |
| "loss": 3.5281, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 5.631695388914765, | |
| "grad_norm": 0.3136567175388336, | |
| "learning_rate": 0.0005327350029120559, | |
| "loss": 3.532, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 5.646250582207732, | |
| "grad_norm": 0.32538020610809326, | |
| "learning_rate": 0.0005325602795573674, | |
| "loss": 3.535, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 5.660805775500698, | |
| "grad_norm": 0.3288552165031433, | |
| "learning_rate": 0.000532385556202679, | |
| "loss": 3.5314, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 5.675360968793665, | |
| "grad_norm": 0.3208260238170624, | |
| "learning_rate": 0.0005322108328479906, | |
| "loss": 3.5216, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.689916162086632, | |
| "grad_norm": 0.3328034579753876, | |
| "learning_rate": 0.0005320361094933023, | |
| "loss": 3.5281, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 5.704471355379599, | |
| "grad_norm": 0.328398197889328, | |
| "learning_rate": 0.0005318613861386138, | |
| "loss": 3.5225, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 5.719026548672566, | |
| "grad_norm": 0.3146987855434418, | |
| "learning_rate": 0.0005316866627839254, | |
| "loss": 3.5241, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 5.7335817419655335, | |
| "grad_norm": 0.3119967579841614, | |
| "learning_rate": 0.000531511939429237, | |
| "loss": 3.5245, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 5.748136935258501, | |
| "grad_norm": 0.34043416380882263, | |
| "learning_rate": 0.0005313372160745486, | |
| "loss": 3.529, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 5.762692128551468, | |
| "grad_norm": 0.3133246898651123, | |
| "learning_rate": 0.0005311624927198602, | |
| "loss": 3.5376, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 5.777247321844434, | |
| "grad_norm": 0.3169836103916168, | |
| "learning_rate": 0.0005309877693651717, | |
| "loss": 3.5233, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 5.791802515137401, | |
| "grad_norm": 0.35480526089668274, | |
| "learning_rate": 0.0005308130460104834, | |
| "loss": 3.5258, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 5.806357708430368, | |
| "grad_norm": 0.31685981154441833, | |
| "learning_rate": 0.0005306383226557949, | |
| "loss": 3.5233, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 5.820912901723335, | |
| "grad_norm": 0.3257889747619629, | |
| "learning_rate": 0.0005304635993011065, | |
| "loss": 3.5296, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.820912901723335, | |
| "eval_accuracy": 0.3616800575319665, | |
| "eval_loss": 3.6151504516601562, | |
| "eval_runtime": 179.7812, | |
| "eval_samples_per_second": 92.618, | |
| "eval_steps_per_second": 5.79, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.835468095016302, | |
| "grad_norm": 0.33038684725761414, | |
| "learning_rate": 0.0005302888759464181, | |
| "loss": 3.4297, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 5.850023288309269, | |
| "grad_norm": 0.3143859803676605, | |
| "learning_rate": 0.0005301141525917298, | |
| "loss": 3.433, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 5.864578481602235, | |
| "grad_norm": 0.30916011333465576, | |
| "learning_rate": 0.0005299394292370413, | |
| "loss": 3.4304, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 5.879133674895202, | |
| "grad_norm": 0.31968605518341064, | |
| "learning_rate": 0.0005297647058823528, | |
| "loss": 3.4407, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 5.893688868188169, | |
| "grad_norm": 0.33643290400505066, | |
| "learning_rate": 0.0005295899825276645, | |
| "loss": 3.4537, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 5.9082440614811365, | |
| "grad_norm": 0.32637345790863037, | |
| "learning_rate": 0.000529415259172976, | |
| "loss": 3.448, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 5.9227992547741035, | |
| "grad_norm": 0.33530595898628235, | |
| "learning_rate": 0.0005292405358182877, | |
| "loss": 3.4441, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 5.937354448067071, | |
| "grad_norm": 0.3074623644351959, | |
| "learning_rate": 0.0005290658124635992, | |
| "loss": 3.4434, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 5.951909641360038, | |
| "grad_norm": 0.33070844411849976, | |
| "learning_rate": 0.0005288910891089109, | |
| "loss": 3.4484, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 5.966464834653004, | |
| "grad_norm": 0.35226529836654663, | |
| "learning_rate": 0.0005287163657542224, | |
| "loss": 3.4417, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.981020027945971, | |
| "grad_norm": 0.3253940939903259, | |
| "learning_rate": 0.000528541642399534, | |
| "loss": 3.4547, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 5.995575221238938, | |
| "grad_norm": 0.3224802613258362, | |
| "learning_rate": 0.0005283669190448456, | |
| "loss": 3.4531, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 6.010188635305076, | |
| "grad_norm": 0.32806479930877686, | |
| "learning_rate": 0.0005281921956901572, | |
| "loss": 3.4962, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 6.0247438285980435, | |
| "grad_norm": 0.32519295811653137, | |
| "learning_rate": 0.0005280174723354688, | |
| "loss": 3.4279, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 6.0392990218910105, | |
| "grad_norm": 0.32187598943710327, | |
| "learning_rate": 0.0005278427489807804, | |
| "loss": 3.4395, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 6.053854215183978, | |
| "grad_norm": 0.32578256726264954, | |
| "learning_rate": 0.000527668025626092, | |
| "loss": 3.4385, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 6.068409408476945, | |
| "grad_norm": 0.3390205502510071, | |
| "learning_rate": 0.0005274933022714035, | |
| "loss": 3.4391, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 6.082964601769912, | |
| "grad_norm": 0.35239386558532715, | |
| "learning_rate": 0.0005273185789167152, | |
| "loss": 3.4394, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 6.097519795062879, | |
| "grad_norm": 0.3355068266391754, | |
| "learning_rate": 0.0005271438555620268, | |
| "loss": 3.4479, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 6.112074988355845, | |
| "grad_norm": 0.3424798846244812, | |
| "learning_rate": 0.0005269691322073384, | |
| "loss": 3.4522, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.112074988355845, | |
| "eval_accuracy": 0.3620054906406543, | |
| "eval_loss": 3.6232964992523193, | |
| "eval_runtime": 180.5945, | |
| "eval_samples_per_second": 92.201, | |
| "eval_steps_per_second": 5.764, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.126630181648812, | |
| "grad_norm": 0.3279637396335602, | |
| "learning_rate": 0.0005267944088526499, | |
| "loss": 3.4444, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 6.141185374941779, | |
| "grad_norm": 0.3577294945716858, | |
| "learning_rate": 0.0005266196854979615, | |
| "loss": 3.4538, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 6.155740568234746, | |
| "grad_norm": 0.3284136652946472, | |
| "learning_rate": 0.0005264449621432731, | |
| "loss": 3.4434, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 6.170295761527713, | |
| "grad_norm": 0.32023024559020996, | |
| "learning_rate": 0.0005262702387885847, | |
| "loss": 3.4613, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 6.18485095482068, | |
| "grad_norm": 0.33147501945495605, | |
| "learning_rate": 0.0005260955154338963, | |
| "loss": 3.4673, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 6.199406148113647, | |
| "grad_norm": 0.31914031505584717, | |
| "learning_rate": 0.0005259207920792079, | |
| "loss": 3.455, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 6.2139613414066135, | |
| "grad_norm": 0.31116756796836853, | |
| "learning_rate": 0.0005257460687245195, | |
| "loss": 3.4751, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 6.2285165346995806, | |
| "grad_norm": 0.35089877247810364, | |
| "learning_rate": 0.000525571345369831, | |
| "loss": 3.4747, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 6.243071727992548, | |
| "grad_norm": 0.34410837292671204, | |
| "learning_rate": 0.0005253966220151426, | |
| "loss": 3.4672, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 6.257626921285515, | |
| "grad_norm": 0.3280709683895111, | |
| "learning_rate": 0.0005252218986604543, | |
| "loss": 3.4755, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 6.272182114578482, | |
| "grad_norm": 0.32187557220458984, | |
| "learning_rate": 0.0005250471753057658, | |
| "loss": 3.4698, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 6.286737307871449, | |
| "grad_norm": 0.3259919583797455, | |
| "learning_rate": 0.0005248724519510774, | |
| "loss": 3.4712, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 6.301292501164416, | |
| "grad_norm": 0.3311648666858673, | |
| "learning_rate": 0.000524697728596389, | |
| "loss": 3.4683, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 6.315847694457382, | |
| "grad_norm": 0.32754069566726685, | |
| "learning_rate": 0.0005245230052417006, | |
| "loss": 3.4672, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 6.330402887750349, | |
| "grad_norm": 0.3217179775238037, | |
| "learning_rate": 0.0005243482818870122, | |
| "loss": 3.4792, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 6.344958081043316, | |
| "grad_norm": 0.3179493546485901, | |
| "learning_rate": 0.0005241735585323238, | |
| "loss": 3.4861, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 6.359513274336283, | |
| "grad_norm": 0.33075499534606934, | |
| "learning_rate": 0.0005239988351776354, | |
| "loss": 3.4686, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 6.37406846762925, | |
| "grad_norm": 0.328039288520813, | |
| "learning_rate": 0.0005238241118229469, | |
| "loss": 3.4802, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 6.388623660922217, | |
| "grad_norm": 0.31766510009765625, | |
| "learning_rate": 0.0005236493884682585, | |
| "loss": 3.4669, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 6.403178854215184, | |
| "grad_norm": 0.31190410256385803, | |
| "learning_rate": 0.0005234746651135701, | |
| "loss": 3.4725, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.403178854215184, | |
| "eval_accuracy": 0.3630159904125925, | |
| "eval_loss": 3.6138389110565186, | |
| "eval_runtime": 180.2927, | |
| "eval_samples_per_second": 92.355, | |
| "eval_steps_per_second": 5.774, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.417734047508151, | |
| "grad_norm": 0.31264427304267883, | |
| "learning_rate": 0.0005232999417588818, | |
| "loss": 3.4819, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 6.432289240801118, | |
| "grad_norm": 0.3015226721763611, | |
| "learning_rate": 0.0005231252184041933, | |
| "loss": 3.4723, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 6.446844434094085, | |
| "grad_norm": 0.33731818199157715, | |
| "learning_rate": 0.0005229504950495049, | |
| "loss": 3.4691, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 6.461399627387052, | |
| "grad_norm": 0.35695183277130127, | |
| "learning_rate": 0.0005227757716948165, | |
| "loss": 3.47, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 6.475954820680019, | |
| "grad_norm": 0.327970027923584, | |
| "learning_rate": 0.000522601048340128, | |
| "loss": 3.4796, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 6.490510013972986, | |
| "grad_norm": 0.32647427916526794, | |
| "learning_rate": 0.0005224263249854397, | |
| "loss": 3.4752, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 6.505065207265952, | |
| "grad_norm": 0.3130800724029541, | |
| "learning_rate": 0.0005222516016307512, | |
| "loss": 3.4955, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 6.519620400558919, | |
| "grad_norm": 0.3222516179084778, | |
| "learning_rate": 0.0005220768782760629, | |
| "loss": 3.4757, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 6.534175593851886, | |
| "grad_norm": 0.3477848768234253, | |
| "learning_rate": 0.0005219021549213744, | |
| "loss": 3.4844, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 6.548730787144853, | |
| "grad_norm": 0.31548774242401123, | |
| "learning_rate": 0.000521727431566686, | |
| "loss": 3.4899, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.56328598043782, | |
| "grad_norm": 0.31945446133613586, | |
| "learning_rate": 0.0005215527082119976, | |
| "loss": 3.4746, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 6.577841173730787, | |
| "grad_norm": 0.32317468523979187, | |
| "learning_rate": 0.0005213779848573093, | |
| "loss": 3.4789, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 6.592396367023754, | |
| "grad_norm": 0.3340426981449127, | |
| "learning_rate": 0.0005212032615026208, | |
| "loss": 3.4807, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 6.6069515603167215, | |
| "grad_norm": 0.3395540714263916, | |
| "learning_rate": 0.0005210285381479323, | |
| "loss": 3.4845, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 6.621506753609688, | |
| "grad_norm": 0.32293739914894104, | |
| "learning_rate": 0.000520853814793244, | |
| "loss": 3.504, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 6.636061946902655, | |
| "grad_norm": 0.326775461435318, | |
| "learning_rate": 0.0005206790914385555, | |
| "loss": 3.4941, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 6.650617140195622, | |
| "grad_norm": 0.3333836495876312, | |
| "learning_rate": 0.0005205043680838672, | |
| "loss": 3.4877, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 6.665172333488589, | |
| "grad_norm": 0.3150651454925537, | |
| "learning_rate": 0.0005203296447291787, | |
| "loss": 3.4789, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 6.679727526781556, | |
| "grad_norm": 0.321943998336792, | |
| "learning_rate": 0.0005201549213744904, | |
| "loss": 3.5005, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 6.694282720074523, | |
| "grad_norm": 0.32264676690101624, | |
| "learning_rate": 0.0005199801980198019, | |
| "loss": 3.4922, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.694282720074523, | |
| "eval_accuracy": 0.36367884441512005, | |
| "eval_loss": 3.6030995845794678, | |
| "eval_runtime": 180.3566, | |
| "eval_samples_per_second": 92.323, | |
| "eval_steps_per_second": 5.772, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.708837913367489, | |
| "grad_norm": 0.3198070228099823, | |
| "learning_rate": 0.0005198054746651136, | |
| "loss": 3.5006, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 6.723393106660456, | |
| "grad_norm": 0.3150603175163269, | |
| "learning_rate": 0.0005196307513104251, | |
| "loss": 3.491, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 6.737948299953423, | |
| "grad_norm": 0.30650636553764343, | |
| "learning_rate": 0.0005194560279557367, | |
| "loss": 3.4876, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 6.75250349324639, | |
| "grad_norm": 0.3138138949871063, | |
| "learning_rate": 0.0005192813046010483, | |
| "loss": 3.481, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 6.767058686539357, | |
| "grad_norm": 0.33389389514923096, | |
| "learning_rate": 0.0005191065812463599, | |
| "loss": 3.4909, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 6.781613879832324, | |
| "grad_norm": 0.3149167597293854, | |
| "learning_rate": 0.0005189318578916715, | |
| "loss": 3.4881, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 6.7961690731252915, | |
| "grad_norm": 0.3338867723941803, | |
| "learning_rate": 0.000518757134536983, | |
| "loss": 3.4817, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 6.810724266418258, | |
| "grad_norm": 0.330256849527359, | |
| "learning_rate": 0.0005185824111822947, | |
| "loss": 3.4843, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 6.825279459711225, | |
| "grad_norm": 0.33427727222442627, | |
| "learning_rate": 0.0005184076878276063, | |
| "loss": 3.4739, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 6.839834653004192, | |
| "grad_norm": 0.32662829756736755, | |
| "learning_rate": 0.0005182329644729179, | |
| "loss": 3.4982, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.854389846297159, | |
| "grad_norm": 0.30232712626457214, | |
| "learning_rate": 0.0005180582411182294, | |
| "loss": 3.4917, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 6.868945039590126, | |
| "grad_norm": 0.329941064119339, | |
| "learning_rate": 0.000517883517763541, | |
| "loss": 3.508, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 6.883500232883093, | |
| "grad_norm": 0.32964077591896057, | |
| "learning_rate": 0.0005177087944088526, | |
| "loss": 3.5106, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 6.898055426176059, | |
| "grad_norm": 0.3354266881942749, | |
| "learning_rate": 0.0005175340710541642, | |
| "loss": 3.4899, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 6.912610619469026, | |
| "grad_norm": 0.3470557928085327, | |
| "learning_rate": 0.0005173593476994758, | |
| "loss": 3.4932, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 6.927165812761993, | |
| "grad_norm": 0.31392523646354675, | |
| "learning_rate": 0.0005171846243447874, | |
| "loss": 3.4967, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 6.94172100605496, | |
| "grad_norm": 0.3317205607891083, | |
| "learning_rate": 0.000517009900990099, | |
| "loss": 3.4931, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 6.956276199347927, | |
| "grad_norm": 0.3179618716239929, | |
| "learning_rate": 0.0005168351776354105, | |
| "loss": 3.4857, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 6.9708313926408945, | |
| "grad_norm": 0.3277272582054138, | |
| "learning_rate": 0.0005166604542807221, | |
| "loss": 3.5021, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 6.9853865859338615, | |
| "grad_norm": 0.3022093176841736, | |
| "learning_rate": 0.0005164857309260338, | |
| "loss": 3.5007, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.9853865859338615, | |
| "eval_accuracy": 0.3649192275964456, | |
| "eval_loss": 3.59055233001709, | |
| "eval_runtime": 180.2837, | |
| "eval_samples_per_second": 92.36, | |
| "eval_steps_per_second": 5.774, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.999941779226829, | |
| "grad_norm": 0.32829391956329346, | |
| "learning_rate": 0.0005163110075713453, | |
| "loss": 3.497, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 7.014264089427107, | |
| "grad_norm": 0.3092605769634247, | |
| "learning_rate": 0.0005161362842166569, | |
| "loss": 3.3899, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 7.028819282720074, | |
| "grad_norm": 0.3218209743499756, | |
| "learning_rate": 0.0005159615608619685, | |
| "loss": 3.3821, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 7.0433744760130415, | |
| "grad_norm": 0.34114161133766174, | |
| "learning_rate": 0.0005157868375072801, | |
| "loss": 3.3843, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 7.0579296693060085, | |
| "grad_norm": 0.3404031991958618, | |
| "learning_rate": 0.0005156121141525917, | |
| "loss": 3.384, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 7.072484862598976, | |
| "grad_norm": 0.32840147614479065, | |
| "learning_rate": 0.0005154373907979033, | |
| "loss": 3.3942, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 7.087040055891943, | |
| "grad_norm": 0.3449382781982422, | |
| "learning_rate": 0.0005152626674432149, | |
| "loss": 3.3896, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 7.101595249184909, | |
| "grad_norm": 0.3175411522388458, | |
| "learning_rate": 0.0005150879440885264, | |
| "loss": 3.4117, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 7.116150442477876, | |
| "grad_norm": 0.3453741669654846, | |
| "learning_rate": 0.000514913220733838, | |
| "loss": 3.4078, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 7.130705635770843, | |
| "grad_norm": 0.3288978934288025, | |
| "learning_rate": 0.0005147384973791496, | |
| "loss": 3.4023, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 7.14526082906381, | |
| "grad_norm": 0.34818190336227417, | |
| "learning_rate": 0.0005145637740244613, | |
| "loss": 3.4102, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 7.159816022356777, | |
| "grad_norm": 0.3382578492164612, | |
| "learning_rate": 0.0005143890506697728, | |
| "loss": 3.4096, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 7.174371215649744, | |
| "grad_norm": 0.3537672758102417, | |
| "learning_rate": 0.0005142143273150844, | |
| "loss": 3.4195, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 7.188926408942711, | |
| "grad_norm": 0.35966384410858154, | |
| "learning_rate": 0.000514039603960396, | |
| "loss": 3.4171, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 7.203481602235677, | |
| "grad_norm": 0.31200113892555237, | |
| "learning_rate": 0.0005138648806057075, | |
| "loss": 3.4291, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 7.218036795528644, | |
| "grad_norm": 0.309789776802063, | |
| "learning_rate": 0.0005136901572510192, | |
| "loss": 3.4037, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 7.2325919888216115, | |
| "grad_norm": 0.3593338131904602, | |
| "learning_rate": 0.0005135154338963307, | |
| "loss": 3.4226, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 7.2471471821145785, | |
| "grad_norm": 0.31558388471603394, | |
| "learning_rate": 0.0005133407105416424, | |
| "loss": 3.4255, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 7.261702375407546, | |
| "grad_norm": 0.3474031388759613, | |
| "learning_rate": 0.0005131659871869539, | |
| "loss": 3.4232, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 7.276257568700513, | |
| "grad_norm": 0.3396109640598297, | |
| "learning_rate": 0.0005129912638322656, | |
| "loss": 3.4275, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.276257568700513, | |
| "eval_accuracy": 0.3646410404645337, | |
| "eval_loss": 3.600318193435669, | |
| "eval_runtime": 180.3373, | |
| "eval_samples_per_second": 92.333, | |
| "eval_steps_per_second": 5.773, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.290812761993479, | |
| "grad_norm": 0.37299227714538574, | |
| "learning_rate": 0.0005128165404775771, | |
| "loss": 3.4347, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 7.305367955286446, | |
| "grad_norm": 0.3420005738735199, | |
| "learning_rate": 0.0005126418171228888, | |
| "loss": 3.4225, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 7.319923148579413, | |
| "grad_norm": 0.3567899465560913, | |
| "learning_rate": 0.0005124670937682003, | |
| "loss": 3.4305, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 7.33447834187238, | |
| "grad_norm": 0.34419989585876465, | |
| "learning_rate": 0.000512292370413512, | |
| "loss": 3.4381, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 7.349033535165347, | |
| "grad_norm": 0.35814324021339417, | |
| "learning_rate": 0.0005121176470588235, | |
| "loss": 3.4428, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 7.363588728458314, | |
| "grad_norm": 0.32788124680519104, | |
| "learning_rate": 0.000511942923704135, | |
| "loss": 3.4401, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 7.378143921751281, | |
| "grad_norm": 0.3420645296573639, | |
| "learning_rate": 0.0005117682003494467, | |
| "loss": 3.4471, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 7.392699115044248, | |
| "grad_norm": 0.31772691011428833, | |
| "learning_rate": 0.0005115934769947583, | |
| "loss": 3.4461, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 7.407254308337214, | |
| "grad_norm": 0.31962013244628906, | |
| "learning_rate": 0.0005114187536400699, | |
| "loss": 3.4413, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 7.4218095016301815, | |
| "grad_norm": 0.3371577560901642, | |
| "learning_rate": 0.0005112440302853814, | |
| "loss": 3.4317, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.4363646949231486, | |
| "grad_norm": 0.35055968165397644, | |
| "learning_rate": 0.0005110693069306931, | |
| "loss": 3.439, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 7.450919888216116, | |
| "grad_norm": 0.3442050516605377, | |
| "learning_rate": 0.0005108945835760046, | |
| "loss": 3.4426, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 7.465475081509083, | |
| "grad_norm": 0.33924394845962524, | |
| "learning_rate": 0.0005107198602213162, | |
| "loss": 3.4455, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 7.48003027480205, | |
| "grad_norm": 0.34456804394721985, | |
| "learning_rate": 0.0005105451368666278, | |
| "loss": 3.4428, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 7.494585468095016, | |
| "grad_norm": 0.32214027643203735, | |
| "learning_rate": 0.0005103704135119394, | |
| "loss": 3.4539, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 7.509140661387983, | |
| "grad_norm": 0.32853496074676514, | |
| "learning_rate": 0.000510195690157251, | |
| "loss": 3.4325, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 7.52369585468095, | |
| "grad_norm": 0.3431142568588257, | |
| "learning_rate": 0.0005100209668025625, | |
| "loss": 3.4522, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 7.538251047973917, | |
| "grad_norm": 0.3387904763221741, | |
| "learning_rate": 0.0005098462434478742, | |
| "loss": 3.4527, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 7.552806241266884, | |
| "grad_norm": 0.3341459035873413, | |
| "learning_rate": 0.0005096715200931858, | |
| "loss": 3.4396, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 7.567361434559851, | |
| "grad_norm": 0.36085036396980286, | |
| "learning_rate": 0.0005094967967384974, | |
| "loss": 3.4465, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.567361434559851, | |
| "eval_accuracy": 0.36556268959358, | |
| "eval_loss": 3.5897607803344727, | |
| "eval_runtime": 180.2095, | |
| "eval_samples_per_second": 92.398, | |
| "eval_steps_per_second": 5.777, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.581916627852818, | |
| "grad_norm": 0.32997265458106995, | |
| "learning_rate": 0.0005093220733838089, | |
| "loss": 3.4519, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 7.5964718211457845, | |
| "grad_norm": 0.338478684425354, | |
| "learning_rate": 0.0005091473500291205, | |
| "loss": 3.4398, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 7.6110270144387515, | |
| "grad_norm": 0.3238486051559448, | |
| "learning_rate": 0.0005089726266744321, | |
| "loss": 3.4585, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 7.625582207731719, | |
| "grad_norm": 0.31277045607566833, | |
| "learning_rate": 0.0005087979033197437, | |
| "loss": 3.4462, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 7.640137401024686, | |
| "grad_norm": 0.32050615549087524, | |
| "learning_rate": 0.0005086231799650553, | |
| "loss": 3.4574, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 7.654692594317653, | |
| "grad_norm": 0.31899574398994446, | |
| "learning_rate": 0.0005084484566103669, | |
| "loss": 3.4587, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 7.66924778761062, | |
| "grad_norm": 0.345290869474411, | |
| "learning_rate": 0.0005082737332556785, | |
| "loss": 3.454, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 7.683802980903586, | |
| "grad_norm": 0.3660203218460083, | |
| "learning_rate": 0.00050809900990099, | |
| "loss": 3.4574, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 7.698358174196553, | |
| "grad_norm": 0.3448614180088043, | |
| "learning_rate": 0.0005079242865463016, | |
| "loss": 3.4539, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 7.71291336748952, | |
| "grad_norm": 0.32432374358177185, | |
| "learning_rate": 0.0005077495631916133, | |
| "loss": 3.4523, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.727468560782487, | |
| "grad_norm": 0.3127756118774414, | |
| "learning_rate": 0.0005075748398369248, | |
| "loss": 3.4497, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 7.742023754075454, | |
| "grad_norm": 0.3349662125110626, | |
| "learning_rate": 0.0005074001164822364, | |
| "loss": 3.4636, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 7.756578947368421, | |
| "grad_norm": 0.32090121507644653, | |
| "learning_rate": 0.000507225393127548, | |
| "loss": 3.4561, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 7.771134140661388, | |
| "grad_norm": 0.33404263854026794, | |
| "learning_rate": 0.0005070506697728596, | |
| "loss": 3.4588, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 7.785689333954355, | |
| "grad_norm": 0.3287113606929779, | |
| "learning_rate": 0.0005068759464181711, | |
| "loss": 3.4641, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 7.8002445272473215, | |
| "grad_norm": 0.3299543857574463, | |
| "learning_rate": 0.0005067012230634828, | |
| "loss": 3.4645, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 7.814799720540289, | |
| "grad_norm": 0.34061795473098755, | |
| "learning_rate": 0.0005065264997087944, | |
| "loss": 3.4586, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 7.829354913833256, | |
| "grad_norm": 0.3335813283920288, | |
| "learning_rate": 0.0005063517763541059, | |
| "loss": 3.4558, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 7.843910107126223, | |
| "grad_norm": 0.35924962162971497, | |
| "learning_rate": 0.0005061770529994175, | |
| "loss": 3.4696, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 7.85846530041919, | |
| "grad_norm": 0.3391464948654175, | |
| "learning_rate": 0.0005060023296447291, | |
| "loss": 3.4704, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.85846530041919, | |
| "eval_accuracy": 0.3658695533880125, | |
| "eval_loss": 3.582665205001831, | |
| "eval_runtime": 180.2723, | |
| "eval_samples_per_second": 92.366, | |
| "eval_steps_per_second": 5.775, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.873020493712157, | |
| "grad_norm": 0.340207576751709, | |
| "learning_rate": 0.0005058276062900408, | |
| "loss": 3.4537, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 7.887575687005123, | |
| "grad_norm": 0.3075796663761139, | |
| "learning_rate": 0.0005056528829353523, | |
| "loss": 3.4566, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 7.90213088029809, | |
| "grad_norm": 0.33511102199554443, | |
| "learning_rate": 0.000505478159580664, | |
| "loss": 3.4713, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 7.916686073591057, | |
| "grad_norm": 0.3101419508457184, | |
| "learning_rate": 0.0005053034362259755, | |
| "loss": 3.4553, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 7.931241266884024, | |
| "grad_norm": 0.30531278252601624, | |
| "learning_rate": 0.000505128712871287, | |
| "loss": 3.4531, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 7.945796460176991, | |
| "grad_norm": 0.32350555062294006, | |
| "learning_rate": 0.0005049539895165987, | |
| "loss": 3.4673, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 7.960351653469958, | |
| "grad_norm": 0.306485652923584, | |
| "learning_rate": 0.0005047792661619103, | |
| "loss": 3.462, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 7.974906846762925, | |
| "grad_norm": 0.32470259070396423, | |
| "learning_rate": 0.0005046045428072219, | |
| "loss": 3.456, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 7.989462040055892, | |
| "grad_norm": 0.32083654403686523, | |
| "learning_rate": 0.0005044298194525334, | |
| "loss": 3.4741, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 8.003784350256172, | |
| "grad_norm": 0.3293924927711487, | |
| "learning_rate": 0.0005042550960978451, | |
| "loss": 3.4383, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 8.01833954354914, | |
| "grad_norm": 0.31325265765190125, | |
| "learning_rate": 0.0005040803727431566, | |
| "loss": 3.3534, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 8.032894736842104, | |
| "grad_norm": 0.3100528120994568, | |
| "learning_rate": 0.0005039056493884683, | |
| "loss": 3.3611, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 8.047449930135071, | |
| "grad_norm": 0.32878878712654114, | |
| "learning_rate": 0.0005037309260337798, | |
| "loss": 3.3492, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 8.062005123428039, | |
| "grad_norm": 0.361114501953125, | |
| "learning_rate": 0.0005035562026790914, | |
| "loss": 3.3572, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 8.076560316721006, | |
| "grad_norm": 0.31694483757019043, | |
| "learning_rate": 0.000503381479324403, | |
| "loss": 3.3778, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 8.091115510013973, | |
| "grad_norm": 0.35615530610084534, | |
| "learning_rate": 0.0005032067559697145, | |
| "loss": 3.3625, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 8.10567070330694, | |
| "grad_norm": 0.33051422238349915, | |
| "learning_rate": 0.0005030320326150262, | |
| "loss": 3.374, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 8.120225896599907, | |
| "grad_norm": 0.34486663341522217, | |
| "learning_rate": 0.0005028573092603378, | |
| "loss": 3.3689, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 8.134781089892874, | |
| "grad_norm": 0.34573471546173096, | |
| "learning_rate": 0.0005026825859056494, | |
| "loss": 3.3794, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 8.149336283185841, | |
| "grad_norm": 0.34255653619766235, | |
| "learning_rate": 0.0005025078625509609, | |
| "loss": 3.3769, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.149336283185841, | |
| "eval_accuracy": 0.36599906847857727, | |
| "eval_loss": 3.592573404312134, | |
| "eval_runtime": 180.5633, | |
| "eval_samples_per_second": 92.217, | |
| "eval_steps_per_second": 5.765, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.163891476478808, | |
| "grad_norm": 0.3379361033439636, | |
| "learning_rate": 0.0005023331391962726, | |
| "loss": 3.3726, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 8.178446669771775, | |
| "grad_norm": 0.31844353675842285, | |
| "learning_rate": 0.0005021584158415841, | |
| "loss": 3.3737, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 8.193001863064742, | |
| "grad_norm": 0.331997275352478, | |
| "learning_rate": 0.0005019836924868956, | |
| "loss": 3.3947, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 8.20755705635771, | |
| "grad_norm": 0.35097721219062805, | |
| "learning_rate": 0.0005018089691322073, | |
| "loss": 3.3742, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 8.222112249650676, | |
| "grad_norm": 0.32892441749572754, | |
| "learning_rate": 0.0005016342457775189, | |
| "loss": 3.3823, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 8.236667442943642, | |
| "grad_norm": 0.34736308455467224, | |
| "learning_rate": 0.0005014595224228305, | |
| "loss": 3.3932, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 8.251222636236609, | |
| "grad_norm": 0.3155169188976288, | |
| "learning_rate": 0.000501284799068142, | |
| "loss": 3.3995, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 8.265777829529576, | |
| "grad_norm": 0.3510957360267639, | |
| "learning_rate": 0.0005011100757134537, | |
| "loss": 3.3842, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 8.280333022822543, | |
| "grad_norm": 0.343882292509079, | |
| "learning_rate": 0.0005009353523587653, | |
| "loss": 3.3941, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 8.29488821611551, | |
| "grad_norm": 0.3476276397705078, | |
| "learning_rate": 0.0005007606290040768, | |
| "loss": 3.4072, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 8.309443409408477, | |
| "grad_norm": 0.3502870202064514, | |
| "learning_rate": 0.0005005859056493884, | |
| "loss": 3.4028, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 8.323998602701444, | |
| "grad_norm": 0.31015148758888245, | |
| "learning_rate": 0.0005004111822947, | |
| "loss": 3.4104, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 8.338553795994411, | |
| "grad_norm": 0.3566758930683136, | |
| "learning_rate": 0.0005002364589400116, | |
| "loss": 3.4121, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 8.353108989287378, | |
| "grad_norm": 0.34552037715911865, | |
| "learning_rate": 0.0005000617355853231, | |
| "loss": 3.3951, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 8.367664182580345, | |
| "grad_norm": 0.33820486068725586, | |
| "learning_rate": 0.0004998870122306348, | |
| "loss": 3.4104, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 8.382219375873312, | |
| "grad_norm": 0.3079693913459778, | |
| "learning_rate": 0.0004997122888759464, | |
| "loss": 3.4178, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 8.39677456916628, | |
| "grad_norm": 0.41044849157333374, | |
| "learning_rate": 0.000499537565521258, | |
| "loss": 3.4165, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 8.411329762459246, | |
| "grad_norm": 0.3613148033618927, | |
| "learning_rate": 0.0004993628421665695, | |
| "loss": 3.4185, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 8.425884955752213, | |
| "grad_norm": 0.3372962474822998, | |
| "learning_rate": 0.0004991881188118811, | |
| "loss": 3.4082, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 8.440440149045179, | |
| "grad_norm": 0.31549227237701416, | |
| "learning_rate": 0.0004990133954571928, | |
| "loss": 3.4175, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.440440149045179, | |
| "eval_accuracy": 0.3664678848998685, | |
| "eval_loss": 3.586700201034546, | |
| "eval_runtime": 180.5552, | |
| "eval_samples_per_second": 92.221, | |
| "eval_steps_per_second": 5.766, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.454995342338146, | |
| "grad_norm": 0.37046492099761963, | |
| "learning_rate": 0.0004988386721025043, | |
| "loss": 3.4178, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 8.469550535631113, | |
| "grad_norm": 0.3361333906650543, | |
| "learning_rate": 0.0004986639487478159, | |
| "loss": 3.4274, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 8.48410572892408, | |
| "grad_norm": 0.34720394015312195, | |
| "learning_rate": 0.0004984892253931275, | |
| "loss": 3.4087, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 8.498660922217047, | |
| "grad_norm": 0.36103421449661255, | |
| "learning_rate": 0.0004983145020384391, | |
| "loss": 3.4123, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 8.513216115510014, | |
| "grad_norm": 0.3321627676486969, | |
| "learning_rate": 0.0004981397786837507, | |
| "loss": 3.4189, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 8.527771308802981, | |
| "grad_norm": 0.3531418740749359, | |
| "learning_rate": 0.0004979650553290622, | |
| "loss": 3.4256, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 8.542326502095948, | |
| "grad_norm": 0.3398493826389313, | |
| "learning_rate": 0.0004977903319743739, | |
| "loss": 3.425, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 8.556881695388915, | |
| "grad_norm": 0.3529471755027771, | |
| "learning_rate": 0.0004976156086196854, | |
| "loss": 3.4124, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 8.571436888681882, | |
| "grad_norm": 0.35845470428466797, | |
| "learning_rate": 0.0004974408852649971, | |
| "loss": 3.4284, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 8.58599208197485, | |
| "grad_norm": 0.3319138288497925, | |
| "learning_rate": 0.0004972661619103086, | |
| "loss": 3.4267, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 8.600547275267816, | |
| "grad_norm": 0.3284785747528076, | |
| "learning_rate": 0.0004970914385556202, | |
| "loss": 3.4119, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 8.615102468560782, | |
| "grad_norm": 0.353296160697937, | |
| "learning_rate": 0.0004969167152009318, | |
| "loss": 3.4199, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 8.629657661853749, | |
| "grad_norm": 0.3219950497150421, | |
| "learning_rate": 0.0004967419918462435, | |
| "loss": 3.4246, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 8.644212855146716, | |
| "grad_norm": 0.35093748569488525, | |
| "learning_rate": 0.000496567268491555, | |
| "loss": 3.4237, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 8.658768048439683, | |
| "grad_norm": 0.3354974091053009, | |
| "learning_rate": 0.0004963925451368665, | |
| "loss": 3.4218, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 8.67332324173265, | |
| "grad_norm": 0.3358497619628906, | |
| "learning_rate": 0.0004962178217821782, | |
| "loss": 3.4226, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 8.687878435025617, | |
| "grad_norm": 0.34513646364212036, | |
| "learning_rate": 0.0004960430984274898, | |
| "loss": 3.4209, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 8.702433628318584, | |
| "grad_norm": 0.35030463337898254, | |
| "learning_rate": 0.0004958683750728014, | |
| "loss": 3.4282, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 8.716988821611551, | |
| "grad_norm": 0.3723340928554535, | |
| "learning_rate": 0.0004956936517181129, | |
| "loss": 3.4377, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 8.731544014904518, | |
| "grad_norm": 0.34282436966896057, | |
| "learning_rate": 0.0004955189283634246, | |
| "loss": 3.4279, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.731544014904518, | |
| "eval_accuracy": 0.3671618836383304, | |
| "eval_loss": 3.574834108352661, | |
| "eval_runtime": 180.2587, | |
| "eval_samples_per_second": 92.373, | |
| "eval_steps_per_second": 5.775, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.746099208197485, | |
| "grad_norm": 0.34552201628685, | |
| "learning_rate": 0.0004953442050087361, | |
| "loss": 3.4266, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 8.760654401490452, | |
| "grad_norm": 0.3571280539035797, | |
| "learning_rate": 0.0004951694816540476, | |
| "loss": 3.4341, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 8.77520959478342, | |
| "grad_norm": 0.33385321497917175, | |
| "learning_rate": 0.0004949947582993593, | |
| "loss": 3.4197, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 8.789764788076386, | |
| "grad_norm": 0.33117714524269104, | |
| "learning_rate": 0.0004948200349446709, | |
| "loss": 3.4211, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 8.804319981369353, | |
| "grad_norm": 0.33476123213768005, | |
| "learning_rate": 0.0004946453115899825, | |
| "loss": 3.4288, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 8.81887517466232, | |
| "grad_norm": 0.3372209668159485, | |
| "learning_rate": 0.000494470588235294, | |
| "loss": 3.433, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 8.833430367955286, | |
| "grad_norm": 0.3290836811065674, | |
| "learning_rate": 0.0004942958648806057, | |
| "loss": 3.4334, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 8.847985561248253, | |
| "grad_norm": 0.3452501595020294, | |
| "learning_rate": 0.0004941211415259173, | |
| "loss": 3.421, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 8.86254075454122, | |
| "grad_norm": 0.31123703718185425, | |
| "learning_rate": 0.0004939464181712289, | |
| "loss": 3.4225, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 8.877095947834187, | |
| "grad_norm": 0.31685128808021545, | |
| "learning_rate": 0.0004937716948165404, | |
| "loss": 3.4394, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 8.891651141127154, | |
| "grad_norm": 0.32032787799835205, | |
| "learning_rate": 0.000493596971461852, | |
| "loss": 3.427, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 8.906206334420121, | |
| "grad_norm": 0.3401283621788025, | |
| "learning_rate": 0.0004934222481071636, | |
| "loss": 3.4334, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 8.920761527713088, | |
| "grad_norm": 0.3419210612773895, | |
| "learning_rate": 0.0004932475247524751, | |
| "loss": 3.4397, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 8.935316721006055, | |
| "grad_norm": 0.3617963492870331, | |
| "learning_rate": 0.0004930728013977868, | |
| "loss": 3.4322, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 8.949871914299022, | |
| "grad_norm": 0.34712788462638855, | |
| "learning_rate": 0.0004928980780430984, | |
| "loss": 3.427, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 8.96442710759199, | |
| "grad_norm": 0.32096177339553833, | |
| "learning_rate": 0.00049272335468841, | |
| "loss": 3.4275, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 8.978982300884956, | |
| "grad_norm": 0.3254910707473755, | |
| "learning_rate": 0.0004925486313337215, | |
| "loss": 3.446, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 8.993537494177923, | |
| "grad_norm": 0.32205885648727417, | |
| "learning_rate": 0.0004923739079790332, | |
| "loss": 3.4478, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 9.007859804378203, | |
| "grad_norm": 0.3276841938495636, | |
| "learning_rate": 0.0004921991846243447, | |
| "loss": 3.381, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 9.022414997671168, | |
| "grad_norm": 0.33040741086006165, | |
| "learning_rate": 0.0004920244612696563, | |
| "loss": 3.3299, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.022414997671168, | |
| "eval_accuracy": 0.3670517605531587, | |
| "eval_loss": 3.581254482269287, | |
| "eval_runtime": 180.2002, | |
| "eval_samples_per_second": 92.403, | |
| "eval_steps_per_second": 5.777, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.036970190964135, | |
| "grad_norm": 0.32714518904685974, | |
| "learning_rate": 0.0004918497379149679, | |
| "loss": 3.3307, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 9.051525384257102, | |
| "grad_norm": 0.33463945984840393, | |
| "learning_rate": 0.0004916750145602795, | |
| "loss": 3.3372, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 9.06608057755007, | |
| "grad_norm": 0.344443678855896, | |
| "learning_rate": 0.0004915002912055911, | |
| "loss": 3.3291, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 9.080635770843037, | |
| "grad_norm": 0.3225387930870056, | |
| "learning_rate": 0.0004913255678509026, | |
| "loss": 3.3442, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 9.095190964136004, | |
| "grad_norm": 0.3648475706577301, | |
| "learning_rate": 0.0004911508444962143, | |
| "loss": 3.3391, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 9.10974615742897, | |
| "grad_norm": 0.36045387387275696, | |
| "learning_rate": 0.0004909761211415259, | |
| "loss": 3.3471, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 9.124301350721938, | |
| "grad_norm": 0.33283770084381104, | |
| "learning_rate": 0.0004908013977868375, | |
| "loss": 3.3555, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 9.138856544014905, | |
| "grad_norm": 0.3415498733520508, | |
| "learning_rate": 0.0004906266744321491, | |
| "loss": 3.3437, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 9.153411737307872, | |
| "grad_norm": 0.33745819330215454, | |
| "learning_rate": 0.0004904519510774606, | |
| "loss": 3.3669, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 9.167966930600839, | |
| "grad_norm": 0.3298662006855011, | |
| "learning_rate": 0.0004902772277227722, | |
| "loss": 3.3517, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 9.182522123893806, | |
| "grad_norm": 0.3514326512813568, | |
| "learning_rate": 0.0004901025043680838, | |
| "loss": 3.3599, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 9.197077317186773, | |
| "grad_norm": 0.3235764801502228, | |
| "learning_rate": 0.0004899277810133955, | |
| "loss": 3.3571, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 9.211632510479738, | |
| "grad_norm": 0.35297152400016785, | |
| "learning_rate": 0.000489753057658707, | |
| "loss": 3.3693, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 9.226187703772705, | |
| "grad_norm": 0.3648125231266022, | |
| "learning_rate": 0.0004895783343040186, | |
| "loss": 3.3592, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 9.240742897065672, | |
| "grad_norm": 0.3410831391811371, | |
| "learning_rate": 0.0004894036109493302, | |
| "loss": 3.3658, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 9.25529809035864, | |
| "grad_norm": 0.3517493009567261, | |
| "learning_rate": 0.0004892288875946419, | |
| "loss": 3.3714, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 9.269853283651607, | |
| "grad_norm": 0.3432004451751709, | |
| "learning_rate": 0.0004890541642399534, | |
| "loss": 3.3696, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 9.284408476944574, | |
| "grad_norm": 0.35906535387039185, | |
| "learning_rate": 0.0004888794408852649, | |
| "loss": 3.3712, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 9.29896367023754, | |
| "grad_norm": 0.34101489186286926, | |
| "learning_rate": 0.0004887047175305766, | |
| "loss": 3.3777, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 9.313518863530508, | |
| "grad_norm": 0.31470558047294617, | |
| "learning_rate": 0.0004885299941758881, | |
| "loss": 3.3795, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.313518863530508, | |
| "eval_accuracy": 0.36712709555592826, | |
| "eval_loss": 3.5808510780334473, | |
| "eval_runtime": 180.3177, | |
| "eval_samples_per_second": 92.343, | |
| "eval_steps_per_second": 5.773, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.328074056823475, | |
| "grad_norm": 0.35451540350914, | |
| "learning_rate": 0.0004883552708211997, | |
| "loss": 3.3783, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 9.342629250116442, | |
| "grad_norm": 0.35627683997154236, | |
| "learning_rate": 0.00048818054746651137, | |
| "loss": 3.3681, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 9.357184443409409, | |
| "grad_norm": 0.3487967550754547, | |
| "learning_rate": 0.0004880058241118229, | |
| "loss": 3.3829, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 9.371739636702376, | |
| "grad_norm": 0.34083083271980286, | |
| "learning_rate": 0.0004878311007571345, | |
| "loss": 3.3811, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 9.386294829995343, | |
| "grad_norm": 0.341313898563385, | |
| "learning_rate": 0.0004876563774024461, | |
| "loss": 3.3812, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 9.40085002328831, | |
| "grad_norm": 0.34539201855659485, | |
| "learning_rate": 0.00048748165404775763, | |
| "loss": 3.379, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 9.415405216581275, | |
| "grad_norm": 0.3151034414768219, | |
| "learning_rate": 0.0004873069306930693, | |
| "loss": 3.3874, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 9.429960409874242, | |
| "grad_norm": 0.33578553795814514, | |
| "learning_rate": 0.0004871322073383809, | |
| "loss": 3.3969, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 9.44451560316721, | |
| "grad_norm": 0.34011614322662354, | |
| "learning_rate": 0.00048695748398369247, | |
| "loss": 3.3715, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 9.459070796460177, | |
| "grad_norm": 0.35364067554473877, | |
| "learning_rate": 0.000486782760629004, | |
| "loss": 3.3848, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 9.473625989753144, | |
| "grad_norm": 0.3555489778518677, | |
| "learning_rate": 0.0004866080372743156, | |
| "loss": 3.3787, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 9.48818118304611, | |
| "grad_norm": 0.3360406458377838, | |
| "learning_rate": 0.0004864333139196272, | |
| "loss": 3.3842, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 9.502736376339078, | |
| "grad_norm": 0.3316383361816406, | |
| "learning_rate": 0.00048625859056493885, | |
| "loss": 3.3827, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 9.517291569632045, | |
| "grad_norm": 0.3541034758090973, | |
| "learning_rate": 0.0004860838672102504, | |
| "loss": 3.3777, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 9.531846762925012, | |
| "grad_norm": 0.33557403087615967, | |
| "learning_rate": 0.000485909143855562, | |
| "loss": 3.3938, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 9.546401956217979, | |
| "grad_norm": 0.3402416706085205, | |
| "learning_rate": 0.0004857344205008736, | |
| "loss": 3.3942, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 9.560957149510946, | |
| "grad_norm": 0.33399122953414917, | |
| "learning_rate": 0.00048555969714618517, | |
| "loss": 3.3911, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 9.575512342803913, | |
| "grad_norm": 0.32051223516464233, | |
| "learning_rate": 0.0004853849737914967, | |
| "loss": 3.401, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 9.59006753609688, | |
| "grad_norm": 0.34346646070480347, | |
| "learning_rate": 0.00048521025043680836, | |
| "loss": 3.3957, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 9.604622729389845, | |
| "grad_norm": 0.3783892095088959, | |
| "learning_rate": 0.00048503552708211995, | |
| "loss": 3.3921, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.604622729389845, | |
| "eval_accuracy": 0.3674388954971881, | |
| "eval_loss": 3.57379150390625, | |
| "eval_runtime": 180.9734, | |
| "eval_samples_per_second": 92.008, | |
| "eval_steps_per_second": 5.752, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.619177922682812, | |
| "grad_norm": 0.32997605204582214, | |
| "learning_rate": 0.00048486080372743155, | |
| "loss": 3.3998, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 9.63373311597578, | |
| "grad_norm": 0.35459256172180176, | |
| "learning_rate": 0.0004846860803727431, | |
| "loss": 3.4032, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 9.648288309268747, | |
| "grad_norm": 0.35381874442100525, | |
| "learning_rate": 0.0004845113570180547, | |
| "loss": 3.401, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 9.662843502561714, | |
| "grad_norm": 0.32483237981796265, | |
| "learning_rate": 0.00048433663366336633, | |
| "loss": 3.3912, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 9.67739869585468, | |
| "grad_norm": 0.3188256025314331, | |
| "learning_rate": 0.0004841619103086779, | |
| "loss": 3.4016, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 9.691953889147648, | |
| "grad_norm": 0.32839906215667725, | |
| "learning_rate": 0.00048398718695398947, | |
| "loss": 3.4143, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 9.706509082440615, | |
| "grad_norm": 0.33581817150115967, | |
| "learning_rate": 0.00048381246359930106, | |
| "loss": 3.3985, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 9.721064275733582, | |
| "grad_norm": 0.35263094305992126, | |
| "learning_rate": 0.00048363774024461265, | |
| "loss": 3.4065, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 9.735619469026549, | |
| "grad_norm": 0.35038137435913086, | |
| "learning_rate": 0.0004834630168899242, | |
| "loss": 3.3968, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 9.750174662319516, | |
| "grad_norm": 0.3345571458339691, | |
| "learning_rate": 0.00048328829353523584, | |
| "loss": 3.3877, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 9.764729855612483, | |
| "grad_norm": 0.35425201058387756, | |
| "learning_rate": 0.00048311357018054744, | |
| "loss": 3.4068, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 9.77928504890545, | |
| "grad_norm": 0.33486849069595337, | |
| "learning_rate": 0.00048293884682585903, | |
| "loss": 3.4103, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 9.793840242198417, | |
| "grad_norm": 0.36015525460243225, | |
| "learning_rate": 0.00048276412347117057, | |
| "loss": 3.4063, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 9.808395435491384, | |
| "grad_norm": 0.3651643693447113, | |
| "learning_rate": 0.00048258940011648217, | |
| "loss": 3.4086, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 9.82295062878435, | |
| "grad_norm": 0.3266768753528595, | |
| "learning_rate": 0.0004824146767617938, | |
| "loss": 3.4094, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 9.837505822077317, | |
| "grad_norm": 0.3396150469779968, | |
| "learning_rate": 0.0004822399534071054, | |
| "loss": 3.4146, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 9.852061015370284, | |
| "grad_norm": 0.3410688638687134, | |
| "learning_rate": 0.00048206523005241695, | |
| "loss": 3.4082, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 9.86661620866325, | |
| "grad_norm": 0.3330535590648651, | |
| "learning_rate": 0.00048189050669772854, | |
| "loss": 3.4214, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 9.881171401956218, | |
| "grad_norm": 0.3465299904346466, | |
| "learning_rate": 0.00048171578334304014, | |
| "loss": 3.4047, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 9.895726595249185, | |
| "grad_norm": 0.448885440826416, | |
| "learning_rate": 0.00048154105998835173, | |
| "loss": 3.4213, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.895726595249185, | |
| "eval_accuracy": 0.36859089814484325, | |
| "eval_loss": 3.5626511573791504, | |
| "eval_runtime": 180.5271, | |
| "eval_samples_per_second": 92.235, | |
| "eval_steps_per_second": 5.766, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.910281788542152, | |
| "grad_norm": 0.3240843117237091, | |
| "learning_rate": 0.0004813663366336633, | |
| "loss": 3.4087, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 9.924836981835119, | |
| "grad_norm": 0.355328768491745, | |
| "learning_rate": 0.0004811916132789749, | |
| "loss": 3.4124, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 9.939392175128086, | |
| "grad_norm": 0.3710843622684479, | |
| "learning_rate": 0.0004810168899242865, | |
| "loss": 3.4133, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 9.953947368421053, | |
| "grad_norm": 0.31903794407844543, | |
| "learning_rate": 0.0004808421665695981, | |
| "loss": 3.4152, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 9.96850256171402, | |
| "grad_norm": 0.35480058193206787, | |
| "learning_rate": 0.00048066744321490965, | |
| "loss": 3.4115, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 9.983057755006987, | |
| "grad_norm": 0.3243095874786377, | |
| "learning_rate": 0.00048049271986022124, | |
| "loss": 3.4012, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 9.997612948299953, | |
| "grad_norm": 0.3363344967365265, | |
| "learning_rate": 0.0004803179965055329, | |
| "loss": 3.4014, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 10.011935258500232, | |
| "grad_norm": 0.3793693482875824, | |
| "learning_rate": 0.0004801432731508445, | |
| "loss": 3.3167, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 10.0264904517932, | |
| "grad_norm": 0.3262706995010376, | |
| "learning_rate": 0.000479968549796156, | |
| "loss": 3.2971, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 10.041045645086166, | |
| "grad_norm": 0.3725760579109192, | |
| "learning_rate": 0.0004797938264414676, | |
| "loss": 3.3046, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 10.055600838379133, | |
| "grad_norm": 0.35947728157043457, | |
| "learning_rate": 0.0004796191030867792, | |
| "loss": 3.3032, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 10.0701560316721, | |
| "grad_norm": 0.36128759384155273, | |
| "learning_rate": 0.00047944437973209086, | |
| "loss": 3.3065, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 10.084711224965067, | |
| "grad_norm": 0.36060285568237305, | |
| "learning_rate": 0.0004792696563774024, | |
| "loss": 3.3144, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 10.099266418258035, | |
| "grad_norm": 0.3805783689022064, | |
| "learning_rate": 0.000479094933022714, | |
| "loss": 3.3184, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 10.113821611551002, | |
| "grad_norm": 0.32521340250968933, | |
| "learning_rate": 0.0004789202096680256, | |
| "loss": 3.299, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 10.128376804843969, | |
| "grad_norm": 0.3500252068042755, | |
| "learning_rate": 0.00047874548631333713, | |
| "loss": 3.3276, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 10.142931998136936, | |
| "grad_norm": 0.3359842896461487, | |
| "learning_rate": 0.0004785707629586487, | |
| "loss": 3.3341, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 10.157487191429903, | |
| "grad_norm": 0.3367213010787964, | |
| "learning_rate": 0.0004783960396039604, | |
| "loss": 3.3178, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 10.17204238472287, | |
| "grad_norm": 0.35683926939964294, | |
| "learning_rate": 0.00047822131624927197, | |
| "loss": 3.3337, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 10.186597578015837, | |
| "grad_norm": 0.3428564965724945, | |
| "learning_rate": 0.0004780465928945835, | |
| "loss": 3.3333, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.186597578015837, | |
| "eval_accuracy": 0.36825829587052533, | |
| "eval_loss": 3.570888042449951, | |
| "eval_runtime": 180.6456, | |
| "eval_samples_per_second": 92.175, | |
| "eval_steps_per_second": 5.763, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.201152771308802, | |
| "grad_norm": 0.3613561689853668, | |
| "learning_rate": 0.0004778718695398951, | |
| "loss": 3.3577, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 10.21570796460177, | |
| "grad_norm": 0.3598746955394745, | |
| "learning_rate": 0.0004776971461852067, | |
| "loss": 3.3371, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 10.230263157894736, | |
| "grad_norm": 0.36579689383506775, | |
| "learning_rate": 0.00047752242283051835, | |
| "loss": 3.3352, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 10.244818351187703, | |
| "grad_norm": 0.34611940383911133, | |
| "learning_rate": 0.00047734769947582994, | |
| "loss": 3.3614, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 10.25937354448067, | |
| "grad_norm": 0.35718485713005066, | |
| "learning_rate": 0.0004771729761211415, | |
| "loss": 3.3429, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 10.273928737773637, | |
| "grad_norm": 0.32261791825294495, | |
| "learning_rate": 0.0004769982527664531, | |
| "loss": 3.3394, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 10.288483931066605, | |
| "grad_norm": 0.38552016019821167, | |
| "learning_rate": 0.00047682352941176467, | |
| "loss": 3.3625, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 10.303039124359572, | |
| "grad_norm": 0.3692517876625061, | |
| "learning_rate": 0.0004766488060570762, | |
| "loss": 3.3471, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 10.317594317652539, | |
| "grad_norm": 0.31940123438835144, | |
| "learning_rate": 0.00047647408270238786, | |
| "loss": 3.3573, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 10.332149510945506, | |
| "grad_norm": 0.38608184456825256, | |
| "learning_rate": 0.00047629935934769945, | |
| "loss": 3.3487, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 10.346704704238473, | |
| "grad_norm": 0.361963152885437, | |
| "learning_rate": 0.00047612463599301105, | |
| "loss": 3.3672, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 10.36125989753144, | |
| "grad_norm": 0.3653804361820221, | |
| "learning_rate": 0.0004759499126383226, | |
| "loss": 3.3551, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 10.375815090824407, | |
| "grad_norm": 0.3686809539794922, | |
| "learning_rate": 0.0004757751892836342, | |
| "loss": 3.3597, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 10.390370284117374, | |
| "grad_norm": 0.395720511674881, | |
| "learning_rate": 0.0004756004659289458, | |
| "loss": 3.3614, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 10.40492547741034, | |
| "grad_norm": 0.33828699588775635, | |
| "learning_rate": 0.0004754257425742574, | |
| "loss": 3.3525, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 10.419480670703306, | |
| "grad_norm": 0.3415243625640869, | |
| "learning_rate": 0.00047525101921956896, | |
| "loss": 3.3599, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 10.434035863996273, | |
| "grad_norm": 0.30961814522743225, | |
| "learning_rate": 0.00047507629586488056, | |
| "loss": 3.373, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 10.44859105728924, | |
| "grad_norm": 0.3361090123653412, | |
| "learning_rate": 0.00047490157251019215, | |
| "loss": 3.3601, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 10.463146250582207, | |
| "grad_norm": 0.33400392532348633, | |
| "learning_rate": 0.0004747268491555037, | |
| "loss": 3.362, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 10.477701443875175, | |
| "grad_norm": 0.3338032066822052, | |
| "learning_rate": 0.00047455212580081534, | |
| "loss": 3.3679, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.477701443875175, | |
| "eval_accuracy": 0.368512507432133, | |
| "eval_loss": 3.5674049854278564, | |
| "eval_runtime": 180.6425, | |
| "eval_samples_per_second": 92.177, | |
| "eval_steps_per_second": 5.763, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.492256637168142, | |
| "grad_norm": 0.34610387682914734, | |
| "learning_rate": 0.00047437740244612694, | |
| "loss": 3.3661, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 10.506811830461109, | |
| "grad_norm": 0.3551473915576935, | |
| "learning_rate": 0.00047420267909143853, | |
| "loss": 3.3608, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 10.521367023754076, | |
| "grad_norm": 0.3263005018234253, | |
| "learning_rate": 0.0004740279557367501, | |
| "loss": 3.3779, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 10.535922217047043, | |
| "grad_norm": 0.34398841857910156, | |
| "learning_rate": 0.00047385323238206166, | |
| "loss": 3.3734, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 10.55047741034001, | |
| "grad_norm": 0.33675917983055115, | |
| "learning_rate": 0.00047367850902737326, | |
| "loss": 3.3728, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 10.565032603632977, | |
| "grad_norm": 0.35929790139198303, | |
| "learning_rate": 0.0004735037856726849, | |
| "loss": 3.3782, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 10.579587796925944, | |
| "grad_norm": 0.32853153347969055, | |
| "learning_rate": 0.0004733290623179965, | |
| "loss": 3.3742, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 10.59414299021891, | |
| "grad_norm": 0.3404340445995331, | |
| "learning_rate": 0.00047315433896330804, | |
| "loss": 3.3737, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 10.608698183511876, | |
| "grad_norm": 0.3543095886707306, | |
| "learning_rate": 0.00047297961560861964, | |
| "loss": 3.3802, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 10.623253376804843, | |
| "grad_norm": 0.3367573022842407, | |
| "learning_rate": 0.00047280489225393123, | |
| "loss": 3.3668, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 10.63780857009781, | |
| "grad_norm": 0.36491137742996216, | |
| "learning_rate": 0.0004726301688992429, | |
| "loss": 3.3744, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 10.652363763390778, | |
| "grad_norm": 0.3399256765842438, | |
| "learning_rate": 0.0004724554455445544, | |
| "loss": 3.3815, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 10.666918956683745, | |
| "grad_norm": 0.3620223104953766, | |
| "learning_rate": 0.000472280722189866, | |
| "loss": 3.381, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 10.681474149976712, | |
| "grad_norm": 0.31678059697151184, | |
| "learning_rate": 0.0004721059988351776, | |
| "loss": 3.3765, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 10.696029343269679, | |
| "grad_norm": 0.32897821068763733, | |
| "learning_rate": 0.00047193127548048915, | |
| "loss": 3.3831, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 10.710584536562646, | |
| "grad_norm": 0.35301482677459717, | |
| "learning_rate": 0.00047175655212580074, | |
| "loss": 3.3833, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 10.725139729855613, | |
| "grad_norm": 0.3511241376399994, | |
| "learning_rate": 0.0004715818287711124, | |
| "loss": 3.3883, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 10.73969492314858, | |
| "grad_norm": 0.361494779586792, | |
| "learning_rate": 0.000471407105416424, | |
| "loss": 3.393, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 10.754250116441547, | |
| "grad_norm": 0.3511395752429962, | |
| "learning_rate": 0.0004712323820617355, | |
| "loss": 3.3868, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 10.768805309734514, | |
| "grad_norm": 0.38783833384513855, | |
| "learning_rate": 0.0004710576587070471, | |
| "loss": 3.3889, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.768805309734514, | |
| "eval_accuracy": 0.3694185254295594, | |
| "eval_loss": 3.558671712875366, | |
| "eval_runtime": 180.6697, | |
| "eval_samples_per_second": 92.163, | |
| "eval_steps_per_second": 5.762, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.783360503027481, | |
| "grad_norm": 0.3379640281200409, | |
| "learning_rate": 0.0004708829353523587, | |
| "loss": 3.3804, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 10.797915696320446, | |
| "grad_norm": 0.3452807068824768, | |
| "learning_rate": 0.0004707082119976703, | |
| "loss": 3.3788, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 10.812470889613413, | |
| "grad_norm": 0.3475739061832428, | |
| "learning_rate": 0.0004705334886429819, | |
| "loss": 3.394, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 10.82702608290638, | |
| "grad_norm": 0.3917761743068695, | |
| "learning_rate": 0.0004703587652882935, | |
| "loss": 3.3778, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 10.841581276199348, | |
| "grad_norm": 0.3523595333099365, | |
| "learning_rate": 0.0004701840419336051, | |
| "loss": 3.3752, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 10.856136469492315, | |
| "grad_norm": 0.343047171831131, | |
| "learning_rate": 0.0004700093185789167, | |
| "loss": 3.3919, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 10.870691662785282, | |
| "grad_norm": 0.344882607460022, | |
| "learning_rate": 0.0004698345952242282, | |
| "loss": 3.3884, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 10.885246856078249, | |
| "grad_norm": 0.334264874458313, | |
| "learning_rate": 0.00046965987186953987, | |
| "loss": 3.3936, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 10.899802049371216, | |
| "grad_norm": 0.339690625667572, | |
| "learning_rate": 0.00046948514851485147, | |
| "loss": 3.3879, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 10.914357242664183, | |
| "grad_norm": 0.3376231789588928, | |
| "learning_rate": 0.00046931042516016306, | |
| "loss": 3.3762, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 10.92891243595715, | |
| "grad_norm": 0.3520675301551819, | |
| "learning_rate": 0.0004691357018054746, | |
| "loss": 3.3798, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 10.943467629250117, | |
| "grad_norm": 0.3569328784942627, | |
| "learning_rate": 0.0004689609784507862, | |
| "loss": 3.3976, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 10.958022822543084, | |
| "grad_norm": 0.3501778542995453, | |
| "learning_rate": 0.0004687862550960978, | |
| "loss": 3.4026, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 10.972578015836051, | |
| "grad_norm": 0.3711495101451874, | |
| "learning_rate": 0.00046861153174140944, | |
| "loss": 3.378, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 10.987133209129016, | |
| "grad_norm": 0.35237112641334534, | |
| "learning_rate": 0.000468436808386721, | |
| "loss": 3.3852, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 11.001455519329296, | |
| "grad_norm": 0.37270471453666687, | |
| "learning_rate": 0.0004682620850320326, | |
| "loss": 3.3759, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 11.016010712622263, | |
| "grad_norm": 0.34398648142814636, | |
| "learning_rate": 0.00046808736167734417, | |
| "loss": 3.2872, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 11.03056590591523, | |
| "grad_norm": 0.3692329227924347, | |
| "learning_rate": 0.0004679126383226557, | |
| "loss": 3.282, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 11.045121099208197, | |
| "grad_norm": 0.34833037853240967, | |
| "learning_rate": 0.00046773791496796736, | |
| "loss": 3.2985, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 11.059676292501164, | |
| "grad_norm": 0.3402170240879059, | |
| "learning_rate": 0.00046756319161327895, | |
| "loss": 3.2762, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.059676292501164, | |
| "eval_accuracy": 0.3692147330819738, | |
| "eval_loss": 3.5703163146972656, | |
| "eval_runtime": 180.5498, | |
| "eval_samples_per_second": 92.224, | |
| "eval_steps_per_second": 5.766, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.074231485794131, | |
| "grad_norm": 0.3203711211681366, | |
| "learning_rate": 0.00046738846825859054, | |
| "loss": 3.2972, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 11.088786679087098, | |
| "grad_norm": 0.38126954436302185, | |
| "learning_rate": 0.0004672137449039021, | |
| "loss": 3.291, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 11.103341872380065, | |
| "grad_norm": 0.34463387727737427, | |
| "learning_rate": 0.0004670390215492137, | |
| "loss": 3.296, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 11.117897065673032, | |
| "grad_norm": 0.33119046688079834, | |
| "learning_rate": 0.0004668642981945253, | |
| "loss": 3.2821, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 11.132452258966, | |
| "grad_norm": 0.35025256872177124, | |
| "learning_rate": 0.0004666895748398369, | |
| "loss": 3.2883, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 11.147007452258967, | |
| "grad_norm": 0.34867775440216064, | |
| "learning_rate": 0.00046651485148514846, | |
| "loss": 3.3033, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 11.161562645551934, | |
| "grad_norm": 0.35319674015045166, | |
| "learning_rate": 0.00046634012813046006, | |
| "loss": 3.3273, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 11.1761178388449, | |
| "grad_norm": 0.3690769076347351, | |
| "learning_rate": 0.00046616540477577165, | |
| "loss": 3.3217, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 11.190673032137866, | |
| "grad_norm": 0.36780601739883423, | |
| "learning_rate": 0.00046599068142108324, | |
| "loss": 3.3193, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 11.205228225430833, | |
| "grad_norm": 0.34421318769454956, | |
| "learning_rate": 0.0004658159580663948, | |
| "loss": 3.3123, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 11.2197834187238, | |
| "grad_norm": 0.3453294038772583, | |
| "learning_rate": 0.00046564123471170643, | |
| "loss": 3.3184, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 11.234338612016767, | |
| "grad_norm": 0.3701744079589844, | |
| "learning_rate": 0.00046546651135701803, | |
| "loss": 3.3203, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 11.248893805309734, | |
| "grad_norm": 0.3486950695514679, | |
| "learning_rate": 0.0004652917880023296, | |
| "loss": 3.3184, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 11.263448998602701, | |
| "grad_norm": 0.3991883397102356, | |
| "learning_rate": 0.00046511706464764116, | |
| "loss": 3.331, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 11.278004191895668, | |
| "grad_norm": 0.3387545645236969, | |
| "learning_rate": 0.00046494234129295276, | |
| "loss": 3.3285, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 11.292559385188635, | |
| "grad_norm": 0.3553421199321747, | |
| "learning_rate": 0.0004647676179382644, | |
| "loss": 3.3234, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 11.307114578481603, | |
| "grad_norm": 0.36746877431869507, | |
| "learning_rate": 0.000464592894583576, | |
| "loss": 3.3292, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 11.32166977177457, | |
| "grad_norm": 0.3457062840461731, | |
| "learning_rate": 0.00046441817122888754, | |
| "loss": 3.3332, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 11.336224965067537, | |
| "grad_norm": 0.3688143789768219, | |
| "learning_rate": 0.00046424344787419913, | |
| "loss": 3.3425, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 11.350780158360504, | |
| "grad_norm": 0.3513636887073517, | |
| "learning_rate": 0.00046406872451951073, | |
| "loss": 3.3422, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.350780158360504, | |
| "eval_accuracy": 0.3692253105394609, | |
| "eval_loss": 3.5667049884796143, | |
| "eval_runtime": 180.4706, | |
| "eval_samples_per_second": 92.264, | |
| "eval_steps_per_second": 5.768, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.36533535165347, | |
| "grad_norm": 0.35480257868766785, | |
| "learning_rate": 0.00046389400116482227, | |
| "loss": 3.3307, | |
| "step": 39050 | |
| }, | |
| { | |
| "epoch": 11.379890544946436, | |
| "grad_norm": 0.33527329564094543, | |
| "learning_rate": 0.0004637192778101339, | |
| "loss": 3.3369, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 11.394445738239403, | |
| "grad_norm": 0.3475622534751892, | |
| "learning_rate": 0.0004635445544554455, | |
| "loss": 3.3318, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 11.40900093153237, | |
| "grad_norm": 0.36372116208076477, | |
| "learning_rate": 0.0004633698311007571, | |
| "loss": 3.3451, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 11.423556124825337, | |
| "grad_norm": 0.34316396713256836, | |
| "learning_rate": 0.0004631951077460687, | |
| "loss": 3.3298, | |
| "step": 39250 | |
| }, | |
| { | |
| "epoch": 11.438111318118304, | |
| "grad_norm": 0.3609369099140167, | |
| "learning_rate": 0.00046302038439138024, | |
| "loss": 3.3359, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 11.452666511411271, | |
| "grad_norm": 0.3469187021255493, | |
| "learning_rate": 0.0004628456610366919, | |
| "loss": 3.3443, | |
| "step": 39350 | |
| }, | |
| { | |
| "epoch": 11.467221704704238, | |
| "grad_norm": 0.3519226312637329, | |
| "learning_rate": 0.0004626709376820035, | |
| "loss": 3.3389, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 11.481776897997205, | |
| "grad_norm": 0.35190141201019287, | |
| "learning_rate": 0.0004624962143273151, | |
| "loss": 3.351, | |
| "step": 39450 | |
| }, | |
| { | |
| "epoch": 11.496332091290173, | |
| "grad_norm": 0.33876270055770874, | |
| "learning_rate": 0.0004623214909726266, | |
| "loss": 3.3338, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 11.51088728458314, | |
| "grad_norm": 0.36089715361595154, | |
| "learning_rate": 0.0004621467676179382, | |
| "loss": 3.3567, | |
| "step": 39550 | |
| }, | |
| { | |
| "epoch": 11.525442477876107, | |
| "grad_norm": 0.34660300612449646, | |
| "learning_rate": 0.0004619720442632498, | |
| "loss": 3.3594, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 11.539997671169074, | |
| "grad_norm": 0.33915892243385315, | |
| "learning_rate": 0.00046179732090856145, | |
| "loss": 3.3471, | |
| "step": 39650 | |
| }, | |
| { | |
| "epoch": 11.55455286446204, | |
| "grad_norm": 0.3671376407146454, | |
| "learning_rate": 0.000461622597553873, | |
| "loss": 3.3468, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 11.569108057755006, | |
| "grad_norm": 0.3596802055835724, | |
| "learning_rate": 0.0004614478741991846, | |
| "loss": 3.3557, | |
| "step": 39750 | |
| }, | |
| { | |
| "epoch": 11.583663251047973, | |
| "grad_norm": 0.3604518175125122, | |
| "learning_rate": 0.0004612731508444962, | |
| "loss": 3.3504, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 11.59821844434094, | |
| "grad_norm": 0.3901963233947754, | |
| "learning_rate": 0.0004610984274898077, | |
| "loss": 3.3521, | |
| "step": 39850 | |
| }, | |
| { | |
| "epoch": 11.612773637633907, | |
| "grad_norm": 0.3487982153892517, | |
| "learning_rate": 0.00046092370413511937, | |
| "loss": 3.363, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 11.627328830926874, | |
| "grad_norm": 0.3493058383464813, | |
| "learning_rate": 0.00046074898078043096, | |
| "loss": 3.3527, | |
| "step": 39950 | |
| }, | |
| { | |
| "epoch": 11.641884024219841, | |
| "grad_norm": 0.3378664553165436, | |
| "learning_rate": 0.00046057425742574256, | |
| "loss": 3.3645, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.641884024219841, | |
| "eval_accuracy": 0.3694677693705273, | |
| "eval_loss": 3.5579540729522705, | |
| "eval_runtime": 180.5445, | |
| "eval_samples_per_second": 92.227, | |
| "eval_steps_per_second": 5.766, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.656439217512808, | |
| "grad_norm": 0.343606173992157, | |
| "learning_rate": 0.0004603995340710541, | |
| "loss": 3.3446, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 11.670994410805775, | |
| "grad_norm": 0.3760487139225006, | |
| "learning_rate": 0.0004602248107163657, | |
| "loss": 3.3721, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 11.685549604098743, | |
| "grad_norm": 0.3491581082344055, | |
| "learning_rate": 0.0004600500873616773, | |
| "loss": 3.3623, | |
| "step": 40150 | |
| }, | |
| { | |
| "epoch": 11.70010479739171, | |
| "grad_norm": 0.3621687889099121, | |
| "learning_rate": 0.00045987536400698894, | |
| "loss": 3.3651, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 11.714659990684677, | |
| "grad_norm": 0.3522111177444458, | |
| "learning_rate": 0.0004597006406523005, | |
| "loss": 3.3669, | |
| "step": 40250 | |
| }, | |
| { | |
| "epoch": 11.729215183977644, | |
| "grad_norm": 0.3602718114852905, | |
| "learning_rate": 0.00045952591729761207, | |
| "loss": 3.3611, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 11.74377037727061, | |
| "grad_norm": 0.3839702904224396, | |
| "learning_rate": 0.00045935119394292367, | |
| "loss": 3.3626, | |
| "step": 40350 | |
| }, | |
| { | |
| "epoch": 11.758325570563578, | |
| "grad_norm": 0.34963271021842957, | |
| "learning_rate": 0.00045917647058823526, | |
| "loss": 3.3643, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 11.772880763856545, | |
| "grad_norm": 0.3682020306587219, | |
| "learning_rate": 0.0004590017472335468, | |
| "loss": 3.3573, | |
| "step": 40450 | |
| }, | |
| { | |
| "epoch": 11.78743595714951, | |
| "grad_norm": 0.33553260564804077, | |
| "learning_rate": 0.00045882702387885845, | |
| "loss": 3.3682, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 11.801991150442477, | |
| "grad_norm": 0.34221509099006653, | |
| "learning_rate": 0.00045865230052417004, | |
| "loss": 3.3647, | |
| "step": 40550 | |
| }, | |
| { | |
| "epoch": 11.816546343735444, | |
| "grad_norm": 0.3339173197746277, | |
| "learning_rate": 0.00045847757716948164, | |
| "loss": 3.3798, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 11.831101537028411, | |
| "grad_norm": 0.3425685167312622, | |
| "learning_rate": 0.0004583028538147932, | |
| "loss": 3.3615, | |
| "step": 40650 | |
| }, | |
| { | |
| "epoch": 11.845656730321378, | |
| "grad_norm": 0.3418058454990387, | |
| "learning_rate": 0.00045812813046010477, | |
| "loss": 3.3696, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 11.860211923614346, | |
| "grad_norm": 0.3560492694377899, | |
| "learning_rate": 0.0004579534071054164, | |
| "loss": 3.3693, | |
| "step": 40750 | |
| }, | |
| { | |
| "epoch": 11.874767116907313, | |
| "grad_norm": 0.3407421112060547, | |
| "learning_rate": 0.000457778683750728, | |
| "loss": 3.3802, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 11.88932231020028, | |
| "grad_norm": 0.34812286496162415, | |
| "learning_rate": 0.00045760396039603955, | |
| "loss": 3.3717, | |
| "step": 40850 | |
| }, | |
| { | |
| "epoch": 11.903877503493247, | |
| "grad_norm": 0.37924692034721375, | |
| "learning_rate": 0.00045742923704135115, | |
| "loss": 3.373, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 11.918432696786214, | |
| "grad_norm": 0.34084251523017883, | |
| "learning_rate": 0.00045725451368666274, | |
| "loss": 3.3726, | |
| "step": 40950 | |
| }, | |
| { | |
| "epoch": 11.93298789007918, | |
| "grad_norm": 0.3382170498371124, | |
| "learning_rate": 0.0004570797903319743, | |
| "loss": 3.3777, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.93298789007918, | |
| "eval_accuracy": 0.37041985807167543, | |
| "eval_loss": 3.5492429733276367, | |
| "eval_runtime": 180.9818, | |
| "eval_samples_per_second": 92.004, | |
| "eval_steps_per_second": 5.752, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.947543083372148, | |
| "grad_norm": 0.3312283754348755, | |
| "learning_rate": 0.00045690506697728593, | |
| "loss": 3.3721, | |
| "step": 41050 | |
| }, | |
| { | |
| "epoch": 11.962098276665115, | |
| "grad_norm": 0.4082726538181305, | |
| "learning_rate": 0.0004567303436225975, | |
| "loss": 3.3703, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 11.97665346995808, | |
| "grad_norm": 0.3326382040977478, | |
| "learning_rate": 0.0004565556202679091, | |
| "loss": 3.3689, | |
| "step": 41150 | |
| }, | |
| { | |
| "epoch": 11.991208663251047, | |
| "grad_norm": 0.38740527629852295, | |
| "learning_rate": 0.00045638089691322066, | |
| "loss": 3.3696, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 12.005530973451327, | |
| "grad_norm": 0.37978094816207886, | |
| "learning_rate": 0.00045620617355853225, | |
| "loss": 3.33, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 12.020086166744294, | |
| "grad_norm": 0.36496874690055847, | |
| "learning_rate": 0.0004560314502038439, | |
| "loss": 3.2598, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 12.034641360037261, | |
| "grad_norm": 0.36001056432724, | |
| "learning_rate": 0.0004558567268491555, | |
| "loss": 3.2643, | |
| "step": 41350 | |
| }, | |
| { | |
| "epoch": 12.049196553330228, | |
| "grad_norm": 0.35342076420783997, | |
| "learning_rate": 0.00045568200349446704, | |
| "loss": 3.2643, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 12.063751746623195, | |
| "grad_norm": 0.3439677953720093, | |
| "learning_rate": 0.00045550728013977863, | |
| "loss": 3.2643, | |
| "step": 41450 | |
| }, | |
| { | |
| "epoch": 12.078306939916162, | |
| "grad_norm": 0.39456239342689514, | |
| "learning_rate": 0.0004553325567850902, | |
| "loss": 3.2815, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 12.09286213320913, | |
| "grad_norm": 0.3336825966835022, | |
| "learning_rate": 0.0004551578334304018, | |
| "loss": 3.2807, | |
| "step": 41550 | |
| }, | |
| { | |
| "epoch": 12.107417326502096, | |
| "grad_norm": 0.3326915204524994, | |
| "learning_rate": 0.00045498311007571347, | |
| "loss": 3.2888, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 12.121972519795063, | |
| "grad_norm": 0.3774108588695526, | |
| "learning_rate": 0.000454808386721025, | |
| "loss": 3.29, | |
| "step": 41650 | |
| }, | |
| { | |
| "epoch": 12.13652771308803, | |
| "grad_norm": 0.3388764560222626, | |
| "learning_rate": 0.0004546336633663366, | |
| "loss": 3.288, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 12.151082906380998, | |
| "grad_norm": 0.34907984733581543, | |
| "learning_rate": 0.0004544589400116482, | |
| "loss": 3.2881, | |
| "step": 41750 | |
| }, | |
| { | |
| "epoch": 12.165638099673963, | |
| "grad_norm": 0.35436078906059265, | |
| "learning_rate": 0.00045428421665695974, | |
| "loss": 3.2909, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 12.18019329296693, | |
| "grad_norm": 0.3528887629508972, | |
| "learning_rate": 0.00045410949330227133, | |
| "loss": 3.2997, | |
| "step": 41850 | |
| }, | |
| { | |
| "epoch": 12.194748486259897, | |
| "grad_norm": 0.3703976273536682, | |
| "learning_rate": 0.000453934769947583, | |
| "loss": 3.3017, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 12.209303679552864, | |
| "grad_norm": 0.37421178817749023, | |
| "learning_rate": 0.0004537600465928946, | |
| "loss": 3.3006, | |
| "step": 41950 | |
| }, | |
| { | |
| "epoch": 12.223858872845831, | |
| "grad_norm": 0.355501264333725, | |
| "learning_rate": 0.0004535853232382061, | |
| "loss": 3.3127, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.223858872845831, | |
| "eval_accuracy": 0.369873238574201, | |
| "eval_loss": 3.5645577907562256, | |
| "eval_runtime": 180.7762, | |
| "eval_samples_per_second": 92.108, | |
| "eval_steps_per_second": 5.759, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.238414066138798, | |
| "grad_norm": 0.355353444814682, | |
| "learning_rate": 0.0004534105998835177, | |
| "loss": 3.3006, | |
| "step": 42050 | |
| }, | |
| { | |
| "epoch": 12.252969259431765, | |
| "grad_norm": 0.38428860902786255, | |
| "learning_rate": 0.0004532358765288293, | |
| "loss": 3.2968, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 12.267524452724732, | |
| "grad_norm": 0.4032050371170044, | |
| "learning_rate": 0.00045306115317414095, | |
| "loss": 3.3104, | |
| "step": 42150 | |
| }, | |
| { | |
| "epoch": 12.2820796460177, | |
| "grad_norm": 0.3453782796859741, | |
| "learning_rate": 0.0004528864298194525, | |
| "loss": 3.3114, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 12.296634839310666, | |
| "grad_norm": 0.3356723487377167, | |
| "learning_rate": 0.0004527117064647641, | |
| "loss": 3.307, | |
| "step": 42250 | |
| }, | |
| { | |
| "epoch": 12.311190032603633, | |
| "grad_norm": 0.3533400595188141, | |
| "learning_rate": 0.0004525369831100757, | |
| "loss": 3.3036, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 12.3257452258966, | |
| "grad_norm": 0.35451340675354004, | |
| "learning_rate": 0.0004523622597553872, | |
| "loss": 3.3313, | |
| "step": 42350 | |
| }, | |
| { | |
| "epoch": 12.340300419189568, | |
| "grad_norm": 0.3433496654033661, | |
| "learning_rate": 0.0004521875364006988, | |
| "loss": 3.3092, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 12.354855612482535, | |
| "grad_norm": 0.7035194635391235, | |
| "learning_rate": 0.00045201281304601046, | |
| "loss": 3.3169, | |
| "step": 42450 | |
| }, | |
| { | |
| "epoch": 12.3694108057755, | |
| "grad_norm": 0.3605223000049591, | |
| "learning_rate": 0.00045183808969132206, | |
| "loss": 3.3173, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 12.383965999068467, | |
| "grad_norm": 0.35129112005233765, | |
| "learning_rate": 0.00045166336633663365, | |
| "loss": 3.3261, | |
| "step": 42550 | |
| }, | |
| { | |
| "epoch": 12.398521192361434, | |
| "grad_norm": 0.3602702021598816, | |
| "learning_rate": 0.0004514886429819452, | |
| "loss": 3.3134, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 12.413076385654401, | |
| "grad_norm": 0.37585994601249695, | |
| "learning_rate": 0.0004513139196272568, | |
| "loss": 3.3272, | |
| "step": 42650 | |
| }, | |
| { | |
| "epoch": 12.427631578947368, | |
| "grad_norm": 0.36394479870796204, | |
| "learning_rate": 0.00045113919627256843, | |
| "loss": 3.3157, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 12.442186772240335, | |
| "grad_norm": 0.36557355523109436, | |
| "learning_rate": 0.00045096447291788003, | |
| "loss": 3.3223, | |
| "step": 42750 | |
| }, | |
| { | |
| "epoch": 12.456741965533302, | |
| "grad_norm": 0.34626543521881104, | |
| "learning_rate": 0.00045078974956319157, | |
| "loss": 3.3323, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 12.47129715882627, | |
| "grad_norm": 0.3926179111003876, | |
| "learning_rate": 0.00045061502620850316, | |
| "loss": 3.3222, | |
| "step": 42850 | |
| }, | |
| { | |
| "epoch": 12.485852352119236, | |
| "grad_norm": 0.34034961462020874, | |
| "learning_rate": 0.00045044030285381476, | |
| "loss": 3.3259, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 12.500407545412203, | |
| "grad_norm": 0.36732086539268494, | |
| "learning_rate": 0.0004502655794991263, | |
| "loss": 3.3292, | |
| "step": 42950 | |
| }, | |
| { | |
| "epoch": 12.51496273870517, | |
| "grad_norm": 0.3459414541721344, | |
| "learning_rate": 0.00045009085614443795, | |
| "loss": 3.3324, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.51496273870517, | |
| "eval_accuracy": 0.3703144360787203, | |
| "eval_loss": 3.554603099822998, | |
| "eval_runtime": 180.4822, | |
| "eval_samples_per_second": 92.258, | |
| "eval_steps_per_second": 5.768, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.529517931998138, | |
| "grad_norm": 0.38607484102249146, | |
| "learning_rate": 0.00044991613278974954, | |
| "loss": 3.3345, | |
| "step": 43050 | |
| }, | |
| { | |
| "epoch": 12.544073125291105, | |
| "grad_norm": 0.3506922423839569, | |
| "learning_rate": 0.00044974140943506113, | |
| "loss": 3.3365, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 12.55862831858407, | |
| "grad_norm": 0.3726440668106079, | |
| "learning_rate": 0.0004495666860803727, | |
| "loss": 3.3272, | |
| "step": 43150 | |
| }, | |
| { | |
| "epoch": 12.573183511877037, | |
| "grad_norm": 0.38131991028785706, | |
| "learning_rate": 0.00044939196272568427, | |
| "loss": 3.3338, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 12.587738705170004, | |
| "grad_norm": 0.3584553003311157, | |
| "learning_rate": 0.00044921723937099586, | |
| "loss": 3.3322, | |
| "step": 43250 | |
| }, | |
| { | |
| "epoch": 12.602293898462971, | |
| "grad_norm": 0.35820427536964417, | |
| "learning_rate": 0.0004490425160163075, | |
| "loss": 3.3464, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 12.616849091755938, | |
| "grad_norm": 0.3539363443851471, | |
| "learning_rate": 0.00044886779266161905, | |
| "loss": 3.332, | |
| "step": 43350 | |
| }, | |
| { | |
| "epoch": 12.631404285048905, | |
| "grad_norm": 0.3747362494468689, | |
| "learning_rate": 0.00044869306930693065, | |
| "loss": 3.3428, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 12.645959478341872, | |
| "grad_norm": 0.3429812490940094, | |
| "learning_rate": 0.00044851834595224224, | |
| "loss": 3.3402, | |
| "step": 43450 | |
| }, | |
| { | |
| "epoch": 12.66051467163484, | |
| "grad_norm": 0.3554392457008362, | |
| "learning_rate": 0.00044834362259755383, | |
| "loss": 3.3499, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 12.675069864927806, | |
| "grad_norm": 0.3396950364112854, | |
| "learning_rate": 0.00044816889924286543, | |
| "loss": 3.3369, | |
| "step": 43550 | |
| }, | |
| { | |
| "epoch": 12.689625058220773, | |
| "grad_norm": 0.343290239572525, | |
| "learning_rate": 0.000447994175888177, | |
| "loss": 3.3475, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 12.70418025151374, | |
| "grad_norm": 0.4013109803199768, | |
| "learning_rate": 0.0004478194525334886, | |
| "loss": 3.3497, | |
| "step": 43650 | |
| }, | |
| { | |
| "epoch": 12.718735444806708, | |
| "grad_norm": 0.36353474855422974, | |
| "learning_rate": 0.0004476447291788002, | |
| "loss": 3.3382, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 12.733290638099675, | |
| "grad_norm": 0.3667754828929901, | |
| "learning_rate": 0.00044747000582411175, | |
| "loss": 3.3366, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 12.747845831392642, | |
| "grad_norm": 0.34893324971199036, | |
| "learning_rate": 0.00044729528246942335, | |
| "loss": 3.3385, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 12.762401024685607, | |
| "grad_norm": 0.3607388138771057, | |
| "learning_rate": 0.000447120559114735, | |
| "loss": 3.3321, | |
| "step": 43850 | |
| }, | |
| { | |
| "epoch": 12.776956217978574, | |
| "grad_norm": 0.3520786762237549, | |
| "learning_rate": 0.0004469458357600466, | |
| "loss": 3.3459, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 12.791511411271541, | |
| "grad_norm": 0.35246145725250244, | |
| "learning_rate": 0.00044677111240535813, | |
| "loss": 3.3461, | |
| "step": 43950 | |
| }, | |
| { | |
| "epoch": 12.806066604564508, | |
| "grad_norm": 0.37881430983543396, | |
| "learning_rate": 0.0004465963890506697, | |
| "loss": 3.35, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.806066604564508, | |
| "eval_accuracy": 0.3708386078608608, | |
| "eval_loss": 3.550057888031006, | |
| "eval_runtime": 180.7088, | |
| "eval_samples_per_second": 92.143, | |
| "eval_steps_per_second": 5.761, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.820621797857475, | |
| "grad_norm": 0.3498731255531311, | |
| "learning_rate": 0.0004464216656959813, | |
| "loss": 3.3612, | |
| "step": 44050 | |
| }, | |
| { | |
| "epoch": 12.835176991150442, | |
| "grad_norm": 0.34110793471336365, | |
| "learning_rate": 0.00044624694234129297, | |
| "loss": 3.3573, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 12.84973218444341, | |
| "grad_norm": 0.3420650064945221, | |
| "learning_rate": 0.0004460722189866045, | |
| "loss": 3.3488, | |
| "step": 44150 | |
| }, | |
| { | |
| "epoch": 12.864287377736376, | |
| "grad_norm": 0.3443625867366791, | |
| "learning_rate": 0.0004458974956319161, | |
| "loss": 3.3514, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 12.878842571029343, | |
| "grad_norm": 0.351570188999176, | |
| "learning_rate": 0.0004457227722772277, | |
| "loss": 3.3507, | |
| "step": 44250 | |
| }, | |
| { | |
| "epoch": 12.89339776432231, | |
| "grad_norm": 0.3458472192287445, | |
| "learning_rate": 0.00044554804892253923, | |
| "loss": 3.3491, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 12.907952957615278, | |
| "grad_norm": 0.35336291790008545, | |
| "learning_rate": 0.00044537332556785083, | |
| "loss": 3.3476, | |
| "step": 44350 | |
| }, | |
| { | |
| "epoch": 12.922508150908245, | |
| "grad_norm": 0.3397476375102997, | |
| "learning_rate": 0.0004451986022131625, | |
| "loss": 3.3578, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 12.937063344201212, | |
| "grad_norm": 0.3396618962287903, | |
| "learning_rate": 0.00044502387885847407, | |
| "loss": 3.3574, | |
| "step": 44450 | |
| }, | |
| { | |
| "epoch": 12.951618537494177, | |
| "grad_norm": 0.37106412649154663, | |
| "learning_rate": 0.0004448491555037856, | |
| "loss": 3.3514, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 12.966173730787144, | |
| "grad_norm": 0.3617645800113678, | |
| "learning_rate": 0.0004446744321490972, | |
| "loss": 3.3492, | |
| "step": 44550 | |
| }, | |
| { | |
| "epoch": 12.980728924080111, | |
| "grad_norm": 0.36867889761924744, | |
| "learning_rate": 0.0004444997087944088, | |
| "loss": 3.3587, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 12.995284117373078, | |
| "grad_norm": 0.35698699951171875, | |
| "learning_rate": 0.0004443249854397204, | |
| "loss": 3.3498, | |
| "step": 44650 | |
| }, | |
| { | |
| "epoch": 13.009606427573358, | |
| "grad_norm": 0.3502655327320099, | |
| "learning_rate": 0.000444150262085032, | |
| "loss": 3.2829, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 13.024161620866325, | |
| "grad_norm": 0.36120232939720154, | |
| "learning_rate": 0.0004439755387303436, | |
| "loss": 3.2469, | |
| "step": 44750 | |
| }, | |
| { | |
| "epoch": 13.038716814159292, | |
| "grad_norm": 0.3971077501773834, | |
| "learning_rate": 0.0004438008153756552, | |
| "loss": 3.2521, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 13.053272007452259, | |
| "grad_norm": 0.37181058526039124, | |
| "learning_rate": 0.00044362609202096677, | |
| "loss": 3.2576, | |
| "step": 44850 | |
| }, | |
| { | |
| "epoch": 13.067827200745226, | |
| "grad_norm": 0.3654915988445282, | |
| "learning_rate": 0.0004434513686662783, | |
| "loss": 3.2463, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 13.082382394038193, | |
| "grad_norm": 0.3972369134426117, | |
| "learning_rate": 0.00044327664531158996, | |
| "loss": 3.2479, | |
| "step": 44950 | |
| }, | |
| { | |
| "epoch": 13.09693758733116, | |
| "grad_norm": 0.3511376678943634, | |
| "learning_rate": 0.00044310192195690155, | |
| "loss": 3.2664, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.09693758733116, | |
| "eval_accuracy": 0.37047803408785474, | |
| "eval_loss": 3.562286615371704, | |
| "eval_runtime": 180.8347, | |
| "eval_samples_per_second": 92.079, | |
| "eval_steps_per_second": 5.757, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.111492780624127, | |
| "grad_norm": 0.34787827730178833, | |
| "learning_rate": 0.00044292719860221315, | |
| "loss": 3.2656, | |
| "step": 45050 | |
| }, | |
| { | |
| "epoch": 13.126047973917094, | |
| "grad_norm": 0.335764616727829, | |
| "learning_rate": 0.0004427524752475247, | |
| "loss": 3.2611, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 13.140603167210061, | |
| "grad_norm": 0.35691389441490173, | |
| "learning_rate": 0.0004425777518928363, | |
| "loss": 3.2656, | |
| "step": 45150 | |
| }, | |
| { | |
| "epoch": 13.155158360503027, | |
| "grad_norm": 0.3829896152019501, | |
| "learning_rate": 0.0004424030285381479, | |
| "loss": 3.2774, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 13.169713553795994, | |
| "grad_norm": 0.3497374653816223, | |
| "learning_rate": 0.0004422283051834595, | |
| "loss": 3.2691, | |
| "step": 45250 | |
| }, | |
| { | |
| "epoch": 13.18426874708896, | |
| "grad_norm": 0.39746323227882385, | |
| "learning_rate": 0.00044205358182877107, | |
| "loss": 3.2923, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 13.198823940381928, | |
| "grad_norm": 0.36261340975761414, | |
| "learning_rate": 0.00044187885847408266, | |
| "loss": 3.2917, | |
| "step": 45350 | |
| }, | |
| { | |
| "epoch": 13.213379133674895, | |
| "grad_norm": 0.35540542006492615, | |
| "learning_rate": 0.00044170413511939425, | |
| "loss": 3.289, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 13.227934326967862, | |
| "grad_norm": 0.3456279933452606, | |
| "learning_rate": 0.0004415294117647058, | |
| "loss": 3.2888, | |
| "step": 45450 | |
| }, | |
| { | |
| "epoch": 13.242489520260829, | |
| "grad_norm": 0.4017365276813507, | |
| "learning_rate": 0.00044135468841001744, | |
| "loss": 3.2956, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 13.257044713553796, | |
| "grad_norm": 0.36163267493247986, | |
| "learning_rate": 0.00044117996505532904, | |
| "loss": 3.2815, | |
| "step": 45550 | |
| }, | |
| { | |
| "epoch": 13.271599906846763, | |
| "grad_norm": 0.3932439684867859, | |
| "learning_rate": 0.00044100524170064063, | |
| "loss": 3.2868, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 13.28615510013973, | |
| "grad_norm": 0.368018239736557, | |
| "learning_rate": 0.0004408305183459522, | |
| "loss": 3.2923, | |
| "step": 45650 | |
| }, | |
| { | |
| "epoch": 13.300710293432697, | |
| "grad_norm": 0.36368754506111145, | |
| "learning_rate": 0.00044065579499126377, | |
| "loss": 3.297, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 13.315265486725664, | |
| "grad_norm": 0.3517613708972931, | |
| "learning_rate": 0.00044048107163657536, | |
| "loss": 3.3054, | |
| "step": 45750 | |
| }, | |
| { | |
| "epoch": 13.329820680018631, | |
| "grad_norm": 0.35036543011665344, | |
| "learning_rate": 0.000440306348281887, | |
| "loss": 3.3059, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 13.344375873311598, | |
| "grad_norm": 0.3734825849533081, | |
| "learning_rate": 0.0004401316249271986, | |
| "loss": 3.3042, | |
| "step": 45850 | |
| }, | |
| { | |
| "epoch": 13.358931066604564, | |
| "grad_norm": 0.34895241260528564, | |
| "learning_rate": 0.00043995690157251014, | |
| "loss": 3.3034, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 13.37348625989753, | |
| "grad_norm": 0.34659481048583984, | |
| "learning_rate": 0.00043978217821782174, | |
| "loss": 3.3088, | |
| "step": 45950 | |
| }, | |
| { | |
| "epoch": 13.388041453190498, | |
| "grad_norm": 0.3693874180316925, | |
| "learning_rate": 0.00043960745486313333, | |
| "loss": 3.3057, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.388041453190498, | |
| "eval_accuracy": 0.3705000116939669, | |
| "eval_loss": 3.5553836822509766, | |
| "eval_runtime": 180.7589, | |
| "eval_samples_per_second": 92.117, | |
| "eval_steps_per_second": 5.759, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.402596646483465, | |
| "grad_norm": 0.353637158870697, | |
| "learning_rate": 0.00043943273150844487, | |
| "loss": 3.3021, | |
| "step": 46050 | |
| }, | |
| { | |
| "epoch": 13.417151839776432, | |
| "grad_norm": 0.36062633991241455, | |
| "learning_rate": 0.0004392580081537565, | |
| "loss": 3.3003, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 13.431707033069399, | |
| "grad_norm": 0.3554173409938812, | |
| "learning_rate": 0.0004390832847990681, | |
| "loss": 3.3094, | |
| "step": 46150 | |
| }, | |
| { | |
| "epoch": 13.446262226362366, | |
| "grad_norm": 0.34017348289489746, | |
| "learning_rate": 0.0004389085614443797, | |
| "loss": 3.307, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 13.460817419655333, | |
| "grad_norm": 0.3515809178352356, | |
| "learning_rate": 0.00043873383808969125, | |
| "loss": 3.2939, | |
| "step": 46250 | |
| }, | |
| { | |
| "epoch": 13.4753726129483, | |
| "grad_norm": 0.35182228684425354, | |
| "learning_rate": 0.00043855911473500284, | |
| "loss": 3.3089, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 13.489927806241267, | |
| "grad_norm": 0.36773136258125305, | |
| "learning_rate": 0.0004383843913803145, | |
| "loss": 3.3088, | |
| "step": 46350 | |
| }, | |
| { | |
| "epoch": 13.504482999534234, | |
| "grad_norm": 0.3656975030899048, | |
| "learning_rate": 0.0004382096680256261, | |
| "loss": 3.3023, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 13.519038192827201, | |
| "grad_norm": 0.395908385515213, | |
| "learning_rate": 0.0004380349446709376, | |
| "loss": 3.3161, | |
| "step": 46450 | |
| }, | |
| { | |
| "epoch": 13.533593386120168, | |
| "grad_norm": 0.3659190535545349, | |
| "learning_rate": 0.0004378602213162492, | |
| "loss": 3.3189, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 13.548148579413134, | |
| "grad_norm": 0.4184473156929016, | |
| "learning_rate": 0.0004376854979615608, | |
| "loss": 3.3149, | |
| "step": 46550 | |
| }, | |
| { | |
| "epoch": 13.5627037727061, | |
| "grad_norm": 0.35925260186195374, | |
| "learning_rate": 0.0004375107746068724, | |
| "loss": 3.3237, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 13.577258965999068, | |
| "grad_norm": 0.3621493875980377, | |
| "learning_rate": 0.000437336051252184, | |
| "loss": 3.3211, | |
| "step": 46650 | |
| }, | |
| { | |
| "epoch": 13.591814159292035, | |
| "grad_norm": 0.40040937066078186, | |
| "learning_rate": 0.0004371613278974956, | |
| "loss": 3.3179, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 13.606369352585002, | |
| "grad_norm": 0.3536069393157959, | |
| "learning_rate": 0.0004369866045428072, | |
| "loss": 3.3133, | |
| "step": 46750 | |
| }, | |
| { | |
| "epoch": 13.620924545877969, | |
| "grad_norm": 0.3616696000099182, | |
| "learning_rate": 0.0004368118811881188, | |
| "loss": 3.3149, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 13.635479739170936, | |
| "grad_norm": 0.3452049195766449, | |
| "learning_rate": 0.0004366371578334303, | |
| "loss": 3.3215, | |
| "step": 46850 | |
| }, | |
| { | |
| "epoch": 13.650034932463903, | |
| "grad_norm": 0.3689325749874115, | |
| "learning_rate": 0.000436462434478742, | |
| "loss": 3.3393, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 13.66459012575687, | |
| "grad_norm": 0.3743153214454651, | |
| "learning_rate": 0.00043628771112405357, | |
| "loss": 3.3319, | |
| "step": 46950 | |
| }, | |
| { | |
| "epoch": 13.679145319049837, | |
| "grad_norm": 0.3641968071460724, | |
| "learning_rate": 0.00043611298776936516, | |
| "loss": 3.3299, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.679145319049837, | |
| "eval_accuracy": 0.3709644796049578, | |
| "eval_loss": 3.550139904022217, | |
| "eval_runtime": 180.6262, | |
| "eval_samples_per_second": 92.185, | |
| "eval_steps_per_second": 5.763, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.693700512342804, | |
| "grad_norm": 0.36270520091056824, | |
| "learning_rate": 0.0004359382644146767, | |
| "loss": 3.3307, | |
| "step": 47050 | |
| }, | |
| { | |
| "epoch": 13.708255705635771, | |
| "grad_norm": 0.35284462571144104, | |
| "learning_rate": 0.0004357635410599883, | |
| "loss": 3.3265, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 13.722810898928739, | |
| "grad_norm": 0.3471984565258026, | |
| "learning_rate": 0.0004355888177052999, | |
| "loss": 3.3359, | |
| "step": 47150 | |
| }, | |
| { | |
| "epoch": 13.737366092221706, | |
| "grad_norm": 0.34575363993644714, | |
| "learning_rate": 0.00043541409435061154, | |
| "loss": 3.3259, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 13.75192128551467, | |
| "grad_norm": 0.34385809302330017, | |
| "learning_rate": 0.0004352393709959231, | |
| "loss": 3.3322, | |
| "step": 47250 | |
| }, | |
| { | |
| "epoch": 13.766476478807638, | |
| "grad_norm": 0.34404346346855164, | |
| "learning_rate": 0.0004350646476412347, | |
| "loss": 3.3379, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 13.781031672100605, | |
| "grad_norm": 0.3680616021156311, | |
| "learning_rate": 0.00043488992428654627, | |
| "loss": 3.3403, | |
| "step": 47350 | |
| }, | |
| { | |
| "epoch": 13.795586865393572, | |
| "grad_norm": 0.36942896246910095, | |
| "learning_rate": 0.0004347152009318578, | |
| "loss": 3.3231, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 13.810142058686539, | |
| "grad_norm": 0.34883928298950195, | |
| "learning_rate": 0.00043454047757716946, | |
| "loss": 3.3337, | |
| "step": 47450 | |
| }, | |
| { | |
| "epoch": 13.824697251979506, | |
| "grad_norm": 0.38166287541389465, | |
| "learning_rate": 0.00043436575422248105, | |
| "loss": 3.3319, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 13.839252445272473, | |
| "grad_norm": 0.353601336479187, | |
| "learning_rate": 0.00043419103086779265, | |
| "loss": 3.3325, | |
| "step": 47550 | |
| }, | |
| { | |
| "epoch": 13.85380763856544, | |
| "grad_norm": 0.3711412847042084, | |
| "learning_rate": 0.0004340163075131042, | |
| "loss": 3.3234, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 13.868362831858407, | |
| "grad_norm": 0.3460991084575653, | |
| "learning_rate": 0.0004338415841584158, | |
| "loss": 3.3401, | |
| "step": 47650 | |
| }, | |
| { | |
| "epoch": 13.882918025151374, | |
| "grad_norm": 0.3756689429283142, | |
| "learning_rate": 0.0004336668608037274, | |
| "loss": 3.3471, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 13.897473218444341, | |
| "grad_norm": 0.34957924485206604, | |
| "learning_rate": 0.000433492137449039, | |
| "loss": 3.3332, | |
| "step": 47750 | |
| }, | |
| { | |
| "epoch": 13.912028411737309, | |
| "grad_norm": 0.34652629494667053, | |
| "learning_rate": 0.00043331741409435056, | |
| "loss": 3.3283, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 13.926583605030276, | |
| "grad_norm": 0.35131871700286865, | |
| "learning_rate": 0.00043314269073966216, | |
| "loss": 3.3328, | |
| "step": 47850 | |
| }, | |
| { | |
| "epoch": 13.94113879832324, | |
| "grad_norm": 0.3381422460079193, | |
| "learning_rate": 0.00043296796738497375, | |
| "loss": 3.3268, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 13.955693991616208, | |
| "grad_norm": 0.33312922716140747, | |
| "learning_rate": 0.00043279324403028535, | |
| "loss": 3.3311, | |
| "step": 47950 | |
| }, | |
| { | |
| "epoch": 13.970249184909175, | |
| "grad_norm": 0.36296528577804565, | |
| "learning_rate": 0.0004326185206755969, | |
| "loss": 3.3344, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 13.970249184909175, | |
| "eval_accuracy": 0.37150863102901854, | |
| "eval_loss": 3.5411489009857178, | |
| "eval_runtime": 180.6801, | |
| "eval_samples_per_second": 92.157, | |
| "eval_steps_per_second": 5.762, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 13.984804378202142, | |
| "grad_norm": 0.3755042254924774, | |
| "learning_rate": 0.00043244379732090854, | |
| "loss": 3.339, | |
| "step": 48050 | |
| }, | |
| { | |
| "epoch": 13.99935957149511, | |
| "grad_norm": 0.35796603560447693, | |
| "learning_rate": 0.00043226907396622013, | |
| "loss": 3.3514, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 14.013681881695389, | |
| "grad_norm": 0.35690686106681824, | |
| "learning_rate": 0.0004320943506115317, | |
| "loss": 3.2215, | |
| "step": 48150 | |
| }, | |
| { | |
| "epoch": 14.028237074988356, | |
| "grad_norm": 0.3859853148460388, | |
| "learning_rate": 0.00043191962725684326, | |
| "loss": 3.2391, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 14.042792268281323, | |
| "grad_norm": 0.356228232383728, | |
| "learning_rate": 0.00043174490390215486, | |
| "loss": 3.244, | |
| "step": 48250 | |
| }, | |
| { | |
| "epoch": 14.05734746157429, | |
| "grad_norm": 0.3733968734741211, | |
| "learning_rate": 0.0004315701805474665, | |
| "loss": 3.2455, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 14.071902654867257, | |
| "grad_norm": 0.3651208281517029, | |
| "learning_rate": 0.0004313954571927781, | |
| "loss": 3.2535, | |
| "step": 48350 | |
| }, | |
| { | |
| "epoch": 14.086457848160224, | |
| "grad_norm": 0.3808946907520294, | |
| "learning_rate": 0.00043122073383808964, | |
| "loss": 3.2467, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 14.101013041453191, | |
| "grad_norm": 0.34354254603385925, | |
| "learning_rate": 0.00043104601048340124, | |
| "loss": 3.2369, | |
| "step": 48450 | |
| }, | |
| { | |
| "epoch": 14.115568234746158, | |
| "grad_norm": 0.3820490837097168, | |
| "learning_rate": 0.00043087128712871283, | |
| "loss": 3.2512, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 14.130123428039123, | |
| "grad_norm": 0.3820575475692749, | |
| "learning_rate": 0.00043069656377402437, | |
| "loss": 3.24, | |
| "step": 48550 | |
| }, | |
| { | |
| "epoch": 14.14467862133209, | |
| "grad_norm": 0.35366949439048767, | |
| "learning_rate": 0.000430521840419336, | |
| "loss": 3.2534, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 14.159233814625058, | |
| "grad_norm": 0.39843541383743286, | |
| "learning_rate": 0.0004303471170646476, | |
| "loss": 3.266, | |
| "step": 48650 | |
| }, | |
| { | |
| "epoch": 14.173789007918025, | |
| "grad_norm": 0.37052369117736816, | |
| "learning_rate": 0.0004301723937099592, | |
| "loss": 3.2609, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 14.188344201210992, | |
| "grad_norm": 0.3426913321018219, | |
| "learning_rate": 0.00042999767035527075, | |
| "loss": 3.2588, | |
| "step": 48750 | |
| }, | |
| { | |
| "epoch": 14.202899394503959, | |
| "grad_norm": 0.38335371017456055, | |
| "learning_rate": 0.00042982294700058234, | |
| "loss": 3.2661, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 14.217454587796926, | |
| "grad_norm": 0.36299335956573486, | |
| "learning_rate": 0.000429648223645894, | |
| "loss": 3.2673, | |
| "step": 48850 | |
| }, | |
| { | |
| "epoch": 14.232009781089893, | |
| "grad_norm": 0.37311017513275146, | |
| "learning_rate": 0.0004294735002912056, | |
| "loss": 3.2794, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 14.24656497438286, | |
| "grad_norm": 0.37185418605804443, | |
| "learning_rate": 0.0004292987769365172, | |
| "loss": 3.2617, | |
| "step": 48950 | |
| }, | |
| { | |
| "epoch": 14.261120167675827, | |
| "grad_norm": 0.38323771953582764, | |
| "learning_rate": 0.0004291240535818287, | |
| "loss": 3.2717, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.261120167675827, | |
| "eval_accuracy": 0.37077866893510036, | |
| "eval_loss": 3.561283826828003, | |
| "eval_runtime": 180.7027, | |
| "eval_samples_per_second": 92.146, | |
| "eval_steps_per_second": 5.761, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.275675360968794, | |
| "grad_norm": 0.38571175932884216, | |
| "learning_rate": 0.0004289493302271403, | |
| "loss": 3.278, | |
| "step": 49050 | |
| }, | |
| { | |
| "epoch": 14.290230554261761, | |
| "grad_norm": 0.36952513456344604, | |
| "learning_rate": 0.0004287746068724519, | |
| "loss": 3.2728, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 14.304785747554728, | |
| "grad_norm": 0.3557230234146118, | |
| "learning_rate": 0.00042859988351776356, | |
| "loss": 3.2858, | |
| "step": 49150 | |
| }, | |
| { | |
| "epoch": 14.319340940847695, | |
| "grad_norm": 0.3847562074661255, | |
| "learning_rate": 0.0004284251601630751, | |
| "loss": 3.2943, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 14.33389613414066, | |
| "grad_norm": 0.35433831810951233, | |
| "learning_rate": 0.0004282504368083867, | |
| "loss": 3.2786, | |
| "step": 49250 | |
| }, | |
| { | |
| "epoch": 14.348451327433628, | |
| "grad_norm": 0.35621702671051025, | |
| "learning_rate": 0.0004280757134536983, | |
| "loss": 3.2851, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 14.363006520726595, | |
| "grad_norm": 0.38574540615081787, | |
| "learning_rate": 0.0004279009900990098, | |
| "loss": 3.2885, | |
| "step": 49350 | |
| }, | |
| { | |
| "epoch": 14.377561714019562, | |
| "grad_norm": 0.3726487457752228, | |
| "learning_rate": 0.0004277262667443214, | |
| "loss": 3.2838, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 14.392116907312529, | |
| "grad_norm": 0.37835362553596497, | |
| "learning_rate": 0.00042755154338963307, | |
| "loss": 3.2886, | |
| "step": 49450 | |
| }, | |
| { | |
| "epoch": 14.406672100605496, | |
| "grad_norm": 0.40398108959198, | |
| "learning_rate": 0.00042737682003494466, | |
| "loss": 3.2928, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 14.421227293898463, | |
| "grad_norm": 0.3870435059070587, | |
| "learning_rate": 0.0004272020966802562, | |
| "loss": 3.279, | |
| "step": 49550 | |
| }, | |
| { | |
| "epoch": 14.43578248719143, | |
| "grad_norm": 0.35079923272132874, | |
| "learning_rate": 0.0004270273733255678, | |
| "loss": 3.2766, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 14.450337680484397, | |
| "grad_norm": 0.382772833108902, | |
| "learning_rate": 0.0004268526499708794, | |
| "loss": 3.2812, | |
| "step": 49650 | |
| }, | |
| { | |
| "epoch": 14.464892873777364, | |
| "grad_norm": 0.3519676625728607, | |
| "learning_rate": 0.00042667792661619104, | |
| "loss": 3.2985, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 14.479448067070331, | |
| "grad_norm": 0.363188236951828, | |
| "learning_rate": 0.0004265032032615026, | |
| "loss": 3.2961, | |
| "step": 49750 | |
| }, | |
| { | |
| "epoch": 14.494003260363298, | |
| "grad_norm": 0.3657895028591156, | |
| "learning_rate": 0.0004263284799068142, | |
| "loss": 3.29, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 14.508558453656265, | |
| "grad_norm": 0.35007017850875854, | |
| "learning_rate": 0.00042615375655212577, | |
| "loss": 3.3003, | |
| "step": 49850 | |
| }, | |
| { | |
| "epoch": 14.52311364694923, | |
| "grad_norm": 0.36784571409225464, | |
| "learning_rate": 0.00042597903319743736, | |
| "loss": 3.297, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 14.537668840242198, | |
| "grad_norm": 0.38216352462768555, | |
| "learning_rate": 0.0004258043098427489, | |
| "loss": 3.3064, | |
| "step": 49950 | |
| }, | |
| { | |
| "epoch": 14.552224033535165, | |
| "grad_norm": 0.3852609694004059, | |
| "learning_rate": 0.00042562958648806055, | |
| "loss": 3.3047, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.552224033535165, | |
| "eval_accuracy": 0.3711253744860678, | |
| "eval_loss": 3.5540521144866943, | |
| "eval_runtime": 180.6741, | |
| "eval_samples_per_second": 92.16, | |
| "eval_steps_per_second": 5.762, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.566779226828132, | |
| "grad_norm": 0.37750688195228577, | |
| "learning_rate": 0.00042545486313337214, | |
| "loss": 3.3066, | |
| "step": 50050 | |
| }, | |
| { | |
| "epoch": 14.581334420121099, | |
| "grad_norm": 0.3378167748451233, | |
| "learning_rate": 0.00042528013977868374, | |
| "loss": 3.3001, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 14.595889613414066, | |
| "grad_norm": 0.3360258936882019, | |
| "learning_rate": 0.0004251054164239953, | |
| "loss": 3.3126, | |
| "step": 50150 | |
| }, | |
| { | |
| "epoch": 14.610444806707033, | |
| "grad_norm": 0.37378090620040894, | |
| "learning_rate": 0.0004249306930693069, | |
| "loss": 3.312, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 14.625, | |
| "grad_norm": 0.3751259744167328, | |
| "learning_rate": 0.0004247559697146185, | |
| "loss": 3.3087, | |
| "step": 50250 | |
| }, | |
| { | |
| "epoch": 14.639555193292967, | |
| "grad_norm": 0.3533448278903961, | |
| "learning_rate": 0.0004245812463599301, | |
| "loss": 3.3096, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 14.654110386585934, | |
| "grad_norm": 0.34902751445770264, | |
| "learning_rate": 0.00042440652300524166, | |
| "loss": 3.3138, | |
| "step": 50350 | |
| }, | |
| { | |
| "epoch": 14.668665579878901, | |
| "grad_norm": 0.3788563907146454, | |
| "learning_rate": 0.00042423179965055325, | |
| "loss": 3.3182, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 14.683220773171868, | |
| "grad_norm": 0.36749276518821716, | |
| "learning_rate": 0.00042405707629586484, | |
| "loss": 3.3093, | |
| "step": 50450 | |
| }, | |
| { | |
| "epoch": 14.697775966464835, | |
| "grad_norm": 0.35430875420570374, | |
| "learning_rate": 0.0004238823529411764, | |
| "loss": 3.3083, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 14.712331159757802, | |
| "grad_norm": 0.3698976933956146, | |
| "learning_rate": 0.00042370762958648803, | |
| "loss": 3.3094, | |
| "step": 50550 | |
| }, | |
| { | |
| "epoch": 14.72688635305077, | |
| "grad_norm": 0.3699832260608673, | |
| "learning_rate": 0.00042353290623179963, | |
| "loss": 3.3171, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 14.741441546343735, | |
| "grad_norm": 0.38030651211738586, | |
| "learning_rate": 0.0004233581828771112, | |
| "loss": 3.3168, | |
| "step": 50650 | |
| }, | |
| { | |
| "epoch": 14.755996739636702, | |
| "grad_norm": 0.3591087758541107, | |
| "learning_rate": 0.00042318345952242276, | |
| "loss": 3.3236, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 14.770551932929669, | |
| "grad_norm": 0.35677772760391235, | |
| "learning_rate": 0.00042300873616773436, | |
| "loss": 3.3107, | |
| "step": 50750 | |
| }, | |
| { | |
| "epoch": 14.785107126222636, | |
| "grad_norm": 0.33428385853767395, | |
| "learning_rate": 0.00042283401281304595, | |
| "loss": 3.3119, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 14.799662319515603, | |
| "grad_norm": 0.35604846477508545, | |
| "learning_rate": 0.0004226592894583576, | |
| "loss": 3.3181, | |
| "step": 50850 | |
| }, | |
| { | |
| "epoch": 14.81421751280857, | |
| "grad_norm": 0.36865541338920593, | |
| "learning_rate": 0.00042248456610366914, | |
| "loss": 3.3018, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 14.828772706101537, | |
| "grad_norm": 0.38723063468933105, | |
| "learning_rate": 0.00042230984274898073, | |
| "loss": 3.3145, | |
| "step": 50950 | |
| }, | |
| { | |
| "epoch": 14.843327899394504, | |
| "grad_norm": 0.3579923212528229, | |
| "learning_rate": 0.00042213511939429233, | |
| "loss": 3.3198, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.843327899394504, | |
| "eval_accuracy": 0.37172370599792376, | |
| "eval_loss": 3.5439720153808594, | |
| "eval_runtime": 180.7444, | |
| "eval_samples_per_second": 92.125, | |
| "eval_steps_per_second": 5.76, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.857883092687471, | |
| "grad_norm": 0.35483837127685547, | |
| "learning_rate": 0.0004219603960396039, | |
| "loss": 3.3159, | |
| "step": 51050 | |
| }, | |
| { | |
| "epoch": 14.872438285980438, | |
| "grad_norm": 0.3707719147205353, | |
| "learning_rate": 0.0004217856726849155, | |
| "loss": 3.3257, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 14.886993479273405, | |
| "grad_norm": 0.38554805517196655, | |
| "learning_rate": 0.0004216109493302271, | |
| "loss": 3.3335, | |
| "step": 51150 | |
| }, | |
| { | |
| "epoch": 14.901548672566372, | |
| "grad_norm": 0.3630807101726532, | |
| "learning_rate": 0.0004214362259755387, | |
| "loss": 3.3156, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 14.916103865859338, | |
| "grad_norm": 0.34905824065208435, | |
| "learning_rate": 0.0004212615026208503, | |
| "loss": 3.3278, | |
| "step": 51250 | |
| }, | |
| { | |
| "epoch": 14.930659059152305, | |
| "grad_norm": 0.3572143316268921, | |
| "learning_rate": 0.00042108677926616184, | |
| "loss": 3.3322, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 14.945214252445272, | |
| "grad_norm": 0.3421500623226166, | |
| "learning_rate": 0.00042091205591147343, | |
| "loss": 3.3142, | |
| "step": 51350 | |
| }, | |
| { | |
| "epoch": 14.959769445738239, | |
| "grad_norm": 0.357888787984848, | |
| "learning_rate": 0.0004207373325567851, | |
| "loss": 3.3227, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 14.974324639031206, | |
| "grad_norm": 0.3665570914745331, | |
| "learning_rate": 0.0004205626092020967, | |
| "loss": 3.3336, | |
| "step": 51450 | |
| }, | |
| { | |
| "epoch": 14.988879832324173, | |
| "grad_norm": 0.35188835859298706, | |
| "learning_rate": 0.0004203878858474082, | |
| "loss": 3.3368, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 15.003202142524453, | |
| "grad_norm": 0.3578493595123291, | |
| "learning_rate": 0.0004202131624927198, | |
| "loss": 3.3033, | |
| "step": 51550 | |
| }, | |
| { | |
| "epoch": 15.01775733581742, | |
| "grad_norm": 0.34845006465911865, | |
| "learning_rate": 0.0004200384391380314, | |
| "loss": 3.2108, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 15.032312529110387, | |
| "grad_norm": 0.35948559641838074, | |
| "learning_rate": 0.00041986371578334305, | |
| "loss": 3.2219, | |
| "step": 51650 | |
| }, | |
| { | |
| "epoch": 15.046867722403354, | |
| "grad_norm": 0.3708672821521759, | |
| "learning_rate": 0.0004196889924286546, | |
| "loss": 3.2238, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 15.06142291569632, | |
| "grad_norm": 0.36989542841911316, | |
| "learning_rate": 0.0004195142690739662, | |
| "loss": 3.2232, | |
| "step": 51750 | |
| }, | |
| { | |
| "epoch": 15.075978108989288, | |
| "grad_norm": 0.39784368872642517, | |
| "learning_rate": 0.0004193395457192778, | |
| "loss": 3.2288, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 15.090533302282255, | |
| "grad_norm": 0.37010207772254944, | |
| "learning_rate": 0.0004191648223645893, | |
| "loss": 3.2277, | |
| "step": 51850 | |
| }, | |
| { | |
| "epoch": 15.105088495575222, | |
| "grad_norm": 0.38173454999923706, | |
| "learning_rate": 0.0004189900990099009, | |
| "loss": 3.2347, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 15.119643688868187, | |
| "grad_norm": 0.3663221597671509, | |
| "learning_rate": 0.00041881537565521256, | |
| "loss": 3.2352, | |
| "step": 51950 | |
| }, | |
| { | |
| "epoch": 15.134198882161154, | |
| "grad_norm": 0.3820110261440277, | |
| "learning_rate": 0.00041864065230052416, | |
| "loss": 3.2314, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.134198882161154, | |
| "eval_accuracy": 0.3710468662460521, | |
| "eval_loss": 3.5597763061523438, | |
| "eval_runtime": 180.8133, | |
| "eval_samples_per_second": 92.089, | |
| "eval_steps_per_second": 5.757, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.148754075454121, | |
| "grad_norm": 0.3669772148132324, | |
| "learning_rate": 0.00041846592894583575, | |
| "loss": 3.2492, | |
| "step": 52050 | |
| }, | |
| { | |
| "epoch": 15.163309268747089, | |
| "grad_norm": 0.34591078758239746, | |
| "learning_rate": 0.0004182912055911473, | |
| "loss": 3.2431, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 15.177864462040056, | |
| "grad_norm": 0.3824208676815033, | |
| "learning_rate": 0.0004181164822364589, | |
| "loss": 3.2466, | |
| "step": 52150 | |
| }, | |
| { | |
| "epoch": 15.192419655333023, | |
| "grad_norm": 0.39810559153556824, | |
| "learning_rate": 0.0004179417588817705, | |
| "loss": 3.2503, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 15.20697484862599, | |
| "grad_norm": 0.3922621011734009, | |
| "learning_rate": 0.00041776703552708213, | |
| "loss": 3.2585, | |
| "step": 52250 | |
| }, | |
| { | |
| "epoch": 15.221530041918957, | |
| "grad_norm": 0.3696410357952118, | |
| "learning_rate": 0.00041759231217239367, | |
| "loss": 3.2627, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 15.236085235211924, | |
| "grad_norm": 0.3916743993759155, | |
| "learning_rate": 0.00041741758881770527, | |
| "loss": 3.2578, | |
| "step": 52350 | |
| }, | |
| { | |
| "epoch": 15.25064042850489, | |
| "grad_norm": 0.38373348116874695, | |
| "learning_rate": 0.00041724286546301686, | |
| "loss": 3.26, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 15.265195621797858, | |
| "grad_norm": 0.3440963923931122, | |
| "learning_rate": 0.0004170681421083284, | |
| "loss": 3.2547, | |
| "step": 52450 | |
| }, | |
| { | |
| "epoch": 15.279750815090825, | |
| "grad_norm": 0.3764485716819763, | |
| "learning_rate": 0.00041689341875364005, | |
| "loss": 3.2616, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 15.294306008383792, | |
| "grad_norm": 0.3800085783004761, | |
| "learning_rate": 0.00041671869539895164, | |
| "loss": 3.2645, | |
| "step": 52550 | |
| }, | |
| { | |
| "epoch": 15.30886120167676, | |
| "grad_norm": 0.3482251763343811, | |
| "learning_rate": 0.00041654397204426324, | |
| "loss": 3.2679, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 15.323416394969724, | |
| "grad_norm": 0.40797919034957886, | |
| "learning_rate": 0.0004163692486895748, | |
| "loss": 3.274, | |
| "step": 52650 | |
| }, | |
| { | |
| "epoch": 15.337971588262691, | |
| "grad_norm": 0.37061765789985657, | |
| "learning_rate": 0.00041619452533488637, | |
| "loss": 3.2729, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 15.352526781555659, | |
| "grad_norm": 0.3923170268535614, | |
| "learning_rate": 0.00041601980198019797, | |
| "loss": 3.2706, | |
| "step": 52750 | |
| }, | |
| { | |
| "epoch": 15.367081974848626, | |
| "grad_norm": 0.3743267357349396, | |
| "learning_rate": 0.0004158450786255096, | |
| "loss": 3.2751, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 15.381637168141593, | |
| "grad_norm": 0.3853559195995331, | |
| "learning_rate": 0.00041567035527082115, | |
| "loss": 3.2698, | |
| "step": 52850 | |
| }, | |
| { | |
| "epoch": 15.39619236143456, | |
| "grad_norm": 0.39672213792800903, | |
| "learning_rate": 0.00041549563191613275, | |
| "loss": 3.2775, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 15.410747554727527, | |
| "grad_norm": 0.3728732168674469, | |
| "learning_rate": 0.00041532090856144434, | |
| "loss": 3.2878, | |
| "step": 52950 | |
| }, | |
| { | |
| "epoch": 15.425302748020494, | |
| "grad_norm": 0.363398939371109, | |
| "learning_rate": 0.00041514618520675594, | |
| "loss": 3.288, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.425302748020494, | |
| "eval_accuracy": 0.371279217728853, | |
| "eval_loss": 3.5508575439453125, | |
| "eval_runtime": 180.7239, | |
| "eval_samples_per_second": 92.135, | |
| "eval_steps_per_second": 5.76, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.439857941313461, | |
| "grad_norm": 0.3929230570793152, | |
| "learning_rate": 0.00041497146185206753, | |
| "loss": 3.2737, | |
| "step": 53050 | |
| }, | |
| { | |
| "epoch": 15.454413134606428, | |
| "grad_norm": 0.3794131875038147, | |
| "learning_rate": 0.0004147967384973791, | |
| "loss": 3.278, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 15.468968327899395, | |
| "grad_norm": 0.40206944942474365, | |
| "learning_rate": 0.0004146220151426907, | |
| "loss": 3.2781, | |
| "step": 53150 | |
| }, | |
| { | |
| "epoch": 15.483523521192362, | |
| "grad_norm": 0.411923348903656, | |
| "learning_rate": 0.0004144472917880023, | |
| "loss": 3.2788, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 15.49807871448533, | |
| "grad_norm": 0.35818853974342346, | |
| "learning_rate": 0.00041427256843331385, | |
| "loss": 3.2878, | |
| "step": 53250 | |
| }, | |
| { | |
| "epoch": 15.512633907778294, | |
| "grad_norm": 0.36999696493148804, | |
| "learning_rate": 0.00041409784507862545, | |
| "loss": 3.2931, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 15.527189101071261, | |
| "grad_norm": 0.37897634506225586, | |
| "learning_rate": 0.0004139231217239371, | |
| "loss": 3.2671, | |
| "step": 53350 | |
| }, | |
| { | |
| "epoch": 15.541744294364229, | |
| "grad_norm": 0.37530139088630676, | |
| "learning_rate": 0.0004137483983692487, | |
| "loss": 3.2885, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 15.556299487657196, | |
| "grad_norm": 0.3707473576068878, | |
| "learning_rate": 0.00041357367501456023, | |
| "loss": 3.286, | |
| "step": 53450 | |
| }, | |
| { | |
| "epoch": 15.570854680950163, | |
| "grad_norm": 0.3612443506717682, | |
| "learning_rate": 0.0004133989516598718, | |
| "loss": 3.2783, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 15.58540987424313, | |
| "grad_norm": 0.3725733458995819, | |
| "learning_rate": 0.0004132242283051834, | |
| "loss": 3.2967, | |
| "step": 53550 | |
| }, | |
| { | |
| "epoch": 15.599965067536097, | |
| "grad_norm": 0.372572660446167, | |
| "learning_rate": 0.00041304950495049496, | |
| "loss": 3.3004, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 15.614520260829064, | |
| "grad_norm": 0.3715369999408722, | |
| "learning_rate": 0.0004128747815958066, | |
| "loss": 3.3003, | |
| "step": 53650 | |
| }, | |
| { | |
| "epoch": 15.629075454122031, | |
| "grad_norm": 0.3832106292247772, | |
| "learning_rate": 0.0004127000582411182, | |
| "loss": 3.2956, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 15.643630647414998, | |
| "grad_norm": 0.357073575258255, | |
| "learning_rate": 0.0004125253348864298, | |
| "loss": 3.2928, | |
| "step": 53750 | |
| }, | |
| { | |
| "epoch": 15.658185840707965, | |
| "grad_norm": 0.3932480812072754, | |
| "learning_rate": 0.00041235061153174134, | |
| "loss": 3.2989, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 15.672741034000932, | |
| "grad_norm": 0.3740988075733185, | |
| "learning_rate": 0.00041217588817705293, | |
| "loss": 3.2842, | |
| "step": 53850 | |
| }, | |
| { | |
| "epoch": 15.6872962272939, | |
| "grad_norm": 0.3591715693473816, | |
| "learning_rate": 0.0004120011648223646, | |
| "loss": 3.3048, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 15.701851420586866, | |
| "grad_norm": 0.38362252712249756, | |
| "learning_rate": 0.0004118264414676762, | |
| "loss": 3.286, | |
| "step": 53950 | |
| }, | |
| { | |
| "epoch": 15.716406613879832, | |
| "grad_norm": 0.35855767130851746, | |
| "learning_rate": 0.0004116517181129877, | |
| "loss": 3.292, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.716406613879832, | |
| "eval_accuracy": 0.37204878652469525, | |
| "eval_loss": 3.5418312549591064, | |
| "eval_runtime": 180.4949, | |
| "eval_samples_per_second": 92.252, | |
| "eval_steps_per_second": 5.767, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.730961807172799, | |
| "grad_norm": 0.3673221468925476, | |
| "learning_rate": 0.0004114769947582993, | |
| "loss": 3.2939, | |
| "step": 54050 | |
| }, | |
| { | |
| "epoch": 15.745517000465766, | |
| "grad_norm": 0.3466639220714569, | |
| "learning_rate": 0.0004113022714036109, | |
| "loss": 3.3044, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 15.760072193758733, | |
| "grad_norm": 0.3751138746738434, | |
| "learning_rate": 0.0004111275480489225, | |
| "loss": 3.2979, | |
| "step": 54150 | |
| }, | |
| { | |
| "epoch": 15.7746273870517, | |
| "grad_norm": 0.3524673879146576, | |
| "learning_rate": 0.0004109528246942341, | |
| "loss": 3.3023, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 15.789182580344667, | |
| "grad_norm": 0.3760589063167572, | |
| "learning_rate": 0.0004107781013395457, | |
| "loss": 3.3052, | |
| "step": 54250 | |
| }, | |
| { | |
| "epoch": 15.803737773637634, | |
| "grad_norm": 0.37704402208328247, | |
| "learning_rate": 0.0004106033779848573, | |
| "loss": 3.2998, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 15.818292966930601, | |
| "grad_norm": 0.34257006645202637, | |
| "learning_rate": 0.0004104286546301689, | |
| "loss": 3.3111, | |
| "step": 54350 | |
| }, | |
| { | |
| "epoch": 15.832848160223568, | |
| "grad_norm": 0.3559759855270386, | |
| "learning_rate": 0.0004102539312754804, | |
| "loss": 3.2984, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 15.847403353516535, | |
| "grad_norm": 0.3894140124320984, | |
| "learning_rate": 0.00041007920792079206, | |
| "loss": 3.3095, | |
| "step": 54450 | |
| }, | |
| { | |
| "epoch": 15.861958546809502, | |
| "grad_norm": 0.3747352361679077, | |
| "learning_rate": 0.00040990448456610366, | |
| "loss": 3.3113, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 15.87651374010247, | |
| "grad_norm": 0.3795801103115082, | |
| "learning_rate": 0.00040972976121141525, | |
| "loss": 3.3034, | |
| "step": 54550 | |
| }, | |
| { | |
| "epoch": 15.891068933395436, | |
| "grad_norm": 0.3692949712276459, | |
| "learning_rate": 0.0004095550378567268, | |
| "loss": 3.3077, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 15.905624126688402, | |
| "grad_norm": 0.37650200724601746, | |
| "learning_rate": 0.0004093803145020384, | |
| "loss": 3.3151, | |
| "step": 54650 | |
| }, | |
| { | |
| "epoch": 15.920179319981369, | |
| "grad_norm": 0.3589628338813782, | |
| "learning_rate": 0.00040920559114735, | |
| "loss": 3.3076, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 15.934734513274336, | |
| "grad_norm": 0.38173508644104004, | |
| "learning_rate": 0.00040903086779266163, | |
| "loss": 3.32, | |
| "step": 54750 | |
| }, | |
| { | |
| "epoch": 15.949289706567303, | |
| "grad_norm": 0.3892667293548584, | |
| "learning_rate": 0.00040885614443797317, | |
| "loss": 3.3163, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 15.96384489986027, | |
| "grad_norm": 0.364224374294281, | |
| "learning_rate": 0.00040868142108328476, | |
| "loss": 3.3101, | |
| "step": 54850 | |
| }, | |
| { | |
| "epoch": 15.978400093153237, | |
| "grad_norm": 0.38242313265800476, | |
| "learning_rate": 0.00040850669772859636, | |
| "loss": 3.3056, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 15.992955286446204, | |
| "grad_norm": 0.34566983580589294, | |
| "learning_rate": 0.0004083319743739079, | |
| "loss": 3.3088, | |
| "step": 54950 | |
| }, | |
| { | |
| "epoch": 16.00727759664648, | |
| "grad_norm": 0.3479236364364624, | |
| "learning_rate": 0.0004081572510192195, | |
| "loss": 3.2576, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.00727759664648, | |
| "eval_accuracy": 0.3714727852008677, | |
| "eval_loss": 3.550360679626465, | |
| "eval_runtime": 180.7157, | |
| "eval_samples_per_second": 92.139, | |
| "eval_steps_per_second": 5.76, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.02183278993945, | |
| "grad_norm": 0.3490423858165741, | |
| "learning_rate": 0.00040798252766453114, | |
| "loss": 3.2051, | |
| "step": 55050 | |
| }, | |
| { | |
| "epoch": 16.036387983232416, | |
| "grad_norm": 0.35485896468162537, | |
| "learning_rate": 0.00040780780430984273, | |
| "loss": 3.1993, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 16.050943176525383, | |
| "grad_norm": 0.3467007875442505, | |
| "learning_rate": 0.0004076330809551543, | |
| "loss": 3.2033, | |
| "step": 55150 | |
| }, | |
| { | |
| "epoch": 16.06549836981835, | |
| "grad_norm": 0.36979350447654724, | |
| "learning_rate": 0.00040745835760046587, | |
| "loss": 3.2182, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 16.080053563111317, | |
| "grad_norm": 0.3978710174560547, | |
| "learning_rate": 0.00040728363424577746, | |
| "loss": 3.2076, | |
| "step": 55250 | |
| }, | |
| { | |
| "epoch": 16.094608756404284, | |
| "grad_norm": 0.36152997612953186, | |
| "learning_rate": 0.0004071089108910891, | |
| "loss": 3.227, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 16.10916394969725, | |
| "grad_norm": 0.40970709919929504, | |
| "learning_rate": 0.0004069341875364007, | |
| "loss": 3.2334, | |
| "step": 55350 | |
| }, | |
| { | |
| "epoch": 16.12371914299022, | |
| "grad_norm": 0.3361974060535431, | |
| "learning_rate": 0.00040675946418171225, | |
| "loss": 3.2195, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 16.138274336283185, | |
| "grad_norm": 0.3803759217262268, | |
| "learning_rate": 0.00040658474082702384, | |
| "loss": 3.2233, | |
| "step": 55450 | |
| }, | |
| { | |
| "epoch": 16.152829529576152, | |
| "grad_norm": 0.39719876646995544, | |
| "learning_rate": 0.00040641001747233543, | |
| "loss": 3.2228, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 16.16738472286912, | |
| "grad_norm": 0.3793357312679291, | |
| "learning_rate": 0.000406235294117647, | |
| "loss": 3.2222, | |
| "step": 55550 | |
| }, | |
| { | |
| "epoch": 16.181939916162086, | |
| "grad_norm": 0.3996943533420563, | |
| "learning_rate": 0.0004060605707629586, | |
| "loss": 3.2343, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 16.196495109455054, | |
| "grad_norm": 0.4160076677799225, | |
| "learning_rate": 0.0004058858474082702, | |
| "loss": 3.2444, | |
| "step": 55650 | |
| }, | |
| { | |
| "epoch": 16.21105030274802, | |
| "grad_norm": 0.42159706354141235, | |
| "learning_rate": 0.0004057111240535818, | |
| "loss": 3.2443, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 16.225605496040988, | |
| "grad_norm": 0.41101568937301636, | |
| "learning_rate": 0.00040553640069889335, | |
| "loss": 3.2388, | |
| "step": 55750 | |
| }, | |
| { | |
| "epoch": 16.240160689333955, | |
| "grad_norm": 0.36233237385749817, | |
| "learning_rate": 0.00040536167734420495, | |
| "loss": 3.2569, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 16.254715882626922, | |
| "grad_norm": 0.3940770626068115, | |
| "learning_rate": 0.0004051869539895166, | |
| "loss": 3.2503, | |
| "step": 55850 | |
| }, | |
| { | |
| "epoch": 16.26927107591989, | |
| "grad_norm": 0.3568994998931885, | |
| "learning_rate": 0.0004050122306348282, | |
| "loss": 3.2398, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 16.283826269212856, | |
| "grad_norm": 0.4108010232448578, | |
| "learning_rate": 0.00040483750728013973, | |
| "loss": 3.2505, | |
| "step": 55950 | |
| }, | |
| { | |
| "epoch": 16.298381462505823, | |
| "grad_norm": 0.38988006114959717, | |
| "learning_rate": 0.0004046627839254513, | |
| "loss": 3.2485, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.298381462505823, | |
| "eval_accuracy": 0.37194136656754806, | |
| "eval_loss": 3.5515103340148926, | |
| "eval_runtime": 180.5241, | |
| "eval_samples_per_second": 92.237, | |
| "eval_steps_per_second": 5.767, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.31293665579879, | |
| "grad_norm": 0.4487169086933136, | |
| "learning_rate": 0.0004044880605707629, | |
| "loss": 3.2605, | |
| "step": 56050 | |
| }, | |
| { | |
| "epoch": 16.327491849091757, | |
| "grad_norm": 0.36728495359420776, | |
| "learning_rate": 0.00040431333721607446, | |
| "loss": 3.2619, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 16.342047042384724, | |
| "grad_norm": 0.3760203719139099, | |
| "learning_rate": 0.0004041386138613861, | |
| "loss": 3.2557, | |
| "step": 56150 | |
| }, | |
| { | |
| "epoch": 16.35660223567769, | |
| "grad_norm": 0.3645574748516083, | |
| "learning_rate": 0.0004039638905066977, | |
| "loss": 3.2606, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 16.37115742897066, | |
| "grad_norm": 0.369003564119339, | |
| "learning_rate": 0.0004037891671520093, | |
| "loss": 3.2511, | |
| "step": 56250 | |
| }, | |
| { | |
| "epoch": 16.385712622263625, | |
| "grad_norm": 0.43754157423973083, | |
| "learning_rate": 0.0004036144437973209, | |
| "loss": 3.2484, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 16.40026781555659, | |
| "grad_norm": 0.3467438519001007, | |
| "learning_rate": 0.00040343972044263243, | |
| "loss": 3.2649, | |
| "step": 56350 | |
| }, | |
| { | |
| "epoch": 16.414823008849556, | |
| "grad_norm": 0.37996238470077515, | |
| "learning_rate": 0.0004032649970879441, | |
| "loss": 3.2618, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 16.429378202142523, | |
| "grad_norm": 0.37395748496055603, | |
| "learning_rate": 0.00040309027373325567, | |
| "loss": 3.2634, | |
| "step": 56450 | |
| }, | |
| { | |
| "epoch": 16.44393339543549, | |
| "grad_norm": 0.4084472060203552, | |
| "learning_rate": 0.00040291555037856727, | |
| "loss": 3.2607, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 16.458488588728457, | |
| "grad_norm": 0.4055795967578888, | |
| "learning_rate": 0.0004027408270238788, | |
| "loss": 3.2661, | |
| "step": 56550 | |
| }, | |
| { | |
| "epoch": 16.473043782021424, | |
| "grad_norm": 0.38332685828208923, | |
| "learning_rate": 0.0004025661036691904, | |
| "loss": 3.2826, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 16.48759897531439, | |
| "grad_norm": 0.36652758717536926, | |
| "learning_rate": 0.000402391380314502, | |
| "loss": 3.2754, | |
| "step": 56650 | |
| }, | |
| { | |
| "epoch": 16.50215416860736, | |
| "grad_norm": 0.38163354992866516, | |
| "learning_rate": 0.00040221665695981364, | |
| "loss": 3.2714, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 16.516709361900325, | |
| "grad_norm": 0.3861958980560303, | |
| "learning_rate": 0.0004020419336051252, | |
| "loss": 3.2711, | |
| "step": 56750 | |
| }, | |
| { | |
| "epoch": 16.531264555193292, | |
| "grad_norm": 0.37569305300712585, | |
| "learning_rate": 0.0004018672102504368, | |
| "loss": 3.2786, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 16.54581974848626, | |
| "grad_norm": 0.3796161711215973, | |
| "learning_rate": 0.00040169248689574837, | |
| "loss": 3.2842, | |
| "step": 56850 | |
| }, | |
| { | |
| "epoch": 16.560374941779227, | |
| "grad_norm": 0.3572014272212982, | |
| "learning_rate": 0.0004015177635410599, | |
| "loss": 3.2793, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 16.574930135072194, | |
| "grad_norm": 0.37812671065330505, | |
| "learning_rate": 0.0004013430401863715, | |
| "loss": 3.2792, | |
| "step": 56950 | |
| }, | |
| { | |
| "epoch": 16.58948532836516, | |
| "grad_norm": 0.364805668592453, | |
| "learning_rate": 0.00040116831683168315, | |
| "loss": 3.2742, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.58948532836516, | |
| "eval_accuracy": 0.3722474076708427, | |
| "eval_loss": 3.546841859817505, | |
| "eval_runtime": 180.5294, | |
| "eval_samples_per_second": 92.234, | |
| "eval_steps_per_second": 5.766, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.604040521658128, | |
| "grad_norm": 0.3842049539089203, | |
| "learning_rate": 0.00040099359347699475, | |
| "loss": 3.2678, | |
| "step": 57050 | |
| }, | |
| { | |
| "epoch": 16.618595714951095, | |
| "grad_norm": 0.34471479058265686, | |
| "learning_rate": 0.0004008188701223063, | |
| "loss": 3.2687, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 16.633150908244062, | |
| "grad_norm": 0.3672460615634918, | |
| "learning_rate": 0.0004006441467676179, | |
| "loss": 3.2822, | |
| "step": 57150 | |
| }, | |
| { | |
| "epoch": 16.64770610153703, | |
| "grad_norm": 0.4222690463066101, | |
| "learning_rate": 0.0004004694234129295, | |
| "loss": 3.2874, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 16.662261294829996, | |
| "grad_norm": 0.3623253405094147, | |
| "learning_rate": 0.0004002947000582411, | |
| "loss": 3.2867, | |
| "step": 57250 | |
| }, | |
| { | |
| "epoch": 16.676816488122963, | |
| "grad_norm": 0.415358304977417, | |
| "learning_rate": 0.00040011997670355267, | |
| "loss": 3.2783, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 16.69137168141593, | |
| "grad_norm": 0.3691982328891754, | |
| "learning_rate": 0.00039994525334886426, | |
| "loss": 3.2882, | |
| "step": 57350 | |
| }, | |
| { | |
| "epoch": 16.705926874708897, | |
| "grad_norm": 0.3871172070503235, | |
| "learning_rate": 0.00039977052999417585, | |
| "loss": 3.2929, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 16.720482068001864, | |
| "grad_norm": 0.3653024733066559, | |
| "learning_rate": 0.00039959580663948745, | |
| "loss": 3.283, | |
| "step": 57450 | |
| }, | |
| { | |
| "epoch": 16.73503726129483, | |
| "grad_norm": 0.35508090257644653, | |
| "learning_rate": 0.000399421083284799, | |
| "loss": 3.2825, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 16.7495924545878, | |
| "grad_norm": 0.36881718039512634, | |
| "learning_rate": 0.00039924635993011064, | |
| "loss": 3.2922, | |
| "step": 57550 | |
| }, | |
| { | |
| "epoch": 16.764147647880765, | |
| "grad_norm": 0.37382662296295166, | |
| "learning_rate": 0.00039907163657542223, | |
| "loss": 3.2958, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 16.778702841173732, | |
| "grad_norm": 0.36220356822013855, | |
| "learning_rate": 0.0003988969132207338, | |
| "loss": 3.2862, | |
| "step": 57650 | |
| }, | |
| { | |
| "epoch": 16.793258034466696, | |
| "grad_norm": 0.38938117027282715, | |
| "learning_rate": 0.00039872218986604537, | |
| "loss": 3.2864, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 16.807813227759663, | |
| "grad_norm": 0.44321128726005554, | |
| "learning_rate": 0.00039854746651135696, | |
| "loss": 3.2846, | |
| "step": 57750 | |
| }, | |
| { | |
| "epoch": 16.82236842105263, | |
| "grad_norm": 0.3790134787559509, | |
| "learning_rate": 0.0003983727431566686, | |
| "loss": 3.287, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 16.836923614345597, | |
| "grad_norm": 0.3475116491317749, | |
| "learning_rate": 0.0003981980198019802, | |
| "loss": 3.2896, | |
| "step": 57850 | |
| }, | |
| { | |
| "epoch": 16.851478807638564, | |
| "grad_norm": 0.39978528022766113, | |
| "learning_rate": 0.00039802329644729174, | |
| "loss": 3.3023, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 16.86603400093153, | |
| "grad_norm": 0.3973531126976013, | |
| "learning_rate": 0.00039784857309260334, | |
| "loss": 3.291, | |
| "step": 57950 | |
| }, | |
| { | |
| "epoch": 16.8805891942245, | |
| "grad_norm": 0.3604554533958435, | |
| "learning_rate": 0.00039767384973791493, | |
| "loss": 3.3017, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.8805891942245, | |
| "eval_accuracy": 0.37261585577331147, | |
| "eval_loss": 3.537532329559326, | |
| "eval_runtime": 180.4183, | |
| "eval_samples_per_second": 92.291, | |
| "eval_steps_per_second": 5.77, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.895144387517465, | |
| "grad_norm": 0.3675045073032379, | |
| "learning_rate": 0.00039749912638322647, | |
| "loss": 3.3074, | |
| "step": 58050 | |
| }, | |
| { | |
| "epoch": 16.909699580810432, | |
| "grad_norm": 0.36799120903015137, | |
| "learning_rate": 0.0003973244030285381, | |
| "loss": 3.2985, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 16.9242547741034, | |
| "grad_norm": 0.3711433708667755, | |
| "learning_rate": 0.0003971496796738497, | |
| "loss": 3.2991, | |
| "step": 58150 | |
| }, | |
| { | |
| "epoch": 16.938809967396367, | |
| "grad_norm": 0.3584972620010376, | |
| "learning_rate": 0.0003969749563191613, | |
| "loss": 3.3057, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 16.953365160689334, | |
| "grad_norm": 0.358095645904541, | |
| "learning_rate": 0.00039680023296447285, | |
| "loss": 3.2921, | |
| "step": 58250 | |
| }, | |
| { | |
| "epoch": 16.9679203539823, | |
| "grad_norm": 0.35984092950820923, | |
| "learning_rate": 0.00039662550960978444, | |
| "loss": 3.2932, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 16.982475547275268, | |
| "grad_norm": 0.43100354075431824, | |
| "learning_rate": 0.00039645078625509604, | |
| "loss": 3.2988, | |
| "step": 58350 | |
| }, | |
| { | |
| "epoch": 16.997030740568235, | |
| "grad_norm": 0.3718823790550232, | |
| "learning_rate": 0.0003962760629004077, | |
| "loss": 3.2887, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 17.011353050768513, | |
| "grad_norm": 0.389464408159256, | |
| "learning_rate": 0.0003961013395457193, | |
| "loss": 3.216, | |
| "step": 58450 | |
| }, | |
| { | |
| "epoch": 17.02590824406148, | |
| "grad_norm": 0.37263309955596924, | |
| "learning_rate": 0.0003959266161910308, | |
| "loss": 3.1833, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 17.040463437354447, | |
| "grad_norm": 0.38926970958709717, | |
| "learning_rate": 0.0003957518928363424, | |
| "loss": 3.1972, | |
| "step": 58550 | |
| }, | |
| { | |
| "epoch": 17.055018630647414, | |
| "grad_norm": 0.3891792595386505, | |
| "learning_rate": 0.000395577169481654, | |
| "loss": 3.1941, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 17.06957382394038, | |
| "grad_norm": 0.36075663566589355, | |
| "learning_rate": 0.00039540244612696566, | |
| "loss": 3.2028, | |
| "step": 58650 | |
| }, | |
| { | |
| "epoch": 17.084129017233348, | |
| "grad_norm": 0.37760135531425476, | |
| "learning_rate": 0.0003952277227722772, | |
| "loss": 3.1921, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 17.098684210526315, | |
| "grad_norm": 0.3842307925224304, | |
| "learning_rate": 0.0003950529994175888, | |
| "loss": 3.2103, | |
| "step": 58750 | |
| }, | |
| { | |
| "epoch": 17.113239403819282, | |
| "grad_norm": 0.361923485994339, | |
| "learning_rate": 0.0003948782760629004, | |
| "loss": 3.2093, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 17.12779459711225, | |
| "grad_norm": 0.3622080981731415, | |
| "learning_rate": 0.0003947035527082119, | |
| "loss": 3.2077, | |
| "step": 58850 | |
| }, | |
| { | |
| "epoch": 17.142349790405216, | |
| "grad_norm": 0.39810413122177124, | |
| "learning_rate": 0.0003945288293535235, | |
| "loss": 3.2135, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 17.156904983698183, | |
| "grad_norm": 0.375042587518692, | |
| "learning_rate": 0.00039435410599883517, | |
| "loss": 3.2273, | |
| "step": 58950 | |
| }, | |
| { | |
| "epoch": 17.17146017699115, | |
| "grad_norm": 0.3546057343482971, | |
| "learning_rate": 0.00039417938264414676, | |
| "loss": 3.2229, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.17146017699115, | |
| "eval_accuracy": 0.37200060032947607, | |
| "eval_loss": 3.5523841381073, | |
| "eval_runtime": 180.411, | |
| "eval_samples_per_second": 92.295, | |
| "eval_steps_per_second": 5.77, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.186015370284117, | |
| "grad_norm": 0.3810058534145355, | |
| "learning_rate": 0.0003940046592894583, | |
| "loss": 3.2349, | |
| "step": 59050 | |
| }, | |
| { | |
| "epoch": 17.200570563577084, | |
| "grad_norm": 0.3859725594520569, | |
| "learning_rate": 0.0003938299359347699, | |
| "loss": 3.2389, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 17.21512575687005, | |
| "grad_norm": 0.36585602164268494, | |
| "learning_rate": 0.0003936552125800815, | |
| "loss": 3.2191, | |
| "step": 59150 | |
| }, | |
| { | |
| "epoch": 17.22968095016302, | |
| "grad_norm": 0.38412246108055115, | |
| "learning_rate": 0.00039348048922539314, | |
| "loss": 3.2309, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 17.244236143455986, | |
| "grad_norm": 0.3637344539165497, | |
| "learning_rate": 0.0003933057658707047, | |
| "loss": 3.2203, | |
| "step": 59250 | |
| }, | |
| { | |
| "epoch": 17.258791336748953, | |
| "grad_norm": 0.3747289776802063, | |
| "learning_rate": 0.0003931310425160163, | |
| "loss": 3.2285, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 17.27334653004192, | |
| "grad_norm": 0.41975367069244385, | |
| "learning_rate": 0.00039295631916132787, | |
| "loss": 3.2325, | |
| "step": 59350 | |
| }, | |
| { | |
| "epoch": 17.287901723334887, | |
| "grad_norm": 0.3893274962902069, | |
| "learning_rate": 0.00039278159580663946, | |
| "loss": 3.2415, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 17.302456916627854, | |
| "grad_norm": 0.3909914493560791, | |
| "learning_rate": 0.000392606872451951, | |
| "loss": 3.2576, | |
| "step": 59450 | |
| }, | |
| { | |
| "epoch": 17.31701210992082, | |
| "grad_norm": 0.37397870421409607, | |
| "learning_rate": 0.00039243214909726265, | |
| "loss": 3.2356, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 17.331567303213788, | |
| "grad_norm": 0.3928914964199066, | |
| "learning_rate": 0.00039225742574257425, | |
| "loss": 3.2407, | |
| "step": 59550 | |
| }, | |
| { | |
| "epoch": 17.346122496506755, | |
| "grad_norm": 0.4055655002593994, | |
| "learning_rate": 0.00039208270238788584, | |
| "loss": 3.2525, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 17.360677689799722, | |
| "grad_norm": 0.3902006149291992, | |
| "learning_rate": 0.0003919079790331974, | |
| "loss": 3.2507, | |
| "step": 59650 | |
| }, | |
| { | |
| "epoch": 17.37523288309269, | |
| "grad_norm": 0.36882686614990234, | |
| "learning_rate": 0.000391733255678509, | |
| "loss": 3.2437, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 17.389788076385653, | |
| "grad_norm": 0.3743095397949219, | |
| "learning_rate": 0.00039155853232382057, | |
| "loss": 3.2466, | |
| "step": 59750 | |
| }, | |
| { | |
| "epoch": 17.40434326967862, | |
| "grad_norm": 0.39880380034446716, | |
| "learning_rate": 0.0003913838089691322, | |
| "loss": 3.2458, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 17.418898462971587, | |
| "grad_norm": 0.3512974977493286, | |
| "learning_rate": 0.00039120908561444376, | |
| "loss": 3.2545, | |
| "step": 59850 | |
| }, | |
| { | |
| "epoch": 17.433453656264554, | |
| "grad_norm": 0.40003323554992676, | |
| "learning_rate": 0.00039103436225975535, | |
| "loss": 3.2494, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 17.44800884955752, | |
| "grad_norm": 0.3613320291042328, | |
| "learning_rate": 0.00039085963890506695, | |
| "loss": 3.2428, | |
| "step": 59950 | |
| }, | |
| { | |
| "epoch": 17.462564042850488, | |
| "grad_norm": 0.36803606152534485, | |
| "learning_rate": 0.0003906849155503785, | |
| "loss": 3.2657, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.462564042850488, | |
| "eval_accuracy": 0.3723788031982941, | |
| "eval_loss": 3.54555606842041, | |
| "eval_runtime": 180.5263, | |
| "eval_samples_per_second": 92.236, | |
| "eval_steps_per_second": 5.766, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.477119236143455, | |
| "grad_norm": 0.4228874444961548, | |
| "learning_rate": 0.00039051019219569014, | |
| "loss": 3.2501, | |
| "step": 60050 | |
| }, | |
| { | |
| "epoch": 17.491674429436422, | |
| "grad_norm": 0.3745982050895691, | |
| "learning_rate": 0.00039033546884100173, | |
| "loss": 3.2524, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 17.50622962272939, | |
| "grad_norm": 0.40704238414764404, | |
| "learning_rate": 0.0003901607454863133, | |
| "loss": 3.2597, | |
| "step": 60150 | |
| }, | |
| { | |
| "epoch": 17.520784816022356, | |
| "grad_norm": 0.36817634105682373, | |
| "learning_rate": 0.00038998602213162486, | |
| "loss": 3.2574, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 17.535340009315323, | |
| "grad_norm": 0.4311186373233795, | |
| "learning_rate": 0.00038981129877693646, | |
| "loss": 3.2757, | |
| "step": 60250 | |
| }, | |
| { | |
| "epoch": 17.54989520260829, | |
| "grad_norm": 0.365153431892395, | |
| "learning_rate": 0.00038963657542224805, | |
| "loss": 3.2598, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 17.564450395901257, | |
| "grad_norm": 0.3981127142906189, | |
| "learning_rate": 0.0003894618520675597, | |
| "loss": 3.2806, | |
| "step": 60350 | |
| }, | |
| { | |
| "epoch": 17.579005589194225, | |
| "grad_norm": 0.3725679814815521, | |
| "learning_rate": 0.00038928712871287124, | |
| "loss": 3.2657, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 17.59356078248719, | |
| "grad_norm": 0.3683021068572998, | |
| "learning_rate": 0.00038911240535818284, | |
| "loss": 3.2575, | |
| "step": 60450 | |
| }, | |
| { | |
| "epoch": 17.60811597578016, | |
| "grad_norm": 0.3987520635128021, | |
| "learning_rate": 0.00038893768200349443, | |
| "loss": 3.2652, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 17.622671169073126, | |
| "grad_norm": 0.4107326865196228, | |
| "learning_rate": 0.000388762958648806, | |
| "loss": 3.2641, | |
| "step": 60550 | |
| }, | |
| { | |
| "epoch": 17.637226362366093, | |
| "grad_norm": 0.40142160654067993, | |
| "learning_rate": 0.0003885882352941176, | |
| "loss": 3.2562, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 17.65178155565906, | |
| "grad_norm": 0.3741169571876526, | |
| "learning_rate": 0.0003884135119394292, | |
| "loss": 3.27, | |
| "step": 60650 | |
| }, | |
| { | |
| "epoch": 17.666336748952027, | |
| "grad_norm": 0.3738797903060913, | |
| "learning_rate": 0.0003882387885847408, | |
| "loss": 3.2616, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 17.680891942244994, | |
| "grad_norm": 0.3983347713947296, | |
| "learning_rate": 0.0003880640652300524, | |
| "loss": 3.2638, | |
| "step": 60750 | |
| }, | |
| { | |
| "epoch": 17.69544713553796, | |
| "grad_norm": 0.37645477056503296, | |
| "learning_rate": 0.00038788934187536394, | |
| "loss": 3.2885, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 17.710002328830928, | |
| "grad_norm": 0.3865315020084381, | |
| "learning_rate": 0.00038771461852067554, | |
| "loss": 3.283, | |
| "step": 60850 | |
| }, | |
| { | |
| "epoch": 17.724557522123895, | |
| "grad_norm": 0.3757742941379547, | |
| "learning_rate": 0.0003875398951659872, | |
| "loss": 3.2732, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 17.739112715416862, | |
| "grad_norm": 0.38467729091644287, | |
| "learning_rate": 0.0003873651718112988, | |
| "loss": 3.2674, | |
| "step": 60950 | |
| }, | |
| { | |
| "epoch": 17.75366790870983, | |
| "grad_norm": 0.39061835408210754, | |
| "learning_rate": 0.0003871904484566103, | |
| "loss": 3.2926, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.75366790870983, | |
| "eval_accuracy": 0.3727806290555, | |
| "eval_loss": 3.5415282249450684, | |
| "eval_runtime": 180.3107, | |
| "eval_samples_per_second": 92.346, | |
| "eval_steps_per_second": 5.773, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.768223102002796, | |
| "grad_norm": 0.36113840341567993, | |
| "learning_rate": 0.0003870157251019219, | |
| "loss": 3.2734, | |
| "step": 61050 | |
| }, | |
| { | |
| "epoch": 17.78277829529576, | |
| "grad_norm": 0.39122429490089417, | |
| "learning_rate": 0.0003868410017472335, | |
| "loss": 3.2763, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 17.797333488588727, | |
| "grad_norm": 0.3742472529411316, | |
| "learning_rate": 0.00038666627839254505, | |
| "loss": 3.2633, | |
| "step": 61150 | |
| }, | |
| { | |
| "epoch": 17.811888681881694, | |
| "grad_norm": 0.3841819763183594, | |
| "learning_rate": 0.0003864915550378567, | |
| "loss": 3.2746, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 17.82644387517466, | |
| "grad_norm": 0.38105177879333496, | |
| "learning_rate": 0.0003863168316831683, | |
| "loss": 3.2821, | |
| "step": 61250 | |
| }, | |
| { | |
| "epoch": 17.840999068467628, | |
| "grad_norm": 0.37769728899002075, | |
| "learning_rate": 0.0003861421083284799, | |
| "loss": 3.2801, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 17.855554261760595, | |
| "grad_norm": 0.37277543544769287, | |
| "learning_rate": 0.0003859673849737914, | |
| "loss": 3.279, | |
| "step": 61350 | |
| }, | |
| { | |
| "epoch": 17.870109455053562, | |
| "grad_norm": 0.386414110660553, | |
| "learning_rate": 0.000385792661619103, | |
| "loss": 3.276, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 17.88466464834653, | |
| "grad_norm": 0.38384565711021423, | |
| "learning_rate": 0.00038561793826441467, | |
| "loss": 3.2809, | |
| "step": 61450 | |
| }, | |
| { | |
| "epoch": 17.899219841639496, | |
| "grad_norm": 0.38427191972732544, | |
| "learning_rate": 0.00038544321490972626, | |
| "loss": 3.3048, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 17.913775034932463, | |
| "grad_norm": 0.38702264428138733, | |
| "learning_rate": 0.0003852684915550378, | |
| "loss": 3.2855, | |
| "step": 61550 | |
| }, | |
| { | |
| "epoch": 17.92833022822543, | |
| "grad_norm": 0.3761012554168701, | |
| "learning_rate": 0.0003850937682003494, | |
| "loss": 3.2923, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 17.942885421518397, | |
| "grad_norm": 0.3897514343261719, | |
| "learning_rate": 0.000384919044845661, | |
| "loss": 3.2822, | |
| "step": 61650 | |
| }, | |
| { | |
| "epoch": 17.957440614811365, | |
| "grad_norm": 0.37470051646232605, | |
| "learning_rate": 0.0003847443214909726, | |
| "loss": 3.294, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 17.97199580810433, | |
| "grad_norm": 0.36596065759658813, | |
| "learning_rate": 0.00038456959813628423, | |
| "loss": 3.2923, | |
| "step": 61750 | |
| }, | |
| { | |
| "epoch": 17.9865510013973, | |
| "grad_norm": 0.3746185600757599, | |
| "learning_rate": 0.0003843948747815958, | |
| "loss": 3.28, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 18.000873311597577, | |
| "grad_norm": 0.40825390815734863, | |
| "learning_rate": 0.00038422015142690737, | |
| "loss": 3.2709, | |
| "step": 61850 | |
| }, | |
| { | |
| "epoch": 18.015428504890544, | |
| "grad_norm": 0.41796135902404785, | |
| "learning_rate": 0.00038404542807221896, | |
| "loss": 3.1633, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 18.02998369818351, | |
| "grad_norm": 0.35944128036499023, | |
| "learning_rate": 0.0003838707047175305, | |
| "loss": 3.1833, | |
| "step": 61950 | |
| }, | |
| { | |
| "epoch": 18.044538891476478, | |
| "grad_norm": 0.3860076069831848, | |
| "learning_rate": 0.00038369598136284215, | |
| "loss": 3.194, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.044538891476478, | |
| "eval_accuracy": 0.372189114127358, | |
| "eval_loss": 3.5531320571899414, | |
| "eval_runtime": 180.5352, | |
| "eval_samples_per_second": 92.231, | |
| "eval_steps_per_second": 5.766, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.059094084769445, | |
| "grad_norm": 0.35413676500320435, | |
| "learning_rate": 0.00038352125800815374, | |
| "loss": 3.1958, | |
| "step": 62050 | |
| }, | |
| { | |
| "epoch": 18.073649278062412, | |
| "grad_norm": 0.3618215024471283, | |
| "learning_rate": 0.00038334653465346534, | |
| "loss": 3.197, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 18.08820447135538, | |
| "grad_norm": 0.36995649337768555, | |
| "learning_rate": 0.0003831718112987769, | |
| "loss": 3.1877, | |
| "step": 62150 | |
| }, | |
| { | |
| "epoch": 18.102759664648346, | |
| "grad_norm": 0.41980400681495667, | |
| "learning_rate": 0.0003829970879440885, | |
| "loss": 3.1948, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 18.117314857941313, | |
| "grad_norm": 0.3778184950351715, | |
| "learning_rate": 0.00038282236458940007, | |
| "loss": 3.2029, | |
| "step": 62250 | |
| }, | |
| { | |
| "epoch": 18.13187005123428, | |
| "grad_norm": 0.3612458109855652, | |
| "learning_rate": 0.0003826476412347117, | |
| "loss": 3.1978, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 18.146425244527247, | |
| "grad_norm": 0.3796541690826416, | |
| "learning_rate": 0.00038247291788002326, | |
| "loss": 3.208, | |
| "step": 62350 | |
| }, | |
| { | |
| "epoch": 18.160980437820214, | |
| "grad_norm": 0.36615505814552307, | |
| "learning_rate": 0.00038229819452533485, | |
| "loss": 3.209, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 18.17553563111318, | |
| "grad_norm": 0.37897372245788574, | |
| "learning_rate": 0.00038212347117064644, | |
| "loss": 3.2022, | |
| "step": 62450 | |
| }, | |
| { | |
| "epoch": 18.19009082440615, | |
| "grad_norm": 0.37915629148483276, | |
| "learning_rate": 0.000381948747815958, | |
| "loss": 3.2272, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 18.204646017699115, | |
| "grad_norm": 0.3873524069786072, | |
| "learning_rate": 0.0003817740244612696, | |
| "loss": 3.214, | |
| "step": 62550 | |
| }, | |
| { | |
| "epoch": 18.219201210992082, | |
| "grad_norm": 0.3985103666782379, | |
| "learning_rate": 0.00038159930110658123, | |
| "loss": 3.2265, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 18.23375640428505, | |
| "grad_norm": 0.3888337314128876, | |
| "learning_rate": 0.0003814245777518928, | |
| "loss": 3.2214, | |
| "step": 62650 | |
| }, | |
| { | |
| "epoch": 18.248311597578017, | |
| "grad_norm": 0.37603312730789185, | |
| "learning_rate": 0.0003812498543972044, | |
| "loss": 3.2231, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 18.262866790870984, | |
| "grad_norm": 0.4270715117454529, | |
| "learning_rate": 0.00038107513104251596, | |
| "loss": 3.2239, | |
| "step": 62750 | |
| }, | |
| { | |
| "epoch": 18.27742198416395, | |
| "grad_norm": 0.3634800910949707, | |
| "learning_rate": 0.00038090040768782755, | |
| "loss": 3.2238, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 18.291977177456918, | |
| "grad_norm": 0.36877721548080444, | |
| "learning_rate": 0.0003807256843331392, | |
| "loss": 3.2307, | |
| "step": 62850 | |
| }, | |
| { | |
| "epoch": 18.306532370749885, | |
| "grad_norm": 0.39757972955703735, | |
| "learning_rate": 0.0003805509609784508, | |
| "loss": 3.2215, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 18.321087564042852, | |
| "grad_norm": 0.3862345814704895, | |
| "learning_rate": 0.00038037623762376233, | |
| "loss": 3.2337, | |
| "step": 62950 | |
| }, | |
| { | |
| "epoch": 18.33564275733582, | |
| "grad_norm": 0.41595593094825745, | |
| "learning_rate": 0.00038020151426907393, | |
| "loss": 3.2321, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.33564275733582, | |
| "eval_accuracy": 0.3725264174938924, | |
| "eval_loss": 3.5496232509613037, | |
| "eval_runtime": 180.5727, | |
| "eval_samples_per_second": 92.212, | |
| "eval_steps_per_second": 5.765, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.350197950628786, | |
| "grad_norm": 0.3716392517089844, | |
| "learning_rate": 0.0003800267909143855, | |
| "loss": 3.2375, | |
| "step": 63050 | |
| }, | |
| { | |
| "epoch": 18.364753143921753, | |
| "grad_norm": 0.36962705850601196, | |
| "learning_rate": 0.00037985206755969706, | |
| "loss": 3.243, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 18.379308337214717, | |
| "grad_norm": 0.40529200434684753, | |
| "learning_rate": 0.0003796773442050087, | |
| "loss": 3.2319, | |
| "step": 63150 | |
| }, | |
| { | |
| "epoch": 18.393863530507684, | |
| "grad_norm": 0.38823139667510986, | |
| "learning_rate": 0.0003795026208503203, | |
| "loss": 3.2433, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 18.40841872380065, | |
| "grad_norm": 0.3592684268951416, | |
| "learning_rate": 0.0003793278974956319, | |
| "loss": 3.2456, | |
| "step": 63250 | |
| }, | |
| { | |
| "epoch": 18.422973917093618, | |
| "grad_norm": 0.40624964237213135, | |
| "learning_rate": 0.00037915317414094344, | |
| "loss": 3.2503, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 18.437529110386585, | |
| "grad_norm": 0.37471804022789, | |
| "learning_rate": 0.00037897845078625503, | |
| "loss": 3.2545, | |
| "step": 63350 | |
| }, | |
| { | |
| "epoch": 18.452084303679552, | |
| "grad_norm": 0.36707061529159546, | |
| "learning_rate": 0.0003788037274315667, | |
| "loss": 3.2492, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 18.46663949697252, | |
| "grad_norm": 0.36586758494377136, | |
| "learning_rate": 0.0003786290040768783, | |
| "loss": 3.2558, | |
| "step": 63450 | |
| }, | |
| { | |
| "epoch": 18.481194690265486, | |
| "grad_norm": 0.42799147963523865, | |
| "learning_rate": 0.0003784542807221898, | |
| "loss": 3.2467, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 18.495749883558453, | |
| "grad_norm": 0.3685186505317688, | |
| "learning_rate": 0.0003782795573675014, | |
| "loss": 3.2462, | |
| "step": 63550 | |
| }, | |
| { | |
| "epoch": 18.51030507685142, | |
| "grad_norm": 0.3876000642776489, | |
| "learning_rate": 0.000378104834012813, | |
| "loss": 3.2543, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 18.524860270144387, | |
| "grad_norm": 0.36416545510292053, | |
| "learning_rate": 0.0003779301106581246, | |
| "loss": 3.2457, | |
| "step": 63650 | |
| }, | |
| { | |
| "epoch": 18.539415463437354, | |
| "grad_norm": 0.3986048698425293, | |
| "learning_rate": 0.0003777553873034362, | |
| "loss": 3.2476, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 18.55397065673032, | |
| "grad_norm": 0.37402477860450745, | |
| "learning_rate": 0.0003775806639487478, | |
| "loss": 3.2524, | |
| "step": 63750 | |
| }, | |
| { | |
| "epoch": 18.56852585002329, | |
| "grad_norm": 0.38148608803749084, | |
| "learning_rate": 0.0003774059405940594, | |
| "loss": 3.25, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 18.583081043316255, | |
| "grad_norm": 0.3836163282394409, | |
| "learning_rate": 0.000377231217239371, | |
| "loss": 3.2399, | |
| "step": 63850 | |
| }, | |
| { | |
| "epoch": 18.597636236609222, | |
| "grad_norm": 0.41393813490867615, | |
| "learning_rate": 0.0003770564938846825, | |
| "loss": 3.2553, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 18.61219142990219, | |
| "grad_norm": 0.3970487713813782, | |
| "learning_rate": 0.00037688177052999416, | |
| "loss": 3.2626, | |
| "step": 63950 | |
| }, | |
| { | |
| "epoch": 18.626746623195157, | |
| "grad_norm": 0.3820064067840576, | |
| "learning_rate": 0.00037670704717530576, | |
| "loss": 3.2589, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.626746623195157, | |
| "eval_accuracy": 0.37256578914120564, | |
| "eval_loss": 3.5414891242980957, | |
| "eval_runtime": 180.6412, | |
| "eval_samples_per_second": 92.177, | |
| "eval_steps_per_second": 5.763, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.641301816488124, | |
| "grad_norm": 0.36854344606399536, | |
| "learning_rate": 0.00037653232382061735, | |
| "loss": 3.2543, | |
| "step": 64050 | |
| }, | |
| { | |
| "epoch": 18.65585700978109, | |
| "grad_norm": 0.3917076587677002, | |
| "learning_rate": 0.0003763576004659289, | |
| "loss": 3.2512, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 18.670412203074058, | |
| "grad_norm": 0.3761797845363617, | |
| "learning_rate": 0.0003761828771112405, | |
| "loss": 3.2481, | |
| "step": 64150 | |
| }, | |
| { | |
| "epoch": 18.684967396367025, | |
| "grad_norm": 0.42972832918167114, | |
| "learning_rate": 0.0003760081537565521, | |
| "loss": 3.2551, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 18.699522589659992, | |
| "grad_norm": 0.3965604603290558, | |
| "learning_rate": 0.00037583343040186373, | |
| "loss": 3.2556, | |
| "step": 64250 | |
| }, | |
| { | |
| "epoch": 18.71407778295296, | |
| "grad_norm": 0.3881710171699524, | |
| "learning_rate": 0.00037565870704717527, | |
| "loss": 3.2619, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 18.728632976245926, | |
| "grad_norm": 0.3977341651916504, | |
| "learning_rate": 0.00037548398369248687, | |
| "loss": 3.2409, | |
| "step": 64350 | |
| }, | |
| { | |
| "epoch": 18.743188169538893, | |
| "grad_norm": 0.4114339053630829, | |
| "learning_rate": 0.00037530926033779846, | |
| "loss": 3.2703, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 18.75774336283186, | |
| "grad_norm": 0.380398690700531, | |
| "learning_rate": 0.00037513453698311, | |
| "loss": 3.2738, | |
| "step": 64450 | |
| }, | |
| { | |
| "epoch": 18.772298556124824, | |
| "grad_norm": 0.38762468099594116, | |
| "learning_rate": 0.0003749598136284216, | |
| "loss": 3.2542, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 18.78685374941779, | |
| "grad_norm": 0.36196255683898926, | |
| "learning_rate": 0.00037478509027373324, | |
| "loss": 3.2516, | |
| "step": 64550 | |
| }, | |
| { | |
| "epoch": 18.801408942710758, | |
| "grad_norm": 0.3822077810764313, | |
| "learning_rate": 0.00037461036691904484, | |
| "loss": 3.2662, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 18.815964136003725, | |
| "grad_norm": 0.36449962854385376, | |
| "learning_rate": 0.0003744356435643564, | |
| "loss": 3.2743, | |
| "step": 64650 | |
| }, | |
| { | |
| "epoch": 18.830519329296692, | |
| "grad_norm": 0.3605346083641052, | |
| "learning_rate": 0.00037426092020966797, | |
| "loss": 3.2658, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 18.84507452258966, | |
| "grad_norm": 0.413309782743454, | |
| "learning_rate": 0.00037408619685497957, | |
| "loss": 3.2666, | |
| "step": 64750 | |
| }, | |
| { | |
| "epoch": 18.859629715882626, | |
| "grad_norm": 0.3782139718532562, | |
| "learning_rate": 0.0003739114735002912, | |
| "loss": 3.2715, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 18.874184909175593, | |
| "grad_norm": 0.3591192066669464, | |
| "learning_rate": 0.0003737367501456028, | |
| "loss": 3.2702, | |
| "step": 64850 | |
| }, | |
| { | |
| "epoch": 18.88874010246856, | |
| "grad_norm": 0.36408689618110657, | |
| "learning_rate": 0.00037356202679091435, | |
| "loss": 3.2646, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 18.903295295761527, | |
| "grad_norm": 0.38768094778060913, | |
| "learning_rate": 0.00037338730343622594, | |
| "loss": 3.2723, | |
| "step": 64950 | |
| }, | |
| { | |
| "epoch": 18.917850489054494, | |
| "grad_norm": 0.366207480430603, | |
| "learning_rate": 0.00037321258008153754, | |
| "loss": 3.2746, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.917850489054494, | |
| "eval_accuracy": 0.37334252710267807, | |
| "eval_loss": 3.5348284244537354, | |
| "eval_runtime": 180.2751, | |
| "eval_samples_per_second": 92.364, | |
| "eval_steps_per_second": 5.775, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.93240568234746, | |
| "grad_norm": 0.37537604570388794, | |
| "learning_rate": 0.0003730378567268491, | |
| "loss": 3.2719, | |
| "step": 65050 | |
| }, | |
| { | |
| "epoch": 18.94696087564043, | |
| "grad_norm": 0.4338463842868805, | |
| "learning_rate": 0.0003728631333721607, | |
| "loss": 3.2609, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 18.961516068933395, | |
| "grad_norm": 0.3860550820827484, | |
| "learning_rate": 0.0003726884100174723, | |
| "loss": 3.2704, | |
| "step": 65150 | |
| }, | |
| { | |
| "epoch": 18.976071262226363, | |
| "grad_norm": 0.3855957090854645, | |
| "learning_rate": 0.0003725136866627839, | |
| "loss": 3.2737, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 18.99062645551933, | |
| "grad_norm": 0.3790414035320282, | |
| "learning_rate": 0.00037233896330809545, | |
| "loss": 3.2691, | |
| "step": 65250 | |
| }, | |
| { | |
| "epoch": 19.004948765719607, | |
| "grad_norm": 0.37143030762672424, | |
| "learning_rate": 0.00037216423995340705, | |
| "loss": 3.2415, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 19.019503959012575, | |
| "grad_norm": 0.36700376868247986, | |
| "learning_rate": 0.0003719895165987187, | |
| "loss": 3.1704, | |
| "step": 65350 | |
| }, | |
| { | |
| "epoch": 19.03405915230554, | |
| "grad_norm": 0.3844224512577057, | |
| "learning_rate": 0.0003718147932440303, | |
| "loss": 3.1844, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 19.04861434559851, | |
| "grad_norm": 0.391570508480072, | |
| "learning_rate": 0.00037164006988934183, | |
| "loss": 3.1695, | |
| "step": 65450 | |
| }, | |
| { | |
| "epoch": 19.063169538891476, | |
| "grad_norm": 0.39597800374031067, | |
| "learning_rate": 0.0003714653465346534, | |
| "loss": 3.1888, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 19.077724732184443, | |
| "grad_norm": 0.37883609533309937, | |
| "learning_rate": 0.000371290623179965, | |
| "loss": 3.1828, | |
| "step": 65550 | |
| }, | |
| { | |
| "epoch": 19.09227992547741, | |
| "grad_norm": 0.39523887634277344, | |
| "learning_rate": 0.00037111589982527656, | |
| "loss": 3.1738, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 19.106835118770377, | |
| "grad_norm": 0.3817376494407654, | |
| "learning_rate": 0.0003709411764705882, | |
| "loss": 3.181, | |
| "step": 65650 | |
| }, | |
| { | |
| "epoch": 19.121390312063344, | |
| "grad_norm": 0.41909563541412354, | |
| "learning_rate": 0.0003707664531158998, | |
| "loss": 3.1926, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 19.13594550535631, | |
| "grad_norm": 0.3599540889263153, | |
| "learning_rate": 0.0003705917297612114, | |
| "loss": 3.1911, | |
| "step": 65750 | |
| }, | |
| { | |
| "epoch": 19.150500698649278, | |
| "grad_norm": 0.3863822817802429, | |
| "learning_rate": 0.000370417006406523, | |
| "loss": 3.1871, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 19.165055891942245, | |
| "grad_norm": 0.39603495597839355, | |
| "learning_rate": 0.00037024228305183453, | |
| "loss": 3.1962, | |
| "step": 65850 | |
| }, | |
| { | |
| "epoch": 19.179611085235212, | |
| "grad_norm": 0.41880905628204346, | |
| "learning_rate": 0.0003700675596971461, | |
| "loss": 3.2061, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 19.19416627852818, | |
| "grad_norm": 0.3970293402671814, | |
| "learning_rate": 0.0003698928363424578, | |
| "loss": 3.2036, | |
| "step": 65950 | |
| }, | |
| { | |
| "epoch": 19.208721471821146, | |
| "grad_norm": 0.3847731351852417, | |
| "learning_rate": 0.00036971811298776937, | |
| "loss": 3.2104, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.208721471821146, | |
| "eval_accuracy": 0.37257848209019023, | |
| "eval_loss": 3.5511715412139893, | |
| "eval_runtime": 180.6703, | |
| "eval_samples_per_second": 92.162, | |
| "eval_steps_per_second": 5.762, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.223276665114113, | |
| "grad_norm": 0.40739133954048157, | |
| "learning_rate": 0.0003695433896330809, | |
| "loss": 3.2091, | |
| "step": 66050 | |
| }, | |
| { | |
| "epoch": 19.23783185840708, | |
| "grad_norm": 0.397180438041687, | |
| "learning_rate": 0.0003693686662783925, | |
| "loss": 3.2078, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 19.252387051700047, | |
| "grad_norm": 0.4168866276741028, | |
| "learning_rate": 0.0003691939429237041, | |
| "loss": 3.2079, | |
| "step": 66150 | |
| }, | |
| { | |
| "epoch": 19.266942244993015, | |
| "grad_norm": 0.3613574504852295, | |
| "learning_rate": 0.00036901921956901575, | |
| "loss": 3.2198, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 19.28149743828598, | |
| "grad_norm": 0.3952755928039551, | |
| "learning_rate": 0.0003688444962143273, | |
| "loss": 3.2073, | |
| "step": 66250 | |
| }, | |
| { | |
| "epoch": 19.29605263157895, | |
| "grad_norm": 0.4050266146659851, | |
| "learning_rate": 0.0003686697728596389, | |
| "loss": 3.2214, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 19.310607824871916, | |
| "grad_norm": 0.4253535568714142, | |
| "learning_rate": 0.0003684950495049505, | |
| "loss": 3.2182, | |
| "step": 66350 | |
| }, | |
| { | |
| "epoch": 19.325163018164883, | |
| "grad_norm": 0.38188880681991577, | |
| "learning_rate": 0.000368320326150262, | |
| "loss": 3.2137, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 19.33971821145785, | |
| "grad_norm": 0.3907614052295685, | |
| "learning_rate": 0.0003681456027955736, | |
| "loss": 3.2192, | |
| "step": 66450 | |
| }, | |
| { | |
| "epoch": 19.354273404750813, | |
| "grad_norm": 0.40632307529449463, | |
| "learning_rate": 0.00036797087944088526, | |
| "loss": 3.2203, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 19.36882859804378, | |
| "grad_norm": 0.36712852120399475, | |
| "learning_rate": 0.00036779615608619685, | |
| "loss": 3.2288, | |
| "step": 66550 | |
| }, | |
| { | |
| "epoch": 19.383383791336747, | |
| "grad_norm": 0.4290497303009033, | |
| "learning_rate": 0.0003676214327315084, | |
| "loss": 3.2281, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 19.397938984629715, | |
| "grad_norm": 0.3995981216430664, | |
| "learning_rate": 0.00036744670937682, | |
| "loss": 3.2251, | |
| "step": 66650 | |
| }, | |
| { | |
| "epoch": 19.41249417792268, | |
| "grad_norm": 0.3878142535686493, | |
| "learning_rate": 0.0003672719860221316, | |
| "loss": 3.2367, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 19.42704937121565, | |
| "grad_norm": 0.3855849802494049, | |
| "learning_rate": 0.00036709726266744323, | |
| "loss": 3.2323, | |
| "step": 66750 | |
| }, | |
| { | |
| "epoch": 19.441604564508616, | |
| "grad_norm": 0.39863401651382446, | |
| "learning_rate": 0.00036692253931275477, | |
| "loss": 3.214, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 19.456159757801583, | |
| "grad_norm": 0.4260837733745575, | |
| "learning_rate": 0.00036674781595806636, | |
| "loss": 3.2303, | |
| "step": 66850 | |
| }, | |
| { | |
| "epoch": 19.47071495109455, | |
| "grad_norm": 0.37935662269592285, | |
| "learning_rate": 0.00036657309260337796, | |
| "loss": 3.2477, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 19.485270144387517, | |
| "grad_norm": 0.3697074353694916, | |
| "learning_rate": 0.00036639836924868955, | |
| "loss": 3.2403, | |
| "step": 66950 | |
| }, | |
| { | |
| "epoch": 19.499825337680484, | |
| "grad_norm": 0.3846164345741272, | |
| "learning_rate": 0.0003662236458940011, | |
| "loss": 3.2527, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.499825337680484, | |
| "eval_accuracy": 0.37289122224989335, | |
| "eval_loss": 3.543311834335327, | |
| "eval_runtime": 180.5734, | |
| "eval_samples_per_second": 92.212, | |
| "eval_steps_per_second": 5.765, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.51438053097345, | |
| "grad_norm": 0.39751338958740234, | |
| "learning_rate": 0.00036604892253931274, | |
| "loss": 3.2511, | |
| "step": 67050 | |
| }, | |
| { | |
| "epoch": 19.528935724266418, | |
| "grad_norm": 0.38897642493247986, | |
| "learning_rate": 0.00036587419918462433, | |
| "loss": 3.2445, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 19.543490917559385, | |
| "grad_norm": 0.366291880607605, | |
| "learning_rate": 0.00036569947582993593, | |
| "loss": 3.2455, | |
| "step": 67150 | |
| }, | |
| { | |
| "epoch": 19.558046110852352, | |
| "grad_norm": 0.40584415197372437, | |
| "learning_rate": 0.00036552475247524747, | |
| "loss": 3.245, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 19.57260130414532, | |
| "grad_norm": 0.3818405866622925, | |
| "learning_rate": 0.00036535002912055906, | |
| "loss": 3.2362, | |
| "step": 67250 | |
| }, | |
| { | |
| "epoch": 19.587156497438286, | |
| "grad_norm": 0.4180438816547394, | |
| "learning_rate": 0.00036517530576587066, | |
| "loss": 3.2431, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 19.601711690731253, | |
| "grad_norm": 0.39405733346939087, | |
| "learning_rate": 0.0003650005824111823, | |
| "loss": 3.2482, | |
| "step": 67350 | |
| }, | |
| { | |
| "epoch": 19.61626688402422, | |
| "grad_norm": 0.372460275888443, | |
| "learning_rate": 0.00036482585905649385, | |
| "loss": 3.2496, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 19.630822077317188, | |
| "grad_norm": 0.3858887553215027, | |
| "learning_rate": 0.00036465113570180544, | |
| "loss": 3.2419, | |
| "step": 67450 | |
| }, | |
| { | |
| "epoch": 19.645377270610155, | |
| "grad_norm": 0.3806799352169037, | |
| "learning_rate": 0.00036447641234711703, | |
| "loss": 3.245, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 19.65993246390312, | |
| "grad_norm": 0.38931015133857727, | |
| "learning_rate": 0.0003643016889924286, | |
| "loss": 3.237, | |
| "step": 67550 | |
| }, | |
| { | |
| "epoch": 19.67448765719609, | |
| "grad_norm": 0.40190720558166504, | |
| "learning_rate": 0.0003641269656377402, | |
| "loss": 3.2539, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 19.689042850489056, | |
| "grad_norm": 0.3825318515300751, | |
| "learning_rate": 0.0003639522422830518, | |
| "loss": 3.2419, | |
| "step": 67650 | |
| }, | |
| { | |
| "epoch": 19.703598043782023, | |
| "grad_norm": 0.37726688385009766, | |
| "learning_rate": 0.0003637775189283634, | |
| "loss": 3.2529, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 19.71815323707499, | |
| "grad_norm": 0.3934818506240845, | |
| "learning_rate": 0.00036360279557367495, | |
| "loss": 3.2395, | |
| "step": 67750 | |
| }, | |
| { | |
| "epoch": 19.732708430367957, | |
| "grad_norm": 0.38725027441978455, | |
| "learning_rate": 0.00036342807221898655, | |
| "loss": 3.2444, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 19.74726362366092, | |
| "grad_norm": 0.40124940872192383, | |
| "learning_rate": 0.00036325334886429814, | |
| "loss": 3.2504, | |
| "step": 67850 | |
| }, | |
| { | |
| "epoch": 19.761818816953888, | |
| "grad_norm": 0.41127246618270874, | |
| "learning_rate": 0.0003630786255096098, | |
| "loss": 3.2467, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 19.776374010246855, | |
| "grad_norm": 0.3693263828754425, | |
| "learning_rate": 0.00036290390215492133, | |
| "loss": 3.2524, | |
| "step": 67950 | |
| }, | |
| { | |
| "epoch": 19.79092920353982, | |
| "grad_norm": 0.4074074923992157, | |
| "learning_rate": 0.0003627291788002329, | |
| "loss": 3.2465, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.79092920353982, | |
| "eval_accuracy": 0.373428792144851, | |
| "eval_loss": 3.5339646339416504, | |
| "eval_runtime": 180.6244, | |
| "eval_samples_per_second": 92.186, | |
| "eval_steps_per_second": 5.763, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.80548439683279, | |
| "grad_norm": 0.3627810478210449, | |
| "learning_rate": 0.0003625544554455445, | |
| "loss": 3.2512, | |
| "step": 68050 | |
| }, | |
| { | |
| "epoch": 19.820039590125756, | |
| "grad_norm": 0.39208662509918213, | |
| "learning_rate": 0.0003623797320908561, | |
| "loss": 3.2519, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 19.834594783418723, | |
| "grad_norm": 0.40004655718803406, | |
| "learning_rate": 0.00036220500873616776, | |
| "loss": 3.262, | |
| "step": 68150 | |
| }, | |
| { | |
| "epoch": 19.84914997671169, | |
| "grad_norm": 0.4301166534423828, | |
| "learning_rate": 0.0003620302853814793, | |
| "loss": 3.2553, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 19.863705170004657, | |
| "grad_norm": 0.37066367268562317, | |
| "learning_rate": 0.0003618555620267909, | |
| "loss": 3.2571, | |
| "step": 68250 | |
| }, | |
| { | |
| "epoch": 19.878260363297624, | |
| "grad_norm": 0.3695203363895416, | |
| "learning_rate": 0.0003616808386721025, | |
| "loss": 3.2485, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 19.89281555659059, | |
| "grad_norm": 0.39587000012397766, | |
| "learning_rate": 0.00036150611531741403, | |
| "loss": 3.2539, | |
| "step": 68350 | |
| }, | |
| { | |
| "epoch": 19.907370749883558, | |
| "grad_norm": 0.38911348581314087, | |
| "learning_rate": 0.0003613313919627256, | |
| "loss": 3.2587, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 19.921925943176525, | |
| "grad_norm": 0.40591132640838623, | |
| "learning_rate": 0.00036115666860803727, | |
| "loss": 3.2641, | |
| "step": 68450 | |
| }, | |
| { | |
| "epoch": 19.936481136469492, | |
| "grad_norm": 0.40508130192756653, | |
| "learning_rate": 0.00036098194525334887, | |
| "loss": 3.2675, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 19.95103632976246, | |
| "grad_norm": 0.38476642966270447, | |
| "learning_rate": 0.0003608072218986604, | |
| "loss": 3.2598, | |
| "step": 68550 | |
| }, | |
| { | |
| "epoch": 19.965591523055426, | |
| "grad_norm": 0.37651675939559937, | |
| "learning_rate": 0.000360632498543972, | |
| "loss": 3.2663, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 19.980146716348393, | |
| "grad_norm": 0.4098745286464691, | |
| "learning_rate": 0.0003604577751892836, | |
| "loss": 3.266, | |
| "step": 68650 | |
| }, | |
| { | |
| "epoch": 19.99470190964136, | |
| "grad_norm": 0.3791620135307312, | |
| "learning_rate": 0.00036028305183459513, | |
| "loss": 3.2638, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 20.00902421984164, | |
| "grad_norm": 0.4268386960029602, | |
| "learning_rate": 0.0003601083284799068, | |
| "loss": 3.1968, | |
| "step": 68750 | |
| }, | |
| { | |
| "epoch": 20.023579413134605, | |
| "grad_norm": 0.4134370982646942, | |
| "learning_rate": 0.0003599336051252184, | |
| "loss": 3.1531, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 20.038134606427572, | |
| "grad_norm": 0.3608231246471405, | |
| "learning_rate": 0.00035975888177052997, | |
| "loss": 3.16, | |
| "step": 68850 | |
| }, | |
| { | |
| "epoch": 20.05268979972054, | |
| "grad_norm": 0.39957451820373535, | |
| "learning_rate": 0.0003595841584158415, | |
| "loss": 3.1672, | |
| "step": 68900 | |
| }, | |
| { | |
| "epoch": 20.067244993013507, | |
| "grad_norm": 0.3998141288757324, | |
| "learning_rate": 0.0003594094350611531, | |
| "loss": 3.1773, | |
| "step": 68950 | |
| }, | |
| { | |
| "epoch": 20.081800186306474, | |
| "grad_norm": 0.43431949615478516, | |
| "learning_rate": 0.00035923471170646475, | |
| "loss": 3.1742, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 20.081800186306474, | |
| "eval_accuracy": 0.3728283451414976, | |
| "eval_loss": 3.5496437549591064, | |
| "eval_runtime": 180.5885, | |
| "eval_samples_per_second": 92.204, | |
| "eval_steps_per_second": 5.764, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 20.09635537959944, | |
| "grad_norm": 0.3777293264865875, | |
| "learning_rate": 0.00035905998835177635, | |
| "loss": 3.1662, | |
| "step": 69050 | |
| }, | |
| { | |
| "epoch": 20.110910572892408, | |
| "grad_norm": 0.4360584616661072, | |
| "learning_rate": 0.00035888526499708794, | |
| "loss": 3.1806, | |
| "step": 69100 | |
| }, | |
| { | |
| "epoch": 20.125465766185375, | |
| "grad_norm": 0.3937258720397949, | |
| "learning_rate": 0.0003587105416423995, | |
| "loss": 3.176, | |
| "step": 69150 | |
| }, | |
| { | |
| "epoch": 20.140020959478342, | |
| "grad_norm": 0.41217368841171265, | |
| "learning_rate": 0.0003585358182877111, | |
| "loss": 3.1826, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 20.15457615277131, | |
| "grad_norm": 0.38056591153144836, | |
| "learning_rate": 0.00035836109493302267, | |
| "loss": 3.18, | |
| "step": 69250 | |
| }, | |
| { | |
| "epoch": 20.169131346064276, | |
| "grad_norm": 0.3858278691768646, | |
| "learning_rate": 0.0003581863715783343, | |
| "loss": 3.1963, | |
| "step": 69300 | |
| }, | |
| { | |
| "epoch": 20.183686539357243, | |
| "grad_norm": 0.39919236302375793, | |
| "learning_rate": 0.00035801164822364586, | |
| "loss": 3.1843, | |
| "step": 69350 | |
| }, | |
| { | |
| "epoch": 20.19824173265021, | |
| "grad_norm": 0.4216569662094116, | |
| "learning_rate": 0.00035783692486895745, | |
| "loss": 3.1797, | |
| "step": 69400 | |
| }, | |
| { | |
| "epoch": 20.212796925943177, | |
| "grad_norm": 0.39037302136421204, | |
| "learning_rate": 0.00035766220151426905, | |
| "loss": 3.2063, | |
| "step": 69450 | |
| }, | |
| { | |
| "epoch": 20.227352119236144, | |
| "grad_norm": 0.3864704966545105, | |
| "learning_rate": 0.0003574874781595806, | |
| "loss": 3.2001, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 20.24190731252911, | |
| "grad_norm": 0.40833717584609985, | |
| "learning_rate": 0.00035731275480489224, | |
| "loss": 3.2078, | |
| "step": 69550 | |
| }, | |
| { | |
| "epoch": 20.25646250582208, | |
| "grad_norm": 0.395744264125824, | |
| "learning_rate": 0.00035713803145020383, | |
| "loss": 3.2051, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 20.271017699115045, | |
| "grad_norm": 0.4026401937007904, | |
| "learning_rate": 0.0003569633080955154, | |
| "loss": 3.202, | |
| "step": 69650 | |
| }, | |
| { | |
| "epoch": 20.285572892408013, | |
| "grad_norm": 0.3985231816768646, | |
| "learning_rate": 0.00035678858474082697, | |
| "loss": 3.206, | |
| "step": 69700 | |
| }, | |
| { | |
| "epoch": 20.30012808570098, | |
| "grad_norm": 0.39604729413986206, | |
| "learning_rate": 0.00035661386138613856, | |
| "loss": 3.2033, | |
| "step": 69750 | |
| }, | |
| { | |
| "epoch": 20.314683278993947, | |
| "grad_norm": 0.3661750257015228, | |
| "learning_rate": 0.00035643913803145015, | |
| "loss": 3.2097, | |
| "step": 69800 | |
| }, | |
| { | |
| "epoch": 20.32923847228691, | |
| "grad_norm": 0.3990136682987213, | |
| "learning_rate": 0.0003562644146767618, | |
| "loss": 3.2185, | |
| "step": 69850 | |
| }, | |
| { | |
| "epoch": 20.343793665579877, | |
| "grad_norm": 0.40821337699890137, | |
| "learning_rate": 0.00035608969132207334, | |
| "loss": 3.2124, | |
| "step": 69900 | |
| }, | |
| { | |
| "epoch": 20.358348858872844, | |
| "grad_norm": 0.407844603061676, | |
| "learning_rate": 0.00035591496796738494, | |
| "loss": 3.2121, | |
| "step": 69950 | |
| }, | |
| { | |
| "epoch": 20.37290405216581, | |
| "grad_norm": 0.3922916650772095, | |
| "learning_rate": 0.00035574024461269653, | |
| "loss": 3.2113, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 20.37290405216581, | |
| "eval_accuracy": 0.37301591872093626, | |
| "eval_loss": 3.5480892658233643, | |
| "eval_runtime": 180.4222, | |
| "eval_samples_per_second": 92.289, | |
| "eval_steps_per_second": 5.77, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 20.38745924545878, | |
| "grad_norm": 0.3855879008769989, | |
| "learning_rate": 0.0003555655212580081, | |
| "loss": 3.2217, | |
| "step": 70050 | |
| }, | |
| { | |
| "epoch": 20.402014438751745, | |
| "grad_norm": 0.3908325135707855, | |
| "learning_rate": 0.00035539079790331967, | |
| "loss": 3.226, | |
| "step": 70100 | |
| }, | |
| { | |
| "epoch": 20.416569632044713, | |
| "grad_norm": 0.4145655632019043, | |
| "learning_rate": 0.0003552160745486313, | |
| "loss": 3.2296, | |
| "step": 70150 | |
| }, | |
| { | |
| "epoch": 20.43112482533768, | |
| "grad_norm": 0.40664735436439514, | |
| "learning_rate": 0.0003550413511939429, | |
| "loss": 3.2261, | |
| "step": 70200 | |
| }, | |
| { | |
| "epoch": 20.445680018630647, | |
| "grad_norm": 0.40211278200149536, | |
| "learning_rate": 0.0003548666278392545, | |
| "loss": 3.2121, | |
| "step": 70250 | |
| }, | |
| { | |
| "epoch": 20.460235211923614, | |
| "grad_norm": 0.4183812737464905, | |
| "learning_rate": 0.00035469190448456604, | |
| "loss": 3.2125, | |
| "step": 70300 | |
| }, | |
| { | |
| "epoch": 20.47479040521658, | |
| "grad_norm": 0.41541483998298645, | |
| "learning_rate": 0.00035451718112987764, | |
| "loss": 3.2285, | |
| "step": 70350 | |
| }, | |
| { | |
| "epoch": 20.489345598509548, | |
| "grad_norm": 0.39988651871681213, | |
| "learning_rate": 0.0003543424577751893, | |
| "loss": 3.2212, | |
| "step": 70400 | |
| }, | |
| { | |
| "epoch": 20.503900791802515, | |
| "grad_norm": 0.3851093649864197, | |
| "learning_rate": 0.0003541677344205009, | |
| "loss": 3.228, | |
| "step": 70450 | |
| }, | |
| { | |
| "epoch": 20.518455985095482, | |
| "grad_norm": 0.387295663356781, | |
| "learning_rate": 0.0003539930110658124, | |
| "loss": 3.2216, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 20.53301117838845, | |
| "grad_norm": 0.39036664366722107, | |
| "learning_rate": 0.000353818287711124, | |
| "loss": 3.233, | |
| "step": 70550 | |
| }, | |
| { | |
| "epoch": 20.547566371681416, | |
| "grad_norm": 0.4173073470592499, | |
| "learning_rate": 0.0003536435643564356, | |
| "loss": 3.2468, | |
| "step": 70600 | |
| }, | |
| { | |
| "epoch": 20.562121564974383, | |
| "grad_norm": 0.3900355398654938, | |
| "learning_rate": 0.00035346884100174715, | |
| "loss": 3.2312, | |
| "step": 70650 | |
| }, | |
| { | |
| "epoch": 20.57667675826735, | |
| "grad_norm": 0.42239484190940857, | |
| "learning_rate": 0.0003532941176470588, | |
| "loss": 3.2336, | |
| "step": 70700 | |
| }, | |
| { | |
| "epoch": 20.591231951560317, | |
| "grad_norm": 0.39398714900016785, | |
| "learning_rate": 0.0003531193942923704, | |
| "loss": 3.2338, | |
| "step": 70750 | |
| }, | |
| { | |
| "epoch": 20.605787144853284, | |
| "grad_norm": 0.388174831867218, | |
| "learning_rate": 0.000352944670937682, | |
| "loss": 3.227, | |
| "step": 70800 | |
| }, | |
| { | |
| "epoch": 20.62034233814625, | |
| "grad_norm": 0.3739430010318756, | |
| "learning_rate": 0.0003527699475829935, | |
| "loss": 3.2401, | |
| "step": 70850 | |
| }, | |
| { | |
| "epoch": 20.63489753143922, | |
| "grad_norm": 0.3884713053703308, | |
| "learning_rate": 0.0003525952242283051, | |
| "loss": 3.2238, | |
| "step": 70900 | |
| }, | |
| { | |
| "epoch": 20.649452724732186, | |
| "grad_norm": 0.3802180886268616, | |
| "learning_rate": 0.00035242050087361677, | |
| "loss": 3.2353, | |
| "step": 70950 | |
| }, | |
| { | |
| "epoch": 20.664007918025153, | |
| "grad_norm": 0.38343745470046997, | |
| "learning_rate": 0.00035224577751892836, | |
| "loss": 3.2383, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 20.664007918025153, | |
| "eval_accuracy": 0.37359591597314784, | |
| "eval_loss": 3.5364091396331787, | |
| "eval_runtime": 180.5917, | |
| "eval_samples_per_second": 92.202, | |
| "eval_steps_per_second": 5.764, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 20.67856311131812, | |
| "grad_norm": 0.3889595866203308, | |
| "learning_rate": 0.0003520710541642399, | |
| "loss": 3.2379, | |
| "step": 71050 | |
| }, | |
| { | |
| "epoch": 20.693118304611087, | |
| "grad_norm": 0.38171514868736267, | |
| "learning_rate": 0.0003518963308095515, | |
| "loss": 3.238, | |
| "step": 71100 | |
| }, | |
| { | |
| "epoch": 20.707673497904054, | |
| "grad_norm": 0.3786235451698303, | |
| "learning_rate": 0.0003517216074548631, | |
| "loss": 3.2378, | |
| "step": 71150 | |
| }, | |
| { | |
| "epoch": 20.722228691197017, | |
| "grad_norm": 0.3755587041378021, | |
| "learning_rate": 0.0003515468841001747, | |
| "loss": 3.2217, | |
| "step": 71200 | |
| }, | |
| { | |
| "epoch": 20.736783884489984, | |
| "grad_norm": 0.4026477038860321, | |
| "learning_rate": 0.00035137216074548634, | |
| "loss": 3.2482, | |
| "step": 71250 | |
| }, | |
| { | |
| "epoch": 20.75133907778295, | |
| "grad_norm": 0.40829044580459595, | |
| "learning_rate": 0.0003511974373907979, | |
| "loss": 3.2342, | |
| "step": 71300 | |
| }, | |
| { | |
| "epoch": 20.76589427107592, | |
| "grad_norm": 0.43004748225212097, | |
| "learning_rate": 0.00035102271403610947, | |
| "loss": 3.244, | |
| "step": 71350 | |
| }, | |
| { | |
| "epoch": 20.780449464368886, | |
| "grad_norm": 0.37790223956108093, | |
| "learning_rate": 0.00035084799068142106, | |
| "loss": 3.2372, | |
| "step": 71400 | |
| }, | |
| { | |
| "epoch": 20.795004657661853, | |
| "grad_norm": 0.43924814462661743, | |
| "learning_rate": 0.0003506732673267326, | |
| "loss": 3.2354, | |
| "step": 71450 | |
| }, | |
| { | |
| "epoch": 20.80955985095482, | |
| "grad_norm": 0.3851429522037506, | |
| "learning_rate": 0.00035049854397204425, | |
| "loss": 3.2343, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 20.824115044247787, | |
| "grad_norm": 0.3749452531337738, | |
| "learning_rate": 0.00035032382061735585, | |
| "loss": 3.261, | |
| "step": 71550 | |
| }, | |
| { | |
| "epoch": 20.838670237540754, | |
| "grad_norm": 0.3865537643432617, | |
| "learning_rate": 0.00035014909726266744, | |
| "loss": 3.2444, | |
| "step": 71600 | |
| }, | |
| { | |
| "epoch": 20.85322543083372, | |
| "grad_norm": 0.38718995451927185, | |
| "learning_rate": 0.000349974373907979, | |
| "loss": 3.2539, | |
| "step": 71650 | |
| }, | |
| { | |
| "epoch": 20.867780624126688, | |
| "grad_norm": 0.40072232484817505, | |
| "learning_rate": 0.0003497996505532906, | |
| "loss": 3.2359, | |
| "step": 71700 | |
| }, | |
| { | |
| "epoch": 20.882335817419655, | |
| "grad_norm": 0.40495890378952026, | |
| "learning_rate": 0.00034962492719860217, | |
| "loss": 3.2575, | |
| "step": 71750 | |
| }, | |
| { | |
| "epoch": 20.896891010712622, | |
| "grad_norm": 0.42096802592277527, | |
| "learning_rate": 0.0003494502038439138, | |
| "loss": 3.2454, | |
| "step": 71800 | |
| }, | |
| { | |
| "epoch": 20.91144620400559, | |
| "grad_norm": 0.38979777693748474, | |
| "learning_rate": 0.00034927548048922536, | |
| "loss": 3.2555, | |
| "step": 71850 | |
| }, | |
| { | |
| "epoch": 20.926001397298556, | |
| "grad_norm": 0.38400694727897644, | |
| "learning_rate": 0.00034910075713453695, | |
| "loss": 3.2454, | |
| "step": 71900 | |
| }, | |
| { | |
| "epoch": 20.940556590591523, | |
| "grad_norm": 0.3987891674041748, | |
| "learning_rate": 0.00034892603377984855, | |
| "loss": 3.2519, | |
| "step": 71950 | |
| }, | |
| { | |
| "epoch": 20.95511178388449, | |
| "grad_norm": 0.39958956837654114, | |
| "learning_rate": 0.0003487513104251601, | |
| "loss": 3.2497, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 20.95511178388449, | |
| "eval_accuracy": 0.3738046444675608, | |
| "eval_loss": 3.534238338470459, | |
| "eval_runtime": 180.7106, | |
| "eval_samples_per_second": 92.142, | |
| "eval_steps_per_second": 5.761, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 20.969666977177457, | |
| "grad_norm": 0.40454617142677307, | |
| "learning_rate": 0.0003485765870704717, | |
| "loss": 3.2574, | |
| "step": 72050 | |
| }, | |
| { | |
| "epoch": 20.984222170470424, | |
| "grad_norm": 0.42763713002204895, | |
| "learning_rate": 0.00034840186371578333, | |
| "loss": 3.2571, | |
| "step": 72100 | |
| }, | |
| { | |
| "epoch": 20.99877736376339, | |
| "grad_norm": 0.3658269941806793, | |
| "learning_rate": 0.0003482271403610949, | |
| "loss": 3.2421, | |
| "step": 72150 | |
| }, | |
| { | |
| "epoch": 21.01309967396367, | |
| "grad_norm": 0.3858882784843445, | |
| "learning_rate": 0.0003480524170064065, | |
| "loss": 3.1648, | |
| "step": 72200 | |
| }, | |
| { | |
| "epoch": 21.027654867256636, | |
| "grad_norm": 0.3896530270576477, | |
| "learning_rate": 0.00034787769365171806, | |
| "loss": 3.1446, | |
| "step": 72250 | |
| }, | |
| { | |
| "epoch": 21.042210060549603, | |
| "grad_norm": 0.3882528245449066, | |
| "learning_rate": 0.00034770297029702965, | |
| "loss": 3.1491, | |
| "step": 72300 | |
| }, | |
| { | |
| "epoch": 21.05676525384257, | |
| "grad_norm": 0.40166833996772766, | |
| "learning_rate": 0.0003475282469423413, | |
| "loss": 3.155, | |
| "step": 72350 | |
| }, | |
| { | |
| "epoch": 21.071320447135538, | |
| "grad_norm": 0.390828013420105, | |
| "learning_rate": 0.0003473535235876529, | |
| "loss": 3.1612, | |
| "step": 72400 | |
| }, | |
| { | |
| "epoch": 21.085875640428505, | |
| "grad_norm": 0.3854805827140808, | |
| "learning_rate": 0.00034717880023296444, | |
| "loss": 3.1682, | |
| "step": 72450 | |
| }, | |
| { | |
| "epoch": 21.10043083372147, | |
| "grad_norm": 0.41260701417922974, | |
| "learning_rate": 0.00034700407687827603, | |
| "loss": 3.1714, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 21.11498602701444, | |
| "grad_norm": 0.39876508712768555, | |
| "learning_rate": 0.0003468293535235876, | |
| "loss": 3.1704, | |
| "step": 72550 | |
| }, | |
| { | |
| "epoch": 21.129541220307406, | |
| "grad_norm": 0.39325881004333496, | |
| "learning_rate": 0.00034665463016889916, | |
| "loss": 3.1756, | |
| "step": 72600 | |
| }, | |
| { | |
| "epoch": 21.144096413600373, | |
| "grad_norm": 0.4202461540699005, | |
| "learning_rate": 0.0003464799068142108, | |
| "loss": 3.1753, | |
| "step": 72650 | |
| }, | |
| { | |
| "epoch": 21.15865160689334, | |
| "grad_norm": 0.4095156788825989, | |
| "learning_rate": 0.0003463051834595224, | |
| "loss": 3.1719, | |
| "step": 72700 | |
| }, | |
| { | |
| "epoch": 21.173206800186307, | |
| "grad_norm": 0.393533319234848, | |
| "learning_rate": 0.000346130460104834, | |
| "loss": 3.1708, | |
| "step": 72750 | |
| }, | |
| { | |
| "epoch": 21.187761993479274, | |
| "grad_norm": 0.4020528793334961, | |
| "learning_rate": 0.00034595573675014554, | |
| "loss": 3.1868, | |
| "step": 72800 | |
| }, | |
| { | |
| "epoch": 21.20231718677224, | |
| "grad_norm": 0.4307057857513428, | |
| "learning_rate": 0.00034578101339545714, | |
| "loss": 3.1879, | |
| "step": 72850 | |
| }, | |
| { | |
| "epoch": 21.216872380065208, | |
| "grad_norm": 0.40629395842552185, | |
| "learning_rate": 0.0003456062900407688, | |
| "loss": 3.1862, | |
| "step": 72900 | |
| }, | |
| { | |
| "epoch": 21.231427573358175, | |
| "grad_norm": 0.40840399265289307, | |
| "learning_rate": 0.0003454315666860804, | |
| "loss": 3.199, | |
| "step": 72950 | |
| }, | |
| { | |
| "epoch": 21.245982766651142, | |
| "grad_norm": 0.3967541754245758, | |
| "learning_rate": 0.0003452568433313919, | |
| "loss": 3.1931, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 21.245982766651142, | |
| "eval_accuracy": 0.3730340179259698, | |
| "eval_loss": 3.548424243927002, | |
| "eval_runtime": 180.401, | |
| "eval_samples_per_second": 92.3, | |
| "eval_steps_per_second": 5.77, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 21.26053795994411, | |
| "grad_norm": 0.38777831196784973, | |
| "learning_rate": 0.0003450821199767035, | |
| "loss": 3.1929, | |
| "step": 73050 | |
| }, | |
| { | |
| "epoch": 21.275093153237076, | |
| "grad_norm": 0.37465155124664307, | |
| "learning_rate": 0.0003449073966220151, | |
| "loss": 3.2056, | |
| "step": 73100 | |
| }, | |
| { | |
| "epoch": 21.289648346530043, | |
| "grad_norm": 0.4003761410713196, | |
| "learning_rate": 0.0003447326732673267, | |
| "loss": 3.1847, | |
| "step": 73150 | |
| }, | |
| { | |
| "epoch": 21.30420353982301, | |
| "grad_norm": 0.41474276781082153, | |
| "learning_rate": 0.0003445579499126383, | |
| "loss": 3.2056, | |
| "step": 73200 | |
| }, | |
| { | |
| "epoch": 21.318758733115978, | |
| "grad_norm": 0.3801861107349396, | |
| "learning_rate": 0.0003443832265579499, | |
| "loss": 3.1966, | |
| "step": 73250 | |
| }, | |
| { | |
| "epoch": 21.33331392640894, | |
| "grad_norm": 0.38901805877685547, | |
| "learning_rate": 0.0003442085032032615, | |
| "loss": 3.2078, | |
| "step": 73300 | |
| }, | |
| { | |
| "epoch": 21.347869119701908, | |
| "grad_norm": 0.39941632747650146, | |
| "learning_rate": 0.0003440337798485731, | |
| "loss": 3.202, | |
| "step": 73350 | |
| }, | |
| { | |
| "epoch": 21.362424312994875, | |
| "grad_norm": 0.3970851004123688, | |
| "learning_rate": 0.0003438590564938846, | |
| "loss": 3.1993, | |
| "step": 73400 | |
| }, | |
| { | |
| "epoch": 21.376979506287842, | |
| "grad_norm": 0.37403079867362976, | |
| "learning_rate": 0.0003436843331391962, | |
| "loss": 3.1938, | |
| "step": 73450 | |
| }, | |
| { | |
| "epoch": 21.39153469958081, | |
| "grad_norm": 0.38682398200035095, | |
| "learning_rate": 0.00034350960978450786, | |
| "loss": 3.1999, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 21.406089892873776, | |
| "grad_norm": 0.38072529435157776, | |
| "learning_rate": 0.00034333488642981946, | |
| "loss": 3.1982, | |
| "step": 73550 | |
| }, | |
| { | |
| "epoch": 21.420645086166743, | |
| "grad_norm": 0.4030841588973999, | |
| "learning_rate": 0.000343160163075131, | |
| "loss": 3.2127, | |
| "step": 73600 | |
| }, | |
| { | |
| "epoch": 21.43520027945971, | |
| "grad_norm": 0.4120728373527527, | |
| "learning_rate": 0.0003429854397204426, | |
| "loss": 3.2228, | |
| "step": 73650 | |
| }, | |
| { | |
| "epoch": 21.449755472752678, | |
| "grad_norm": 0.39254021644592285, | |
| "learning_rate": 0.0003428107163657542, | |
| "loss": 3.2094, | |
| "step": 73700 | |
| }, | |
| { | |
| "epoch": 21.464310666045645, | |
| "grad_norm": 0.44818630814552307, | |
| "learning_rate": 0.00034263599301106583, | |
| "loss": 3.2024, | |
| "step": 73750 | |
| }, | |
| { | |
| "epoch": 21.47886585933861, | |
| "grad_norm": 0.38482633233070374, | |
| "learning_rate": 0.0003424612696563774, | |
| "loss": 3.2083, | |
| "step": 73800 | |
| }, | |
| { | |
| "epoch": 21.49342105263158, | |
| "grad_norm": 0.4166334271430969, | |
| "learning_rate": 0.00034228654630168897, | |
| "loss": 3.2168, | |
| "step": 73850 | |
| }, | |
| { | |
| "epoch": 21.507976245924546, | |
| "grad_norm": 0.3959280848503113, | |
| "learning_rate": 0.00034211182294700056, | |
| "loss": 3.2098, | |
| "step": 73900 | |
| }, | |
| { | |
| "epoch": 21.522531439217513, | |
| "grad_norm": 0.3966386616230011, | |
| "learning_rate": 0.0003419370995923121, | |
| "loss": 3.221, | |
| "step": 73950 | |
| }, | |
| { | |
| "epoch": 21.53708663251048, | |
| "grad_norm": 0.44773370027542114, | |
| "learning_rate": 0.0003417623762376237, | |
| "loss": 3.2141, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 21.53708663251048, | |
| "eval_accuracy": 0.3735255171172056, | |
| "eval_loss": 3.542484760284424, | |
| "eval_runtime": 181.9396, | |
| "eval_samples_per_second": 91.519, | |
| "eval_steps_per_second": 5.722, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 21.551641825803447, | |
| "grad_norm": 0.39274561405181885, | |
| "learning_rate": 0.00034158765288293534, | |
| "loss": 3.2218, | |
| "step": 74050 | |
| }, | |
| { | |
| "epoch": 21.566197019096414, | |
| "grad_norm": 0.3738594055175781, | |
| "learning_rate": 0.00034141292952824694, | |
| "loss": 3.2229, | |
| "step": 74100 | |
| }, | |
| { | |
| "epoch": 21.58075221238938, | |
| "grad_norm": 0.3919222056865692, | |
| "learning_rate": 0.0003412382061735585, | |
| "loss": 3.232, | |
| "step": 74150 | |
| }, | |
| { | |
| "epoch": 21.595307405682348, | |
| "grad_norm": 0.39072421193122864, | |
| "learning_rate": 0.0003410634828188701, | |
| "loss": 3.2218, | |
| "step": 74200 | |
| }, | |
| { | |
| "epoch": 21.609862598975315, | |
| "grad_norm": 0.3992181122303009, | |
| "learning_rate": 0.00034088875946418167, | |
| "loss": 3.2217, | |
| "step": 74250 | |
| }, | |
| { | |
| "epoch": 21.624417792268282, | |
| "grad_norm": 0.37571993470191956, | |
| "learning_rate": 0.0003407140361094933, | |
| "loss": 3.2311, | |
| "step": 74300 | |
| }, | |
| { | |
| "epoch": 21.63897298556125, | |
| "grad_norm": 0.40937113761901855, | |
| "learning_rate": 0.00034053931275480486, | |
| "loss": 3.2164, | |
| "step": 74350 | |
| }, | |
| { | |
| "epoch": 21.653528178854216, | |
| "grad_norm": 0.4119691550731659, | |
| "learning_rate": 0.00034036458940011645, | |
| "loss": 3.2291, | |
| "step": 74400 | |
| }, | |
| { | |
| "epoch": 21.668083372147183, | |
| "grad_norm": 0.41038092970848083, | |
| "learning_rate": 0.00034018986604542804, | |
| "loss": 3.2191, | |
| "step": 74450 | |
| }, | |
| { | |
| "epoch": 21.68263856544015, | |
| "grad_norm": 0.4318416714668274, | |
| "learning_rate": 0.00034001514269073964, | |
| "loss": 3.2298, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 21.697193758733118, | |
| "grad_norm": 0.42463475465774536, | |
| "learning_rate": 0.0003398404193360512, | |
| "loss": 3.2328, | |
| "step": 74550 | |
| }, | |
| { | |
| "epoch": 21.711748952026085, | |
| "grad_norm": 0.37979745864868164, | |
| "learning_rate": 0.00033966569598136283, | |
| "loss": 3.2325, | |
| "step": 74600 | |
| }, | |
| { | |
| "epoch": 21.726304145319048, | |
| "grad_norm": 0.41880518198013306, | |
| "learning_rate": 0.0003394909726266744, | |
| "loss": 3.2324, | |
| "step": 74650 | |
| }, | |
| { | |
| "epoch": 21.740859338612015, | |
| "grad_norm": 0.40106236934661865, | |
| "learning_rate": 0.000339316249271986, | |
| "loss": 3.2279, | |
| "step": 74700 | |
| }, | |
| { | |
| "epoch": 21.755414531904982, | |
| "grad_norm": 0.39838361740112305, | |
| "learning_rate": 0.00033914152591729756, | |
| "loss": 3.2313, | |
| "step": 74750 | |
| }, | |
| { | |
| "epoch": 21.76996972519795, | |
| "grad_norm": 0.39547356963157654, | |
| "learning_rate": 0.00033896680256260915, | |
| "loss": 3.2329, | |
| "step": 74800 | |
| }, | |
| { | |
| "epoch": 21.784524918490916, | |
| "grad_norm": 0.3893059492111206, | |
| "learning_rate": 0.00033879207920792074, | |
| "loss": 3.231, | |
| "step": 74850 | |
| }, | |
| { | |
| "epoch": 21.799080111783883, | |
| "grad_norm": 0.38267838954925537, | |
| "learning_rate": 0.0003386173558532324, | |
| "loss": 3.2332, | |
| "step": 74900 | |
| }, | |
| { | |
| "epoch": 21.81363530507685, | |
| "grad_norm": 0.4035026431083679, | |
| "learning_rate": 0.00033844263249854393, | |
| "loss": 3.2316, | |
| "step": 74950 | |
| }, | |
| { | |
| "epoch": 21.828190498369818, | |
| "grad_norm": 0.4062095880508423, | |
| "learning_rate": 0.00033826790914385553, | |
| "loss": 3.2294, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 21.828190498369818, | |
| "eval_accuracy": 0.3740121976889196, | |
| "eval_loss": 3.5341384410858154, | |
| "eval_runtime": 180.659, | |
| "eval_samples_per_second": 92.168, | |
| "eval_steps_per_second": 5.762, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 21.842745691662785, | |
| "grad_norm": 0.39287662506103516, | |
| "learning_rate": 0.0003380931857891671, | |
| "loss": 3.236, | |
| "step": 75050 | |
| }, | |
| { | |
| "epoch": 21.85730088495575, | |
| "grad_norm": 0.4002121686935425, | |
| "learning_rate": 0.00033791846243447866, | |
| "loss": 3.2412, | |
| "step": 75100 | |
| }, | |
| { | |
| "epoch": 21.87185607824872, | |
| "grad_norm": 0.4140980541706085, | |
| "learning_rate": 0.0003377437390797903, | |
| "loss": 3.2475, | |
| "step": 75150 | |
| }, | |
| { | |
| "epoch": 21.886411271541686, | |
| "grad_norm": 0.4264191687107086, | |
| "learning_rate": 0.0003375690157251019, | |
| "loss": 3.2248, | |
| "step": 75200 | |
| }, | |
| { | |
| "epoch": 21.900966464834653, | |
| "grad_norm": 0.42887499928474426, | |
| "learning_rate": 0.0003373942923704135, | |
| "loss": 3.2426, | |
| "step": 75250 | |
| }, | |
| { | |
| "epoch": 21.91552165812762, | |
| "grad_norm": 0.40436336398124695, | |
| "learning_rate": 0.00033721956901572504, | |
| "loss": 3.2428, | |
| "step": 75300 | |
| }, | |
| { | |
| "epoch": 21.930076851420587, | |
| "grad_norm": 0.4055461287498474, | |
| "learning_rate": 0.00033704484566103663, | |
| "loss": 3.2371, | |
| "step": 75350 | |
| }, | |
| { | |
| "epoch": 21.944632044713554, | |
| "grad_norm": 0.4044981300830841, | |
| "learning_rate": 0.00033687012230634823, | |
| "loss": 3.2515, | |
| "step": 75400 | |
| }, | |
| { | |
| "epoch": 21.95918723800652, | |
| "grad_norm": 0.3932948708534241, | |
| "learning_rate": 0.0003366953989516599, | |
| "loss": 3.2352, | |
| "step": 75450 | |
| }, | |
| { | |
| "epoch": 21.97374243129949, | |
| "grad_norm": 0.38733401894569397, | |
| "learning_rate": 0.00033652067559697147, | |
| "loss": 3.2346, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 21.988297624592455, | |
| "grad_norm": 0.4006105363368988, | |
| "learning_rate": 0.000336345952242283, | |
| "loss": 3.2381, | |
| "step": 75550 | |
| }, | |
| { | |
| "epoch": 22.002619934792733, | |
| "grad_norm": 0.43284958600997925, | |
| "learning_rate": 0.0003361712288875946, | |
| "loss": 3.2174, | |
| "step": 75600 | |
| }, | |
| { | |
| "epoch": 22.0171751280857, | |
| "grad_norm": 0.385699063539505, | |
| "learning_rate": 0.0003359965055329062, | |
| "loss": 3.134, | |
| "step": 75650 | |
| }, | |
| { | |
| "epoch": 22.031730321378667, | |
| "grad_norm": 0.39526697993278503, | |
| "learning_rate": 0.00033582178217821785, | |
| "loss": 3.1461, | |
| "step": 75700 | |
| }, | |
| { | |
| "epoch": 22.046285514671634, | |
| "grad_norm": 0.4182550609111786, | |
| "learning_rate": 0.0003356470588235294, | |
| "loss": 3.1481, | |
| "step": 75750 | |
| }, | |
| { | |
| "epoch": 22.0608407079646, | |
| "grad_norm": 0.41826847195625305, | |
| "learning_rate": 0.000335472335468841, | |
| "loss": 3.1555, | |
| "step": 75800 | |
| }, | |
| { | |
| "epoch": 22.07539590125757, | |
| "grad_norm": 0.40757420659065247, | |
| "learning_rate": 0.0003352976121141526, | |
| "loss": 3.1546, | |
| "step": 75850 | |
| }, | |
| { | |
| "epoch": 22.089951094550536, | |
| "grad_norm": 0.40740177035331726, | |
| "learning_rate": 0.0003351228887594641, | |
| "loss": 3.1575, | |
| "step": 75900 | |
| }, | |
| { | |
| "epoch": 22.104506287843503, | |
| "grad_norm": 0.44522273540496826, | |
| "learning_rate": 0.0003349481654047757, | |
| "loss": 3.1598, | |
| "step": 75950 | |
| }, | |
| { | |
| "epoch": 22.11906148113647, | |
| "grad_norm": 0.3875965476036072, | |
| "learning_rate": 0.00033477344205008736, | |
| "loss": 3.1546, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 22.11906148113647, | |
| "eval_accuracy": 0.3735621856364944, | |
| "eval_loss": 3.55169415473938, | |
| "eval_runtime": 182.759, | |
| "eval_samples_per_second": 91.109, | |
| "eval_steps_per_second": 5.696, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 22.133616674429437, | |
| "grad_norm": 0.39561259746551514, | |
| "learning_rate": 0.00033459871869539895, | |
| "loss": 3.1664, | |
| "step": 76050 | |
| }, | |
| { | |
| "epoch": 22.148171867722404, | |
| "grad_norm": 0.403856486082077, | |
| "learning_rate": 0.0003344239953407105, | |
| "loss": 3.1733, | |
| "step": 76100 | |
| }, | |
| { | |
| "epoch": 22.16272706101537, | |
| "grad_norm": 0.38269567489624023, | |
| "learning_rate": 0.0003342492719860221, | |
| "loss": 3.1668, | |
| "step": 76150 | |
| }, | |
| { | |
| "epoch": 22.177282254308338, | |
| "grad_norm": 0.39551034569740295, | |
| "learning_rate": 0.0003340745486313337, | |
| "loss": 3.1746, | |
| "step": 76200 | |
| }, | |
| { | |
| "epoch": 22.191837447601305, | |
| "grad_norm": 0.42570576071739197, | |
| "learning_rate": 0.0003338998252766453, | |
| "loss": 3.1872, | |
| "step": 76250 | |
| }, | |
| { | |
| "epoch": 22.206392640894272, | |
| "grad_norm": 0.3884979784488678, | |
| "learning_rate": 0.00033372510192195687, | |
| "loss": 3.1668, | |
| "step": 76300 | |
| }, | |
| { | |
| "epoch": 22.22094783418724, | |
| "grad_norm": 0.43206584453582764, | |
| "learning_rate": 0.00033355037856726847, | |
| "loss": 3.1718, | |
| "step": 76350 | |
| }, | |
| { | |
| "epoch": 22.235503027480206, | |
| "grad_norm": 0.4337749183177948, | |
| "learning_rate": 0.00033337565521258006, | |
| "loss": 3.1789, | |
| "step": 76400 | |
| }, | |
| { | |
| "epoch": 22.250058220773173, | |
| "grad_norm": 0.4393354058265686, | |
| "learning_rate": 0.00033320093185789165, | |
| "loss": 3.1816, | |
| "step": 76450 | |
| }, | |
| { | |
| "epoch": 22.26461341406614, | |
| "grad_norm": 0.39173170924186707, | |
| "learning_rate": 0.0003330262085032032, | |
| "loss": 3.1806, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 22.279168607359107, | |
| "grad_norm": 0.4066053330898285, | |
| "learning_rate": 0.00033285148514851484, | |
| "loss": 3.1814, | |
| "step": 76550 | |
| }, | |
| { | |
| "epoch": 22.293723800652074, | |
| "grad_norm": 0.38287800550460815, | |
| "learning_rate": 0.00033267676179382644, | |
| "loss": 3.1909, | |
| "step": 76600 | |
| }, | |
| { | |
| "epoch": 22.308278993945038, | |
| "grad_norm": 0.41691353917121887, | |
| "learning_rate": 0.00033250203843913803, | |
| "loss": 3.1885, | |
| "step": 76650 | |
| }, | |
| { | |
| "epoch": 22.322834187238005, | |
| "grad_norm": 0.41341423988342285, | |
| "learning_rate": 0.00033232731508444957, | |
| "loss": 3.1938, | |
| "step": 76700 | |
| }, | |
| { | |
| "epoch": 22.337389380530972, | |
| "grad_norm": 0.38151562213897705, | |
| "learning_rate": 0.00033215259172976117, | |
| "loss": 3.1888, | |
| "step": 76750 | |
| }, | |
| { | |
| "epoch": 22.35194457382394, | |
| "grad_norm": 0.3901282250881195, | |
| "learning_rate": 0.00033197786837507276, | |
| "loss": 3.1905, | |
| "step": 76800 | |
| }, | |
| { | |
| "epoch": 22.366499767116906, | |
| "grad_norm": 0.37372684478759766, | |
| "learning_rate": 0.0003318031450203844, | |
| "loss": 3.1933, | |
| "step": 76850 | |
| }, | |
| { | |
| "epoch": 22.381054960409873, | |
| "grad_norm": 0.39925166964530945, | |
| "learning_rate": 0.00033162842166569595, | |
| "loss": 3.186, | |
| "step": 76900 | |
| }, | |
| { | |
| "epoch": 22.39561015370284, | |
| "grad_norm": 0.41825148463249207, | |
| "learning_rate": 0.00033145369831100754, | |
| "loss": 3.1982, | |
| "step": 76950 | |
| }, | |
| { | |
| "epoch": 22.410165346995807, | |
| "grad_norm": 0.40504124760627747, | |
| "learning_rate": 0.00033127897495631914, | |
| "loss": 3.1986, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 22.410165346995807, | |
| "eval_accuracy": 0.3735411482488255, | |
| "eval_loss": 3.547119140625, | |
| "eval_runtime": 180.534, | |
| "eval_samples_per_second": 92.232, | |
| "eval_steps_per_second": 5.766, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 22.424720540288774, | |
| "grad_norm": 0.4165102243423462, | |
| "learning_rate": 0.0003311042516016307, | |
| "loss": 3.2004, | |
| "step": 77050 | |
| }, | |
| { | |
| "epoch": 22.43927573358174, | |
| "grad_norm": 0.4132382869720459, | |
| "learning_rate": 0.0003309295282469423, | |
| "loss": 3.1964, | |
| "step": 77100 | |
| }, | |
| { | |
| "epoch": 22.45383092687471, | |
| "grad_norm": 0.39456766843795776, | |
| "learning_rate": 0.0003307548048922539, | |
| "loss": 3.1999, | |
| "step": 77150 | |
| }, | |
| { | |
| "epoch": 22.468386120167676, | |
| "grad_norm": 0.42880240082740784, | |
| "learning_rate": 0.0003305800815375655, | |
| "loss": 3.2033, | |
| "step": 77200 | |
| }, | |
| { | |
| "epoch": 22.482941313460643, | |
| "grad_norm": 0.4196506440639496, | |
| "learning_rate": 0.00033040535818287705, | |
| "loss": 3.1991, | |
| "step": 77250 | |
| }, | |
| { | |
| "epoch": 22.49749650675361, | |
| "grad_norm": 0.41333723068237305, | |
| "learning_rate": 0.00033023063482818865, | |
| "loss": 3.203, | |
| "step": 77300 | |
| }, | |
| { | |
| "epoch": 22.512051700046577, | |
| "grad_norm": 0.4085758626461029, | |
| "learning_rate": 0.00033005591147350024, | |
| "loss": 3.1992, | |
| "step": 77350 | |
| }, | |
| { | |
| "epoch": 22.526606893339544, | |
| "grad_norm": 0.4137350618839264, | |
| "learning_rate": 0.0003298811881188119, | |
| "loss": 3.2143, | |
| "step": 77400 | |
| }, | |
| { | |
| "epoch": 22.54116208663251, | |
| "grad_norm": 0.3937785029411316, | |
| "learning_rate": 0.00032970646476412343, | |
| "loss": 3.195, | |
| "step": 77450 | |
| }, | |
| { | |
| "epoch": 22.555717279925478, | |
| "grad_norm": 0.4458615183830261, | |
| "learning_rate": 0.000329531741409435, | |
| "loss": 3.2136, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 22.570272473218445, | |
| "grad_norm": 0.39188769459724426, | |
| "learning_rate": 0.0003293570180547466, | |
| "loss": 3.2025, | |
| "step": 77550 | |
| }, | |
| { | |
| "epoch": 22.584827666511412, | |
| "grad_norm": 0.3993958532810211, | |
| "learning_rate": 0.0003291822947000582, | |
| "loss": 3.2054, | |
| "step": 77600 | |
| }, | |
| { | |
| "epoch": 22.59938285980438, | |
| "grad_norm": 0.5013609528541565, | |
| "learning_rate": 0.00032900757134536975, | |
| "loss": 3.2134, | |
| "step": 77650 | |
| }, | |
| { | |
| "epoch": 22.613938053097346, | |
| "grad_norm": 0.42360374331474304, | |
| "learning_rate": 0.0003288328479906814, | |
| "loss": 3.2158, | |
| "step": 77700 | |
| }, | |
| { | |
| "epoch": 22.628493246390313, | |
| "grad_norm": 0.42878004908561707, | |
| "learning_rate": 0.000328658124635993, | |
| "loss": 3.2135, | |
| "step": 77750 | |
| }, | |
| { | |
| "epoch": 22.64304843968328, | |
| "grad_norm": 0.421800434589386, | |
| "learning_rate": 0.0003284834012813046, | |
| "loss": 3.2062, | |
| "step": 77800 | |
| }, | |
| { | |
| "epoch": 22.657603632976247, | |
| "grad_norm": 0.40251171588897705, | |
| "learning_rate": 0.00032830867792661613, | |
| "loss": 3.2149, | |
| "step": 77850 | |
| }, | |
| { | |
| "epoch": 22.672158826269214, | |
| "grad_norm": 0.4186382591724396, | |
| "learning_rate": 0.0003281339545719277, | |
| "loss": 3.2222, | |
| "step": 77900 | |
| }, | |
| { | |
| "epoch": 22.68671401956218, | |
| "grad_norm": 0.42149704694747925, | |
| "learning_rate": 0.0003279592312172394, | |
| "loss": 3.2174, | |
| "step": 77950 | |
| }, | |
| { | |
| "epoch": 22.701269212855145, | |
| "grad_norm": 0.3991159498691559, | |
| "learning_rate": 0.00032778450786255097, | |
| "loss": 3.2226, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 22.701269212855145, | |
| "eval_accuracy": 0.37383179327511107, | |
| "eval_loss": 3.543539524078369, | |
| "eval_runtime": 182.2435, | |
| "eval_samples_per_second": 91.367, | |
| "eval_steps_per_second": 5.712, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 22.715824406148112, | |
| "grad_norm": 0.4073682725429535, | |
| "learning_rate": 0.0003276097845078625, | |
| "loss": 3.2042, | |
| "step": 78050 | |
| }, | |
| { | |
| "epoch": 22.73037959944108, | |
| "grad_norm": 0.4109225571155548, | |
| "learning_rate": 0.0003274350611531741, | |
| "loss": 3.2271, | |
| "step": 78100 | |
| }, | |
| { | |
| "epoch": 22.744934792734046, | |
| "grad_norm": 0.3942478597164154, | |
| "learning_rate": 0.0003272603377984857, | |
| "loss": 3.2182, | |
| "step": 78150 | |
| }, | |
| { | |
| "epoch": 22.759489986027013, | |
| "grad_norm": 0.4061560034751892, | |
| "learning_rate": 0.00032708561444379724, | |
| "loss": 3.2163, | |
| "step": 78200 | |
| }, | |
| { | |
| "epoch": 22.77404517931998, | |
| "grad_norm": 0.4090252220630646, | |
| "learning_rate": 0.0003269108910891089, | |
| "loss": 3.2243, | |
| "step": 78250 | |
| }, | |
| { | |
| "epoch": 22.788600372612947, | |
| "grad_norm": 0.37358608841896057, | |
| "learning_rate": 0.0003267361677344205, | |
| "loss": 3.2224, | |
| "step": 78300 | |
| }, | |
| { | |
| "epoch": 22.803155565905914, | |
| "grad_norm": 0.40171942114830017, | |
| "learning_rate": 0.0003265614443797321, | |
| "loss": 3.2283, | |
| "step": 78350 | |
| }, | |
| { | |
| "epoch": 22.81771075919888, | |
| "grad_norm": 0.38627341389656067, | |
| "learning_rate": 0.0003263867210250436, | |
| "loss": 3.2227, | |
| "step": 78400 | |
| }, | |
| { | |
| "epoch": 22.83226595249185, | |
| "grad_norm": 0.377933531999588, | |
| "learning_rate": 0.0003262119976703552, | |
| "loss": 3.2251, | |
| "step": 78450 | |
| }, | |
| { | |
| "epoch": 22.846821145784816, | |
| "grad_norm": 0.39408254623413086, | |
| "learning_rate": 0.00032603727431566686, | |
| "loss": 3.2303, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 22.861376339077783, | |
| "grad_norm": 0.4206821918487549, | |
| "learning_rate": 0.00032586255096097845, | |
| "loss": 3.2393, | |
| "step": 78550 | |
| }, | |
| { | |
| "epoch": 22.87593153237075, | |
| "grad_norm": 0.3831664025783539, | |
| "learning_rate": 0.00032568782760629005, | |
| "loss": 3.2295, | |
| "step": 78600 | |
| }, | |
| { | |
| "epoch": 22.890486725663717, | |
| "grad_norm": 0.402316689491272, | |
| "learning_rate": 0.0003255131042516016, | |
| "loss": 3.2384, | |
| "step": 78650 | |
| }, | |
| { | |
| "epoch": 22.905041918956684, | |
| "grad_norm": 0.3817090094089508, | |
| "learning_rate": 0.0003253383808969132, | |
| "loss": 3.2233, | |
| "step": 78700 | |
| }, | |
| { | |
| "epoch": 22.91959711224965, | |
| "grad_norm": 0.37257474660873413, | |
| "learning_rate": 0.0003251636575422248, | |
| "loss": 3.2347, | |
| "step": 78750 | |
| }, | |
| { | |
| "epoch": 22.934152305542618, | |
| "grad_norm": 0.4055401682853699, | |
| "learning_rate": 0.0003249889341875364, | |
| "loss": 3.2215, | |
| "step": 78800 | |
| }, | |
| { | |
| "epoch": 22.948707498835585, | |
| "grad_norm": 0.3962166905403137, | |
| "learning_rate": 0.00032481421083284796, | |
| "loss": 3.2327, | |
| "step": 78850 | |
| }, | |
| { | |
| "epoch": 22.963262692128552, | |
| "grad_norm": 0.396388441324234, | |
| "learning_rate": 0.00032463948747815956, | |
| "loss": 3.245, | |
| "step": 78900 | |
| }, | |
| { | |
| "epoch": 22.97781788542152, | |
| "grad_norm": 0.4103637635707855, | |
| "learning_rate": 0.00032446476412347115, | |
| "loss": 3.2283, | |
| "step": 78950 | |
| }, | |
| { | |
| "epoch": 22.992373078714486, | |
| "grad_norm": 0.3708791434764862, | |
| "learning_rate": 0.0003242900407687827, | |
| "loss": 3.2436, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 22.992373078714486, | |
| "eval_accuracy": 0.3745538810395666, | |
| "eval_loss": 3.5276875495910645, | |
| "eval_runtime": 181.1042, | |
| "eval_samples_per_second": 91.942, | |
| "eval_steps_per_second": 5.748, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 23.006695388914764, | |
| "grad_norm": 0.4256347715854645, | |
| "learning_rate": 0.00032411531741409434, | |
| "loss": 3.1873, | |
| "step": 79050 | |
| }, | |
| { | |
| "epoch": 23.02125058220773, | |
| "grad_norm": 0.4103791415691376, | |
| "learning_rate": 0.00032394059405940593, | |
| "loss": 3.1258, | |
| "step": 79100 | |
| }, | |
| { | |
| "epoch": 23.035805775500698, | |
| "grad_norm": 0.41765856742858887, | |
| "learning_rate": 0.00032376587070471753, | |
| "loss": 3.1259, | |
| "step": 79150 | |
| }, | |
| { | |
| "epoch": 23.050360968793665, | |
| "grad_norm": 0.4011674225330353, | |
| "learning_rate": 0.00032359114735002907, | |
| "loss": 3.1324, | |
| "step": 79200 | |
| }, | |
| { | |
| "epoch": 23.064916162086632, | |
| "grad_norm": 0.4066724181175232, | |
| "learning_rate": 0.00032341642399534066, | |
| "loss": 3.1481, | |
| "step": 79250 | |
| }, | |
| { | |
| "epoch": 23.0794713553796, | |
| "grad_norm": 0.4127407968044281, | |
| "learning_rate": 0.00032324170064065226, | |
| "loss": 3.1275, | |
| "step": 79300 | |
| }, | |
| { | |
| "epoch": 23.094026548672566, | |
| "grad_norm": 0.420987993478775, | |
| "learning_rate": 0.0003230669772859639, | |
| "loss": 3.1594, | |
| "step": 79350 | |
| }, | |
| { | |
| "epoch": 23.108581741965533, | |
| "grad_norm": 0.39907628297805786, | |
| "learning_rate": 0.00032289225393127545, | |
| "loss": 3.1477, | |
| "step": 79400 | |
| }, | |
| { | |
| "epoch": 23.1231369352585, | |
| "grad_norm": 0.3801015317440033, | |
| "learning_rate": 0.00032271753057658704, | |
| "loss": 3.1457, | |
| "step": 79450 | |
| }, | |
| { | |
| "epoch": 23.137692128551468, | |
| "grad_norm": 0.4148379862308502, | |
| "learning_rate": 0.00032254280722189863, | |
| "loss": 3.1618, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 23.152247321844435, | |
| "grad_norm": 0.4523087739944458, | |
| "learning_rate": 0.00032236808386721023, | |
| "loss": 3.1482, | |
| "step": 79550 | |
| }, | |
| { | |
| "epoch": 23.1668025151374, | |
| "grad_norm": 0.40067973732948303, | |
| "learning_rate": 0.00032219336051252177, | |
| "loss": 3.1531, | |
| "step": 79600 | |
| }, | |
| { | |
| "epoch": 23.18135770843037, | |
| "grad_norm": 0.4054492712020874, | |
| "learning_rate": 0.0003220186371578334, | |
| "loss": 3.1559, | |
| "step": 79650 | |
| }, | |
| { | |
| "epoch": 23.195912901723336, | |
| "grad_norm": 0.39074888825416565, | |
| "learning_rate": 0.000321843913803145, | |
| "loss": 3.1735, | |
| "step": 79700 | |
| }, | |
| { | |
| "epoch": 23.210468095016303, | |
| "grad_norm": 0.4145815074443817, | |
| "learning_rate": 0.0003216691904484566, | |
| "loss": 3.1657, | |
| "step": 79750 | |
| }, | |
| { | |
| "epoch": 23.22502328830927, | |
| "grad_norm": 0.3951996862888336, | |
| "learning_rate": 0.00032149446709376815, | |
| "loss": 3.1714, | |
| "step": 79800 | |
| }, | |
| { | |
| "epoch": 23.239578481602237, | |
| "grad_norm": 0.39838355779647827, | |
| "learning_rate": 0.00032131974373907974, | |
| "loss": 3.1725, | |
| "step": 79850 | |
| }, | |
| { | |
| "epoch": 23.254133674895204, | |
| "grad_norm": 0.4332260191440582, | |
| "learning_rate": 0.0003211450203843914, | |
| "loss": 3.1792, | |
| "step": 79900 | |
| }, | |
| { | |
| "epoch": 23.26868886818817, | |
| "grad_norm": 0.41784408688545227, | |
| "learning_rate": 0.000320970297029703, | |
| "loss": 3.1752, | |
| "step": 79950 | |
| }, | |
| { | |
| "epoch": 23.283244061481135, | |
| "grad_norm": 0.40836164355278015, | |
| "learning_rate": 0.0003207955736750145, | |
| "loss": 3.1827, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 23.283244061481135, | |
| "eval_accuracy": 0.37335815823429797, | |
| "eval_loss": 3.551513910293579, | |
| "eval_runtime": 180.9912, | |
| "eval_samples_per_second": 91.999, | |
| "eval_steps_per_second": 5.752, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 23.2977992547741, | |
| "grad_norm": 0.4139251112937927, | |
| "learning_rate": 0.0003206208503203261, | |
| "loss": 3.1796, | |
| "step": 80050 | |
| }, | |
| { | |
| "epoch": 23.31235444806707, | |
| "grad_norm": 0.4029051959514618, | |
| "learning_rate": 0.0003204461269656377, | |
| "loss": 3.174, | |
| "step": 80100 | |
| }, | |
| { | |
| "epoch": 23.326909641360036, | |
| "grad_norm": 0.4269234240055084, | |
| "learning_rate": 0.00032027140361094925, | |
| "loss": 3.1902, | |
| "step": 80150 | |
| }, | |
| { | |
| "epoch": 23.341464834653003, | |
| "grad_norm": 0.44048935174942017, | |
| "learning_rate": 0.0003200966802562609, | |
| "loss": 3.1894, | |
| "step": 80200 | |
| }, | |
| { | |
| "epoch": 23.35602002794597, | |
| "grad_norm": 0.40298840403556824, | |
| "learning_rate": 0.0003199219569015725, | |
| "loss": 3.1872, | |
| "step": 80250 | |
| }, | |
| { | |
| "epoch": 23.370575221238937, | |
| "grad_norm": 0.4126374125480652, | |
| "learning_rate": 0.0003197472335468841, | |
| "loss": 3.179, | |
| "step": 80300 | |
| }, | |
| { | |
| "epoch": 23.385130414531904, | |
| "grad_norm": 0.4273306429386139, | |
| "learning_rate": 0.00031957251019219563, | |
| "loss": 3.1886, | |
| "step": 80350 | |
| }, | |
| { | |
| "epoch": 23.39968560782487, | |
| "grad_norm": 0.4023396670818329, | |
| "learning_rate": 0.0003193977868375072, | |
| "loss": 3.1851, | |
| "step": 80400 | |
| }, | |
| { | |
| "epoch": 23.41424080111784, | |
| "grad_norm": 0.4175347685813904, | |
| "learning_rate": 0.00031922306348281887, | |
| "loss": 3.1866, | |
| "step": 80450 | |
| }, | |
| { | |
| "epoch": 23.428795994410805, | |
| "grad_norm": 0.40811020135879517, | |
| "learning_rate": 0.00031904834012813047, | |
| "loss": 3.1829, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 23.443351187703772, | |
| "grad_norm": 0.39960628747940063, | |
| "learning_rate": 0.000318873616773442, | |
| "loss": 3.1989, | |
| "step": 80550 | |
| }, | |
| { | |
| "epoch": 23.45790638099674, | |
| "grad_norm": 0.42026636004447937, | |
| "learning_rate": 0.0003186988934187536, | |
| "loss": 3.1962, | |
| "step": 80600 | |
| }, | |
| { | |
| "epoch": 23.472461574289706, | |
| "grad_norm": 0.37581321597099304, | |
| "learning_rate": 0.0003185241700640652, | |
| "loss": 3.1957, | |
| "step": 80650 | |
| }, | |
| { | |
| "epoch": 23.487016767582674, | |
| "grad_norm": 0.4311843514442444, | |
| "learning_rate": 0.0003183494467093768, | |
| "loss": 3.2022, | |
| "step": 80700 | |
| }, | |
| { | |
| "epoch": 23.50157196087564, | |
| "grad_norm": 0.39548468589782715, | |
| "learning_rate": 0.0003181747233546884, | |
| "loss": 3.2, | |
| "step": 80750 | |
| }, | |
| { | |
| "epoch": 23.516127154168608, | |
| "grad_norm": 0.4133346974849701, | |
| "learning_rate": 0.000318, | |
| "loss": 3.2, | |
| "step": 80800 | |
| }, | |
| { | |
| "epoch": 23.530682347461575, | |
| "grad_norm": 0.3871370553970337, | |
| "learning_rate": 0.00031782527664531157, | |
| "loss": 3.1965, | |
| "step": 80850 | |
| }, | |
| { | |
| "epoch": 23.545237540754542, | |
| "grad_norm": 0.4371592402458191, | |
| "learning_rate": 0.00031765055329062317, | |
| "loss": 3.2013, | |
| "step": 80900 | |
| }, | |
| { | |
| "epoch": 23.55979273404751, | |
| "grad_norm": 0.4149767756462097, | |
| "learning_rate": 0.0003174758299359347, | |
| "loss": 3.2007, | |
| "step": 80950 | |
| }, | |
| { | |
| "epoch": 23.574347927340476, | |
| "grad_norm": 0.4131397306919098, | |
| "learning_rate": 0.0003173011065812463, | |
| "loss": 3.1975, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 23.574347927340476, | |
| "eval_accuracy": 0.3738587070280506, | |
| "eval_loss": 3.5425162315368652, | |
| "eval_runtime": 180.9213, | |
| "eval_samples_per_second": 92.035, | |
| "eval_steps_per_second": 5.754, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 23.588903120633443, | |
| "grad_norm": 0.4062992334365845, | |
| "learning_rate": 0.00031712638322655795, | |
| "loss": 3.1874, | |
| "step": 81050 | |
| }, | |
| { | |
| "epoch": 23.60345831392641, | |
| "grad_norm": 0.39633697271347046, | |
| "learning_rate": 0.00031695165987186954, | |
| "loss": 3.1983, | |
| "step": 81100 | |
| }, | |
| { | |
| "epoch": 23.618013507219377, | |
| "grad_norm": 0.408441960811615, | |
| "learning_rate": 0.0003167769365171811, | |
| "loss": 3.2016, | |
| "step": 81150 | |
| }, | |
| { | |
| "epoch": 23.632568700512344, | |
| "grad_norm": 0.3951725661754608, | |
| "learning_rate": 0.0003166022131624927, | |
| "loss": 3.2034, | |
| "step": 81200 | |
| }, | |
| { | |
| "epoch": 23.64712389380531, | |
| "grad_norm": 0.4006396234035492, | |
| "learning_rate": 0.00031642748980780427, | |
| "loss": 3.1994, | |
| "step": 81250 | |
| }, | |
| { | |
| "epoch": 23.66167908709828, | |
| "grad_norm": 0.39524927735328674, | |
| "learning_rate": 0.0003162527664531159, | |
| "loss": 3.2041, | |
| "step": 81300 | |
| }, | |
| { | |
| "epoch": 23.676234280391242, | |
| "grad_norm": 0.383435994386673, | |
| "learning_rate": 0.00031607804309842746, | |
| "loss": 3.1966, | |
| "step": 81350 | |
| }, | |
| { | |
| "epoch": 23.69078947368421, | |
| "grad_norm": 0.4194474518299103, | |
| "learning_rate": 0.00031590331974373905, | |
| "loss": 3.2113, | |
| "step": 81400 | |
| }, | |
| { | |
| "epoch": 23.705344666977176, | |
| "grad_norm": 0.4209073781967163, | |
| "learning_rate": 0.00031572859638905065, | |
| "loss": 3.2018, | |
| "step": 81450 | |
| }, | |
| { | |
| "epoch": 23.719899860270143, | |
| "grad_norm": 0.4205525815486908, | |
| "learning_rate": 0.0003155538730343622, | |
| "loss": 3.2128, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 23.73445505356311, | |
| "grad_norm": 0.44765710830688477, | |
| "learning_rate": 0.0003153791496796738, | |
| "loss": 3.2231, | |
| "step": 81550 | |
| }, | |
| { | |
| "epoch": 23.749010246856077, | |
| "grad_norm": 0.42908868193626404, | |
| "learning_rate": 0.00031520442632498543, | |
| "loss": 3.2146, | |
| "step": 81600 | |
| }, | |
| { | |
| "epoch": 23.763565440149044, | |
| "grad_norm": 0.40154361724853516, | |
| "learning_rate": 0.000315029702970297, | |
| "loss": 3.2074, | |
| "step": 81650 | |
| }, | |
| { | |
| "epoch": 23.77812063344201, | |
| "grad_norm": 0.4053952991962433, | |
| "learning_rate": 0.0003148549796156086, | |
| "loss": 3.2054, | |
| "step": 81700 | |
| }, | |
| { | |
| "epoch": 23.79267582673498, | |
| "grad_norm": 0.41705214977264404, | |
| "learning_rate": 0.00031468025626092016, | |
| "loss": 3.2097, | |
| "step": 81750 | |
| }, | |
| { | |
| "epoch": 23.807231020027945, | |
| "grad_norm": 0.41086921095848083, | |
| "learning_rate": 0.00031450553290623175, | |
| "loss": 3.2144, | |
| "step": 81800 | |
| }, | |
| { | |
| "epoch": 23.821786213320912, | |
| "grad_norm": 0.4101288616657257, | |
| "learning_rate": 0.0003143308095515434, | |
| "loss": 3.2174, | |
| "step": 81850 | |
| }, | |
| { | |
| "epoch": 23.83634140661388, | |
| "grad_norm": 0.3978751599788666, | |
| "learning_rate": 0.000314156086196855, | |
| "loss": 3.2183, | |
| "step": 81900 | |
| }, | |
| { | |
| "epoch": 23.850896599906847, | |
| "grad_norm": 0.4297041594982147, | |
| "learning_rate": 0.00031398136284216654, | |
| "loss": 3.2178, | |
| "step": 81950 | |
| }, | |
| { | |
| "epoch": 23.865451793199814, | |
| "grad_norm": 0.4131315052509308, | |
| "learning_rate": 0.00031380663948747813, | |
| "loss": 3.2275, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 23.865451793199814, | |
| "eval_accuracy": 0.3741180897910964, | |
| "eval_loss": 3.5338618755340576, | |
| "eval_runtime": 180.7885, | |
| "eval_samples_per_second": 92.102, | |
| "eval_steps_per_second": 5.758, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 23.88000698649278, | |
| "grad_norm": 0.4159573018550873, | |
| "learning_rate": 0.0003136319161327897, | |
| "loss": 3.2187, | |
| "step": 82050 | |
| }, | |
| { | |
| "epoch": 23.894562179785748, | |
| "grad_norm": 0.5051674246788025, | |
| "learning_rate": 0.00031345719277810127, | |
| "loss": 3.2304, | |
| "step": 82100 | |
| }, | |
| { | |
| "epoch": 23.909117373078715, | |
| "grad_norm": 0.42157235741615295, | |
| "learning_rate": 0.0003132824694234129, | |
| "loss": 3.2268, | |
| "step": 82150 | |
| }, | |
| { | |
| "epoch": 23.923672566371682, | |
| "grad_norm": 0.3888903260231018, | |
| "learning_rate": 0.0003131077460687245, | |
| "loss": 3.2228, | |
| "step": 82200 | |
| }, | |
| { | |
| "epoch": 23.93822775966465, | |
| "grad_norm": 0.42773908376693726, | |
| "learning_rate": 0.0003129330227140361, | |
| "loss": 3.2184, | |
| "step": 82250 | |
| }, | |
| { | |
| "epoch": 23.952782952957616, | |
| "grad_norm": 0.40163886547088623, | |
| "learning_rate": 0.00031275829935934764, | |
| "loss": 3.2203, | |
| "step": 82300 | |
| }, | |
| { | |
| "epoch": 23.967338146250583, | |
| "grad_norm": 0.41257938742637634, | |
| "learning_rate": 0.00031258357600465924, | |
| "loss": 3.2191, | |
| "step": 82350 | |
| }, | |
| { | |
| "epoch": 23.98189333954355, | |
| "grad_norm": 0.416121244430542, | |
| "learning_rate": 0.00031240885264997083, | |
| "loss": 3.2343, | |
| "step": 82400 | |
| }, | |
| { | |
| "epoch": 23.996448532836517, | |
| "grad_norm": 0.4214395582675934, | |
| "learning_rate": 0.0003122341292952825, | |
| "loss": 3.2195, | |
| "step": 82450 | |
| }, | |
| { | |
| "epoch": 24.010770843036795, | |
| "grad_norm": 0.3829551041126251, | |
| "learning_rate": 0.000312059405940594, | |
| "loss": 3.1487, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 24.025326036329762, | |
| "grad_norm": 0.4037528932094574, | |
| "learning_rate": 0.0003118846825859056, | |
| "loss": 3.126, | |
| "step": 82550 | |
| }, | |
| { | |
| "epoch": 24.03988122962273, | |
| "grad_norm": 0.4099786579608917, | |
| "learning_rate": 0.0003117099592312172, | |
| "loss": 3.1269, | |
| "step": 82600 | |
| }, | |
| { | |
| "epoch": 24.054436422915696, | |
| "grad_norm": 0.3996747136116028, | |
| "learning_rate": 0.0003115352358765288, | |
| "loss": 3.1255, | |
| "step": 82650 | |
| }, | |
| { | |
| "epoch": 24.068991616208663, | |
| "grad_norm": 0.4156681001186371, | |
| "learning_rate": 0.0003113605125218404, | |
| "loss": 3.1419, | |
| "step": 82700 | |
| }, | |
| { | |
| "epoch": 24.08354680950163, | |
| "grad_norm": 0.4593733847141266, | |
| "learning_rate": 0.000311185789167152, | |
| "loss": 3.1397, | |
| "step": 82750 | |
| }, | |
| { | |
| "epoch": 24.098102002794597, | |
| "grad_norm": 0.4231386184692383, | |
| "learning_rate": 0.0003110110658124636, | |
| "loss": 3.1453, | |
| "step": 82800 | |
| }, | |
| { | |
| "epoch": 24.112657196087564, | |
| "grad_norm": 0.42956456542015076, | |
| "learning_rate": 0.0003108363424577752, | |
| "loss": 3.1492, | |
| "step": 82850 | |
| }, | |
| { | |
| "epoch": 24.12721238938053, | |
| "grad_norm": 0.40355461835861206, | |
| "learning_rate": 0.0003106616191030867, | |
| "loss": 3.1409, | |
| "step": 82900 | |
| }, | |
| { | |
| "epoch": 24.1417675826735, | |
| "grad_norm": 0.4016251862049103, | |
| "learning_rate": 0.0003104868957483983, | |
| "loss": 3.1399, | |
| "step": 82950 | |
| }, | |
| { | |
| "epoch": 24.156322775966466, | |
| "grad_norm": 0.41739484667778015, | |
| "learning_rate": 0.00031031217239370996, | |
| "loss": 3.1521, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 24.156322775966466, | |
| "eval_accuracy": 0.373625297799501, | |
| "eval_loss": 3.551887273788452, | |
| "eval_runtime": 180.7411, | |
| "eval_samples_per_second": 92.126, | |
| "eval_steps_per_second": 5.76, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 24.170877969259433, | |
| "grad_norm": 0.4393619894981384, | |
| "learning_rate": 0.00031013744903902156, | |
| "loss": 3.1527, | |
| "step": 83050 | |
| }, | |
| { | |
| "epoch": 24.1854331625524, | |
| "grad_norm": 0.41159191727638245, | |
| "learning_rate": 0.0003099627256843331, | |
| "loss": 3.1609, | |
| "step": 83100 | |
| }, | |
| { | |
| "epoch": 24.199988355845367, | |
| "grad_norm": 0.40605202317237854, | |
| "learning_rate": 0.0003097880023296447, | |
| "loss": 3.15, | |
| "step": 83150 | |
| }, | |
| { | |
| "epoch": 24.214543549138334, | |
| "grad_norm": 0.4293530285358429, | |
| "learning_rate": 0.0003096132789749563, | |
| "loss": 3.1509, | |
| "step": 83200 | |
| }, | |
| { | |
| "epoch": 24.2290987424313, | |
| "grad_norm": 0.4325450360774994, | |
| "learning_rate": 0.00030943855562026794, | |
| "loss": 3.1526, | |
| "step": 83250 | |
| }, | |
| { | |
| "epoch": 24.243653935724268, | |
| "grad_norm": 0.42839089035987854, | |
| "learning_rate": 0.0003092638322655795, | |
| "loss": 3.1653, | |
| "step": 83300 | |
| }, | |
| { | |
| "epoch": 24.258209129017235, | |
| "grad_norm": 0.4248741567134857, | |
| "learning_rate": 0.00030908910891089107, | |
| "loss": 3.161, | |
| "step": 83350 | |
| }, | |
| { | |
| "epoch": 24.2727643223102, | |
| "grad_norm": 0.39277422428131104, | |
| "learning_rate": 0.00030891438555620266, | |
| "loss": 3.161, | |
| "step": 83400 | |
| }, | |
| { | |
| "epoch": 24.287319515603166, | |
| "grad_norm": 0.41305747628211975, | |
| "learning_rate": 0.0003087396622015142, | |
| "loss": 3.1732, | |
| "step": 83450 | |
| }, | |
| { | |
| "epoch": 24.301874708896133, | |
| "grad_norm": 0.415712833404541, | |
| "learning_rate": 0.0003085649388468258, | |
| "loss": 3.1718, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 24.3164299021891, | |
| "grad_norm": 0.38852572441101074, | |
| "learning_rate": 0.00030839021549213745, | |
| "loss": 3.1697, | |
| "step": 83550 | |
| }, | |
| { | |
| "epoch": 24.330985095482067, | |
| "grad_norm": 0.38443219661712646, | |
| "learning_rate": 0.00030821549213744904, | |
| "loss": 3.1646, | |
| "step": 83600 | |
| }, | |
| { | |
| "epoch": 24.345540288775034, | |
| "grad_norm": 0.39225396513938904, | |
| "learning_rate": 0.0003080407687827606, | |
| "loss": 3.1779, | |
| "step": 83650 | |
| }, | |
| { | |
| "epoch": 24.360095482068, | |
| "grad_norm": 0.40809082984924316, | |
| "learning_rate": 0.0003078660454280722, | |
| "loss": 3.1789, | |
| "step": 83700 | |
| }, | |
| { | |
| "epoch": 24.374650675360968, | |
| "grad_norm": 0.42657771706581116, | |
| "learning_rate": 0.00030769132207338377, | |
| "loss": 3.1788, | |
| "step": 83750 | |
| }, | |
| { | |
| "epoch": 24.389205868653935, | |
| "grad_norm": 0.42962631583213806, | |
| "learning_rate": 0.00030751659871869536, | |
| "loss": 3.1898, | |
| "step": 83800 | |
| }, | |
| { | |
| "epoch": 24.403761061946902, | |
| "grad_norm": 0.4245082139968872, | |
| "learning_rate": 0.00030734187536400696, | |
| "loss": 3.1715, | |
| "step": 83850 | |
| }, | |
| { | |
| "epoch": 24.41831625523987, | |
| "grad_norm": 0.4384056031703949, | |
| "learning_rate": 0.00030716715200931855, | |
| "loss": 3.1727, | |
| "step": 83900 | |
| }, | |
| { | |
| "epoch": 24.432871448532836, | |
| "grad_norm": 0.3749312460422516, | |
| "learning_rate": 0.00030699242865463015, | |
| "loss": 3.188, | |
| "step": 83950 | |
| }, | |
| { | |
| "epoch": 24.447426641825803, | |
| "grad_norm": 0.4019298851490021, | |
| "learning_rate": 0.00030681770529994174, | |
| "loss": 3.1829, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 24.447426641825803, | |
| "eval_accuracy": 0.3735271624994814, | |
| "eval_loss": 3.5468599796295166, | |
| "eval_runtime": 180.7582, | |
| "eval_samples_per_second": 92.118, | |
| "eval_steps_per_second": 5.759, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 24.46198183511877, | |
| "grad_norm": 0.4251369833946228, | |
| "learning_rate": 0.0003066429819452533, | |
| "loss": 3.1804, | |
| "step": 84050 | |
| }, | |
| { | |
| "epoch": 24.476537028411737, | |
| "grad_norm": 0.4531020522117615, | |
| "learning_rate": 0.00030646825859056493, | |
| "loss": 3.1735, | |
| "step": 84100 | |
| }, | |
| { | |
| "epoch": 24.491092221704704, | |
| "grad_norm": 0.4240332245826721, | |
| "learning_rate": 0.0003062935352358765, | |
| "loss": 3.187, | |
| "step": 84150 | |
| }, | |
| { | |
| "epoch": 24.50564741499767, | |
| "grad_norm": 0.44458672404289246, | |
| "learning_rate": 0.0003061188118811881, | |
| "loss": 3.1838, | |
| "step": 84200 | |
| }, | |
| { | |
| "epoch": 24.52020260829064, | |
| "grad_norm": 0.4538998305797577, | |
| "learning_rate": 0.00030594408852649966, | |
| "loss": 3.1897, | |
| "step": 84250 | |
| }, | |
| { | |
| "epoch": 24.534757801583606, | |
| "grad_norm": 0.4091043472290039, | |
| "learning_rate": 0.00030576936517181125, | |
| "loss": 3.1878, | |
| "step": 84300 | |
| }, | |
| { | |
| "epoch": 24.549312994876573, | |
| "grad_norm": 0.4159854054450989, | |
| "learning_rate": 0.00030559464181712285, | |
| "loss": 3.1918, | |
| "step": 84350 | |
| }, | |
| { | |
| "epoch": 24.56386818816954, | |
| "grad_norm": 0.4282359182834625, | |
| "learning_rate": 0.0003054199184624345, | |
| "loss": 3.1977, | |
| "step": 84400 | |
| }, | |
| { | |
| "epoch": 24.578423381462507, | |
| "grad_norm": 0.4006085693836212, | |
| "learning_rate": 0.00030524519510774604, | |
| "loss": 3.1818, | |
| "step": 84450 | |
| }, | |
| { | |
| "epoch": 24.592978574755474, | |
| "grad_norm": 0.411472350358963, | |
| "learning_rate": 0.00030507047175305763, | |
| "loss": 3.1931, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 24.60753376804844, | |
| "grad_norm": 0.4276401698589325, | |
| "learning_rate": 0.0003048957483983692, | |
| "loss": 3.1905, | |
| "step": 84550 | |
| }, | |
| { | |
| "epoch": 24.622088961341408, | |
| "grad_norm": 0.44235286116600037, | |
| "learning_rate": 0.00030472102504368076, | |
| "loss": 3.2027, | |
| "step": 84600 | |
| }, | |
| { | |
| "epoch": 24.636644154634375, | |
| "grad_norm": 0.3925072252750397, | |
| "learning_rate": 0.0003045463016889924, | |
| "loss": 3.1939, | |
| "step": 84650 | |
| }, | |
| { | |
| "epoch": 24.651199347927342, | |
| "grad_norm": 0.4455093741416931, | |
| "learning_rate": 0.000304371578334304, | |
| "loss": 3.2002, | |
| "step": 84700 | |
| }, | |
| { | |
| "epoch": 24.66575454122031, | |
| "grad_norm": 0.4702233672142029, | |
| "learning_rate": 0.0003041968549796156, | |
| "loss": 3.1949, | |
| "step": 84750 | |
| }, | |
| { | |
| "epoch": 24.680309734513273, | |
| "grad_norm": 0.4084894061088562, | |
| "learning_rate": 0.00030402213162492714, | |
| "loss": 3.1966, | |
| "step": 84800 | |
| }, | |
| { | |
| "epoch": 24.69486492780624, | |
| "grad_norm": 0.4031795561313629, | |
| "learning_rate": 0.00030384740827023874, | |
| "loss": 3.1998, | |
| "step": 84850 | |
| }, | |
| { | |
| "epoch": 24.709420121099207, | |
| "grad_norm": 0.4553413689136505, | |
| "learning_rate": 0.00030367268491555033, | |
| "loss": 3.1836, | |
| "step": 84900 | |
| }, | |
| { | |
| "epoch": 24.723975314392174, | |
| "grad_norm": 0.40718361735343933, | |
| "learning_rate": 0.000303497961560862, | |
| "loss": 3.1981, | |
| "step": 84950 | |
| }, | |
| { | |
| "epoch": 24.73853050768514, | |
| "grad_norm": 0.41987234354019165, | |
| "learning_rate": 0.00030332323820617357, | |
| "loss": 3.2103, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 24.73853050768514, | |
| "eval_accuracy": 0.3742721680884924, | |
| "eval_loss": 3.538865327835083, | |
| "eval_runtime": 180.5782, | |
| "eval_samples_per_second": 92.209, | |
| "eval_steps_per_second": 5.765, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 24.753085700978108, | |
| "grad_norm": 0.6392399072647095, | |
| "learning_rate": 0.0003031485148514851, | |
| "loss": 3.2034, | |
| "step": 85050 | |
| }, | |
| { | |
| "epoch": 24.767640894271075, | |
| "grad_norm": 0.39197757840156555, | |
| "learning_rate": 0.0003029737914967967, | |
| "loss": 3.2148, | |
| "step": 85100 | |
| }, | |
| { | |
| "epoch": 24.782196087564042, | |
| "grad_norm": 0.39907306432724, | |
| "learning_rate": 0.0003027990681421083, | |
| "loss": 3.2014, | |
| "step": 85150 | |
| }, | |
| { | |
| "epoch": 24.79675128085701, | |
| "grad_norm": 0.42008307576179504, | |
| "learning_rate": 0.00030262434478741984, | |
| "loss": 3.2067, | |
| "step": 85200 | |
| }, | |
| { | |
| "epoch": 24.811306474149976, | |
| "grad_norm": 0.42736494541168213, | |
| "learning_rate": 0.0003024496214327315, | |
| "loss": 3.2044, | |
| "step": 85250 | |
| }, | |
| { | |
| "epoch": 24.825861667442943, | |
| "grad_norm": 0.4229523837566376, | |
| "learning_rate": 0.0003022748980780431, | |
| "loss": 3.1811, | |
| "step": 85300 | |
| }, | |
| { | |
| "epoch": 24.84041686073591, | |
| "grad_norm": 0.41390445828437805, | |
| "learning_rate": 0.0003021001747233547, | |
| "loss": 3.22, | |
| "step": 85350 | |
| }, | |
| { | |
| "epoch": 24.854972054028877, | |
| "grad_norm": 0.45541396737098694, | |
| "learning_rate": 0.0003019254513686662, | |
| "loss": 3.2049, | |
| "step": 85400 | |
| }, | |
| { | |
| "epoch": 24.869527247321844, | |
| "grad_norm": 0.41808629035949707, | |
| "learning_rate": 0.0003017507280139778, | |
| "loss": 3.218, | |
| "step": 85450 | |
| }, | |
| { | |
| "epoch": 24.88408244061481, | |
| "grad_norm": 0.4269886314868927, | |
| "learning_rate": 0.00030157600465928946, | |
| "loss": 3.2089, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 24.89863763390778, | |
| "grad_norm": 0.40179815888404846, | |
| "learning_rate": 0.00030140128130460106, | |
| "loss": 3.2163, | |
| "step": 85550 | |
| }, | |
| { | |
| "epoch": 24.913192827200746, | |
| "grad_norm": 0.40155360102653503, | |
| "learning_rate": 0.0003012265579499126, | |
| "loss": 3.2088, | |
| "step": 85600 | |
| }, | |
| { | |
| "epoch": 24.927748020493713, | |
| "grad_norm": 0.4079338610172272, | |
| "learning_rate": 0.0003010518345952242, | |
| "loss": 3.2131, | |
| "step": 85650 | |
| }, | |
| { | |
| "epoch": 24.94230321378668, | |
| "grad_norm": 0.40120014548301697, | |
| "learning_rate": 0.0003008771112405358, | |
| "loss": 3.2158, | |
| "step": 85700 | |
| }, | |
| { | |
| "epoch": 24.956858407079647, | |
| "grad_norm": 0.40522441267967224, | |
| "learning_rate": 0.0003007023878858473, | |
| "loss": 3.2156, | |
| "step": 85750 | |
| }, | |
| { | |
| "epoch": 24.971413600372614, | |
| "grad_norm": 0.4275481402873993, | |
| "learning_rate": 0.000300527664531159, | |
| "loss": 3.2055, | |
| "step": 85800 | |
| }, | |
| { | |
| "epoch": 24.98596879366558, | |
| "grad_norm": 0.39670512080192566, | |
| "learning_rate": 0.00030035294117647057, | |
| "loss": 3.2123, | |
| "step": 85850 | |
| }, | |
| { | |
| "epoch": 25.00029110386586, | |
| "grad_norm": 0.40994325280189514, | |
| "learning_rate": 0.00030017821782178216, | |
| "loss": 3.2163, | |
| "step": 85900 | |
| }, | |
| { | |
| "epoch": 25.014846297158826, | |
| "grad_norm": 0.4169817864894867, | |
| "learning_rate": 0.00030000349446709376, | |
| "loss": 3.1135, | |
| "step": 85950 | |
| }, | |
| { | |
| "epoch": 25.029401490451793, | |
| "grad_norm": 0.4049444794654846, | |
| "learning_rate": 0.00029982877111240535, | |
| "loss": 3.1168, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 25.029401490451793, | |
| "eval_accuracy": 0.3737857225713893, | |
| "eval_loss": 3.5512163639068604, | |
| "eval_runtime": 180.8132, | |
| "eval_samples_per_second": 92.09, | |
| "eval_steps_per_second": 5.757, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 25.04395668374476, | |
| "grad_norm": 0.4463197886943817, | |
| "learning_rate": 0.00029965404775771694, | |
| "loss": 3.1191, | |
| "step": 86050 | |
| }, | |
| { | |
| "epoch": 25.058511877037727, | |
| "grad_norm": 0.4090753495693207, | |
| "learning_rate": 0.0002994793244030285, | |
| "loss": 3.1277, | |
| "step": 86100 | |
| }, | |
| { | |
| "epoch": 25.073067070330694, | |
| "grad_norm": 0.41755521297454834, | |
| "learning_rate": 0.00029930460104834013, | |
| "loss": 3.1256, | |
| "step": 86150 | |
| }, | |
| { | |
| "epoch": 25.08762226362366, | |
| "grad_norm": 0.4182721972465515, | |
| "learning_rate": 0.0002991298776936517, | |
| "loss": 3.1272, | |
| "step": 86200 | |
| }, | |
| { | |
| "epoch": 25.10217745691663, | |
| "grad_norm": 0.4424816370010376, | |
| "learning_rate": 0.00029895515433896327, | |
| "loss": 3.1396, | |
| "step": 86250 | |
| }, | |
| { | |
| "epoch": 25.116732650209595, | |
| "grad_norm": 0.39702218770980835, | |
| "learning_rate": 0.00029878043098427486, | |
| "loss": 3.1338, | |
| "step": 86300 | |
| }, | |
| { | |
| "epoch": 25.131287843502562, | |
| "grad_norm": 0.4153575003147125, | |
| "learning_rate": 0.00029860570762958646, | |
| "loss": 3.1438, | |
| "step": 86350 | |
| }, | |
| { | |
| "epoch": 25.14584303679553, | |
| "grad_norm": 0.39811867475509644, | |
| "learning_rate": 0.00029843098427489805, | |
| "loss": 3.142, | |
| "step": 86400 | |
| }, | |
| { | |
| "epoch": 25.160398230088497, | |
| "grad_norm": 0.4301377236843109, | |
| "learning_rate": 0.00029825626092020964, | |
| "loss": 3.1376, | |
| "step": 86450 | |
| }, | |
| { | |
| "epoch": 25.174953423381464, | |
| "grad_norm": 0.418992817401886, | |
| "learning_rate": 0.00029808153756552124, | |
| "loss": 3.1361, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 25.18950861667443, | |
| "grad_norm": 0.40978747606277466, | |
| "learning_rate": 0.00029790681421083283, | |
| "loss": 3.1537, | |
| "step": 86550 | |
| }, | |
| { | |
| "epoch": 25.204063809967398, | |
| "grad_norm": 0.42810678482055664, | |
| "learning_rate": 0.00029773209085614443, | |
| "loss": 3.1431, | |
| "step": 86600 | |
| }, | |
| { | |
| "epoch": 25.218619003260365, | |
| "grad_norm": 0.40550005435943604, | |
| "learning_rate": 0.00029755736750145597, | |
| "loss": 3.1456, | |
| "step": 86650 | |
| }, | |
| { | |
| "epoch": 25.233174196553332, | |
| "grad_norm": 0.4268396198749542, | |
| "learning_rate": 0.0002973826441467676, | |
| "loss": 3.1409, | |
| "step": 86700 | |
| }, | |
| { | |
| "epoch": 25.2477293898463, | |
| "grad_norm": 0.3889879584312439, | |
| "learning_rate": 0.00029720792079207916, | |
| "loss": 3.154, | |
| "step": 86750 | |
| }, | |
| { | |
| "epoch": 25.262284583139262, | |
| "grad_norm": 0.39797431230545044, | |
| "learning_rate": 0.00029703319743739075, | |
| "loss": 3.154, | |
| "step": 86800 | |
| }, | |
| { | |
| "epoch": 25.27683977643223, | |
| "grad_norm": 0.4599686563014984, | |
| "learning_rate": 0.00029685847408270234, | |
| "loss": 3.1648, | |
| "step": 86850 | |
| }, | |
| { | |
| "epoch": 25.291394969725197, | |
| "grad_norm": 0.4047425389289856, | |
| "learning_rate": 0.00029668375072801394, | |
| "loss": 3.1577, | |
| "step": 86900 | |
| }, | |
| { | |
| "epoch": 25.305950163018164, | |
| "grad_norm": 0.45010191202163696, | |
| "learning_rate": 0.00029650902737332553, | |
| "loss": 3.1556, | |
| "step": 86950 | |
| }, | |
| { | |
| "epoch": 25.32050535631113, | |
| "grad_norm": 0.4050070345401764, | |
| "learning_rate": 0.00029633430401863713, | |
| "loss": 3.1637, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 25.32050535631113, | |
| "eval_accuracy": 0.3736241225264469, | |
| "eval_loss": 3.550231695175171, | |
| "eval_runtime": 180.1874, | |
| "eval_samples_per_second": 92.409, | |
| "eval_steps_per_second": 5.777, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 25.335060549604098, | |
| "grad_norm": 0.42178481817245483, | |
| "learning_rate": 0.0002961595806639487, | |
| "loss": 3.1491, | |
| "step": 87050 | |
| }, | |
| { | |
| "epoch": 25.349615742897065, | |
| "grad_norm": 0.4090774357318878, | |
| "learning_rate": 0.0002959848573092603, | |
| "loss": 3.1661, | |
| "step": 87100 | |
| }, | |
| { | |
| "epoch": 25.364170936190032, | |
| "grad_norm": 0.4134995937347412, | |
| "learning_rate": 0.0002958101339545719, | |
| "loss": 3.1838, | |
| "step": 87150 | |
| }, | |
| { | |
| "epoch": 25.378726129483, | |
| "grad_norm": 0.40914326906204224, | |
| "learning_rate": 0.0002956354105998835, | |
| "loss": 3.1702, | |
| "step": 87200 | |
| }, | |
| { | |
| "epoch": 25.393281322775966, | |
| "grad_norm": 0.4460836946964264, | |
| "learning_rate": 0.0002954606872451951, | |
| "loss": 3.1649, | |
| "step": 87250 | |
| }, | |
| { | |
| "epoch": 25.407836516068933, | |
| "grad_norm": 0.41393399238586426, | |
| "learning_rate": 0.0002952859638905067, | |
| "loss": 3.1674, | |
| "step": 87300 | |
| }, | |
| { | |
| "epoch": 25.4223917093619, | |
| "grad_norm": 0.44275742769241333, | |
| "learning_rate": 0.00029511124053581823, | |
| "loss": 3.1677, | |
| "step": 87350 | |
| }, | |
| { | |
| "epoch": 25.436946902654867, | |
| "grad_norm": 0.41643190383911133, | |
| "learning_rate": 0.0002949365171811299, | |
| "loss": 3.1698, | |
| "step": 87400 | |
| }, | |
| { | |
| "epoch": 25.451502095947834, | |
| "grad_norm": 0.4415244460105896, | |
| "learning_rate": 0.0002947617938264414, | |
| "loss": 3.1663, | |
| "step": 87450 | |
| }, | |
| { | |
| "epoch": 25.4660572892408, | |
| "grad_norm": 0.39513441920280457, | |
| "learning_rate": 0.000294587070471753, | |
| "loss": 3.1708, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 25.48061248253377, | |
| "grad_norm": 0.4174979627132416, | |
| "learning_rate": 0.0002944123471170646, | |
| "loss": 3.1841, | |
| "step": 87550 | |
| }, | |
| { | |
| "epoch": 25.495167675826735, | |
| "grad_norm": 0.4493371248245239, | |
| "learning_rate": 0.0002942376237623762, | |
| "loss": 3.197, | |
| "step": 87600 | |
| }, | |
| { | |
| "epoch": 25.509722869119702, | |
| "grad_norm": 0.4385649561882019, | |
| "learning_rate": 0.0002940629004076878, | |
| "loss": 3.18, | |
| "step": 87650 | |
| }, | |
| { | |
| "epoch": 25.52427806241267, | |
| "grad_norm": 0.38533443212509155, | |
| "learning_rate": 0.0002938881770529994, | |
| "loss": 3.1794, | |
| "step": 87700 | |
| }, | |
| { | |
| "epoch": 25.538833255705637, | |
| "grad_norm": 0.4166770279407501, | |
| "learning_rate": 0.000293713453698311, | |
| "loss": 3.1835, | |
| "step": 87750 | |
| }, | |
| { | |
| "epoch": 25.553388448998604, | |
| "grad_norm": 0.42698341608047485, | |
| "learning_rate": 0.0002935387303436226, | |
| "loss": 3.1773, | |
| "step": 87800 | |
| }, | |
| { | |
| "epoch": 25.56794364229157, | |
| "grad_norm": 0.45010149478912354, | |
| "learning_rate": 0.0002933640069889342, | |
| "loss": 3.1838, | |
| "step": 87850 | |
| }, | |
| { | |
| "epoch": 25.582498835584538, | |
| "grad_norm": 0.39543336629867554, | |
| "learning_rate": 0.0002931892836342457, | |
| "loss": 3.1869, | |
| "step": 87900 | |
| }, | |
| { | |
| "epoch": 25.597054028877505, | |
| "grad_norm": 0.42403191328048706, | |
| "learning_rate": 0.00029301456027955736, | |
| "loss": 3.1854, | |
| "step": 87950 | |
| }, | |
| { | |
| "epoch": 25.611609222170472, | |
| "grad_norm": 0.42595258355140686, | |
| "learning_rate": 0.0002928398369248689, | |
| "loss": 3.1768, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 25.611609222170472, | |
| "eval_accuracy": 0.37398316844448265, | |
| "eval_loss": 3.545428991317749, | |
| "eval_runtime": 180.2806, | |
| "eval_samples_per_second": 92.362, | |
| "eval_steps_per_second": 5.774, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 25.62616441546344, | |
| "grad_norm": 0.43794485926628113, | |
| "learning_rate": 0.0002926651135701805, | |
| "loss": 3.186, | |
| "step": 88050 | |
| }, | |
| { | |
| "epoch": 25.640719608756406, | |
| "grad_norm": 0.42607247829437256, | |
| "learning_rate": 0.00029249039021549215, | |
| "loss": 3.186, | |
| "step": 88100 | |
| }, | |
| { | |
| "epoch": 25.65527480204937, | |
| "grad_norm": 0.44838985800743103, | |
| "learning_rate": 0.0002923156668608037, | |
| "loss": 3.1879, | |
| "step": 88150 | |
| }, | |
| { | |
| "epoch": 25.669829995342337, | |
| "grad_norm": 0.40192946791648865, | |
| "learning_rate": 0.0002921409435061153, | |
| "loss": 3.185, | |
| "step": 88200 | |
| }, | |
| { | |
| "epoch": 25.684385188635304, | |
| "grad_norm": 0.43323275446891785, | |
| "learning_rate": 0.0002919662201514269, | |
| "loss": 3.183, | |
| "step": 88250 | |
| }, | |
| { | |
| "epoch": 25.69894038192827, | |
| "grad_norm": 0.4673517644405365, | |
| "learning_rate": 0.00029179149679673847, | |
| "loss": 3.1824, | |
| "step": 88300 | |
| }, | |
| { | |
| "epoch": 25.713495575221238, | |
| "grad_norm": 0.4231990873813629, | |
| "learning_rate": 0.00029161677344205007, | |
| "loss": 3.1878, | |
| "step": 88350 | |
| }, | |
| { | |
| "epoch": 25.728050768514205, | |
| "grad_norm": 0.4697956442832947, | |
| "learning_rate": 0.00029144205008736166, | |
| "loss": 3.1989, | |
| "step": 88400 | |
| }, | |
| { | |
| "epoch": 25.742605961807172, | |
| "grad_norm": 0.4267004430294037, | |
| "learning_rate": 0.00029126732673267325, | |
| "loss": 3.1991, | |
| "step": 88450 | |
| }, | |
| { | |
| "epoch": 25.75716115510014, | |
| "grad_norm": 0.3971269726753235, | |
| "learning_rate": 0.00029109260337798485, | |
| "loss": 3.1952, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 25.771716348393106, | |
| "grad_norm": 0.47079986333847046, | |
| "learning_rate": 0.00029091788002329644, | |
| "loss": 3.1889, | |
| "step": 88550 | |
| }, | |
| { | |
| "epoch": 25.786271541686073, | |
| "grad_norm": 0.4244484007358551, | |
| "learning_rate": 0.000290743156668608, | |
| "loss": 3.1896, | |
| "step": 88600 | |
| }, | |
| { | |
| "epoch": 25.80082673497904, | |
| "grad_norm": 0.4454701244831085, | |
| "learning_rate": 0.00029056843331391963, | |
| "loss": 3.2038, | |
| "step": 88650 | |
| }, | |
| { | |
| "epoch": 25.815381928272007, | |
| "grad_norm": 0.43634405732154846, | |
| "learning_rate": 0.00029039370995923117, | |
| "loss": 3.1937, | |
| "step": 88700 | |
| }, | |
| { | |
| "epoch": 25.829937121564974, | |
| "grad_norm": 0.41437003016471863, | |
| "learning_rate": 0.00029021898660454277, | |
| "loss": 3.1998, | |
| "step": 88750 | |
| }, | |
| { | |
| "epoch": 25.84449231485794, | |
| "grad_norm": 0.4281659424304962, | |
| "learning_rate": 0.00029004426324985436, | |
| "loss": 3.1997, | |
| "step": 88800 | |
| }, | |
| { | |
| "epoch": 25.85904750815091, | |
| "grad_norm": 0.4199349582195282, | |
| "learning_rate": 0.00028986953989516595, | |
| "loss": 3.1967, | |
| "step": 88850 | |
| }, | |
| { | |
| "epoch": 25.873602701443875, | |
| "grad_norm": 0.41134020686149597, | |
| "learning_rate": 0.00028969481654047755, | |
| "loss": 3.2022, | |
| "step": 88900 | |
| }, | |
| { | |
| "epoch": 25.888157894736842, | |
| "grad_norm": 0.4369158148765564, | |
| "learning_rate": 0.00028952009318578914, | |
| "loss": 3.2089, | |
| "step": 88950 | |
| }, | |
| { | |
| "epoch": 25.90271308802981, | |
| "grad_norm": 0.39390864968299866, | |
| "learning_rate": 0.00028934536983110074, | |
| "loss": 3.2057, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 25.90271308802981, | |
| "eval_accuracy": 0.3748214907139913, | |
| "eval_loss": 3.5337438583374023, | |
| "eval_runtime": 180.3407, | |
| "eval_samples_per_second": 92.331, | |
| "eval_steps_per_second": 5.772, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 25.917268281322777, | |
| "grad_norm": 0.4118199944496155, | |
| "learning_rate": 0.00028917064647641233, | |
| "loss": 3.2137, | |
| "step": 89050 | |
| }, | |
| { | |
| "epoch": 25.931823474615744, | |
| "grad_norm": 0.4112432599067688, | |
| "learning_rate": 0.0002889959231217239, | |
| "loss": 3.2085, | |
| "step": 89100 | |
| }, | |
| { | |
| "epoch": 25.94637866790871, | |
| "grad_norm": 0.42336785793304443, | |
| "learning_rate": 0.0002888211997670355, | |
| "loss": 3.1903, | |
| "step": 89150 | |
| }, | |
| { | |
| "epoch": 25.960933861201678, | |
| "grad_norm": 0.4463735818862915, | |
| "learning_rate": 0.0002886464764123471, | |
| "loss": 3.2037, | |
| "step": 89200 | |
| }, | |
| { | |
| "epoch": 25.975489054494645, | |
| "grad_norm": 0.4266931414604187, | |
| "learning_rate": 0.0002884717530576587, | |
| "loss": 3.2025, | |
| "step": 89250 | |
| }, | |
| { | |
| "epoch": 25.990044247787612, | |
| "grad_norm": 0.39456993341445923, | |
| "learning_rate": 0.00028829702970297025, | |
| "loss": 3.1916, | |
| "step": 89300 | |
| }, | |
| { | |
| "epoch": 26.00436655798789, | |
| "grad_norm": 0.4564741551876068, | |
| "learning_rate": 0.0002881223063482819, | |
| "loss": 3.1658, | |
| "step": 89350 | |
| }, | |
| { | |
| "epoch": 26.018921751280857, | |
| "grad_norm": 0.4292503595352173, | |
| "learning_rate": 0.00028794758299359344, | |
| "loss": 3.1029, | |
| "step": 89400 | |
| }, | |
| { | |
| "epoch": 26.033476944573824, | |
| "grad_norm": 0.45016735792160034, | |
| "learning_rate": 0.00028777285963890503, | |
| "loss": 3.1069, | |
| "step": 89450 | |
| }, | |
| { | |
| "epoch": 26.04803213786679, | |
| "grad_norm": 0.4236689805984497, | |
| "learning_rate": 0.0002875981362842166, | |
| "loss": 3.1239, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 26.062587331159758, | |
| "grad_norm": 0.42502501606941223, | |
| "learning_rate": 0.0002874234129295282, | |
| "loss": 3.1208, | |
| "step": 89550 | |
| }, | |
| { | |
| "epoch": 26.077142524452725, | |
| "grad_norm": 0.4013234078884125, | |
| "learning_rate": 0.0002872486895748398, | |
| "loss": 3.1292, | |
| "step": 89600 | |
| }, | |
| { | |
| "epoch": 26.091697717745692, | |
| "grad_norm": 0.4063028395175934, | |
| "learning_rate": 0.0002870739662201514, | |
| "loss": 3.1281, | |
| "step": 89650 | |
| }, | |
| { | |
| "epoch": 26.10625291103866, | |
| "grad_norm": 0.43187108635902405, | |
| "learning_rate": 0.000286899242865463, | |
| "loss": 3.118, | |
| "step": 89700 | |
| }, | |
| { | |
| "epoch": 26.120808104331626, | |
| "grad_norm": 0.4590144157409668, | |
| "learning_rate": 0.0002867245195107746, | |
| "loss": 3.1182, | |
| "step": 89750 | |
| }, | |
| { | |
| "epoch": 26.135363297624593, | |
| "grad_norm": 0.40971580147743225, | |
| "learning_rate": 0.0002865497961560862, | |
| "loss": 3.1248, | |
| "step": 89800 | |
| }, | |
| { | |
| "epoch": 26.14991849091756, | |
| "grad_norm": 0.4597384035587311, | |
| "learning_rate": 0.00028637507280139773, | |
| "loss": 3.133, | |
| "step": 89850 | |
| }, | |
| { | |
| "epoch": 26.164473684210527, | |
| "grad_norm": 0.44787418842315674, | |
| "learning_rate": 0.0002862003494467094, | |
| "loss": 3.1333, | |
| "step": 89900 | |
| }, | |
| { | |
| "epoch": 26.179028877503494, | |
| "grad_norm": 0.39661404490470886, | |
| "learning_rate": 0.0002860256260920209, | |
| "loss": 3.1389, | |
| "step": 89950 | |
| }, | |
| { | |
| "epoch": 26.19358407079646, | |
| "grad_norm": 0.413208931684494, | |
| "learning_rate": 0.0002858509027373325, | |
| "loss": 3.1333, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 26.19358407079646, | |
| "eval_accuracy": 0.37358134258727665, | |
| "eval_loss": 3.5513510704040527, | |
| "eval_runtime": 180.2953, | |
| "eval_samples_per_second": 92.354, | |
| "eval_steps_per_second": 5.774, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 26.207848160223566, | |
| "grad_norm": 0.421714723110199, | |
| "learning_rate": 0.0002856761793826441, | |
| "loss": 3.1254, | |
| "step": 90050 | |
| }, | |
| { | |
| "epoch": 26.222403353516533, | |
| "grad_norm": 0.4438458979129791, | |
| "learning_rate": 0.0002855014560279557, | |
| "loss": 3.1234, | |
| "step": 90100 | |
| }, | |
| { | |
| "epoch": 26.2369585468095, | |
| "grad_norm": 0.4270126223564148, | |
| "learning_rate": 0.0002853267326732673, | |
| "loss": 3.1245, | |
| "step": 90150 | |
| }, | |
| { | |
| "epoch": 26.251513740102467, | |
| "grad_norm": 0.4316069185733795, | |
| "learning_rate": 0.0002851520093185789, | |
| "loss": 3.122, | |
| "step": 90200 | |
| }, | |
| { | |
| "epoch": 26.266068933395434, | |
| "grad_norm": 0.41992348432540894, | |
| "learning_rate": 0.0002849772859638905, | |
| "loss": 3.1187, | |
| "step": 90250 | |
| }, | |
| { | |
| "epoch": 26.2806241266884, | |
| "grad_norm": 0.45664697885513306, | |
| "learning_rate": 0.0002848025626092021, | |
| "loss": 3.1121, | |
| "step": 90300 | |
| }, | |
| { | |
| "epoch": 26.29517931998137, | |
| "grad_norm": 0.44784870743751526, | |
| "learning_rate": 0.0002846278392545137, | |
| "loss": 3.1212, | |
| "step": 90350 | |
| }, | |
| { | |
| "epoch": 26.309734513274336, | |
| "grad_norm": 0.43676409125328064, | |
| "learning_rate": 0.00028445311589982527, | |
| "loss": 3.1268, | |
| "step": 90400 | |
| }, | |
| { | |
| "epoch": 26.324289706567303, | |
| "grad_norm": 0.4639459252357483, | |
| "learning_rate": 0.00028427839254513686, | |
| "loss": 3.1347, | |
| "step": 90450 | |
| }, | |
| { | |
| "epoch": 26.33884489986027, | |
| "grad_norm": 0.4368378520011902, | |
| "learning_rate": 0.00028410366919044846, | |
| "loss": 3.1326, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 26.353400093153237, | |
| "grad_norm": 0.45159977674484253, | |
| "learning_rate": 0.00028392894583576, | |
| "loss": 3.1369, | |
| "step": 90550 | |
| }, | |
| { | |
| "epoch": 26.367955286446204, | |
| "grad_norm": 0.43553614616394043, | |
| "learning_rate": 0.00028375422248107165, | |
| "loss": 3.1472, | |
| "step": 90600 | |
| }, | |
| { | |
| "epoch": 26.38251047973917, | |
| "grad_norm": 0.45836883783340454, | |
| "learning_rate": 0.0002835794991263832, | |
| "loss": 3.146, | |
| "step": 90650 | |
| }, | |
| { | |
| "epoch": 26.397065673032138, | |
| "grad_norm": 0.4412062466144562, | |
| "learning_rate": 0.0002834047757716948, | |
| "loss": 3.1467, | |
| "step": 90700 | |
| }, | |
| { | |
| "epoch": 26.411620866325105, | |
| "grad_norm": 0.43280988931655884, | |
| "learning_rate": 0.0002832300524170064, | |
| "loss": 3.1418, | |
| "step": 90750 | |
| }, | |
| { | |
| "epoch": 26.426176059618072, | |
| "grad_norm": 0.473898321390152, | |
| "learning_rate": 0.00028305532906231797, | |
| "loss": 3.1415, | |
| "step": 90800 | |
| }, | |
| { | |
| "epoch": 26.44073125291104, | |
| "grad_norm": 0.4060930907726288, | |
| "learning_rate": 0.00028288060570762956, | |
| "loss": 3.1558, | |
| "step": 90850 | |
| }, | |
| { | |
| "epoch": 26.455286446204006, | |
| "grad_norm": 0.4253024160861969, | |
| "learning_rate": 0.00028270588235294116, | |
| "loss": 3.1568, | |
| "step": 90900 | |
| }, | |
| { | |
| "epoch": 26.469841639496973, | |
| "grad_norm": 0.41087841987609863, | |
| "learning_rate": 0.00028253115899825275, | |
| "loss": 3.1432, | |
| "step": 90950 | |
| }, | |
| { | |
| "epoch": 26.48439683278994, | |
| "grad_norm": 0.43201327323913574, | |
| "learning_rate": 0.0002823564356435643, | |
| "loss": 3.1644, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 26.48439683278994, | |
| "eval_accuracy": 0.37376562540216374, | |
| "eval_loss": 3.553745746612549, | |
| "eval_runtime": 81.26, | |
| "eval_samples_per_second": 204.91, | |
| "eval_steps_per_second": 12.811, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 26.498952026082907, | |
| "grad_norm": 0.4287349283695221, | |
| "learning_rate": 0.00028218171228887594, | |
| "loss": 3.1433, | |
| "step": 91050 | |
| }, | |
| { | |
| "epoch": 26.513507219375875, | |
| "grad_norm": 0.4375492036342621, | |
| "learning_rate": 0.0002820069889341875, | |
| "loss": 3.1571, | |
| "step": 91100 | |
| }, | |
| { | |
| "epoch": 26.52806241266884, | |
| "grad_norm": 0.4249202311038971, | |
| "learning_rate": 0.00028183226557949913, | |
| "loss": 3.1583, | |
| "step": 91150 | |
| }, | |
| { | |
| "epoch": 26.54261760596181, | |
| "grad_norm": 0.4327187240123749, | |
| "learning_rate": 0.00028165754222481067, | |
| "loss": 3.1651, | |
| "step": 91200 | |
| }, | |
| { | |
| "epoch": 26.557172799254776, | |
| "grad_norm": 0.42157667875289917, | |
| "learning_rate": 0.00028148281887012226, | |
| "loss": 3.1636, | |
| "step": 91250 | |
| }, | |
| { | |
| "epoch": 26.571727992547743, | |
| "grad_norm": 0.42917701601982117, | |
| "learning_rate": 0.0002813080955154339, | |
| "loss": 3.1557, | |
| "step": 91300 | |
| }, | |
| { | |
| "epoch": 26.586283185840706, | |
| "grad_norm": 0.4451695680618286, | |
| "learning_rate": 0.00028113337216074545, | |
| "loss": 3.158, | |
| "step": 91350 | |
| }, | |
| { | |
| "epoch": 26.600838379133673, | |
| "grad_norm": 0.4248709976673126, | |
| "learning_rate": 0.00028095864880605705, | |
| "loss": 3.1539, | |
| "step": 91400 | |
| }, | |
| { | |
| "epoch": 26.61539357242664, | |
| "grad_norm": 0.41473838686943054, | |
| "learning_rate": 0.00028078392545136864, | |
| "loss": 3.1576, | |
| "step": 91450 | |
| }, | |
| { | |
| "epoch": 26.629948765719607, | |
| "grad_norm": 0.48401448130607605, | |
| "learning_rate": 0.00028060920209668023, | |
| "loss": 3.1612, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 26.644503959012575, | |
| "grad_norm": 0.4365958571434021, | |
| "learning_rate": 0.00028043447874199183, | |
| "loss": 3.1664, | |
| "step": 91550 | |
| }, | |
| { | |
| "epoch": 26.65905915230554, | |
| "grad_norm": 0.4604295790195465, | |
| "learning_rate": 0.0002802597553873034, | |
| "loss": 3.1596, | |
| "step": 91600 | |
| }, | |
| { | |
| "epoch": 26.67361434559851, | |
| "grad_norm": 0.4202839434146881, | |
| "learning_rate": 0.000280085032032615, | |
| "loss": 3.1755, | |
| "step": 91650 | |
| }, | |
| { | |
| "epoch": 26.688169538891476, | |
| "grad_norm": 0.4243042767047882, | |
| "learning_rate": 0.00027991030867792656, | |
| "loss": 3.181, | |
| "step": 91700 | |
| }, | |
| { | |
| "epoch": 26.702724732184443, | |
| "grad_norm": 0.4041393995285034, | |
| "learning_rate": 0.0002797355853232382, | |
| "loss": 3.1721, | |
| "step": 91750 | |
| }, | |
| { | |
| "epoch": 26.71727992547741, | |
| "grad_norm": 0.46612367033958435, | |
| "learning_rate": 0.00027956086196854975, | |
| "loss": 3.1681, | |
| "step": 91800 | |
| }, | |
| { | |
| "epoch": 26.731835118770377, | |
| "grad_norm": 0.4544692635536194, | |
| "learning_rate": 0.0002793861386138614, | |
| "loss": 3.1757, | |
| "step": 91850 | |
| }, | |
| { | |
| "epoch": 26.746390312063344, | |
| "grad_norm": 0.4264598488807678, | |
| "learning_rate": 0.00027921141525917293, | |
| "loss": 3.1713, | |
| "step": 91900 | |
| }, | |
| { | |
| "epoch": 26.76094550535631, | |
| "grad_norm": 0.4370327293872833, | |
| "learning_rate": 0.00027903669190448453, | |
| "loss": 3.1705, | |
| "step": 91950 | |
| }, | |
| { | |
| "epoch": 26.775500698649278, | |
| "grad_norm": 0.4360485374927521, | |
| "learning_rate": 0.0002788619685497961, | |
| "loss": 3.1785, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 26.775500698649278, | |
| "eval_accuracy": 0.37416862653242383, | |
| "eval_loss": 3.5467350482940674, | |
| "eval_runtime": 81.7111, | |
| "eval_samples_per_second": 203.779, | |
| "eval_steps_per_second": 12.74, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 26.790055891942245, | |
| "grad_norm": 0.4283793568611145, | |
| "learning_rate": 0.0002786872451951077, | |
| "loss": 3.1697, | |
| "step": 92050 | |
| }, | |
| { | |
| "epoch": 26.804611085235212, | |
| "grad_norm": 0.4487936496734619, | |
| "learning_rate": 0.0002785125218404193, | |
| "loss": 3.1753, | |
| "step": 92100 | |
| }, | |
| { | |
| "epoch": 26.81916627852818, | |
| "grad_norm": 0.42697852849960327, | |
| "learning_rate": 0.0002783377984857309, | |
| "loss": 3.1743, | |
| "step": 92150 | |
| }, | |
| { | |
| "epoch": 26.833721471821146, | |
| "grad_norm": 0.41927722096443176, | |
| "learning_rate": 0.0002781630751310425, | |
| "loss": 3.1751, | |
| "step": 92200 | |
| }, | |
| { | |
| "epoch": 26.848276665114113, | |
| "grad_norm": 0.42508047819137573, | |
| "learning_rate": 0.0002779883517763541, | |
| "loss": 3.1762, | |
| "step": 92250 | |
| }, | |
| { | |
| "epoch": 26.86283185840708, | |
| "grad_norm": 0.4437408745288849, | |
| "learning_rate": 0.0002778136284216657, | |
| "loss": 3.1743, | |
| "step": 92300 | |
| }, | |
| { | |
| "epoch": 26.877387051700047, | |
| "grad_norm": 0.43265363574028015, | |
| "learning_rate": 0.0002776389050669773, | |
| "loss": 3.1777, | |
| "step": 92350 | |
| }, | |
| { | |
| "epoch": 26.891942244993015, | |
| "grad_norm": 0.4531176686286926, | |
| "learning_rate": 0.0002774641817122888, | |
| "loss": 3.1938, | |
| "step": 92400 | |
| }, | |
| { | |
| "epoch": 26.90649743828598, | |
| "grad_norm": 0.4262772500514984, | |
| "learning_rate": 0.00027728945835760047, | |
| "loss": 3.179, | |
| "step": 92450 | |
| }, | |
| { | |
| "epoch": 26.92105263157895, | |
| "grad_norm": 0.46178603172302246, | |
| "learning_rate": 0.000277114735002912, | |
| "loss": 3.1719, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 26.935607824871916, | |
| "grad_norm": 0.4419558346271515, | |
| "learning_rate": 0.00027694001164822366, | |
| "loss": 3.1784, | |
| "step": 92550 | |
| }, | |
| { | |
| "epoch": 26.950163018164883, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0002767652882935352, | |
| "loss": 3.176, | |
| "step": 92600 | |
| }, | |
| { | |
| "epoch": 26.96471821145785, | |
| "grad_norm": 0.4367315173149109, | |
| "learning_rate": 0.0002765905649388468, | |
| "loss": 3.1657, | |
| "step": 92650 | |
| }, | |
| { | |
| "epoch": 26.979273404750813, | |
| "grad_norm": 0.44332101941108704, | |
| "learning_rate": 0.0002764158415841584, | |
| "loss": 3.1807, | |
| "step": 92700 | |
| }, | |
| { | |
| "epoch": 26.99382859804378, | |
| "grad_norm": 0.4203725755214691, | |
| "learning_rate": 0.00027624111822947, | |
| "loss": 3.176, | |
| "step": 92750 | |
| }, | |
| { | |
| "epoch": 27.00844201210992, | |
| "grad_norm": 0.4206697344779968, | |
| "learning_rate": 0.0002760663948747816, | |
| "loss": 3.1898, | |
| "step": 92800 | |
| }, | |
| { | |
| "epoch": 27.022997205402888, | |
| "grad_norm": 0.47763592004776, | |
| "learning_rate": 0.00027589167152009317, | |
| "loss": 3.1048, | |
| "step": 92850 | |
| }, | |
| { | |
| "epoch": 27.037552398695855, | |
| "grad_norm": 0.45985543727874756, | |
| "learning_rate": 0.00027571694816540477, | |
| "loss": 3.1077, | |
| "step": 92900 | |
| }, | |
| { | |
| "epoch": 27.052107591988822, | |
| "grad_norm": 0.4220231771469116, | |
| "learning_rate": 0.0002755422248107163, | |
| "loss": 3.1145, | |
| "step": 92950 | |
| }, | |
| { | |
| "epoch": 27.06666278528179, | |
| "grad_norm": 0.4286889433860779, | |
| "learning_rate": 0.00027536750145602795, | |
| "loss": 3.115, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 27.06666278528179, | |
| "eval_accuracy": 0.3737308373197616, | |
| "eval_loss": 3.5535054206848145, | |
| "eval_runtime": 81.6541, | |
| "eval_samples_per_second": 203.921, | |
| "eval_steps_per_second": 12.749, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 27.081217978574756, | |
| "grad_norm": 0.42607933282852173, | |
| "learning_rate": 0.0002751927781013395, | |
| "loss": 3.1153, | |
| "step": 93050 | |
| }, | |
| { | |
| "epoch": 27.095773171867723, | |
| "grad_norm": 0.44815680384635925, | |
| "learning_rate": 0.0002750180547466511, | |
| "loss": 3.1197, | |
| "step": 93100 | |
| }, | |
| { | |
| "epoch": 27.11032836516069, | |
| "grad_norm": 0.42774924635887146, | |
| "learning_rate": 0.0002748433313919627, | |
| "loss": 3.1195, | |
| "step": 93150 | |
| }, | |
| { | |
| "epoch": 27.124883558453657, | |
| "grad_norm": 0.45016756653785706, | |
| "learning_rate": 0.0002746686080372743, | |
| "loss": 3.1217, | |
| "step": 93200 | |
| }, | |
| { | |
| "epoch": 27.139438751746624, | |
| "grad_norm": 0.40932929515838623, | |
| "learning_rate": 0.00027449388468258587, | |
| "loss": 3.136, | |
| "step": 93250 | |
| }, | |
| { | |
| "epoch": 27.15399394503959, | |
| "grad_norm": 0.4122571647167206, | |
| "learning_rate": 0.00027431916132789747, | |
| "loss": 3.1237, | |
| "step": 93300 | |
| }, | |
| { | |
| "epoch": 27.16854913833256, | |
| "grad_norm": 0.4471433460712433, | |
| "learning_rate": 0.00027414443797320906, | |
| "loss": 3.1328, | |
| "step": 93350 | |
| }, | |
| { | |
| "epoch": 27.183104331625525, | |
| "grad_norm": 0.44877904653549194, | |
| "learning_rate": 0.00027396971461852065, | |
| "loss": 3.1287, | |
| "step": 93400 | |
| }, | |
| { | |
| "epoch": 27.197659524918492, | |
| "grad_norm": 0.4053255021572113, | |
| "learning_rate": 0.00027379499126383225, | |
| "loss": 3.1378, | |
| "step": 93450 | |
| }, | |
| { | |
| "epoch": 27.21221471821146, | |
| "grad_norm": 0.44145819544792175, | |
| "learning_rate": 0.00027362026790914384, | |
| "loss": 3.1293, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 27.226769911504423, | |
| "grad_norm": 0.4564794600009918, | |
| "learning_rate": 0.00027344554455445544, | |
| "loss": 3.1394, | |
| "step": 93550 | |
| }, | |
| { | |
| "epoch": 27.24132510479739, | |
| "grad_norm": 0.4337918162345886, | |
| "learning_rate": 0.00027327082119976703, | |
| "loss": 3.1363, | |
| "step": 93600 | |
| }, | |
| { | |
| "epoch": 27.255880298090357, | |
| "grad_norm": 0.44088059663772583, | |
| "learning_rate": 0.00027309609784507857, | |
| "loss": 3.1445, | |
| "step": 93650 | |
| }, | |
| { | |
| "epoch": 27.270435491383324, | |
| "grad_norm": 0.41944295167922974, | |
| "learning_rate": 0.0002729213744903902, | |
| "loss": 3.1421, | |
| "step": 93700 | |
| }, | |
| { | |
| "epoch": 27.28499068467629, | |
| "grad_norm": 0.4538804888725281, | |
| "learning_rate": 0.00027274665113570176, | |
| "loss": 3.1464, | |
| "step": 93750 | |
| }, | |
| { | |
| "epoch": 27.29954587796926, | |
| "grad_norm": 0.4835473895072937, | |
| "learning_rate": 0.00027257192778101335, | |
| "loss": 3.1526, | |
| "step": 93800 | |
| }, | |
| { | |
| "epoch": 27.314101071262225, | |
| "grad_norm": 0.4130382835865021, | |
| "learning_rate": 0.00027239720442632495, | |
| "loss": 3.1374, | |
| "step": 93850 | |
| }, | |
| { | |
| "epoch": 27.328656264555192, | |
| "grad_norm": 0.44483402371406555, | |
| "learning_rate": 0.00027222248107163654, | |
| "loss": 3.1463, | |
| "step": 93900 | |
| }, | |
| { | |
| "epoch": 27.34321145784816, | |
| "grad_norm": 0.4645542800426483, | |
| "learning_rate": 0.00027204775771694814, | |
| "loss": 3.138, | |
| "step": 93950 | |
| }, | |
| { | |
| "epoch": 27.357766651141127, | |
| "grad_norm": 0.4354679584503174, | |
| "learning_rate": 0.00027187303436225973, | |
| "loss": 3.1541, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 27.357766651141127, | |
| "eval_accuracy": 0.3742118765808157, | |
| "eval_loss": 3.550565242767334, | |
| "eval_runtime": 81.6889, | |
| "eval_samples_per_second": 203.834, | |
| "eval_steps_per_second": 12.743, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 27.372321844434094, | |
| "grad_norm": 0.4329032599925995, | |
| "learning_rate": 0.0002716983110075713, | |
| "loss": 3.1454, | |
| "step": 94050 | |
| }, | |
| { | |
| "epoch": 27.38687703772706, | |
| "grad_norm": 0.42032626271247864, | |
| "learning_rate": 0.0002715235876528829, | |
| "loss": 3.1547, | |
| "step": 94100 | |
| }, | |
| { | |
| "epoch": 27.401432231020028, | |
| "grad_norm": 0.4581587612628937, | |
| "learning_rate": 0.0002713488642981945, | |
| "loss": 3.151, | |
| "step": 94150 | |
| }, | |
| { | |
| "epoch": 27.415987424312995, | |
| "grad_norm": 0.4240913391113281, | |
| "learning_rate": 0.00027117414094350606, | |
| "loss": 3.1622, | |
| "step": 94200 | |
| }, | |
| { | |
| "epoch": 27.430542617605962, | |
| "grad_norm": 0.41683876514434814, | |
| "learning_rate": 0.0002709994175888177, | |
| "loss": 3.1622, | |
| "step": 94250 | |
| }, | |
| { | |
| "epoch": 27.44509781089893, | |
| "grad_norm": 0.47243040800094604, | |
| "learning_rate": 0.00027082469423412924, | |
| "loss": 3.1604, | |
| "step": 94300 | |
| }, | |
| { | |
| "epoch": 27.459653004191896, | |
| "grad_norm": 0.4212307333946228, | |
| "learning_rate": 0.00027064997087944084, | |
| "loss": 3.1723, | |
| "step": 94350 | |
| }, | |
| { | |
| "epoch": 27.474208197484863, | |
| "grad_norm": 0.41983917355537415, | |
| "learning_rate": 0.00027047524752475243, | |
| "loss": 3.159, | |
| "step": 94400 | |
| }, | |
| { | |
| "epoch": 27.48876339077783, | |
| "grad_norm": 0.41763192415237427, | |
| "learning_rate": 0.000270300524170064, | |
| "loss": 3.1679, | |
| "step": 94450 | |
| }, | |
| { | |
| "epoch": 27.503318584070797, | |
| "grad_norm": 0.4923311769962311, | |
| "learning_rate": 0.0002701258008153757, | |
| "loss": 3.1661, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 27.517873777363764, | |
| "grad_norm": 0.4193151593208313, | |
| "learning_rate": 0.0002699510774606872, | |
| "loss": 3.158, | |
| "step": 94550 | |
| }, | |
| { | |
| "epoch": 27.53242897065673, | |
| "grad_norm": 0.43846607208251953, | |
| "learning_rate": 0.0002697763541059988, | |
| "loss": 3.1763, | |
| "step": 94600 | |
| }, | |
| { | |
| "epoch": 27.5469841639497, | |
| "grad_norm": 0.4201177656650543, | |
| "learning_rate": 0.0002696016307513104, | |
| "loss": 3.1747, | |
| "step": 94650 | |
| }, | |
| { | |
| "epoch": 27.561539357242665, | |
| "grad_norm": 0.4298728406429291, | |
| "learning_rate": 0.000269426907396622, | |
| "loss": 3.1639, | |
| "step": 94700 | |
| }, | |
| { | |
| "epoch": 27.576094550535633, | |
| "grad_norm": 0.4554992616176605, | |
| "learning_rate": 0.0002692521840419336, | |
| "loss": 3.1686, | |
| "step": 94750 | |
| }, | |
| { | |
| "epoch": 27.5906497438286, | |
| "grad_norm": 0.41391247510910034, | |
| "learning_rate": 0.0002690774606872452, | |
| "loss": 3.1576, | |
| "step": 94800 | |
| }, | |
| { | |
| "epoch": 27.605204937121567, | |
| "grad_norm": 0.4202953577041626, | |
| "learning_rate": 0.0002689027373325568, | |
| "loss": 3.1709, | |
| "step": 94850 | |
| }, | |
| { | |
| "epoch": 27.619760130414534, | |
| "grad_norm": 0.4237096905708313, | |
| "learning_rate": 0.0002687280139778683, | |
| "loss": 3.1641, | |
| "step": 94900 | |
| }, | |
| { | |
| "epoch": 27.634315323707497, | |
| "grad_norm": 0.4351206123828888, | |
| "learning_rate": 0.00026855329062317997, | |
| "loss": 3.1707, | |
| "step": 94950 | |
| }, | |
| { | |
| "epoch": 27.648870517000464, | |
| "grad_norm": 0.4442284405231476, | |
| "learning_rate": 0.0002683785672684915, | |
| "loss": 3.1689, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 27.648870517000464, | |
| "eval_accuracy": 0.3744737274172752, | |
| "eval_loss": 3.5419938564300537, | |
| "eval_runtime": 81.5578, | |
| "eval_samples_per_second": 204.162, | |
| "eval_steps_per_second": 12.764, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 27.66342571029343, | |
| "grad_norm": 0.4352906048297882, | |
| "learning_rate": 0.0002682038439138031, | |
| "loss": 3.1722, | |
| "step": 95050 | |
| }, | |
| { | |
| "epoch": 27.6779809035864, | |
| "grad_norm": 0.47953805327415466, | |
| "learning_rate": 0.0002680291205591147, | |
| "loss": 3.1691, | |
| "step": 95100 | |
| }, | |
| { | |
| "epoch": 27.692536096879365, | |
| "grad_norm": 0.46263787150382996, | |
| "learning_rate": 0.0002678543972044263, | |
| "loss": 3.175, | |
| "step": 95150 | |
| }, | |
| { | |
| "epoch": 27.707091290172333, | |
| "grad_norm": 0.4320184290409088, | |
| "learning_rate": 0.0002676796738497379, | |
| "loss": 3.1767, | |
| "step": 95200 | |
| }, | |
| { | |
| "epoch": 27.7216464834653, | |
| "grad_norm": 0.4423382580280304, | |
| "learning_rate": 0.0002675049504950495, | |
| "loss": 3.1803, | |
| "step": 95250 | |
| }, | |
| { | |
| "epoch": 27.736201676758267, | |
| "grad_norm": 0.4242289066314697, | |
| "learning_rate": 0.0002673302271403611, | |
| "loss": 3.1755, | |
| "step": 95300 | |
| }, | |
| { | |
| "epoch": 27.750756870051234, | |
| "grad_norm": 0.46094515919685364, | |
| "learning_rate": 0.00026715550378567267, | |
| "loss": 3.1867, | |
| "step": 95350 | |
| }, | |
| { | |
| "epoch": 27.7653120633442, | |
| "grad_norm": 0.43266111612319946, | |
| "learning_rate": 0.00026698078043098426, | |
| "loss": 3.1627, | |
| "step": 95400 | |
| }, | |
| { | |
| "epoch": 27.779867256637168, | |
| "grad_norm": 0.4551432728767395, | |
| "learning_rate": 0.00026680605707629586, | |
| "loss": 3.1708, | |
| "step": 95450 | |
| }, | |
| { | |
| "epoch": 27.794422449930135, | |
| "grad_norm": 0.4403327405452728, | |
| "learning_rate": 0.00026663133372160745, | |
| "loss": 3.1835, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 27.808977643223102, | |
| "grad_norm": 0.4427354037761688, | |
| "learning_rate": 0.00026645661036691905, | |
| "loss": 3.1786, | |
| "step": 95550 | |
| }, | |
| { | |
| "epoch": 27.82353283651607, | |
| "grad_norm": 0.42573073506355286, | |
| "learning_rate": 0.0002662818870122306, | |
| "loss": 3.1736, | |
| "step": 95600 | |
| }, | |
| { | |
| "epoch": 27.838088029809036, | |
| "grad_norm": 0.4395178556442261, | |
| "learning_rate": 0.00026610716365754224, | |
| "loss": 3.1696, | |
| "step": 95650 | |
| }, | |
| { | |
| "epoch": 27.852643223102003, | |
| "grad_norm": 0.4574327766895294, | |
| "learning_rate": 0.0002659324403028538, | |
| "loss": 3.1822, | |
| "step": 95700 | |
| }, | |
| { | |
| "epoch": 27.86719841639497, | |
| "grad_norm": 0.43819448351860046, | |
| "learning_rate": 0.00026575771694816537, | |
| "loss": 3.188, | |
| "step": 95750 | |
| }, | |
| { | |
| "epoch": 27.881753609687937, | |
| "grad_norm": 0.44623273611068726, | |
| "learning_rate": 0.00026558299359347696, | |
| "loss": 3.1815, | |
| "step": 95800 | |
| }, | |
| { | |
| "epoch": 27.896308802980904, | |
| "grad_norm": 0.4393879771232605, | |
| "learning_rate": 0.00026540827023878856, | |
| "loss": 3.187, | |
| "step": 95850 | |
| }, | |
| { | |
| "epoch": 27.91086399627387, | |
| "grad_norm": 0.4445323348045349, | |
| "learning_rate": 0.00026523354688410015, | |
| "loss": 3.1809, | |
| "step": 95900 | |
| }, | |
| { | |
| "epoch": 27.92541918956684, | |
| "grad_norm": 0.44179466366767883, | |
| "learning_rate": 0.00026505882352941175, | |
| "loss": 3.2004, | |
| "step": 95950 | |
| }, | |
| { | |
| "epoch": 27.939974382859806, | |
| "grad_norm": 0.45991572737693787, | |
| "learning_rate": 0.00026488410017472334, | |
| "loss": 3.1867, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 27.939974382859806, | |
| "eval_accuracy": 0.3751155440321338, | |
| "eval_loss": 3.5365254878997803, | |
| "eval_runtime": 81.5956, | |
| "eval_samples_per_second": 204.067, | |
| "eval_steps_per_second": 12.758, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 27.954529576152773, | |
| "grad_norm": 0.43493005633354187, | |
| "learning_rate": 0.00026470937682003494, | |
| "loss": 3.1798, | |
| "step": 96050 | |
| }, | |
| { | |
| "epoch": 27.96908476944574, | |
| "grad_norm": 0.45600172877311707, | |
| "learning_rate": 0.00026453465346534653, | |
| "loss": 3.1887, | |
| "step": 96100 | |
| }, | |
| { | |
| "epoch": 27.983639962738707, | |
| "grad_norm": 0.4553404152393341, | |
| "learning_rate": 0.00026435993011065807, | |
| "loss": 3.1842, | |
| "step": 96150 | |
| }, | |
| { | |
| "epoch": 27.998195156031674, | |
| "grad_norm": 0.4468027353286743, | |
| "learning_rate": 0.0002641852067559697, | |
| "loss": 3.1827, | |
| "step": 96200 | |
| }, | |
| { | |
| "epoch": 28.01251746623195, | |
| "grad_norm": 0.4575220048427582, | |
| "learning_rate": 0.00026401048340128126, | |
| "loss": 3.0836, | |
| "step": 96250 | |
| }, | |
| { | |
| "epoch": 28.02707265952492, | |
| "grad_norm": 0.4800165593624115, | |
| "learning_rate": 0.00026383576004659285, | |
| "loss": 3.0947, | |
| "step": 96300 | |
| }, | |
| { | |
| "epoch": 28.041627852817886, | |
| "grad_norm": 0.4504547417163849, | |
| "learning_rate": 0.00026366103669190445, | |
| "loss": 3.0946, | |
| "step": 96350 | |
| }, | |
| { | |
| "epoch": 28.056183046110853, | |
| "grad_norm": 0.4591192901134491, | |
| "learning_rate": 0.00026348631333721604, | |
| "loss": 3.1136, | |
| "step": 96400 | |
| }, | |
| { | |
| "epoch": 28.07073823940382, | |
| "grad_norm": 0.4446643888950348, | |
| "learning_rate": 0.00026331158998252764, | |
| "loss": 3.1018, | |
| "step": 96450 | |
| }, | |
| { | |
| "epoch": 28.085293432696787, | |
| "grad_norm": 0.44344717264175415, | |
| "learning_rate": 0.00026313686662783923, | |
| "loss": 3.0978, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 28.099848625989754, | |
| "grad_norm": 0.4162672460079193, | |
| "learning_rate": 0.0002629621432731508, | |
| "loss": 3.0999, | |
| "step": 96550 | |
| }, | |
| { | |
| "epoch": 28.11440381928272, | |
| "grad_norm": 0.43002283573150635, | |
| "learning_rate": 0.0002627874199184624, | |
| "loss": 3.1151, | |
| "step": 96600 | |
| }, | |
| { | |
| "epoch": 28.128959012575688, | |
| "grad_norm": 0.4493495225906372, | |
| "learning_rate": 0.000262612696563774, | |
| "loss": 3.1035, | |
| "step": 96650 | |
| }, | |
| { | |
| "epoch": 28.143514205868655, | |
| "grad_norm": 0.4252546727657318, | |
| "learning_rate": 0.0002624379732090856, | |
| "loss": 3.1191, | |
| "step": 96700 | |
| }, | |
| { | |
| "epoch": 28.158069399161622, | |
| "grad_norm": 0.43416526913642883, | |
| "learning_rate": 0.0002622632498543972, | |
| "loss": 3.1106, | |
| "step": 96750 | |
| }, | |
| { | |
| "epoch": 28.17262459245459, | |
| "grad_norm": 0.44047465920448303, | |
| "learning_rate": 0.0002620885264997088, | |
| "loss": 3.1209, | |
| "step": 96800 | |
| }, | |
| { | |
| "epoch": 28.187179785747556, | |
| "grad_norm": 0.4314543306827545, | |
| "learning_rate": 0.00026191380314502034, | |
| "loss": 3.129, | |
| "step": 96850 | |
| }, | |
| { | |
| "epoch": 28.201734979040523, | |
| "grad_norm": 0.4365466237068176, | |
| "learning_rate": 0.000261739079790332, | |
| "loss": 3.1011, | |
| "step": 96900 | |
| }, | |
| { | |
| "epoch": 28.216290172333487, | |
| "grad_norm": 0.42054036259651184, | |
| "learning_rate": 0.0002615643564356435, | |
| "loss": 3.1282, | |
| "step": 96950 | |
| }, | |
| { | |
| "epoch": 28.230845365626454, | |
| "grad_norm": 0.4819340705871582, | |
| "learning_rate": 0.0002613896330809551, | |
| "loss": 3.1251, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 28.230845365626454, | |
| "eval_accuracy": 0.3739499082170508, | |
| "eval_loss": 3.55663800239563, | |
| "eval_runtime": 81.5889, | |
| "eval_samples_per_second": 204.084, | |
| "eval_steps_per_second": 12.759, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 28.24540055891942, | |
| "grad_norm": 0.4251644015312195, | |
| "learning_rate": 0.0002612149097262667, | |
| "loss": 3.1314, | |
| "step": 97050 | |
| }, | |
| { | |
| "epoch": 28.259955752212388, | |
| "grad_norm": 0.4289312958717346, | |
| "learning_rate": 0.0002610401863715783, | |
| "loss": 3.1191, | |
| "step": 97100 | |
| }, | |
| { | |
| "epoch": 28.274510945505355, | |
| "grad_norm": 0.4611528217792511, | |
| "learning_rate": 0.0002608654630168899, | |
| "loss": 3.1274, | |
| "step": 97150 | |
| }, | |
| { | |
| "epoch": 28.289066138798322, | |
| "grad_norm": 0.4506722092628479, | |
| "learning_rate": 0.0002606907396622015, | |
| "loss": 3.1391, | |
| "step": 97200 | |
| }, | |
| { | |
| "epoch": 28.30362133209129, | |
| "grad_norm": 0.46781811118125916, | |
| "learning_rate": 0.0002605160163075131, | |
| "loss": 3.1232, | |
| "step": 97250 | |
| }, | |
| { | |
| "epoch": 28.318176525384256, | |
| "grad_norm": 0.4538547694683075, | |
| "learning_rate": 0.0002603412929528247, | |
| "loss": 3.1436, | |
| "step": 97300 | |
| }, | |
| { | |
| "epoch": 28.332731718677223, | |
| "grad_norm": 0.4640878438949585, | |
| "learning_rate": 0.0002601665695981363, | |
| "loss": 3.1404, | |
| "step": 97350 | |
| }, | |
| { | |
| "epoch": 28.34728691197019, | |
| "grad_norm": 0.4476887285709381, | |
| "learning_rate": 0.0002599918462434478, | |
| "loss": 3.1413, | |
| "step": 97400 | |
| }, | |
| { | |
| "epoch": 28.361842105263158, | |
| "grad_norm": 0.41474345326423645, | |
| "learning_rate": 0.00025981712288875947, | |
| "loss": 3.1379, | |
| "step": 97450 | |
| }, | |
| { | |
| "epoch": 28.376397298556125, | |
| "grad_norm": 0.43970558047294617, | |
| "learning_rate": 0.000259642399534071, | |
| "loss": 3.1509, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 28.39095249184909, | |
| "grad_norm": 0.46579426527023315, | |
| "learning_rate": 0.0002594676761793826, | |
| "loss": 3.1454, | |
| "step": 97550 | |
| }, | |
| { | |
| "epoch": 28.40550768514206, | |
| "grad_norm": 0.4330880343914032, | |
| "learning_rate": 0.0002592929528246942, | |
| "loss": 3.1493, | |
| "step": 97600 | |
| }, | |
| { | |
| "epoch": 28.420062878435026, | |
| "grad_norm": 0.44026318192481995, | |
| "learning_rate": 0.0002591182294700058, | |
| "loss": 3.147, | |
| "step": 97650 | |
| }, | |
| { | |
| "epoch": 28.434618071727993, | |
| "grad_norm": 0.44170844554901123, | |
| "learning_rate": 0.0002589435061153174, | |
| "loss": 3.1515, | |
| "step": 97700 | |
| }, | |
| { | |
| "epoch": 28.44917326502096, | |
| "grad_norm": 0.4635496437549591, | |
| "learning_rate": 0.000258768782760629, | |
| "loss": 3.1474, | |
| "step": 97750 | |
| }, | |
| { | |
| "epoch": 28.463728458313927, | |
| "grad_norm": 0.4308694005012512, | |
| "learning_rate": 0.0002585940594059406, | |
| "loss": 3.1495, | |
| "step": 97800 | |
| }, | |
| { | |
| "epoch": 28.478283651606894, | |
| "grad_norm": 0.44511526823043823, | |
| "learning_rate": 0.00025841933605125217, | |
| "loss": 3.1522, | |
| "step": 97850 | |
| }, | |
| { | |
| "epoch": 28.49283884489986, | |
| "grad_norm": 0.4325944781303406, | |
| "learning_rate": 0.00025824461269656376, | |
| "loss": 3.1614, | |
| "step": 97900 | |
| }, | |
| { | |
| "epoch": 28.507394038192828, | |
| "grad_norm": 0.4429301619529724, | |
| "learning_rate": 0.00025806988934187536, | |
| "loss": 3.1466, | |
| "step": 97950 | |
| }, | |
| { | |
| "epoch": 28.521949231485795, | |
| "grad_norm": 0.4527985155582428, | |
| "learning_rate": 0.00025789516598718695, | |
| "loss": 3.1487, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 28.521949231485795, | |
| "eval_accuracy": 0.37472652865121786, | |
| "eval_loss": 3.545165777206421, | |
| "eval_runtime": 81.5549, | |
| "eval_samples_per_second": 204.169, | |
| "eval_steps_per_second": 12.764, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 28.536504424778762, | |
| "grad_norm": 0.4763733744621277, | |
| "learning_rate": 0.00025772044263249854, | |
| "loss": 3.1601, | |
| "step": 98050 | |
| }, | |
| { | |
| "epoch": 28.55105961807173, | |
| "grad_norm": 0.46908220648765564, | |
| "learning_rate": 0.0002575457192778101, | |
| "loss": 3.1629, | |
| "step": 98100 | |
| }, | |
| { | |
| "epoch": 28.565614811364696, | |
| "grad_norm": 0.4391682744026184, | |
| "learning_rate": 0.00025737099592312173, | |
| "loss": 3.1441, | |
| "step": 98150 | |
| }, | |
| { | |
| "epoch": 28.580170004657663, | |
| "grad_norm": 0.42764508724212646, | |
| "learning_rate": 0.0002571962725684333, | |
| "loss": 3.1582, | |
| "step": 98200 | |
| }, | |
| { | |
| "epoch": 28.59472519795063, | |
| "grad_norm": 0.42899560928344727, | |
| "learning_rate": 0.00025702154921374487, | |
| "loss": 3.157, | |
| "step": 98250 | |
| }, | |
| { | |
| "epoch": 28.609280391243594, | |
| "grad_norm": 0.4499003291130066, | |
| "learning_rate": 0.00025684682585905646, | |
| "loss": 3.1595, | |
| "step": 98300 | |
| }, | |
| { | |
| "epoch": 28.62383558453656, | |
| "grad_norm": 0.41827598214149475, | |
| "learning_rate": 0.00025667210250436806, | |
| "loss": 3.1713, | |
| "step": 98350 | |
| }, | |
| { | |
| "epoch": 28.638390777829528, | |
| "grad_norm": 0.45985516905784607, | |
| "learning_rate": 0.00025649737914967965, | |
| "loss": 3.1682, | |
| "step": 98400 | |
| }, | |
| { | |
| "epoch": 28.652945971122495, | |
| "grad_norm": 0.45067837834358215, | |
| "learning_rate": 0.00025632265579499124, | |
| "loss": 3.1731, | |
| "step": 98450 | |
| }, | |
| { | |
| "epoch": 28.667501164415462, | |
| "grad_norm": 0.4321034550666809, | |
| "learning_rate": 0.00025614793244030284, | |
| "loss": 3.1601, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 28.68205635770843, | |
| "grad_norm": 0.43246740102767944, | |
| "learning_rate": 0.0002559732090856144, | |
| "loss": 3.158, | |
| "step": 98550 | |
| }, | |
| { | |
| "epoch": 28.696611551001396, | |
| "grad_norm": 0.423515260219574, | |
| "learning_rate": 0.00025579848573092603, | |
| "loss": 3.158, | |
| "step": 98600 | |
| }, | |
| { | |
| "epoch": 28.711166744294363, | |
| "grad_norm": 0.4541126489639282, | |
| "learning_rate": 0.0002556237623762376, | |
| "loss": 3.179, | |
| "step": 98650 | |
| }, | |
| { | |
| "epoch": 28.72572193758733, | |
| "grad_norm": 0.4211547374725342, | |
| "learning_rate": 0.0002554490390215492, | |
| "loss": 3.1743, | |
| "step": 98700 | |
| }, | |
| { | |
| "epoch": 28.740277130880298, | |
| "grad_norm": 0.42562857270240784, | |
| "learning_rate": 0.0002552743156668608, | |
| "loss": 3.1762, | |
| "step": 98750 | |
| }, | |
| { | |
| "epoch": 28.754832324173265, | |
| "grad_norm": 0.46214157342910767, | |
| "learning_rate": 0.00025509959231217235, | |
| "loss": 3.1699, | |
| "step": 98800 | |
| }, | |
| { | |
| "epoch": 28.76938751746623, | |
| "grad_norm": 0.4245946705341339, | |
| "learning_rate": 0.000254924868957484, | |
| "loss": 3.1673, | |
| "step": 98850 | |
| }, | |
| { | |
| "epoch": 28.7839427107592, | |
| "grad_norm": 0.429810494184494, | |
| "learning_rate": 0.00025475014560279554, | |
| "loss": 3.1817, | |
| "step": 98900 | |
| }, | |
| { | |
| "epoch": 28.798497904052166, | |
| "grad_norm": 0.4301431179046631, | |
| "learning_rate": 0.00025457542224810713, | |
| "loss": 3.1632, | |
| "step": 98950 | |
| }, | |
| { | |
| "epoch": 28.813053097345133, | |
| "grad_norm": 0.42899125814437866, | |
| "learning_rate": 0.00025440069889341873, | |
| "loss": 3.1661, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 28.813053097345133, | |
| "eval_accuracy": 0.37503492030062074, | |
| "eval_loss": 3.5378646850585938, | |
| "eval_runtime": 81.7286, | |
| "eval_samples_per_second": 203.735, | |
| "eval_steps_per_second": 12.737, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 28.8276082906381, | |
| "grad_norm": 0.45702245831489563, | |
| "learning_rate": 0.0002542259755387303, | |
| "loss": 3.1632, | |
| "step": 99050 | |
| }, | |
| { | |
| "epoch": 28.842163483931067, | |
| "grad_norm": 0.4632762670516968, | |
| "learning_rate": 0.0002540512521840419, | |
| "loss": 3.1783, | |
| "step": 99100 | |
| }, | |
| { | |
| "epoch": 28.856718677224034, | |
| "grad_norm": 0.45204028487205505, | |
| "learning_rate": 0.0002538765288293535, | |
| "loss": 3.169, | |
| "step": 99150 | |
| }, | |
| { | |
| "epoch": 28.871273870517, | |
| "grad_norm": 0.4582211375236511, | |
| "learning_rate": 0.0002537018054746651, | |
| "loss": 3.1668, | |
| "step": 99200 | |
| }, | |
| { | |
| "epoch": 28.885829063809968, | |
| "grad_norm": 0.4202673137187958, | |
| "learning_rate": 0.00025352708211997664, | |
| "loss": 3.1715, | |
| "step": 99250 | |
| }, | |
| { | |
| "epoch": 28.900384257102935, | |
| "grad_norm": 0.4296882152557373, | |
| "learning_rate": 0.0002533523587652883, | |
| "loss": 3.1707, | |
| "step": 99300 | |
| }, | |
| { | |
| "epoch": 28.914939450395902, | |
| "grad_norm": 0.44095855951309204, | |
| "learning_rate": 0.00025317763541059983, | |
| "loss": 3.1864, | |
| "step": 99350 | |
| }, | |
| { | |
| "epoch": 28.92949464368887, | |
| "grad_norm": 0.4560483694076538, | |
| "learning_rate": 0.0002530029120559115, | |
| "loss": 3.175, | |
| "step": 99400 | |
| }, | |
| { | |
| "epoch": 28.944049836981836, | |
| "grad_norm": 0.4313257038593292, | |
| "learning_rate": 0.000252828188701223, | |
| "loss": 3.1819, | |
| "step": 99450 | |
| }, | |
| { | |
| "epoch": 28.958605030274803, | |
| "grad_norm": 0.4492696523666382, | |
| "learning_rate": 0.0002526534653465346, | |
| "loss": 3.1806, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 28.97316022356777, | |
| "grad_norm": 0.44620639085769653, | |
| "learning_rate": 0.0002524787419918462, | |
| "loss": 3.1759, | |
| "step": 99550 | |
| }, | |
| { | |
| "epoch": 28.987715416860738, | |
| "grad_norm": 0.40607306361198425, | |
| "learning_rate": 0.0002523040186371578, | |
| "loss": 3.1788, | |
| "step": 99600 | |
| }, | |
| { | |
| "epoch": 29.002037727061015, | |
| "grad_norm": 0.44455665349960327, | |
| "learning_rate": 0.0002521292952824694, | |
| "loss": 3.1783, | |
| "step": 99650 | |
| }, | |
| { | |
| "epoch": 29.016592920353983, | |
| "grad_norm": 0.4591967761516571, | |
| "learning_rate": 0.000251954571927781, | |
| "loss": 3.0755, | |
| "step": 99700 | |
| }, | |
| { | |
| "epoch": 29.03114811364695, | |
| "grad_norm": 0.4482179284095764, | |
| "learning_rate": 0.0002517798485730926, | |
| "loss": 3.0835, | |
| "step": 99750 | |
| }, | |
| { | |
| "epoch": 29.045703306939917, | |
| "grad_norm": 0.4787764549255371, | |
| "learning_rate": 0.0002516051252184042, | |
| "loss": 3.0802, | |
| "step": 99800 | |
| }, | |
| { | |
| "epoch": 29.060258500232884, | |
| "grad_norm": 0.43147143721580505, | |
| "learning_rate": 0.0002514304018637158, | |
| "loss": 3.0878, | |
| "step": 99850 | |
| }, | |
| { | |
| "epoch": 29.07481369352585, | |
| "grad_norm": 0.43477749824523926, | |
| "learning_rate": 0.00025125567850902737, | |
| "loss": 3.0869, | |
| "step": 99900 | |
| }, | |
| { | |
| "epoch": 29.089368886818818, | |
| "grad_norm": 0.4682404696941376, | |
| "learning_rate": 0.0002510809551543389, | |
| "loss": 3.0867, | |
| "step": 99950 | |
| }, | |
| { | |
| "epoch": 29.103924080111785, | |
| "grad_norm": 0.4298090636730194, | |
| "learning_rate": 0.00025090623179965056, | |
| "loss": 3.1053, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 29.103924080111785, | |
| "eval_accuracy": 0.3737838421345027, | |
| "eval_loss": 3.555643320083618, | |
| "eval_runtime": 81.5216, | |
| "eval_samples_per_second": 204.253, | |
| "eval_steps_per_second": 12.77, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 29.118479273404752, | |
| "grad_norm": 0.4298948049545288, | |
| "learning_rate": 0.0002507315084449621, | |
| "loss": 3.1118, | |
| "step": 100050 | |
| }, | |
| { | |
| "epoch": 29.13303446669772, | |
| "grad_norm": 0.43792274594306946, | |
| "learning_rate": 0.00025055678509027375, | |
| "loss": 3.0926, | |
| "step": 100100 | |
| }, | |
| { | |
| "epoch": 29.147589659990686, | |
| "grad_norm": 0.4618018865585327, | |
| "learning_rate": 0.0002503820617355853, | |
| "loss": 3.1208, | |
| "step": 100150 | |
| }, | |
| { | |
| "epoch": 29.162144853283653, | |
| "grad_norm": 0.45855310559272766, | |
| "learning_rate": 0.0002502073383808969, | |
| "loss": 3.1215, | |
| "step": 100200 | |
| }, | |
| { | |
| "epoch": 29.17670004657662, | |
| "grad_norm": 0.45707497000694275, | |
| "learning_rate": 0.0002500326150262085, | |
| "loss": 3.1202, | |
| "step": 100250 | |
| }, | |
| { | |
| "epoch": 29.191255239869584, | |
| "grad_norm": 0.4611913561820984, | |
| "learning_rate": 0.00024985789167152007, | |
| "loss": 3.1161, | |
| "step": 100300 | |
| }, | |
| { | |
| "epoch": 29.20581043316255, | |
| "grad_norm": 0.4508360028266907, | |
| "learning_rate": 0.00024968316831683167, | |
| "loss": 3.1297, | |
| "step": 100350 | |
| }, | |
| { | |
| "epoch": 29.220365626455518, | |
| "grad_norm": 0.4615615904331207, | |
| "learning_rate": 0.00024950844496214326, | |
| "loss": 3.117, | |
| "step": 100400 | |
| }, | |
| { | |
| "epoch": 29.234920819748485, | |
| "grad_norm": 0.47552332282066345, | |
| "learning_rate": 0.00024933372160745485, | |
| "loss": 3.1142, | |
| "step": 100450 | |
| }, | |
| { | |
| "epoch": 29.249476013041452, | |
| "grad_norm": 0.43124398589134216, | |
| "learning_rate": 0.0002491589982527664, | |
| "loss": 3.124, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 29.26403120633442, | |
| "grad_norm": 0.41188037395477295, | |
| "learning_rate": 0.00024898427489807804, | |
| "loss": 3.1103, | |
| "step": 100550 | |
| }, | |
| { | |
| "epoch": 29.278586399627386, | |
| "grad_norm": 0.4601183235645294, | |
| "learning_rate": 0.0002488095515433896, | |
| "loss": 3.1042, | |
| "step": 100600 | |
| }, | |
| { | |
| "epoch": 29.293141592920353, | |
| "grad_norm": 0.47006529569625854, | |
| "learning_rate": 0.0002486348281887012, | |
| "loss": 3.132, | |
| "step": 100650 | |
| }, | |
| { | |
| "epoch": 29.30769678621332, | |
| "grad_norm": 0.45706549286842346, | |
| "learning_rate": 0.00024846010483401277, | |
| "loss": 3.1281, | |
| "step": 100700 | |
| }, | |
| { | |
| "epoch": 29.322251979506287, | |
| "grad_norm": 0.42421749234199524, | |
| "learning_rate": 0.00024828538147932437, | |
| "loss": 3.1226, | |
| "step": 100750 | |
| }, | |
| { | |
| "epoch": 29.336807172799254, | |
| "grad_norm": 0.4357367753982544, | |
| "learning_rate": 0.00024811065812463596, | |
| "loss": 3.1252, | |
| "step": 100800 | |
| }, | |
| { | |
| "epoch": 29.35136236609222, | |
| "grad_norm": 0.4616818130016327, | |
| "learning_rate": 0.00024793593476994755, | |
| "loss": 3.1296, | |
| "step": 100850 | |
| }, | |
| { | |
| "epoch": 29.36591755938519, | |
| "grad_norm": 0.4600532352924347, | |
| "learning_rate": 0.00024776121141525915, | |
| "loss": 3.1211, | |
| "step": 100900 | |
| }, | |
| { | |
| "epoch": 29.380472752678156, | |
| "grad_norm": 0.45205041766166687, | |
| "learning_rate": 0.00024758648806057074, | |
| "loss": 3.1414, | |
| "step": 100950 | |
| }, | |
| { | |
| "epoch": 29.395027945971123, | |
| "grad_norm": 0.4724768102169037, | |
| "learning_rate": 0.00024741176470588234, | |
| "loss": 3.1238, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 29.395027945971123, | |
| "eval_accuracy": 0.37448853585775715, | |
| "eval_loss": 3.5502748489379883, | |
| "eval_runtime": 81.5899, | |
| "eval_samples_per_second": 204.082, | |
| "eval_steps_per_second": 12.759, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 29.40958313926409, | |
| "grad_norm": 0.4232248067855835, | |
| "learning_rate": 0.00024723704135119393, | |
| "loss": 3.1325, | |
| "step": 101050 | |
| }, | |
| { | |
| "epoch": 29.424138332557057, | |
| "grad_norm": 0.45912835001945496, | |
| "learning_rate": 0.0002470623179965055, | |
| "loss": 3.1362, | |
| "step": 101100 | |
| }, | |
| { | |
| "epoch": 29.438693525850024, | |
| "grad_norm": 0.43694475293159485, | |
| "learning_rate": 0.0002468875946418171, | |
| "loss": 3.1298, | |
| "step": 101150 | |
| }, | |
| { | |
| "epoch": 29.45324871914299, | |
| "grad_norm": 0.44252192974090576, | |
| "learning_rate": 0.00024671287128712866, | |
| "loss": 3.1515, | |
| "step": 101200 | |
| }, | |
| { | |
| "epoch": 29.467803912435958, | |
| "grad_norm": 0.4595278203487396, | |
| "learning_rate": 0.0002465381479324403, | |
| "loss": 3.1419, | |
| "step": 101250 | |
| }, | |
| { | |
| "epoch": 29.482359105728925, | |
| "grad_norm": 0.4526241719722748, | |
| "learning_rate": 0.00024636342457775185, | |
| "loss": 3.1564, | |
| "step": 101300 | |
| }, | |
| { | |
| "epoch": 29.496914299021892, | |
| "grad_norm": 0.43796080350875854, | |
| "learning_rate": 0.00024618870122306344, | |
| "loss": 3.1508, | |
| "step": 101350 | |
| }, | |
| { | |
| "epoch": 29.51146949231486, | |
| "grad_norm": 0.4640996754169464, | |
| "learning_rate": 0.00024601397786837504, | |
| "loss": 3.1406, | |
| "step": 101400 | |
| }, | |
| { | |
| "epoch": 29.526024685607826, | |
| "grad_norm": 0.4471496343612671, | |
| "learning_rate": 0.00024583925451368663, | |
| "loss": 3.1341, | |
| "step": 101450 | |
| }, | |
| { | |
| "epoch": 29.540579878900793, | |
| "grad_norm": 0.442502498626709, | |
| "learning_rate": 0.0002456645311589982, | |
| "loss": 3.1405, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 29.55513507219376, | |
| "grad_norm": 0.46430104970932007, | |
| "learning_rate": 0.0002454898078043098, | |
| "loss": 3.1546, | |
| "step": 101550 | |
| }, | |
| { | |
| "epoch": 29.569690265486727, | |
| "grad_norm": 0.44241204857826233, | |
| "learning_rate": 0.0002453150844496214, | |
| "loss": 3.1402, | |
| "step": 101600 | |
| }, | |
| { | |
| "epoch": 29.58424545877969, | |
| "grad_norm": 0.45652905106544495, | |
| "learning_rate": 0.000245140361094933, | |
| "loss": 3.1479, | |
| "step": 101650 | |
| }, | |
| { | |
| "epoch": 29.598800652072658, | |
| "grad_norm": 0.4611835181713104, | |
| "learning_rate": 0.0002449656377402446, | |
| "loss": 3.1542, | |
| "step": 101700 | |
| }, | |
| { | |
| "epoch": 29.613355845365625, | |
| "grad_norm": 0.46368882060050964, | |
| "learning_rate": 0.00024479091438555614, | |
| "loss": 3.1598, | |
| "step": 101750 | |
| }, | |
| { | |
| "epoch": 29.627911038658592, | |
| "grad_norm": 0.4293883144855499, | |
| "learning_rate": 0.0002446161910308678, | |
| "loss": 3.1693, | |
| "step": 101800 | |
| }, | |
| { | |
| "epoch": 29.64246623195156, | |
| "grad_norm": 0.43483561277389526, | |
| "learning_rate": 0.0002444414676761794, | |
| "loss": 3.1582, | |
| "step": 101850 | |
| }, | |
| { | |
| "epoch": 29.657021425244526, | |
| "grad_norm": 0.4586428105831146, | |
| "learning_rate": 0.0002442667443214909, | |
| "loss": 3.171, | |
| "step": 101900 | |
| }, | |
| { | |
| "epoch": 29.671576618537493, | |
| "grad_norm": 0.43442341685295105, | |
| "learning_rate": 0.00024409202096680255, | |
| "loss": 3.1591, | |
| "step": 101950 | |
| }, | |
| { | |
| "epoch": 29.68613181183046, | |
| "grad_norm": 0.4427039921283722, | |
| "learning_rate": 0.00024391729761211411, | |
| "loss": 3.1671, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 29.68613181183046, | |
| "eval_accuracy": 0.3747529722949357, | |
| "eval_loss": 3.543421745300293, | |
| "eval_runtime": 81.485, | |
| "eval_samples_per_second": 204.344, | |
| "eval_steps_per_second": 12.775, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 29.700687005123427, | |
| "grad_norm": 0.4447053074836731, | |
| "learning_rate": 0.00024374257425742574, | |
| "loss": 3.1574, | |
| "step": 102050 | |
| }, | |
| { | |
| "epoch": 29.715242198416394, | |
| "grad_norm": 0.44969141483306885, | |
| "learning_rate": 0.0002435678509027373, | |
| "loss": 3.1506, | |
| "step": 102100 | |
| }, | |
| { | |
| "epoch": 29.72979739170936, | |
| "grad_norm": 0.46557024121284485, | |
| "learning_rate": 0.0002433931275480489, | |
| "loss": 3.1626, | |
| "step": 102150 | |
| }, | |
| { | |
| "epoch": 29.74435258500233, | |
| "grad_norm": 0.4674363136291504, | |
| "learning_rate": 0.00024321840419336052, | |
| "loss": 3.1525, | |
| "step": 102200 | |
| }, | |
| { | |
| "epoch": 29.758907778295296, | |
| "grad_norm": 0.4326673150062561, | |
| "learning_rate": 0.00024304368083867209, | |
| "loss": 3.1677, | |
| "step": 102250 | |
| }, | |
| { | |
| "epoch": 29.773462971588263, | |
| "grad_norm": 0.47652554512023926, | |
| "learning_rate": 0.00024286895748398365, | |
| "loss": 3.1509, | |
| "step": 102300 | |
| }, | |
| { | |
| "epoch": 29.78801816488123, | |
| "grad_norm": 0.43937867879867554, | |
| "learning_rate": 0.00024269423412929527, | |
| "loss": 3.1553, | |
| "step": 102350 | |
| }, | |
| { | |
| "epoch": 29.802573358174197, | |
| "grad_norm": 0.4371858835220337, | |
| "learning_rate": 0.00024251951077460684, | |
| "loss": 3.1589, | |
| "step": 102400 | |
| }, | |
| { | |
| "epoch": 29.817128551467164, | |
| "grad_norm": 0.4592929184436798, | |
| "learning_rate": 0.00024234478741991844, | |
| "loss": 3.1725, | |
| "step": 102450 | |
| }, | |
| { | |
| "epoch": 29.83168374476013, | |
| "grad_norm": 0.43899455666542053, | |
| "learning_rate": 0.00024217006406523003, | |
| "loss": 3.1625, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 29.846238938053098, | |
| "grad_norm": 0.44234412908554077, | |
| "learning_rate": 0.00024199534071054162, | |
| "loss": 3.1742, | |
| "step": 102550 | |
| }, | |
| { | |
| "epoch": 29.860794131346065, | |
| "grad_norm": 0.4332352876663208, | |
| "learning_rate": 0.0002418206173558532, | |
| "loss": 3.1661, | |
| "step": 102600 | |
| }, | |
| { | |
| "epoch": 29.875349324639032, | |
| "grad_norm": 0.44105732440948486, | |
| "learning_rate": 0.0002416458940011648, | |
| "loss": 3.1717, | |
| "step": 102650 | |
| }, | |
| { | |
| "epoch": 29.889904517932, | |
| "grad_norm": 0.4699614346027374, | |
| "learning_rate": 0.00024147117064647638, | |
| "loss": 3.1643, | |
| "step": 102700 | |
| }, | |
| { | |
| "epoch": 29.904459711224966, | |
| "grad_norm": 0.46386492252349854, | |
| "learning_rate": 0.000241296447291788, | |
| "loss": 3.1673, | |
| "step": 102750 | |
| }, | |
| { | |
| "epoch": 29.919014904517933, | |
| "grad_norm": 0.44478514790534973, | |
| "learning_rate": 0.00024112172393709957, | |
| "loss": 3.1713, | |
| "step": 102800 | |
| }, | |
| { | |
| "epoch": 29.9335700978109, | |
| "grad_norm": 0.4461037814617157, | |
| "learning_rate": 0.00024094700058241116, | |
| "loss": 3.1651, | |
| "step": 102850 | |
| }, | |
| { | |
| "epoch": 29.948125291103867, | |
| "grad_norm": 0.458215594291687, | |
| "learning_rate": 0.00024077227722772276, | |
| "loss": 3.163, | |
| "step": 102900 | |
| }, | |
| { | |
| "epoch": 29.962680484396834, | |
| "grad_norm": 0.4382062256336212, | |
| "learning_rate": 0.00024059755387303435, | |
| "loss": 3.1814, | |
| "step": 102950 | |
| }, | |
| { | |
| "epoch": 29.977235677689798, | |
| "grad_norm": 0.43666553497314453, | |
| "learning_rate": 0.00024042283051834592, | |
| "loss": 3.1642, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 29.977235677689798, | |
| "eval_accuracy": 0.37509638708135157, | |
| "eval_loss": 3.53694748878479, | |
| "eval_runtime": 81.6577, | |
| "eval_samples_per_second": 203.912, | |
| "eval_steps_per_second": 12.748, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 29.991790870982765, | |
| "grad_norm": 0.4342905879020691, | |
| "learning_rate": 0.00024024810716365754, | |
| "loss": 3.1691, | |
| "step": 103050 | |
| }, | |
| { | |
| "epoch": 30.006113181183046, | |
| "grad_norm": 0.4480896592140198, | |
| "learning_rate": 0.0002400733838089691, | |
| "loss": 3.1271, | |
| "step": 103100 | |
| }, | |
| { | |
| "epoch": 30.020668374476013, | |
| "grad_norm": 0.4539491534233093, | |
| "learning_rate": 0.0002398986604542807, | |
| "loss": 3.0764, | |
| "step": 103150 | |
| }, | |
| { | |
| "epoch": 30.03522356776898, | |
| "grad_norm": 0.4655728042125702, | |
| "learning_rate": 0.0002397239370995923, | |
| "loss": 3.0877, | |
| "step": 103200 | |
| }, | |
| { | |
| "epoch": 30.049778761061948, | |
| "grad_norm": 0.4541957974433899, | |
| "learning_rate": 0.0002395492137449039, | |
| "loss": 3.0907, | |
| "step": 103250 | |
| }, | |
| { | |
| "epoch": 30.064333954354915, | |
| "grad_norm": 0.4566064178943634, | |
| "learning_rate": 0.00023937449039021546, | |
| "loss": 3.0852, | |
| "step": 103300 | |
| }, | |
| { | |
| "epoch": 30.07888914764788, | |
| "grad_norm": 0.46173039078712463, | |
| "learning_rate": 0.00023919976703552708, | |
| "loss": 3.0809, | |
| "step": 103350 | |
| }, | |
| { | |
| "epoch": 30.09344434094085, | |
| "grad_norm": 0.45617493987083435, | |
| "learning_rate": 0.00023902504368083865, | |
| "loss": 3.0982, | |
| "step": 103400 | |
| }, | |
| { | |
| "epoch": 30.107999534233816, | |
| "grad_norm": 0.4568575620651245, | |
| "learning_rate": 0.00023885032032615027, | |
| "loss": 3.1005, | |
| "step": 103450 | |
| }, | |
| { | |
| "epoch": 30.122554727526783, | |
| "grad_norm": 0.4696102440357208, | |
| "learning_rate": 0.00023867559697146183, | |
| "loss": 3.1025, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 30.13710992081975, | |
| "grad_norm": 0.43672025203704834, | |
| "learning_rate": 0.0002385008736167734, | |
| "loss": 3.0938, | |
| "step": 103550 | |
| }, | |
| { | |
| "epoch": 30.151665114112717, | |
| "grad_norm": 0.4579235017299652, | |
| "learning_rate": 0.00023832615026208502, | |
| "loss": 3.0999, | |
| "step": 103600 | |
| }, | |
| { | |
| "epoch": 30.166220307405684, | |
| "grad_norm": 0.4703800678253174, | |
| "learning_rate": 0.0002381514269073966, | |
| "loss": 3.0959, | |
| "step": 103650 | |
| }, | |
| { | |
| "epoch": 30.180775500698648, | |
| "grad_norm": 0.46524420380592346, | |
| "learning_rate": 0.00023797670355270818, | |
| "loss": 3.1049, | |
| "step": 103700 | |
| }, | |
| { | |
| "epoch": 30.195330693991615, | |
| "grad_norm": 0.46136435866355896, | |
| "learning_rate": 0.0002378019801980198, | |
| "loss": 3.1244, | |
| "step": 103750 | |
| }, | |
| { | |
| "epoch": 30.20988588728458, | |
| "grad_norm": 0.44990143179893494, | |
| "learning_rate": 0.00023762725684333137, | |
| "loss": 3.1093, | |
| "step": 103800 | |
| }, | |
| { | |
| "epoch": 30.22444108057755, | |
| "grad_norm": 0.4674222469329834, | |
| "learning_rate": 0.00023745253348864294, | |
| "loss": 3.0998, | |
| "step": 103850 | |
| }, | |
| { | |
| "epoch": 30.238996273870516, | |
| "grad_norm": 0.451393187046051, | |
| "learning_rate": 0.00023727781013395456, | |
| "loss": 3.1138, | |
| "step": 103900 | |
| }, | |
| { | |
| "epoch": 30.253551467163483, | |
| "grad_norm": 0.44018563628196716, | |
| "learning_rate": 0.00023710308677926613, | |
| "loss": 3.1129, | |
| "step": 103950 | |
| }, | |
| { | |
| "epoch": 30.26810666045645, | |
| "grad_norm": 0.45843884348869324, | |
| "learning_rate": 0.00023692836342457772, | |
| "loss": 3.1046, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 30.26810666045645, | |
| "eval_accuracy": 0.37426488139555686, | |
| "eval_loss": 3.5546748638153076, | |
| "eval_runtime": 81.7838, | |
| "eval_samples_per_second": 203.598, | |
| "eval_steps_per_second": 12.729, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 30.282661853749417, | |
| "grad_norm": 0.4758078157901764, | |
| "learning_rate": 0.00023675364006988932, | |
| "loss": 3.1092, | |
| "step": 104050 | |
| }, | |
| { | |
| "epoch": 30.297217047042384, | |
| "grad_norm": 0.459271639585495, | |
| "learning_rate": 0.0002365789167152009, | |
| "loss": 3.1214, | |
| "step": 104100 | |
| }, | |
| { | |
| "epoch": 30.31177224033535, | |
| "grad_norm": 0.47780340909957886, | |
| "learning_rate": 0.0002364041933605125, | |
| "loss": 3.1234, | |
| "step": 104150 | |
| }, | |
| { | |
| "epoch": 30.326327433628318, | |
| "grad_norm": 0.47596970200538635, | |
| "learning_rate": 0.0002362294700058241, | |
| "loss": 3.135, | |
| "step": 104200 | |
| }, | |
| { | |
| "epoch": 30.340882626921285, | |
| "grad_norm": 0.44845911860466003, | |
| "learning_rate": 0.00023605474665113567, | |
| "loss": 3.1231, | |
| "step": 104250 | |
| }, | |
| { | |
| "epoch": 30.355437820214252, | |
| "grad_norm": 0.4445011615753174, | |
| "learning_rate": 0.0002358800232964473, | |
| "loss": 3.1179, | |
| "step": 104300 | |
| }, | |
| { | |
| "epoch": 30.36999301350722, | |
| "grad_norm": 0.48343926668167114, | |
| "learning_rate": 0.00023570529994175886, | |
| "loss": 3.1196, | |
| "step": 104350 | |
| }, | |
| { | |
| "epoch": 30.384548206800186, | |
| "grad_norm": 0.4556646943092346, | |
| "learning_rate": 0.00023553057658707045, | |
| "loss": 3.1267, | |
| "step": 104400 | |
| }, | |
| { | |
| "epoch": 30.399103400093153, | |
| "grad_norm": 0.47624683380126953, | |
| "learning_rate": 0.00023535585323238204, | |
| "loss": 3.1292, | |
| "step": 104450 | |
| }, | |
| { | |
| "epoch": 30.41365859338612, | |
| "grad_norm": 0.4488056004047394, | |
| "learning_rate": 0.00023518112987769364, | |
| "loss": 3.1143, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 30.428213786679088, | |
| "grad_norm": 0.4296557307243347, | |
| "learning_rate": 0.0002350064065230052, | |
| "loss": 3.1394, | |
| "step": 104550 | |
| }, | |
| { | |
| "epoch": 30.442768979972055, | |
| "grad_norm": 0.4520358145236969, | |
| "learning_rate": 0.00023483168316831683, | |
| "loss": 3.13, | |
| "step": 104600 | |
| }, | |
| { | |
| "epoch": 30.45732417326502, | |
| "grad_norm": 0.45875284075737, | |
| "learning_rate": 0.0002346569598136284, | |
| "loss": 3.1369, | |
| "step": 104650 | |
| }, | |
| { | |
| "epoch": 30.47187936655799, | |
| "grad_norm": 0.442231684923172, | |
| "learning_rate": 0.00023448223645894, | |
| "loss": 3.1449, | |
| "step": 104700 | |
| }, | |
| { | |
| "epoch": 30.486434559850956, | |
| "grad_norm": 0.4277413487434387, | |
| "learning_rate": 0.00023430751310425158, | |
| "loss": 3.1186, | |
| "step": 104750 | |
| }, | |
| { | |
| "epoch": 30.500989753143923, | |
| "grad_norm": 0.4438856244087219, | |
| "learning_rate": 0.00023413278974956318, | |
| "loss": 3.1496, | |
| "step": 104800 | |
| }, | |
| { | |
| "epoch": 30.51554494643689, | |
| "grad_norm": 0.45021602511405945, | |
| "learning_rate": 0.00023395806639487477, | |
| "loss": 3.1289, | |
| "step": 104850 | |
| }, | |
| { | |
| "epoch": 30.530100139729857, | |
| "grad_norm": 0.48635202646255493, | |
| "learning_rate": 0.00023378334304018637, | |
| "loss": 3.1387, | |
| "step": 104900 | |
| }, | |
| { | |
| "epoch": 30.544655333022824, | |
| "grad_norm": 0.45521771907806396, | |
| "learning_rate": 0.00023360861968549793, | |
| "loss": 3.1369, | |
| "step": 104950 | |
| }, | |
| { | |
| "epoch": 30.55921052631579, | |
| "grad_norm": 0.4433465301990509, | |
| "learning_rate": 0.00023343389633080955, | |
| "loss": 3.1332, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 30.55921052631579, | |
| "eval_accuracy": 0.37484017755555193, | |
| "eval_loss": 3.5463550090789795, | |
| "eval_runtime": 81.6842, | |
| "eval_samples_per_second": 203.846, | |
| "eval_steps_per_second": 12.744, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 30.573765719608758, | |
| "grad_norm": 0.4477085471153259, | |
| "learning_rate": 0.00023325917297612112, | |
| "loss": 3.138, | |
| "step": 105050 | |
| }, | |
| { | |
| "epoch": 30.58832091290172, | |
| "grad_norm": 0.4495393931865692, | |
| "learning_rate": 0.0002330844496214327, | |
| "loss": 3.1374, | |
| "step": 105100 | |
| }, | |
| { | |
| "epoch": 30.60287610619469, | |
| "grad_norm": 0.47526881098747253, | |
| "learning_rate": 0.0002329097262667443, | |
| "loss": 3.1477, | |
| "step": 105150 | |
| }, | |
| { | |
| "epoch": 30.617431299487656, | |
| "grad_norm": 0.4314660429954529, | |
| "learning_rate": 0.00023273500291205588, | |
| "loss": 3.144, | |
| "step": 105200 | |
| }, | |
| { | |
| "epoch": 30.631986492780623, | |
| "grad_norm": 0.4396653175354004, | |
| "learning_rate": 0.00023256027955736747, | |
| "loss": 3.1503, | |
| "step": 105250 | |
| }, | |
| { | |
| "epoch": 30.64654168607359, | |
| "grad_norm": 0.4902075231075287, | |
| "learning_rate": 0.00023238555620267907, | |
| "loss": 3.1399, | |
| "step": 105300 | |
| }, | |
| { | |
| "epoch": 30.661096879366557, | |
| "grad_norm": 0.4467863142490387, | |
| "learning_rate": 0.00023221083284799066, | |
| "loss": 3.1501, | |
| "step": 105350 | |
| }, | |
| { | |
| "epoch": 30.675652072659524, | |
| "grad_norm": 0.4563294053077698, | |
| "learning_rate": 0.00023203610949330223, | |
| "loss": 3.1421, | |
| "step": 105400 | |
| }, | |
| { | |
| "epoch": 30.69020726595249, | |
| "grad_norm": 0.4771472215652466, | |
| "learning_rate": 0.00023186138613861385, | |
| "loss": 3.1562, | |
| "step": 105450 | |
| }, | |
| { | |
| "epoch": 30.704762459245458, | |
| "grad_norm": 0.478089839220047, | |
| "learning_rate": 0.00023168666278392542, | |
| "loss": 3.1473, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 30.719317652538425, | |
| "grad_norm": 0.4389056861400604, | |
| "learning_rate": 0.00023151193942923704, | |
| "loss": 3.1512, | |
| "step": 105550 | |
| }, | |
| { | |
| "epoch": 30.733872845831392, | |
| "grad_norm": 0.43841278553009033, | |
| "learning_rate": 0.0002313372160745486, | |
| "loss": 3.1386, | |
| "step": 105600 | |
| }, | |
| { | |
| "epoch": 30.74842803912436, | |
| "grad_norm": 0.4402447044849396, | |
| "learning_rate": 0.0002311624927198602, | |
| "loss": 3.1474, | |
| "step": 105650 | |
| }, | |
| { | |
| "epoch": 30.762983232417326, | |
| "grad_norm": 0.45434558391571045, | |
| "learning_rate": 0.0002309877693651718, | |
| "loss": 3.1532, | |
| "step": 105700 | |
| }, | |
| { | |
| "epoch": 30.777538425710294, | |
| "grad_norm": 0.47730764746665955, | |
| "learning_rate": 0.0002308130460104834, | |
| "loss": 3.1443, | |
| "step": 105750 | |
| }, | |
| { | |
| "epoch": 30.79209361900326, | |
| "grad_norm": 0.47131097316741943, | |
| "learning_rate": 0.00023063832265579496, | |
| "loss": 3.1497, | |
| "step": 105800 | |
| }, | |
| { | |
| "epoch": 30.806648812296228, | |
| "grad_norm": 0.4748658835887909, | |
| "learning_rate": 0.00023046359930110658, | |
| "loss": 3.1577, | |
| "step": 105850 | |
| }, | |
| { | |
| "epoch": 30.821204005589195, | |
| "grad_norm": 0.4538467228412628, | |
| "learning_rate": 0.00023028887594641814, | |
| "loss": 3.1504, | |
| "step": 105900 | |
| }, | |
| { | |
| "epoch": 30.83575919888216, | |
| "grad_norm": 0.46276625990867615, | |
| "learning_rate": 0.00023011415259172974, | |
| "loss": 3.1542, | |
| "step": 105950 | |
| }, | |
| { | |
| "epoch": 30.85031439217513, | |
| "grad_norm": 0.4294084310531616, | |
| "learning_rate": 0.00022993942923704133, | |
| "loss": 3.1583, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 30.85031439217513, | |
| "eval_accuracy": 0.37522790013610835, | |
| "eval_loss": 3.5395941734313965, | |
| "eval_runtime": 81.638, | |
| "eval_samples_per_second": 203.961, | |
| "eval_steps_per_second": 12.751, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 30.864869585468096, | |
| "grad_norm": 0.5004943609237671, | |
| "learning_rate": 0.00022976470588235293, | |
| "loss": 3.1542, | |
| "step": 106050 | |
| }, | |
| { | |
| "epoch": 30.879424778761063, | |
| "grad_norm": 0.45164182782173157, | |
| "learning_rate": 0.0002295899825276645, | |
| "loss": 3.1566, | |
| "step": 106100 | |
| }, | |
| { | |
| "epoch": 30.89397997205403, | |
| "grad_norm": 0.45464181900024414, | |
| "learning_rate": 0.00022941525917297612, | |
| "loss": 3.1618, | |
| "step": 106150 | |
| }, | |
| { | |
| "epoch": 30.908535165346997, | |
| "grad_norm": 0.47931236028671265, | |
| "learning_rate": 0.00022924053581828768, | |
| "loss": 3.1507, | |
| "step": 106200 | |
| }, | |
| { | |
| "epoch": 30.923090358639964, | |
| "grad_norm": 0.4295690655708313, | |
| "learning_rate": 0.0002290658124635993, | |
| "loss": 3.1571, | |
| "step": 106250 | |
| }, | |
| { | |
| "epoch": 30.93764555193293, | |
| "grad_norm": 0.469592809677124, | |
| "learning_rate": 0.00022889108910891087, | |
| "loss": 3.1658, | |
| "step": 106300 | |
| }, | |
| { | |
| "epoch": 30.9522007452259, | |
| "grad_norm": 0.4959361255168915, | |
| "learning_rate": 0.00022871636575422247, | |
| "loss": 3.1524, | |
| "step": 106350 | |
| }, | |
| { | |
| "epoch": 30.966755938518865, | |
| "grad_norm": 0.46808984875679016, | |
| "learning_rate": 0.00022854164239953406, | |
| "loss": 3.1593, | |
| "step": 106400 | |
| }, | |
| { | |
| "epoch": 30.98131113181183, | |
| "grad_norm": 0.4391014873981476, | |
| "learning_rate": 0.00022836691904484565, | |
| "loss": 3.1729, | |
| "step": 106450 | |
| }, | |
| { | |
| "epoch": 30.995866325104796, | |
| "grad_norm": 0.4613760709762573, | |
| "learning_rate": 0.00022819219569015722, | |
| "loss": 3.158, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 31.010188635305077, | |
| "grad_norm": 0.4337015748023987, | |
| "learning_rate": 0.00022801747233546884, | |
| "loss": 3.1105, | |
| "step": 106550 | |
| }, | |
| { | |
| "epoch": 31.024743828598044, | |
| "grad_norm": 0.4768725037574768, | |
| "learning_rate": 0.0002278427489807804, | |
| "loss": 3.0626, | |
| "step": 106600 | |
| }, | |
| { | |
| "epoch": 31.03929902189101, | |
| "grad_norm": 0.49389609694480896, | |
| "learning_rate": 0.00022766802562609198, | |
| "loss": 3.0834, | |
| "step": 106650 | |
| }, | |
| { | |
| "epoch": 31.05385421518398, | |
| "grad_norm": 0.45176437497138977, | |
| "learning_rate": 0.0002274933022714036, | |
| "loss": 3.086, | |
| "step": 106700 | |
| }, | |
| { | |
| "epoch": 31.068409408476946, | |
| "grad_norm": 0.43654248118400574, | |
| "learning_rate": 0.00022731857891671517, | |
| "loss": 3.0609, | |
| "step": 106750 | |
| }, | |
| { | |
| "epoch": 31.082964601769913, | |
| "grad_norm": 0.4483693838119507, | |
| "learning_rate": 0.00022714385556202676, | |
| "loss": 3.0879, | |
| "step": 106800 | |
| }, | |
| { | |
| "epoch": 31.09751979506288, | |
| "grad_norm": 0.46999308466911316, | |
| "learning_rate": 0.00022696913220733835, | |
| "loss": 3.088, | |
| "step": 106850 | |
| }, | |
| { | |
| "epoch": 31.112074988355847, | |
| "grad_norm": 0.4499324560165405, | |
| "learning_rate": 0.00022679440885264995, | |
| "loss": 3.0804, | |
| "step": 106900 | |
| }, | |
| { | |
| "epoch": 31.126630181648814, | |
| "grad_norm": 0.4599437415599823, | |
| "learning_rate": 0.00022661968549796157, | |
| "loss": 3.0932, | |
| "step": 106950 | |
| }, | |
| { | |
| "epoch": 31.14118537494178, | |
| "grad_norm": 0.4603266716003418, | |
| "learning_rate": 0.00022644496214327314, | |
| "loss": 3.0952, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 31.14118537494178, | |
| "eval_accuracy": 0.37409540702115174, | |
| "eval_loss": 3.556943416595459, | |
| "eval_runtime": 81.5929, | |
| "eval_samples_per_second": 204.074, | |
| "eval_steps_per_second": 12.758, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 31.155740568234748, | |
| "grad_norm": 0.448766827583313, | |
| "learning_rate": 0.0002262702387885847, | |
| "loss": 3.094, | |
| "step": 107050 | |
| }, | |
| { | |
| "epoch": 31.17029576152771, | |
| "grad_norm": 0.4654287099838257, | |
| "learning_rate": 0.00022609551543389633, | |
| "loss": 3.0898, | |
| "step": 107100 | |
| }, | |
| { | |
| "epoch": 31.18485095482068, | |
| "grad_norm": 0.492904931306839, | |
| "learning_rate": 0.0002259207920792079, | |
| "loss": 3.0932, | |
| "step": 107150 | |
| }, | |
| { | |
| "epoch": 31.199406148113646, | |
| "grad_norm": 0.4475638270378113, | |
| "learning_rate": 0.0002257460687245195, | |
| "loss": 3.0997, | |
| "step": 107200 | |
| }, | |
| { | |
| "epoch": 31.213961341406613, | |
| "grad_norm": 0.4529367983341217, | |
| "learning_rate": 0.00022557134536983108, | |
| "loss": 3.0922, | |
| "step": 107250 | |
| }, | |
| { | |
| "epoch": 31.22851653469958, | |
| "grad_norm": 0.44643649458885193, | |
| "learning_rate": 0.00022539662201514268, | |
| "loss": 3.1033, | |
| "step": 107300 | |
| }, | |
| { | |
| "epoch": 31.243071727992547, | |
| "grad_norm": 0.474950909614563, | |
| "learning_rate": 0.00022522189866045424, | |
| "loss": 3.1064, | |
| "step": 107350 | |
| }, | |
| { | |
| "epoch": 31.257626921285514, | |
| "grad_norm": 0.4473762512207031, | |
| "learning_rate": 0.00022504717530576586, | |
| "loss": 3.0891, | |
| "step": 107400 | |
| }, | |
| { | |
| "epoch": 31.27218211457848, | |
| "grad_norm": 0.4661721885204315, | |
| "learning_rate": 0.00022487245195107743, | |
| "loss": 3.1068, | |
| "step": 107450 | |
| }, | |
| { | |
| "epoch": 31.286737307871448, | |
| "grad_norm": 0.4367949962615967, | |
| "learning_rate": 0.00022469772859638903, | |
| "loss": 3.1093, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 31.301292501164415, | |
| "grad_norm": 0.49752283096313477, | |
| "learning_rate": 0.00022452300524170062, | |
| "loss": 3.12, | |
| "step": 107550 | |
| }, | |
| { | |
| "epoch": 31.315847694457382, | |
| "grad_norm": 0.4435296356678009, | |
| "learning_rate": 0.00022434828188701221, | |
| "loss": 3.1041, | |
| "step": 107600 | |
| }, | |
| { | |
| "epoch": 31.33040288775035, | |
| "grad_norm": 0.4968740940093994, | |
| "learning_rate": 0.0002241735585323238, | |
| "loss": 3.1175, | |
| "step": 107650 | |
| }, | |
| { | |
| "epoch": 31.344958081043316, | |
| "grad_norm": 0.4529845118522644, | |
| "learning_rate": 0.0002239988351776354, | |
| "loss": 3.1196, | |
| "step": 107700 | |
| }, | |
| { | |
| "epoch": 31.359513274336283, | |
| "grad_norm": 0.48634758591651917, | |
| "learning_rate": 0.00022382411182294697, | |
| "loss": 3.1199, | |
| "step": 107750 | |
| }, | |
| { | |
| "epoch": 31.37406846762925, | |
| "grad_norm": 0.4886033236980438, | |
| "learning_rate": 0.0002236493884682586, | |
| "loss": 3.1155, | |
| "step": 107800 | |
| }, | |
| { | |
| "epoch": 31.388623660922217, | |
| "grad_norm": 0.4572789669036865, | |
| "learning_rate": 0.00022347466511357016, | |
| "loss": 3.1125, | |
| "step": 107850 | |
| }, | |
| { | |
| "epoch": 31.403178854215184, | |
| "grad_norm": 0.47334814071655273, | |
| "learning_rate": 0.00022329994175888175, | |
| "loss": 3.1107, | |
| "step": 107900 | |
| }, | |
| { | |
| "epoch": 31.41773404750815, | |
| "grad_norm": 0.43518730998039246, | |
| "learning_rate": 0.00022312521840419335, | |
| "loss": 3.1207, | |
| "step": 107950 | |
| }, | |
| { | |
| "epoch": 31.43228924080112, | |
| "grad_norm": 0.43658027052879333, | |
| "learning_rate": 0.00022295049504950494, | |
| "loss": 3.1241, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 31.43228924080112, | |
| "eval_accuracy": 0.374796339870633, | |
| "eval_loss": 3.55124568939209, | |
| "eval_runtime": 81.6469, | |
| "eval_samples_per_second": 203.939, | |
| "eval_steps_per_second": 12.75, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 31.446844434094086, | |
| "grad_norm": 0.4897937774658203, | |
| "learning_rate": 0.0002227757716948165, | |
| "loss": 3.1247, | |
| "step": 108050 | |
| }, | |
| { | |
| "epoch": 31.461399627387053, | |
| "grad_norm": 0.4503650963306427, | |
| "learning_rate": 0.00022260104834012813, | |
| "loss": 3.135, | |
| "step": 108100 | |
| }, | |
| { | |
| "epoch": 31.47595482068002, | |
| "grad_norm": 0.5468674302101135, | |
| "learning_rate": 0.0002224263249854397, | |
| "loss": 3.1292, | |
| "step": 108150 | |
| }, | |
| { | |
| "epoch": 31.490510013972987, | |
| "grad_norm": 0.45566606521606445, | |
| "learning_rate": 0.00022225160163075126, | |
| "loss": 3.1356, | |
| "step": 108200 | |
| }, | |
| { | |
| "epoch": 31.505065207265954, | |
| "grad_norm": 0.44275963306427, | |
| "learning_rate": 0.00022207687827606289, | |
| "loss": 3.124, | |
| "step": 108250 | |
| }, | |
| { | |
| "epoch": 31.51962040055892, | |
| "grad_norm": 0.46963468194007874, | |
| "learning_rate": 0.00022190215492137445, | |
| "loss": 3.135, | |
| "step": 108300 | |
| }, | |
| { | |
| "epoch": 31.534175593851888, | |
| "grad_norm": 0.4874171316623688, | |
| "learning_rate": 0.00022172743156668607, | |
| "loss": 3.1253, | |
| "step": 108350 | |
| }, | |
| { | |
| "epoch": 31.548730787144855, | |
| "grad_norm": 0.4586457908153534, | |
| "learning_rate": 0.00022155270821199764, | |
| "loss": 3.1352, | |
| "step": 108400 | |
| }, | |
| { | |
| "epoch": 31.56328598043782, | |
| "grad_norm": 0.45111775398254395, | |
| "learning_rate": 0.00022137798485730924, | |
| "loss": 3.1377, | |
| "step": 108450 | |
| }, | |
| { | |
| "epoch": 31.577841173730786, | |
| "grad_norm": 0.44890204071998596, | |
| "learning_rate": 0.00022120326150262083, | |
| "loss": 3.1317, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 31.592396367023753, | |
| "grad_norm": 0.4884273409843445, | |
| "learning_rate": 0.00022102853814793242, | |
| "loss": 3.1198, | |
| "step": 108550 | |
| }, | |
| { | |
| "epoch": 31.60695156031672, | |
| "grad_norm": 0.4430197477340698, | |
| "learning_rate": 0.000220853814793244, | |
| "loss": 3.1232, | |
| "step": 108600 | |
| }, | |
| { | |
| "epoch": 31.621506753609687, | |
| "grad_norm": 0.4811874032020569, | |
| "learning_rate": 0.0002206790914385556, | |
| "loss": 3.1285, | |
| "step": 108650 | |
| }, | |
| { | |
| "epoch": 31.636061946902654, | |
| "grad_norm": 0.4888242185115814, | |
| "learning_rate": 0.00022050436808386718, | |
| "loss": 3.1445, | |
| "step": 108700 | |
| }, | |
| { | |
| "epoch": 31.65061714019562, | |
| "grad_norm": 0.42643800377845764, | |
| "learning_rate": 0.00022032964472917877, | |
| "loss": 3.1463, | |
| "step": 108750 | |
| }, | |
| { | |
| "epoch": 31.665172333488588, | |
| "grad_norm": 0.4505205750465393, | |
| "learning_rate": 0.00022015492137449037, | |
| "loss": 3.1412, | |
| "step": 108800 | |
| }, | |
| { | |
| "epoch": 31.679727526781555, | |
| "grad_norm": 0.4725150465965271, | |
| "learning_rate": 0.00021998019801980196, | |
| "loss": 3.1432, | |
| "step": 108850 | |
| }, | |
| { | |
| "epoch": 31.694282720074522, | |
| "grad_norm": 0.4727577567100525, | |
| "learning_rate": 0.00021980547466511353, | |
| "loss": 3.1375, | |
| "step": 108900 | |
| }, | |
| { | |
| "epoch": 31.70883791336749, | |
| "grad_norm": 0.4706392288208008, | |
| "learning_rate": 0.00021963075131042515, | |
| "loss": 3.1455, | |
| "step": 108950 | |
| }, | |
| { | |
| "epoch": 31.723393106660456, | |
| "grad_norm": 0.43995681405067444, | |
| "learning_rate": 0.00021945602795573672, | |
| "loss": 3.1344, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 31.723393106660456, | |
| "eval_accuracy": 0.37524905505108264, | |
| "eval_loss": 3.541571617126465, | |
| "eval_runtime": 81.7192, | |
| "eval_samples_per_second": 203.759, | |
| "eval_steps_per_second": 12.739, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 31.737948299953423, | |
| "grad_norm": 0.46493199467658997, | |
| "learning_rate": 0.00021928130460104834, | |
| "loss": 3.1363, | |
| "step": 109050 | |
| }, | |
| { | |
| "epoch": 31.75250349324639, | |
| "grad_norm": 0.4539068937301636, | |
| "learning_rate": 0.0002191065812463599, | |
| "loss": 3.1413, | |
| "step": 109100 | |
| }, | |
| { | |
| "epoch": 31.767058686539357, | |
| "grad_norm": 0.4341435730457306, | |
| "learning_rate": 0.0002189318578916715, | |
| "loss": 3.1563, | |
| "step": 109150 | |
| }, | |
| { | |
| "epoch": 31.781613879832324, | |
| "grad_norm": 0.4788074791431427, | |
| "learning_rate": 0.0002187571345369831, | |
| "loss": 3.1546, | |
| "step": 109200 | |
| }, | |
| { | |
| "epoch": 31.79616907312529, | |
| "grad_norm": 0.4583766758441925, | |
| "learning_rate": 0.0002185824111822947, | |
| "loss": 3.1504, | |
| "step": 109250 | |
| }, | |
| { | |
| "epoch": 31.81072426641826, | |
| "grad_norm": 0.4509258270263672, | |
| "learning_rate": 0.00021840768782760626, | |
| "loss": 3.1357, | |
| "step": 109300 | |
| }, | |
| { | |
| "epoch": 31.825279459711226, | |
| "grad_norm": 0.4848930537700653, | |
| "learning_rate": 0.00021823296447291788, | |
| "loss": 3.141, | |
| "step": 109350 | |
| }, | |
| { | |
| "epoch": 31.839834653004193, | |
| "grad_norm": 0.44145888090133667, | |
| "learning_rate": 0.00021805824111822945, | |
| "loss": 3.1454, | |
| "step": 109400 | |
| }, | |
| { | |
| "epoch": 31.85438984629716, | |
| "grad_norm": 0.47195175290107727, | |
| "learning_rate": 0.00021788351776354104, | |
| "loss": 3.1521, | |
| "step": 109450 | |
| }, | |
| { | |
| "epoch": 31.868945039590127, | |
| "grad_norm": 0.5092898011207581, | |
| "learning_rate": 0.00021770879440885263, | |
| "loss": 3.1484, | |
| "step": 109500 | |
| }, | |
| { | |
| "epoch": 31.883500232883094, | |
| "grad_norm": 0.4609210193157196, | |
| "learning_rate": 0.00021753407105416423, | |
| "loss": 3.1526, | |
| "step": 109550 | |
| }, | |
| { | |
| "epoch": 31.89805542617606, | |
| "grad_norm": 0.4842016398906708, | |
| "learning_rate": 0.0002173593476994758, | |
| "loss": 3.149, | |
| "step": 109600 | |
| }, | |
| { | |
| "epoch": 31.912610619469028, | |
| "grad_norm": 0.4648073613643646, | |
| "learning_rate": 0.00021718462434478742, | |
| "loss": 3.1494, | |
| "step": 109650 | |
| }, | |
| { | |
| "epoch": 31.927165812761995, | |
| "grad_norm": 0.456338107585907, | |
| "learning_rate": 0.00021700990099009898, | |
| "loss": 3.1554, | |
| "step": 109700 | |
| }, | |
| { | |
| "epoch": 31.941721006054962, | |
| "grad_norm": 0.44563332200050354, | |
| "learning_rate": 0.0002168351776354106, | |
| "loss": 3.1592, | |
| "step": 109750 | |
| }, | |
| { | |
| "epoch": 31.956276199347926, | |
| "grad_norm": 0.4612378776073456, | |
| "learning_rate": 0.00021666045428072217, | |
| "loss": 3.1384, | |
| "step": 109800 | |
| }, | |
| { | |
| "epoch": 31.970831392640893, | |
| "grad_norm": 0.4844173491001129, | |
| "learning_rate": 0.00021648573092603374, | |
| "loss": 3.1527, | |
| "step": 109850 | |
| }, | |
| { | |
| "epoch": 31.98538658593386, | |
| "grad_norm": 0.5019930601119995, | |
| "learning_rate": 0.00021631100757134536, | |
| "loss": 3.1458, | |
| "step": 109900 | |
| }, | |
| { | |
| "epoch": 31.999941779226827, | |
| "grad_norm": 0.4411730170249939, | |
| "learning_rate": 0.00021613628421665693, | |
| "loss": 3.1476, | |
| "step": 109950 | |
| }, | |
| { | |
| "epoch": 32.01426408942711, | |
| "grad_norm": 0.46677476167678833, | |
| "learning_rate": 0.00021596156086196852, | |
| "loss": 3.0589, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 32.01426408942711, | |
| "eval_accuracy": 0.37481067820189334, | |
| "eval_loss": 3.5506389141082764, | |
| "eval_runtime": 81.6564, | |
| "eval_samples_per_second": 203.916, | |
| "eval_steps_per_second": 12.749, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 32.01426408942711, | |
| "step": 110000, | |
| "total_flos": 2.298801310138368e+18, | |
| "train_loss": 0.5712694714632901, | |
| "train_runtime": 14745.3781, | |
| "train_samples_per_second": 931.841, | |
| "train_steps_per_second": 11.651 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 171800, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 20, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 20 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.298801310138368e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |