Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": 89000, | |
| "best_metric": 3.5308263301849365, | |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/swap_require_to_drop_40817/checkpoint-40000", | |
| "epoch": 31.723102002794597, | |
| "eval_steps": 1000, | |
| "global_step": 109000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014555193292966931, | |
| "grad_norm": 0.5271727442741394, | |
| "learning_rate": 0.000294, | |
| "loss": 8.4624, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.029110386585933862, | |
| "grad_norm": 0.46208518743515015, | |
| "learning_rate": 0.0005939999999999999, | |
| "loss": 6.7697, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04366557987890079, | |
| "grad_norm": 0.5063037276268005, | |
| "learning_rate": 0.0005998287711124053, | |
| "loss": 6.3743, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.058220773171867725, | |
| "grad_norm": 0.5487610101699829, | |
| "learning_rate": 0.000599654047757717, | |
| "loss": 6.1606, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07277596646483465, | |
| "grad_norm": 0.4579280614852905, | |
| "learning_rate": 0.0005994793244030285, | |
| "loss": 6.0051, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08733115975780158, | |
| "grad_norm": 0.4190051555633545, | |
| "learning_rate": 0.00059930460104834, | |
| "loss": 5.8856, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10188635305076851, | |
| "grad_norm": 0.4702790379524231, | |
| "learning_rate": 0.0005991298776936517, | |
| "loss": 5.7623, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11644154634373545, | |
| "grad_norm": 0.44624361395835876, | |
| "learning_rate": 0.0005989551543389632, | |
| "loss": 5.6496, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1309967396367024, | |
| "grad_norm": 0.4926813542842865, | |
| "learning_rate": 0.0005987804309842748, | |
| "loss": 5.522, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1455519329296693, | |
| "grad_norm": 0.3893285095691681, | |
| "learning_rate": 0.0005986057076295864, | |
| "loss": 5.4316, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.16010712622263623, | |
| "grad_norm": 0.4649236500263214, | |
| "learning_rate": 0.0005984309842748981, | |
| "loss": 5.3476, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.17466231951560315, | |
| "grad_norm": 0.47005704045295715, | |
| "learning_rate": 0.0005982562609202096, | |
| "loss": 5.2786, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1892175128085701, | |
| "grad_norm": 0.4326360523700714, | |
| "learning_rate": 0.0005980815375655212, | |
| "loss": 5.2111, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.20377270610153703, | |
| "grad_norm": 0.40753060579299927, | |
| "learning_rate": 0.0005979068142108328, | |
| "loss": 5.1503, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.21832789939450395, | |
| "grad_norm": 0.4554615914821625, | |
| "learning_rate": 0.0005977320908561445, | |
| "loss": 5.0891, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2328830926874709, | |
| "grad_norm": 0.43539178371429443, | |
| "learning_rate": 0.000597557367501456, | |
| "loss": 5.0279, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.24743828598043782, | |
| "grad_norm": 0.4908430874347687, | |
| "learning_rate": 0.0005973826441467675, | |
| "loss": 4.9855, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2619934792734048, | |
| "grad_norm": 0.44112998247146606, | |
| "learning_rate": 0.0005972079207920792, | |
| "loss": 4.9386, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.27654867256637167, | |
| "grad_norm": 0.5209882855415344, | |
| "learning_rate": 0.0005970331974373907, | |
| "loss": 4.8873, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.2911038658593386, | |
| "grad_norm": 0.5419289469718933, | |
| "learning_rate": 0.0005968584740827023, | |
| "loss": 4.8466, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2911038658593386, | |
| "eval_accuracy": 0.2529622463511004, | |
| "eval_loss": 4.771843433380127, | |
| "eval_runtime": 80.9007, | |
| "eval_samples_per_second": 205.82, | |
| "eval_steps_per_second": 12.868, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.30565905915230557, | |
| "grad_norm": 0.4487905502319336, | |
| "learning_rate": 0.0005966837507280139, | |
| "loss": 4.7922, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.32021425244527246, | |
| "grad_norm": 0.45877334475517273, | |
| "learning_rate": 0.0005965090273733256, | |
| "loss": 4.7502, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3347694457382394, | |
| "grad_norm": 0.4339123070240021, | |
| "learning_rate": 0.0005963343040186371, | |
| "loss": 4.6976, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3493246390312063, | |
| "grad_norm": 0.4831567406654358, | |
| "learning_rate": 0.0005961595806639486, | |
| "loss": 4.6597, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.36387983232417326, | |
| "grad_norm": 0.45950332283973694, | |
| "learning_rate": 0.0005959848573092603, | |
| "loss": 4.6478, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.3784350256171402, | |
| "grad_norm": 0.4822908043861389, | |
| "learning_rate": 0.0005958101339545718, | |
| "loss": 4.6104, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.3929902189101071, | |
| "grad_norm": 0.44366389513015747, | |
| "learning_rate": 0.0005956354105998835, | |
| "loss": 4.5627, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.40754541220307405, | |
| "grad_norm": 0.4284035265445709, | |
| "learning_rate": 0.000595460687245195, | |
| "loss": 4.5605, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.422100605496041, | |
| "grad_norm": 0.449191153049469, | |
| "learning_rate": 0.0005952859638905067, | |
| "loss": 4.5194, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4366557987890079, | |
| "grad_norm": 0.42922577261924744, | |
| "learning_rate": 0.0005951112405358182, | |
| "loss": 4.5117, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.45121099208197485, | |
| "grad_norm": 0.4628215730190277, | |
| "learning_rate": 0.0005949365171811299, | |
| "loss": 4.4934, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4657661853749418, | |
| "grad_norm": 0.42728379368782043, | |
| "learning_rate": 0.0005947617938264414, | |
| "loss": 4.4644, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.4803213786679087, | |
| "grad_norm": 0.39588406682014465, | |
| "learning_rate": 0.000594587070471753, | |
| "loss": 4.4561, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.49487657196087564, | |
| "grad_norm": 0.44425565004348755, | |
| "learning_rate": 0.0005944123471170646, | |
| "loss": 4.4442, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5094317652538426, | |
| "grad_norm": 0.415975958108902, | |
| "learning_rate": 0.0005942376237623762, | |
| "loss": 4.4127, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5239869585468095, | |
| "grad_norm": 0.527612566947937, | |
| "learning_rate": 0.0005940629004076878, | |
| "loss": 4.398, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5385421518397764, | |
| "grad_norm": 0.40263131260871887, | |
| "learning_rate": 0.0005938881770529993, | |
| "loss": 4.3808, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5530973451327433, | |
| "grad_norm": 0.4493564963340759, | |
| "learning_rate": 0.000593713453698311, | |
| "loss": 4.3644, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5676525384257103, | |
| "grad_norm": 0.40246161818504333, | |
| "learning_rate": 0.0005935387303436226, | |
| "loss": 4.3614, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5822077317186772, | |
| "grad_norm": 0.4083396792411804, | |
| "learning_rate": 0.0005933640069889342, | |
| "loss": 4.3445, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5822077317186772, | |
| "eval_accuracy": 0.2988631231165515, | |
| "eval_loss": 4.286505222320557, | |
| "eval_runtime": 80.2953, | |
| "eval_samples_per_second": 207.372, | |
| "eval_steps_per_second": 12.965, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5967629250116442, | |
| "grad_norm": 0.4143264889717102, | |
| "learning_rate": 0.0005931892836342457, | |
| "loss": 4.3257, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6113181183046111, | |
| "grad_norm": 0.40621867775917053, | |
| "learning_rate": 0.0005930145602795573, | |
| "loss": 4.3254, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.625873311597578, | |
| "grad_norm": 0.40006792545318604, | |
| "learning_rate": 0.000592839836924869, | |
| "loss": 4.3015, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6404285048905449, | |
| "grad_norm": 0.39471518993377686, | |
| "learning_rate": 0.0005926651135701805, | |
| "loss": 4.2986, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6549836981835119, | |
| "grad_norm": 0.36508554220199585, | |
| "learning_rate": 0.0005924903902154921, | |
| "loss": 4.2749, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.6695388914764788, | |
| "grad_norm": 0.3801308274269104, | |
| "learning_rate": 0.0005923156668608037, | |
| "loss": 4.2689, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6840940847694458, | |
| "grad_norm": 0.4366742968559265, | |
| "learning_rate": 0.0005921409435061153, | |
| "loss": 4.2567, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6986492780624126, | |
| "grad_norm": 0.3730611205101013, | |
| "learning_rate": 0.0005919662201514268, | |
| "loss": 4.2593, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7132044713553796, | |
| "grad_norm": 0.39359498023986816, | |
| "learning_rate": 0.0005917914967967384, | |
| "loss": 4.2405, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7277596646483465, | |
| "grad_norm": 0.414410799741745, | |
| "learning_rate": 0.0005916167734420501, | |
| "loss": 4.2211, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7423148579413135, | |
| "grad_norm": 0.39767029881477356, | |
| "learning_rate": 0.0005914420500873616, | |
| "loss": 4.2296, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.7568700512342804, | |
| "grad_norm": 0.3539912700653076, | |
| "learning_rate": 0.0005912673267326732, | |
| "loss": 4.2209, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.7714252445272474, | |
| "grad_norm": 0.3714052736759186, | |
| "learning_rate": 0.0005910926033779848, | |
| "loss": 4.1953, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.7859804378202142, | |
| "grad_norm": 0.39169925451278687, | |
| "learning_rate": 0.0005909178800232964, | |
| "loss": 4.1914, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8005356311131812, | |
| "grad_norm": 0.36949044466018677, | |
| "learning_rate": 0.000590743156668608, | |
| "loss": 4.1904, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8150908244061481, | |
| "grad_norm": 0.3829376995563507, | |
| "learning_rate": 0.0005905684333139196, | |
| "loss": 4.1869, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8296460176991151, | |
| "grad_norm": 0.33983802795410156, | |
| "learning_rate": 0.0005903937099592312, | |
| "loss": 4.1689, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.844201210992082, | |
| "grad_norm": 0.3911590874195099, | |
| "learning_rate": 0.0005902189866045427, | |
| "loss": 4.1694, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.858756404285049, | |
| "grad_norm": 0.36505550146102905, | |
| "learning_rate": 0.0005900442632498543, | |
| "loss": 4.1604, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.8733115975780158, | |
| "grad_norm": 0.36020687222480774, | |
| "learning_rate": 0.0005898695398951659, | |
| "loss": 4.1487, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8733115975780158, | |
| "eval_accuracy": 0.31504933619990266, | |
| "eval_loss": 4.098790168762207, | |
| "eval_runtime": 80.5249, | |
| "eval_samples_per_second": 206.781, | |
| "eval_steps_per_second": 12.928, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8878667908709827, | |
| "grad_norm": 0.3863276541233063, | |
| "learning_rate": 0.0005896948165404776, | |
| "loss": 4.1312, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9024219841639497, | |
| "grad_norm": 0.36030322313308716, | |
| "learning_rate": 0.0005895200931857891, | |
| "loss": 4.1416, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9169771774569166, | |
| "grad_norm": 0.3481094241142273, | |
| "learning_rate": 0.0005893453698311007, | |
| "loss": 4.1166, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.9315323707498836, | |
| "grad_norm": 0.35702067613601685, | |
| "learning_rate": 0.0005891706464764123, | |
| "loss": 4.1255, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.9460875640428504, | |
| "grad_norm": 0.34004005789756775, | |
| "learning_rate": 0.0005889959231217238, | |
| "loss": 4.1196, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.9606427573358174, | |
| "grad_norm": 0.3363751769065857, | |
| "learning_rate": 0.0005888211997670355, | |
| "loss": 4.0987, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.9751979506287843, | |
| "grad_norm": 0.3608480393886566, | |
| "learning_rate": 0.000588646476412347, | |
| "loss": 4.098, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.9897531439217513, | |
| "grad_norm": 0.3625710904598236, | |
| "learning_rate": 0.0005884717530576587, | |
| "loss": 4.1064, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.0040754541220307, | |
| "grad_norm": 0.4029366374015808, | |
| "learning_rate": 0.0005882970297029702, | |
| "loss": 4.0877, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.0186306474149978, | |
| "grad_norm": 0.3653298318386078, | |
| "learning_rate": 0.0005881223063482818, | |
| "loss": 4.0118, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.0331858407079646, | |
| "grad_norm": 0.36948803067207336, | |
| "learning_rate": 0.0005879475829935934, | |
| "loss": 4.0161, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.0477410340009314, | |
| "grad_norm": 0.35129520297050476, | |
| "learning_rate": 0.0005877728596389051, | |
| "loss": 4.0104, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.0622962272938985, | |
| "grad_norm": 0.3525533974170685, | |
| "learning_rate": 0.0005875981362842166, | |
| "loss": 4.0106, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.0768514205868653, | |
| "grad_norm": 0.3563438653945923, | |
| "learning_rate": 0.0005874234129295281, | |
| "loss": 4.0232, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.0914066138798324, | |
| "grad_norm": 0.364452600479126, | |
| "learning_rate": 0.0005872486895748398, | |
| "loss": 4.0061, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.1059618071727992, | |
| "grad_norm": 0.3372463881969452, | |
| "learning_rate": 0.0005870739662201513, | |
| "loss": 4.0092, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.120517000465766, | |
| "grad_norm": 0.3655630946159363, | |
| "learning_rate": 0.000586899242865463, | |
| "loss": 3.9973, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.1350721937587331, | |
| "grad_norm": 0.33341357111930847, | |
| "learning_rate": 0.0005867245195107746, | |
| "loss": 4.0002, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.1496273870517, | |
| "grad_norm": 0.355985552072525, | |
| "learning_rate": 0.0005865497961560862, | |
| "loss": 3.986, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.164182580344667, | |
| "grad_norm": 0.34362098574638367, | |
| "learning_rate": 0.0005863750728013977, | |
| "loss": 4.0036, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.164182580344667, | |
| "eval_accuracy": 0.325240834016069, | |
| "eval_loss": 3.990701198577881, | |
| "eval_runtime": 80.3899, | |
| "eval_samples_per_second": 207.128, | |
| "eval_steps_per_second": 12.949, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.1787377736376339, | |
| "grad_norm": 0.3871329426765442, | |
| "learning_rate": 0.0005862003494467094, | |
| "loss": 3.9877, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.193292966930601, | |
| "grad_norm": 0.34389519691467285, | |
| "learning_rate": 0.0005860256260920209, | |
| "loss": 3.983, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.2078481602235678, | |
| "grad_norm": 0.33484119176864624, | |
| "learning_rate": 0.0005858509027373325, | |
| "loss": 3.9851, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.2224033535165346, | |
| "grad_norm": 0.3400310277938843, | |
| "learning_rate": 0.0005856761793826441, | |
| "loss": 3.9848, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.2369585468095017, | |
| "grad_norm": 0.3445764183998108, | |
| "learning_rate": 0.0005855014560279557, | |
| "loss": 3.977, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.2515137401024685, | |
| "grad_norm": 0.3449366092681885, | |
| "learning_rate": 0.0005853267326732673, | |
| "loss": 3.9685, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.2660689333954354, | |
| "grad_norm": 0.3173910975456238, | |
| "learning_rate": 0.0005851520093185788, | |
| "loss": 3.9727, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.2806241266884024, | |
| "grad_norm": 0.3384472727775574, | |
| "learning_rate": 0.0005849772859638905, | |
| "loss": 3.9813, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.2951793199813695, | |
| "grad_norm": 0.3449079692363739, | |
| "learning_rate": 0.0005848025626092021, | |
| "loss": 3.9561, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.3097345132743363, | |
| "grad_norm": 0.35612615942955017, | |
| "learning_rate": 0.0005846278392545136, | |
| "loss": 3.9731, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3242897065673032, | |
| "grad_norm": 0.3560566008090973, | |
| "learning_rate": 0.0005844531158998252, | |
| "loss": 3.9631, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.3388448998602702, | |
| "grad_norm": 0.32671722769737244, | |
| "learning_rate": 0.0005842783925451368, | |
| "loss": 3.9667, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.353400093153237, | |
| "grad_norm": 0.3312210738658905, | |
| "learning_rate": 0.0005841036691904484, | |
| "loss": 3.9507, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.367955286446204, | |
| "grad_norm": 0.3450145125389099, | |
| "learning_rate": 0.00058392894583576, | |
| "loss": 3.9449, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.382510479739171, | |
| "grad_norm": 0.3478182852268219, | |
| "learning_rate": 0.0005837542224810716, | |
| "loss": 3.9502, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.3970656730321378, | |
| "grad_norm": 0.3191460072994232, | |
| "learning_rate": 0.0005835794991263832, | |
| "loss": 3.9534, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.4116208663251049, | |
| "grad_norm": 0.3257412016391754, | |
| "learning_rate": 0.0005834047757716948, | |
| "loss": 3.9394, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.4261760596180717, | |
| "grad_norm": 0.3550835847854614, | |
| "learning_rate": 0.0005832300524170063, | |
| "loss": 3.9343, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.4407312529110388, | |
| "grad_norm": 0.3301994800567627, | |
| "learning_rate": 0.0005830553290623179, | |
| "loss": 3.93, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.4552864462040056, | |
| "grad_norm": 0.32117563486099243, | |
| "learning_rate": 0.0005828806057076296, | |
| "loss": 3.9375, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4552864462040056, | |
| "eval_accuracy": 0.33205906311228056, | |
| "eval_loss": 3.9138526916503906, | |
| "eval_runtime": 80.6696, | |
| "eval_samples_per_second": 206.41, | |
| "eval_steps_per_second": 12.904, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4698416394969724, | |
| "grad_norm": 0.3202812373638153, | |
| "learning_rate": 0.0005827058823529411, | |
| "loss": 3.9215, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.4843968327899395, | |
| "grad_norm": 0.3350994884967804, | |
| "learning_rate": 0.0005825311589982527, | |
| "loss": 3.9341, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.4989520260829063, | |
| "grad_norm": 0.3419645130634308, | |
| "learning_rate": 0.0005823564356435643, | |
| "loss": 3.918, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.5135072193758732, | |
| "grad_norm": 0.3223453462123871, | |
| "learning_rate": 0.0005821817122888759, | |
| "loss": 3.9242, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.5280624126688402, | |
| "grad_norm": 0.3518142104148865, | |
| "learning_rate": 0.0005820069889341875, | |
| "loss": 3.9083, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.5426176059618073, | |
| "grad_norm": 0.3310922682285309, | |
| "learning_rate": 0.000581832265579499, | |
| "loss": 3.9189, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.5571727992547741, | |
| "grad_norm": 0.3117953836917877, | |
| "learning_rate": 0.0005816575422248107, | |
| "loss": 3.9066, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.571727992547741, | |
| "grad_norm": 0.32134130597114563, | |
| "learning_rate": 0.0005814828188701222, | |
| "loss": 3.9138, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.586283185840708, | |
| "grad_norm": 0.32665690779685974, | |
| "learning_rate": 0.0005813080955154338, | |
| "loss": 3.9059, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.6008383791336749, | |
| "grad_norm": 0.3477399945259094, | |
| "learning_rate": 0.0005811333721607454, | |
| "loss": 3.8957, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6153935724266417, | |
| "grad_norm": 0.3333335816860199, | |
| "learning_rate": 0.0005809586488060571, | |
| "loss": 3.9143, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.6299487657196088, | |
| "grad_norm": 0.3129744827747345, | |
| "learning_rate": 0.0005807839254513686, | |
| "loss": 3.916, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.6445039590125758, | |
| "grad_norm": 0.31310853362083435, | |
| "learning_rate": 0.0005806092020966802, | |
| "loss": 3.8919, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.6590591523055425, | |
| "grad_norm": 0.31589412689208984, | |
| "learning_rate": 0.0005804344787419918, | |
| "loss": 3.8996, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.6736143455985095, | |
| "grad_norm": 0.33164557814598083, | |
| "learning_rate": 0.0005802597553873033, | |
| "loss": 3.8899, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.6881695388914766, | |
| "grad_norm": 0.3197880685329437, | |
| "learning_rate": 0.000580085032032615, | |
| "loss": 3.8835, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.7027247321844434, | |
| "grad_norm": 0.3329980969429016, | |
| "learning_rate": 0.0005799103086779265, | |
| "loss": 3.8885, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.7172799254774103, | |
| "grad_norm": 0.3256678581237793, | |
| "learning_rate": 0.0005797355853232382, | |
| "loss": 3.8773, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.7318351187703773, | |
| "grad_norm": 0.34026142954826355, | |
| "learning_rate": 0.0005795608619685497, | |
| "loss": 3.8827, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.7463903120633442, | |
| "grad_norm": 0.3289831876754761, | |
| "learning_rate": 0.0005793861386138614, | |
| "loss": 3.8856, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.7463903120633442, | |
| "eval_accuracy": 0.33695748367457584, | |
| "eval_loss": 3.8560938835144043, | |
| "eval_runtime": 81.3648, | |
| "eval_samples_per_second": 204.646, | |
| "eval_steps_per_second": 12.794, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.760945505356311, | |
| "grad_norm": 0.34663325548171997, | |
| "learning_rate": 0.0005792114152591729, | |
| "loss": 3.8884, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.775500698649278, | |
| "grad_norm": 0.3132448196411133, | |
| "learning_rate": 0.0005790366919044846, | |
| "loss": 3.8643, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.7900558919422451, | |
| "grad_norm": 0.3242737352848053, | |
| "learning_rate": 0.0005788619685497961, | |
| "loss": 3.8858, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.804611085235212, | |
| "grad_norm": 0.33676642179489136, | |
| "learning_rate": 0.0005786872451951077, | |
| "loss": 3.8658, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.8191662785281788, | |
| "grad_norm": 0.3253450393676758, | |
| "learning_rate": 0.0005785125218404193, | |
| "loss": 3.8803, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.8337214718211459, | |
| "grad_norm": 0.3216457664966583, | |
| "learning_rate": 0.0005783377984857308, | |
| "loss": 3.859, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.8482766651141127, | |
| "grad_norm": 0.3245769143104553, | |
| "learning_rate": 0.0005781630751310425, | |
| "loss": 3.8622, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.8628318584070795, | |
| "grad_norm": 0.3190279006958008, | |
| "learning_rate": 0.0005779883517763541, | |
| "loss": 3.878, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.8773870517000466, | |
| "grad_norm": 0.3186055123806, | |
| "learning_rate": 0.0005778136284216657, | |
| "loss": 3.8598, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.8919422449930137, | |
| "grad_norm": 0.3255595564842224, | |
| "learning_rate": 0.0005776389050669772, | |
| "loss": 3.8548, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.9064974382859803, | |
| "grad_norm": 0.3079172372817993, | |
| "learning_rate": 0.0005774641817122889, | |
| "loss": 3.8466, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.9210526315789473, | |
| "grad_norm": 0.3154686689376831, | |
| "learning_rate": 0.0005772894583576004, | |
| "loss": 3.8557, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.9356078248719144, | |
| "grad_norm": 0.31125935912132263, | |
| "learning_rate": 0.000577114735002912, | |
| "loss": 3.8543, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.9501630181648812, | |
| "grad_norm": 0.3469392955303192, | |
| "learning_rate": 0.0005769400116482236, | |
| "loss": 3.8464, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.964718211457848, | |
| "grad_norm": 0.35761559009552, | |
| "learning_rate": 0.0005767652882935352, | |
| "loss": 3.8407, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.9792734047508151, | |
| "grad_norm": 0.32018837332725525, | |
| "learning_rate": 0.0005765905649388468, | |
| "loss": 3.8385, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.993828598043782, | |
| "grad_norm": 0.30845850706100464, | |
| "learning_rate": 0.0005764158415841583, | |
| "loss": 3.8416, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.0081509082440614, | |
| "grad_norm": 0.3330078423023224, | |
| "learning_rate": 0.00057624111822947, | |
| "loss": 3.8009, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.0227061015370285, | |
| "grad_norm": 0.30205458402633667, | |
| "learning_rate": 0.0005760663948747816, | |
| "loss": 3.7382, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.0372612948299955, | |
| "grad_norm": 0.32405152916908264, | |
| "learning_rate": 0.0005758916715200931, | |
| "loss": 3.7575, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.0372612948299955, | |
| "eval_accuracy": 0.3415351722203999, | |
| "eval_loss": 3.8129448890686035, | |
| "eval_runtime": 81.5495, | |
| "eval_samples_per_second": 204.183, | |
| "eval_steps_per_second": 12.765, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.051816488122962, | |
| "grad_norm": 0.32713183760643005, | |
| "learning_rate": 0.0005757169481654047, | |
| "loss": 3.757, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.066371681415929, | |
| "grad_norm": 0.32786786556243896, | |
| "learning_rate": 0.0005755422248107163, | |
| "loss": 3.7451, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.0809268747088963, | |
| "grad_norm": 0.3092855215072632, | |
| "learning_rate": 0.0005753675014560279, | |
| "loss": 3.7557, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.095482068001863, | |
| "grad_norm": 0.3192426562309265, | |
| "learning_rate": 0.0005751927781013395, | |
| "loss": 3.7484, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.11003726129483, | |
| "grad_norm": 0.31631532311439514, | |
| "learning_rate": 0.0005750180547466511, | |
| "loss": 3.7548, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.124592454587797, | |
| "grad_norm": 0.31545156240463257, | |
| "learning_rate": 0.0005748433313919627, | |
| "loss": 3.7617, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.139147647880764, | |
| "grad_norm": 0.31154194474220276, | |
| "learning_rate": 0.0005746686080372743, | |
| "loss": 3.7647, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.1537028411737307, | |
| "grad_norm": 0.33813735842704773, | |
| "learning_rate": 0.0005744938846825858, | |
| "loss": 3.7584, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.1682580344666977, | |
| "grad_norm": 0.31195804476737976, | |
| "learning_rate": 0.0005743191613278974, | |
| "loss": 3.7542, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.182813227759665, | |
| "grad_norm": 0.34031805396080017, | |
| "learning_rate": 0.0005741444379732091, | |
| "loss": 3.7532, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.1973684210526314, | |
| "grad_norm": 0.33518555760383606, | |
| "learning_rate": 0.0005739697146185206, | |
| "loss": 3.7609, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.2119236143455985, | |
| "grad_norm": 0.31790536642074585, | |
| "learning_rate": 0.0005737949912638322, | |
| "loss": 3.7541, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.2264788076385655, | |
| "grad_norm": 0.3077254593372345, | |
| "learning_rate": 0.0005736202679091438, | |
| "loss": 3.7621, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.241034000931532, | |
| "grad_norm": 0.3513492941856384, | |
| "learning_rate": 0.0005734455445544554, | |
| "loss": 3.7469, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.255589194224499, | |
| "grad_norm": 0.3367244601249695, | |
| "learning_rate": 0.000573270821199767, | |
| "loss": 3.7501, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.2701443875174663, | |
| "grad_norm": 0.327831506729126, | |
| "learning_rate": 0.0005730960978450785, | |
| "loss": 3.7591, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.2846995808104333, | |
| "grad_norm": 0.30504941940307617, | |
| "learning_rate": 0.0005729213744903902, | |
| "loss": 3.7592, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.2992547741034, | |
| "grad_norm": 0.32957780361175537, | |
| "learning_rate": 0.0005727466511357017, | |
| "loss": 3.7417, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.313809967396367, | |
| "grad_norm": 0.3062915503978729, | |
| "learning_rate": 0.0005725719277810134, | |
| "loss": 3.7573, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.328365160689334, | |
| "grad_norm": 0.3116219937801361, | |
| "learning_rate": 0.0005723972044263249, | |
| "loss": 3.7454, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.328365160689334, | |
| "eval_accuracy": 0.34439049810540107, | |
| "eval_loss": 3.783144235610962, | |
| "eval_runtime": 81.4614, | |
| "eval_samples_per_second": 204.404, | |
| "eval_steps_per_second": 12.779, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.3429203539823007, | |
| "grad_norm": 0.3212279975414276, | |
| "learning_rate": 0.0005722224810716366, | |
| "loss": 3.7626, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.3574755472752678, | |
| "grad_norm": 0.32338324189186096, | |
| "learning_rate": 0.0005720477577169481, | |
| "loss": 3.7545, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.372030740568235, | |
| "grad_norm": 0.31945735216140747, | |
| "learning_rate": 0.0005718730343622598, | |
| "loss": 3.7534, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.386585933861202, | |
| "grad_norm": 0.3039323389530182, | |
| "learning_rate": 0.0005716983110075713, | |
| "loss": 3.761, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.4011411271541685, | |
| "grad_norm": 0.311315655708313, | |
| "learning_rate": 0.0005715235876528828, | |
| "loss": 3.7538, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.4156963204471356, | |
| "grad_norm": 0.3373345732688904, | |
| "learning_rate": 0.0005713488642981945, | |
| "loss": 3.7437, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.4302515137401026, | |
| "grad_norm": 0.30638036131858826, | |
| "learning_rate": 0.0005711741409435061, | |
| "loss": 3.7486, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.4448067070330692, | |
| "grad_norm": 0.325650691986084, | |
| "learning_rate": 0.0005709994175888177, | |
| "loss": 3.7574, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.4593619003260363, | |
| "grad_norm": 0.31678134202957153, | |
| "learning_rate": 0.0005708246942341292, | |
| "loss": 3.742, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.4739170936190034, | |
| "grad_norm": 0.31129980087280273, | |
| "learning_rate": 0.0005706499708794409, | |
| "loss": 3.7411, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4884722869119704, | |
| "grad_norm": 0.3063598573207855, | |
| "learning_rate": 0.0005704752475247524, | |
| "loss": 3.7373, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.503027480204937, | |
| "grad_norm": 0.3293297290802002, | |
| "learning_rate": 0.0005703005241700641, | |
| "loss": 3.738, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.517582673497904, | |
| "grad_norm": 0.33821359276771545, | |
| "learning_rate": 0.0005701258008153756, | |
| "loss": 3.7361, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.5321378667908707, | |
| "grad_norm": 0.32988160848617554, | |
| "learning_rate": 0.0005699510774606872, | |
| "loss": 3.745, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.546693060083838, | |
| "grad_norm": 0.31691664457321167, | |
| "learning_rate": 0.0005697763541059988, | |
| "loss": 3.7483, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.561248253376805, | |
| "grad_norm": 0.29600226879119873, | |
| "learning_rate": 0.0005696016307513103, | |
| "loss": 3.7479, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.575803446669772, | |
| "grad_norm": 0.3323105573654175, | |
| "learning_rate": 0.000569426907396622, | |
| "loss": 3.7536, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 2.590358639962739, | |
| "grad_norm": 0.30689480900764465, | |
| "learning_rate": 0.0005692521840419336, | |
| "loss": 3.7481, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.6049138332557056, | |
| "grad_norm": 0.31229686737060547, | |
| "learning_rate": 0.0005690774606872452, | |
| "loss": 3.7326, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 2.6194690265486726, | |
| "grad_norm": 0.31316253542900085, | |
| "learning_rate": 0.0005689027373325567, | |
| "loss": 3.7444, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.6194690265486726, | |
| "eval_accuracy": 0.34723207329566896, | |
| "eval_loss": 3.753814220428467, | |
| "eval_runtime": 81.2604, | |
| "eval_samples_per_second": 204.909, | |
| "eval_steps_per_second": 12.811, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.6340242198416393, | |
| "grad_norm": 0.31683412194252014, | |
| "learning_rate": 0.0005687280139778683, | |
| "loss": 3.7356, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 2.6485794131346063, | |
| "grad_norm": 0.29618194699287415, | |
| "learning_rate": 0.0005685532906231799, | |
| "loss": 3.7281, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.6631346064275734, | |
| "grad_norm": 0.31509730219841003, | |
| "learning_rate": 0.0005683785672684915, | |
| "loss": 3.7329, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 2.6776897997205404, | |
| "grad_norm": 0.3411220908164978, | |
| "learning_rate": 0.0005682038439138031, | |
| "loss": 3.7245, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.692244993013507, | |
| "grad_norm": 0.32798290252685547, | |
| "learning_rate": 0.0005680291205591147, | |
| "loss": 3.7352, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 2.706800186306474, | |
| "grad_norm": 0.30504173040390015, | |
| "learning_rate": 0.0005678543972044263, | |
| "loss": 3.736, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.721355379599441, | |
| "grad_norm": 0.31030216813087463, | |
| "learning_rate": 0.0005676796738497378, | |
| "loss": 3.7414, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 2.735910572892408, | |
| "grad_norm": 0.3225027322769165, | |
| "learning_rate": 0.0005675049504950495, | |
| "loss": 3.7316, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.750465766185375, | |
| "grad_norm": 0.3140605092048645, | |
| "learning_rate": 0.0005673302271403611, | |
| "loss": 3.7354, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 2.765020959478342, | |
| "grad_norm": 0.31096795201301575, | |
| "learning_rate": 0.0005671555037856726, | |
| "loss": 3.7344, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.779576152771309, | |
| "grad_norm": 0.3346410095691681, | |
| "learning_rate": 0.0005669807804309842, | |
| "loss": 3.7297, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 2.7941313460642756, | |
| "grad_norm": 0.3011225461959839, | |
| "learning_rate": 0.0005668060570762958, | |
| "loss": 3.7323, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.8086865393572427, | |
| "grad_norm": 0.31489118933677673, | |
| "learning_rate": 0.0005666313337216074, | |
| "loss": 3.7267, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 2.8232417326502097, | |
| "grad_norm": 0.29963552951812744, | |
| "learning_rate": 0.000566456610366919, | |
| "loss": 3.7301, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.8377969259431763, | |
| "grad_norm": 0.3229866623878479, | |
| "learning_rate": 0.0005662818870122306, | |
| "loss": 3.7254, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 2.8523521192361434, | |
| "grad_norm": 0.3318677246570587, | |
| "learning_rate": 0.0005661071636575422, | |
| "loss": 3.7236, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.8669073125291105, | |
| "grad_norm": 0.3175937235355377, | |
| "learning_rate": 0.0005659324403028537, | |
| "loss": 3.7138, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 2.8814625058220775, | |
| "grad_norm": 0.3084236979484558, | |
| "learning_rate": 0.0005657577169481653, | |
| "loss": 3.7193, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.896017699115044, | |
| "grad_norm": 0.3252362906932831, | |
| "learning_rate": 0.0005655829935934769, | |
| "loss": 3.7102, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 2.910572892408011, | |
| "grad_norm": 0.3192616403102875, | |
| "learning_rate": 0.0005654082702387886, | |
| "loss": 3.7281, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.910572892408011, | |
| "eval_accuracy": 0.34959448966176937, | |
| "eval_loss": 3.729318618774414, | |
| "eval_runtime": 82.2821, | |
| "eval_samples_per_second": 202.365, | |
| "eval_steps_per_second": 12.652, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.9251280857009783, | |
| "grad_norm": 0.2914718687534332, | |
| "learning_rate": 0.0005652335468841001, | |
| "loss": 3.7269, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 2.939683278993945, | |
| "grad_norm": 0.31015467643737793, | |
| "learning_rate": 0.0005650588235294117, | |
| "loss": 3.7292, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.954238472286912, | |
| "grad_norm": 0.31583884358406067, | |
| "learning_rate": 0.0005648841001747233, | |
| "loss": 3.7204, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 2.968793665579879, | |
| "grad_norm": 0.3031386435031891, | |
| "learning_rate": 0.0005647093768200349, | |
| "loss": 3.7136, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.983348858872846, | |
| "grad_norm": 0.3371553421020508, | |
| "learning_rate": 0.0005645346534653465, | |
| "loss": 3.7201, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 2.9979040521658127, | |
| "grad_norm": 0.3117963671684265, | |
| "learning_rate": 0.0005643599301106582, | |
| "loss": 3.7177, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.0122263623660923, | |
| "grad_norm": 0.33362844586372375, | |
| "learning_rate": 0.0005641852067559697, | |
| "loss": 3.6358, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 3.026781555659059, | |
| "grad_norm": 0.3290381133556366, | |
| "learning_rate": 0.0005640104834012812, | |
| "loss": 3.6109, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.041336748952026, | |
| "grad_norm": 0.3276137709617615, | |
| "learning_rate": 0.0005638357600465929, | |
| "loss": 3.6123, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 3.055891942244993, | |
| "grad_norm": 0.3159635365009308, | |
| "learning_rate": 0.0005636610366919044, | |
| "loss": 3.6236, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.07044713553796, | |
| "grad_norm": 0.33656877279281616, | |
| "learning_rate": 0.0005634863133372161, | |
| "loss": 3.6188, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 3.0850023288309267, | |
| "grad_norm": 0.3062834143638611, | |
| "learning_rate": 0.0005633115899825276, | |
| "loss": 3.6178, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.099557522123894, | |
| "grad_norm": 0.34431013464927673, | |
| "learning_rate": 0.0005631368666278393, | |
| "loss": 3.6213, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 3.114112715416861, | |
| "grad_norm": 0.2998802363872528, | |
| "learning_rate": 0.0005629621432731508, | |
| "loss": 3.616, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.1286679087098275, | |
| "grad_norm": 0.3096281886100769, | |
| "learning_rate": 0.0005627874199184623, | |
| "loss": 3.6333, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 3.1432231020027945, | |
| "grad_norm": 0.30659937858581543, | |
| "learning_rate": 0.000562612696563774, | |
| "loss": 3.6305, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.1577782952957616, | |
| "grad_norm": 0.32195740938186646, | |
| "learning_rate": 0.0005624379732090856, | |
| "loss": 3.6273, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 3.1723334885887287, | |
| "grad_norm": 0.33462584018707275, | |
| "learning_rate": 0.0005622632498543972, | |
| "loss": 3.6307, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.1868886818816953, | |
| "grad_norm": 0.3127260208129883, | |
| "learning_rate": 0.0005620885264997087, | |
| "loss": 3.6206, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 3.2014438751746623, | |
| "grad_norm": 0.32775092124938965, | |
| "learning_rate": 0.0005619138031450204, | |
| "loss": 3.6378, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.2014438751746623, | |
| "eval_accuracy": 0.3514352023191428, | |
| "eval_loss": 3.7163853645324707, | |
| "eval_runtime": 82.1016, | |
| "eval_samples_per_second": 202.81, | |
| "eval_steps_per_second": 12.679, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.2159990684676294, | |
| "grad_norm": 0.31226086616516113, | |
| "learning_rate": 0.0005617390797903319, | |
| "loss": 3.653, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 3.230554261760596, | |
| "grad_norm": 0.3057551383972168, | |
| "learning_rate": 0.0005615643564356436, | |
| "loss": 3.6371, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.245109455053563, | |
| "grad_norm": 0.3345123529434204, | |
| "learning_rate": 0.0005613896330809551, | |
| "loss": 3.6401, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 3.25966464834653, | |
| "grad_norm": 0.31475716829299927, | |
| "learning_rate": 0.0005612149097262667, | |
| "loss": 3.6396, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.274219841639497, | |
| "grad_norm": 0.3113819360733032, | |
| "learning_rate": 0.0005610401863715783, | |
| "loss": 3.6435, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 3.288775034932464, | |
| "grad_norm": 0.31860965490341187, | |
| "learning_rate": 0.0005608654630168898, | |
| "loss": 3.6335, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.303330228225431, | |
| "grad_norm": 0.3172319233417511, | |
| "learning_rate": 0.0005606907396622015, | |
| "loss": 3.6539, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 3.317885421518398, | |
| "grad_norm": 0.318330854177475, | |
| "learning_rate": 0.0005605160163075131, | |
| "loss": 3.6478, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.3324406148113646, | |
| "grad_norm": 0.30692028999328613, | |
| "learning_rate": 0.0005603412929528247, | |
| "loss": 3.6458, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 3.3469958081043316, | |
| "grad_norm": 0.31615644693374634, | |
| "learning_rate": 0.0005601665695981362, | |
| "loss": 3.6416, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.3615510013972987, | |
| "grad_norm": 0.3097667098045349, | |
| "learning_rate": 0.0005599918462434478, | |
| "loss": 3.6429, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 3.3761061946902653, | |
| "grad_norm": 0.3427872061729431, | |
| "learning_rate": 0.0005598171228887594, | |
| "loss": 3.6499, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.3906613879832324, | |
| "grad_norm": 0.3071345090866089, | |
| "learning_rate": 0.0005596423995340709, | |
| "loss": 3.6422, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 3.4052165812761994, | |
| "grad_norm": 0.31189608573913574, | |
| "learning_rate": 0.0005594676761793826, | |
| "loss": 3.6491, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.419771774569166, | |
| "grad_norm": 0.32518497109413147, | |
| "learning_rate": 0.0005592929528246942, | |
| "loss": 3.6414, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 3.434326967862133, | |
| "grad_norm": 0.3021961748600006, | |
| "learning_rate": 0.0005591182294700058, | |
| "loss": 3.6358, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.4488821611551, | |
| "grad_norm": 0.3302323818206787, | |
| "learning_rate": 0.0005589435061153173, | |
| "loss": 3.6416, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 3.463437354448067, | |
| "grad_norm": 0.33426007628440857, | |
| "learning_rate": 0.000558768782760629, | |
| "loss": 3.6423, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.477992547741034, | |
| "grad_norm": 0.31119561195373535, | |
| "learning_rate": 0.0005585940594059406, | |
| "loss": 3.6385, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 3.492547741034001, | |
| "grad_norm": 0.2987367510795593, | |
| "learning_rate": 0.0005584193360512521, | |
| "loss": 3.6481, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.492547741034001, | |
| "eval_accuracy": 0.35323113707315407, | |
| "eval_loss": 3.6966543197631836, | |
| "eval_runtime": 82.601, | |
| "eval_samples_per_second": 201.583, | |
| "eval_steps_per_second": 12.603, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.507102934326968, | |
| "grad_norm": 0.3311735987663269, | |
| "learning_rate": 0.0005582446126965637, | |
| "loss": 3.6443, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 3.5216581276199346, | |
| "grad_norm": 0.30673640966415405, | |
| "learning_rate": 0.0005580698893418753, | |
| "loss": 3.6447, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 3.5362133209129016, | |
| "grad_norm": 0.3128121793270111, | |
| "learning_rate": 0.0005578951659871869, | |
| "loss": 3.6391, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 3.5507685142058687, | |
| "grad_norm": 0.3103184401988983, | |
| "learning_rate": 0.0005577204426324985, | |
| "loss": 3.6433, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 3.5653237074988358, | |
| "grad_norm": 0.31347033381462097, | |
| "learning_rate": 0.0005575457192778101, | |
| "loss": 3.6507, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 3.5798789007918024, | |
| "grad_norm": 0.30016088485717773, | |
| "learning_rate": 0.0005573709959231217, | |
| "loss": 3.6426, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 3.5944340940847694, | |
| "grad_norm": 0.3018815815448761, | |
| "learning_rate": 0.0005571962725684332, | |
| "loss": 3.6371, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 3.6089892873777365, | |
| "grad_norm": 0.3217933773994446, | |
| "learning_rate": 0.0005570215492137449, | |
| "loss": 3.6405, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 3.623544480670703, | |
| "grad_norm": 0.30570515990257263, | |
| "learning_rate": 0.0005568468258590564, | |
| "loss": 3.646, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 3.63809967396367, | |
| "grad_norm": 0.3075626492500305, | |
| "learning_rate": 0.0005566721025043681, | |
| "loss": 3.6542, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.6526548672566372, | |
| "grad_norm": 0.3189581036567688, | |
| "learning_rate": 0.0005564973791496796, | |
| "loss": 3.6351, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 3.6672100605496043, | |
| "grad_norm": 0.30742666125297546, | |
| "learning_rate": 0.0005563226557949913, | |
| "loss": 3.638, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 3.681765253842571, | |
| "grad_norm": 0.31452637910842896, | |
| "learning_rate": 0.0005561479324403028, | |
| "loss": 3.6445, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 3.696320447135538, | |
| "grad_norm": 0.33098313212394714, | |
| "learning_rate": 0.0005559732090856144, | |
| "loss": 3.6454, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 3.710875640428505, | |
| "grad_norm": 0.2981927990913391, | |
| "learning_rate": 0.000555798485730926, | |
| "loss": 3.6585, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 3.7254308337214717, | |
| "grad_norm": 0.34049972891807556, | |
| "learning_rate": 0.0005556237623762376, | |
| "loss": 3.6372, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 3.7399860270144387, | |
| "grad_norm": 0.3138774037361145, | |
| "learning_rate": 0.0005554490390215492, | |
| "loss": 3.6418, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 3.754541220307406, | |
| "grad_norm": 0.3035743832588196, | |
| "learning_rate": 0.0005552743156668607, | |
| "loss": 3.6364, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 3.769096413600373, | |
| "grad_norm": 0.32107847929000854, | |
| "learning_rate": 0.0005550995923121724, | |
| "loss": 3.6322, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 3.7836516068933395, | |
| "grad_norm": 0.31542128324508667, | |
| "learning_rate": 0.0005549248689574839, | |
| "loss": 3.6473, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.7836516068933395, | |
| "eval_accuracy": 0.3548796925861778, | |
| "eval_loss": 3.678546667098999, | |
| "eval_runtime": 82.1512, | |
| "eval_samples_per_second": 202.687, | |
| "eval_steps_per_second": 12.672, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.7982068001863065, | |
| "grad_norm": 0.32424482703208923, | |
| "learning_rate": 0.0005547501456027955, | |
| "loss": 3.64, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 3.812761993479273, | |
| "grad_norm": 0.30467554926872253, | |
| "learning_rate": 0.0005545754222481071, | |
| "loss": 3.6298, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 3.82731718677224, | |
| "grad_norm": 0.30049797892570496, | |
| "learning_rate": 0.0005544006988934188, | |
| "loss": 3.6264, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 3.8418723800652073, | |
| "grad_norm": 0.31272950768470764, | |
| "learning_rate": 0.0005542259755387303, | |
| "loss": 3.6403, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 3.8564275733581743, | |
| "grad_norm": 0.313121497631073, | |
| "learning_rate": 0.0005540512521840418, | |
| "loss": 3.6378, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 3.8709827666511414, | |
| "grad_norm": 0.3041077256202698, | |
| "learning_rate": 0.0005538765288293535, | |
| "loss": 3.6331, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 3.885537959944108, | |
| "grad_norm": 0.31648609042167664, | |
| "learning_rate": 0.0005537018054746651, | |
| "loss": 3.6603, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 3.900093153237075, | |
| "grad_norm": 0.31049904227256775, | |
| "learning_rate": 0.0005535270821199767, | |
| "loss": 3.6502, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 3.9146483465300417, | |
| "grad_norm": 0.3157772421836853, | |
| "learning_rate": 0.0005533523587652882, | |
| "loss": 3.6337, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 3.9292035398230087, | |
| "grad_norm": 0.2977442145347595, | |
| "learning_rate": 0.0005531776354105999, | |
| "loss": 3.6494, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.943758733115976, | |
| "grad_norm": 0.3233408033847809, | |
| "learning_rate": 0.0005530029120559114, | |
| "loss": 3.6369, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 3.958313926408943, | |
| "grad_norm": 0.3033396303653717, | |
| "learning_rate": 0.0005528281887012229, | |
| "loss": 3.6307, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 3.9728691197019095, | |
| "grad_norm": 0.2945745587348938, | |
| "learning_rate": 0.0005526534653465346, | |
| "loss": 3.6394, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 3.9874243129948765, | |
| "grad_norm": 0.31680789589881897, | |
| "learning_rate": 0.0005524787419918462, | |
| "loss": 3.6266, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 4.001746623195156, | |
| "grad_norm": 0.31889063119888306, | |
| "learning_rate": 0.0005523040186371578, | |
| "loss": 3.6118, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 4.016301816488123, | |
| "grad_norm": 0.3044288158416748, | |
| "learning_rate": 0.0005521292952824693, | |
| "loss": 3.531, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 4.03085700978109, | |
| "grad_norm": 0.32427647709846497, | |
| "learning_rate": 0.000551954571927781, | |
| "loss": 3.528, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 4.045412203074057, | |
| "grad_norm": 0.3020067811012268, | |
| "learning_rate": 0.0005517798485730926, | |
| "loss": 3.5463, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 4.059967396367024, | |
| "grad_norm": 0.30971652269363403, | |
| "learning_rate": 0.0005516051252184042, | |
| "loss": 3.5374, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 4.074522589659991, | |
| "grad_norm": 0.32447558641433716, | |
| "learning_rate": 0.0005514304018637157, | |
| "loss": 3.5417, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.074522589659991, | |
| "eval_accuracy": 0.3560623698605456, | |
| "eval_loss": 3.673107624053955, | |
| "eval_runtime": 82.0776, | |
| "eval_samples_per_second": 202.869, | |
| "eval_steps_per_second": 12.683, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.089077782952957, | |
| "grad_norm": 0.29753831028938293, | |
| "learning_rate": 0.0005512556785090273, | |
| "loss": 3.5402, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 4.103632976245924, | |
| "grad_norm": 0.30887463688850403, | |
| "learning_rate": 0.0005510809551543389, | |
| "loss": 3.5434, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 4.118188169538891, | |
| "grad_norm": 0.3261681795120239, | |
| "learning_rate": 0.0005509062317996504, | |
| "loss": 3.5429, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 4.132743362831858, | |
| "grad_norm": 0.31773021817207336, | |
| "learning_rate": 0.0005507315084449621, | |
| "loss": 3.5483, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 4.1472985561248255, | |
| "grad_norm": 0.31176310777664185, | |
| "learning_rate": 0.0005505567850902737, | |
| "loss": 3.5435, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 4.1618537494177925, | |
| "grad_norm": 0.32373055815696716, | |
| "learning_rate": 0.0005503820617355853, | |
| "loss": 3.5525, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 4.17640894271076, | |
| "grad_norm": 0.3068099915981293, | |
| "learning_rate": 0.0005502073383808969, | |
| "loss": 3.5542, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 4.190964136003726, | |
| "grad_norm": 0.3189917802810669, | |
| "learning_rate": 0.0005500326150262085, | |
| "loss": 3.5494, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.205519329296693, | |
| "grad_norm": 0.3026528060436249, | |
| "learning_rate": 0.00054985789167152, | |
| "loss": 3.556, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 4.22007452258966, | |
| "grad_norm": 0.31138288974761963, | |
| "learning_rate": 0.0005496831683168316, | |
| "loss": 3.5592, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.234629715882627, | |
| "grad_norm": 0.30152401328086853, | |
| "learning_rate": 0.0005495084449621433, | |
| "loss": 3.5705, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 4.249184909175594, | |
| "grad_norm": 0.2989727556705475, | |
| "learning_rate": 0.0005493337216074548, | |
| "loss": 3.5681, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 4.263740102468561, | |
| "grad_norm": 0.31015369296073914, | |
| "learning_rate": 0.0005491589982527664, | |
| "loss": 3.5747, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 4.278295295761528, | |
| "grad_norm": 0.32896533608436584, | |
| "learning_rate": 0.000548984274898078, | |
| "loss": 3.5679, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 4.292850489054494, | |
| "grad_norm": 0.3171634376049042, | |
| "learning_rate": 0.0005488095515433897, | |
| "loss": 3.5641, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 4.307405682347461, | |
| "grad_norm": 0.32699674367904663, | |
| "learning_rate": 0.0005486348281887012, | |
| "loss": 3.5675, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 4.321960875640428, | |
| "grad_norm": 0.3310847580432892, | |
| "learning_rate": 0.0005484601048340127, | |
| "loss": 3.5568, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 4.3365160689333955, | |
| "grad_norm": 0.3076675236225128, | |
| "learning_rate": 0.0005482853814793244, | |
| "loss": 3.5522, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 4.3510712622263625, | |
| "grad_norm": 0.314236044883728, | |
| "learning_rate": 0.0005481106581246359, | |
| "loss": 3.573, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 4.36562645551933, | |
| "grad_norm": 0.31613799929618835, | |
| "learning_rate": 0.0005479359347699475, | |
| "loss": 3.5539, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.36562645551933, | |
| "eval_accuracy": 0.3570901461463796, | |
| "eval_loss": 3.6625263690948486, | |
| "eval_runtime": 82.0045, | |
| "eval_samples_per_second": 203.05, | |
| "eval_steps_per_second": 12.694, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.380181648812297, | |
| "grad_norm": 0.3096142113208771, | |
| "learning_rate": 0.0005477612114152591, | |
| "loss": 3.5617, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 4.394736842105263, | |
| "grad_norm": 0.299726665019989, | |
| "learning_rate": 0.0005475864880605708, | |
| "loss": 3.5649, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 4.40929203539823, | |
| "grad_norm": 0.31255221366882324, | |
| "learning_rate": 0.0005474117647058823, | |
| "loss": 3.5673, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 4.423847228691197, | |
| "grad_norm": 0.33720284700393677, | |
| "learning_rate": 0.0005472370413511939, | |
| "loss": 3.5649, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 4.438402421984164, | |
| "grad_norm": 0.3025956153869629, | |
| "learning_rate": 0.0005470623179965055, | |
| "loss": 3.5715, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 4.452957615277131, | |
| "grad_norm": 0.33253517746925354, | |
| "learning_rate": 0.0005468875946418171, | |
| "loss": 3.577, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 4.467512808570098, | |
| "grad_norm": 0.31943124532699585, | |
| "learning_rate": 0.0005467128712871287, | |
| "loss": 3.5801, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 4.482068001863064, | |
| "grad_norm": 0.3012823164463043, | |
| "learning_rate": 0.0005465381479324402, | |
| "loss": 3.5756, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 4.496623195156031, | |
| "grad_norm": 0.32098299264907837, | |
| "learning_rate": 0.0005463634245777519, | |
| "loss": 3.5647, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 4.511178388448998, | |
| "grad_norm": 0.31534451246261597, | |
| "learning_rate": 0.0005461887012230634, | |
| "loss": 3.5806, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.5257335817419655, | |
| "grad_norm": 0.31664130091667175, | |
| "learning_rate": 0.000546013977868375, | |
| "loss": 3.5737, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 4.5402887750349326, | |
| "grad_norm": 0.2983573377132416, | |
| "learning_rate": 0.0005458392545136866, | |
| "loss": 3.5703, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 4.5548439683279, | |
| "grad_norm": 0.3048228919506073, | |
| "learning_rate": 0.0005456645311589983, | |
| "loss": 3.5696, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 4.569399161620867, | |
| "grad_norm": 0.3136667013168335, | |
| "learning_rate": 0.0005454898078043098, | |
| "loss": 3.5636, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 4.583954354913834, | |
| "grad_norm": 0.3322896361351013, | |
| "learning_rate": 0.0005453150844496213, | |
| "loss": 3.5747, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 4.5985095482068, | |
| "grad_norm": 0.326160192489624, | |
| "learning_rate": 0.000545140361094933, | |
| "loss": 3.5791, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 4.613064741499767, | |
| "grad_norm": 0.3007518947124481, | |
| "learning_rate": 0.0005449656377402445, | |
| "loss": 3.5794, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 4.627619934792734, | |
| "grad_norm": 0.330252081155777, | |
| "learning_rate": 0.0005447909143855562, | |
| "loss": 3.5762, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 4.642175128085701, | |
| "grad_norm": 0.32782429456710815, | |
| "learning_rate": 0.0005446161910308677, | |
| "loss": 3.576, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 4.656730321378668, | |
| "grad_norm": 0.30817028880119324, | |
| "learning_rate": 0.0005444414676761794, | |
| "loss": 3.5818, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.656730321378668, | |
| "eval_accuracy": 0.358599079220573, | |
| "eval_loss": 3.647080183029175, | |
| "eval_runtime": 82.0669, | |
| "eval_samples_per_second": 202.895, | |
| "eval_steps_per_second": 12.685, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.671285514671635, | |
| "grad_norm": 0.2996269762516022, | |
| "learning_rate": 0.0005442667443214909, | |
| "loss": 3.5833, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 4.685840707964601, | |
| "grad_norm": 0.30686891078948975, | |
| "learning_rate": 0.0005440920209668024, | |
| "loss": 3.5731, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 4.7003959012575685, | |
| "grad_norm": 0.31228163838386536, | |
| "learning_rate": 0.0005439172976121141, | |
| "loss": 3.5842, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 4.7149510945505355, | |
| "grad_norm": 0.326372355222702, | |
| "learning_rate": 0.0005437425742574257, | |
| "loss": 3.575, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 4.729506287843503, | |
| "grad_norm": 0.33600667119026184, | |
| "learning_rate": 0.0005435678509027373, | |
| "loss": 3.5846, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 4.74406148113647, | |
| "grad_norm": 0.31084907054901123, | |
| "learning_rate": 0.0005433931275480488, | |
| "loss": 3.5636, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 4.758616674429437, | |
| "grad_norm": 0.30431193113327026, | |
| "learning_rate": 0.0005432184041933605, | |
| "loss": 3.5846, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 4.773171867722404, | |
| "grad_norm": 0.30759352445602417, | |
| "learning_rate": 0.000543043680838672, | |
| "loss": 3.5722, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 4.78772706101537, | |
| "grad_norm": 0.3144029974937439, | |
| "learning_rate": 0.0005428689574839837, | |
| "loss": 3.5672, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 4.802282254308337, | |
| "grad_norm": 0.3099290728569031, | |
| "learning_rate": 0.0005426942341292952, | |
| "loss": 3.5707, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.816837447601304, | |
| "grad_norm": 0.30486756563186646, | |
| "learning_rate": 0.0005425195107746068, | |
| "loss": 3.5766, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 4.831392640894271, | |
| "grad_norm": 0.324359655380249, | |
| "learning_rate": 0.0005423447874199184, | |
| "loss": 3.5828, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 4.845947834187238, | |
| "grad_norm": 0.3312830924987793, | |
| "learning_rate": 0.00054217006406523, | |
| "loss": 3.5823, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 4.860503027480205, | |
| "grad_norm": 0.30742231011390686, | |
| "learning_rate": 0.0005419953407105417, | |
| "loss": 3.5909, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 4.875058220773171, | |
| "grad_norm": 0.3221106231212616, | |
| "learning_rate": 0.0005418206173558532, | |
| "loss": 3.5603, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 4.8896134140661385, | |
| "grad_norm": 0.30627861618995667, | |
| "learning_rate": 0.0005416458940011648, | |
| "loss": 3.5894, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 4.9041686073591055, | |
| "grad_norm": 0.3032422959804535, | |
| "learning_rate": 0.0005414711706464764, | |
| "loss": 3.5951, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 4.918723800652073, | |
| "grad_norm": 0.30214816331863403, | |
| "learning_rate": 0.000541296447291788, | |
| "loss": 3.5822, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 4.93327899394504, | |
| "grad_norm": 0.2999882698059082, | |
| "learning_rate": 0.0005411217239370995, | |
| "loss": 3.5606, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 4.947834187238007, | |
| "grad_norm": 0.29804569482803345, | |
| "learning_rate": 0.0005409470005824111, | |
| "loss": 3.5794, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.947834187238007, | |
| "eval_accuracy": 0.3596189811769443, | |
| "eval_loss": 3.6342313289642334, | |
| "eval_runtime": 82.2138, | |
| "eval_samples_per_second": 202.533, | |
| "eval_steps_per_second": 12.662, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.962389380530974, | |
| "grad_norm": 0.30328428745269775, | |
| "learning_rate": 0.0005407722772277228, | |
| "loss": 3.5833, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 4.976944573823941, | |
| "grad_norm": 0.31700846552848816, | |
| "learning_rate": 0.0005405975538730343, | |
| "loss": 3.5764, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 4.991499767116907, | |
| "grad_norm": 0.3086136281490326, | |
| "learning_rate": 0.0005404228305183459, | |
| "loss": 3.5699, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 5.005822077317187, | |
| "grad_norm": 0.3159460127353668, | |
| "learning_rate": 0.0005402481071636575, | |
| "loss": 3.5434, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 5.020377270610154, | |
| "grad_norm": 0.33103612065315247, | |
| "learning_rate": 0.0005400733838089692, | |
| "loss": 3.4745, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 5.034932463903121, | |
| "grad_norm": 0.30555590987205505, | |
| "learning_rate": 0.0005398986604542807, | |
| "loss": 3.4711, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 5.049487657196088, | |
| "grad_norm": 0.3130439817905426, | |
| "learning_rate": 0.0005397239370995922, | |
| "loss": 3.4664, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 5.064042850489055, | |
| "grad_norm": 0.3214947283267975, | |
| "learning_rate": 0.0005395492137449039, | |
| "loss": 3.4723, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 5.078598043782021, | |
| "grad_norm": 0.3237079381942749, | |
| "learning_rate": 0.0005393744903902154, | |
| "loss": 3.4938, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 5.093153237074988, | |
| "grad_norm": 0.30716630816459656, | |
| "learning_rate": 0.000539199767035527, | |
| "loss": 3.4889, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.107708430367955, | |
| "grad_norm": 0.3121359944343567, | |
| "learning_rate": 0.0005390250436808386, | |
| "loss": 3.49, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 5.122263623660922, | |
| "grad_norm": 0.3188546299934387, | |
| "learning_rate": 0.0005388503203261503, | |
| "loss": 3.4931, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 5.136818816953889, | |
| "grad_norm": 0.309587687253952, | |
| "learning_rate": 0.0005386755969714618, | |
| "loss": 3.4827, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 5.151374010246856, | |
| "grad_norm": 0.31933507323265076, | |
| "learning_rate": 0.0005385008736167733, | |
| "loss": 3.5022, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 5.165929203539823, | |
| "grad_norm": 0.3354766368865967, | |
| "learning_rate": 0.000538326150262085, | |
| "loss": 3.5086, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 5.18048439683279, | |
| "grad_norm": 0.3540804088115692, | |
| "learning_rate": 0.0005381514269073965, | |
| "loss": 3.4984, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 5.195039590125757, | |
| "grad_norm": 0.323789119720459, | |
| "learning_rate": 0.0005379767035527082, | |
| "loss": 3.5074, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 5.209594783418724, | |
| "grad_norm": 0.31788966059684753, | |
| "learning_rate": 0.0005378019801980197, | |
| "loss": 3.4993, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 5.224149976711691, | |
| "grad_norm": 0.3073675036430359, | |
| "learning_rate": 0.0005376272568433314, | |
| "loss": 3.489, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 5.238705170004658, | |
| "grad_norm": 0.3163228929042816, | |
| "learning_rate": 0.0005374525334886429, | |
| "loss": 3.4993, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.238705170004658, | |
| "eval_accuracy": 0.3596782149388723, | |
| "eval_loss": 3.6365137100219727, | |
| "eval_runtime": 82.0984, | |
| "eval_samples_per_second": 202.818, | |
| "eval_steps_per_second": 12.68, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.253260363297625, | |
| "grad_norm": 0.31673556566238403, | |
| "learning_rate": 0.0005372778101339545, | |
| "loss": 3.5069, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 5.267815556590591, | |
| "grad_norm": 0.33246222138404846, | |
| "learning_rate": 0.0005371030867792661, | |
| "loss": 3.5065, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 5.282370749883558, | |
| "grad_norm": 0.3224247395992279, | |
| "learning_rate": 0.0005369283634245778, | |
| "loss": 3.5157, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 5.296925943176525, | |
| "grad_norm": 0.32036086916923523, | |
| "learning_rate": 0.0005367536400698893, | |
| "loss": 3.5048, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 5.311481136469492, | |
| "grad_norm": 0.3066246509552002, | |
| "learning_rate": 0.0005365789167152008, | |
| "loss": 3.5118, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 5.326036329762459, | |
| "grad_norm": 0.3067922592163086, | |
| "learning_rate": 0.0005364041933605125, | |
| "loss": 3.506, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 5.340591523055426, | |
| "grad_norm": 0.3043152987957001, | |
| "learning_rate": 0.000536229470005824, | |
| "loss": 3.5055, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 5.3551467163483935, | |
| "grad_norm": 0.31479376554489136, | |
| "learning_rate": 0.0005360547466511357, | |
| "loss": 3.5106, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 5.36970190964136, | |
| "grad_norm": 0.31457990407943726, | |
| "learning_rate": 0.0005358800232964472, | |
| "loss": 3.5212, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 5.384257102934327, | |
| "grad_norm": 0.33203020691871643, | |
| "learning_rate": 0.0005357052999417589, | |
| "loss": 3.526, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.398812296227294, | |
| "grad_norm": 0.3407822251319885, | |
| "learning_rate": 0.0005355305765870704, | |
| "loss": 3.5158, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 5.413367489520261, | |
| "grad_norm": 0.359107106924057, | |
| "learning_rate": 0.000535355853232382, | |
| "loss": 3.5224, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 5.427922682813228, | |
| "grad_norm": 0.3023736774921417, | |
| "learning_rate": 0.0005351811298776936, | |
| "loss": 3.5188, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 5.442477876106195, | |
| "grad_norm": 0.3270534574985504, | |
| "learning_rate": 0.0005350064065230052, | |
| "loss": 3.5136, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 5.457033069399162, | |
| "grad_norm": 0.3224993348121643, | |
| "learning_rate": 0.0005348316831683168, | |
| "loss": 3.5153, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 5.471588262692128, | |
| "grad_norm": 0.3359697759151459, | |
| "learning_rate": 0.0005346569598136284, | |
| "loss": 3.5144, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 5.486143455985095, | |
| "grad_norm": 0.30153319239616394, | |
| "learning_rate": 0.00053448223645894, | |
| "loss": 3.5175, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 5.500698649278062, | |
| "grad_norm": 0.3074522018432617, | |
| "learning_rate": 0.0005343075131042515, | |
| "loss": 3.5175, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 5.515253842571029, | |
| "grad_norm": 0.3245329260826111, | |
| "learning_rate": 0.0005341327897495632, | |
| "loss": 3.5101, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 5.529809035863996, | |
| "grad_norm": 0.3213854730129242, | |
| "learning_rate": 0.0005339580663948748, | |
| "loss": 3.5089, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.529809035863996, | |
| "eval_accuracy": 0.36083515373335473, | |
| "eval_loss": 3.62589955329895, | |
| "eval_runtime": 82.6328, | |
| "eval_samples_per_second": 201.506, | |
| "eval_steps_per_second": 12.598, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.5443642291569635, | |
| "grad_norm": 0.3325282335281372, | |
| "learning_rate": 0.0005337833430401863, | |
| "loss": 3.5316, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 5.5589194224499305, | |
| "grad_norm": 0.3386688530445099, | |
| "learning_rate": 0.0005336086196854979, | |
| "loss": 3.5323, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 5.573474615742897, | |
| "grad_norm": 0.32731765508651733, | |
| "learning_rate": 0.0005334338963308095, | |
| "loss": 3.5228, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 5.588029809035864, | |
| "grad_norm": 0.3102168142795563, | |
| "learning_rate": 0.0005332591729761211, | |
| "loss": 3.5253, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 5.602585002328831, | |
| "grad_norm": 0.3269800841808319, | |
| "learning_rate": 0.0005330844496214327, | |
| "loss": 3.5301, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 5.617140195621798, | |
| "grad_norm": 0.3206944167613983, | |
| "learning_rate": 0.0005329097262667443, | |
| "loss": 3.5339, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 5.631695388914765, | |
| "grad_norm": 0.34097373485565186, | |
| "learning_rate": 0.0005327350029120559, | |
| "loss": 3.5198, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 5.646250582207732, | |
| "grad_norm": 0.32035309076309204, | |
| "learning_rate": 0.0005325602795573674, | |
| "loss": 3.5097, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 5.660805775500698, | |
| "grad_norm": 0.2980031967163086, | |
| "learning_rate": 0.000532385556202679, | |
| "loss": 3.5306, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 5.675360968793665, | |
| "grad_norm": 0.3016957938671112, | |
| "learning_rate": 0.0005322108328479906, | |
| "loss": 3.5215, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.689916162086632, | |
| "grad_norm": 0.33236822485923767, | |
| "learning_rate": 0.0005320361094933023, | |
| "loss": 3.5248, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 5.704471355379599, | |
| "grad_norm": 0.3233235776424408, | |
| "learning_rate": 0.0005318613861386138, | |
| "loss": 3.5154, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 5.719026548672566, | |
| "grad_norm": 0.2925451099872589, | |
| "learning_rate": 0.0005316866627839254, | |
| "loss": 3.5366, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 5.7335817419655335, | |
| "grad_norm": 0.3301728069782257, | |
| "learning_rate": 0.000531511939429237, | |
| "loss": 3.5184, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 5.748136935258501, | |
| "grad_norm": 0.31181296706199646, | |
| "learning_rate": 0.0005313372160745486, | |
| "loss": 3.5358, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 5.762692128551468, | |
| "grad_norm": 0.32187023758888245, | |
| "learning_rate": 0.0005311624927198602, | |
| "loss": 3.5241, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 5.777247321844434, | |
| "grad_norm": 0.3225005567073822, | |
| "learning_rate": 0.0005309877693651717, | |
| "loss": 3.5353, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 5.791802515137401, | |
| "grad_norm": 0.311990350484848, | |
| "learning_rate": 0.0005308130460104834, | |
| "loss": 3.5328, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 5.806357708430368, | |
| "grad_norm": 0.29590773582458496, | |
| "learning_rate": 0.0005306383226557949, | |
| "loss": 3.5329, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 5.820912901723335, | |
| "grad_norm": 0.36562469601631165, | |
| "learning_rate": 0.0005304635993011065, | |
| "loss": 3.5332, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.820912901723335, | |
| "eval_accuracy": 0.3619520157166915, | |
| "eval_loss": 3.6142091751098633, | |
| "eval_runtime": 82.0503, | |
| "eval_samples_per_second": 202.936, | |
| "eval_steps_per_second": 12.687, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.835468095016302, | |
| "grad_norm": 0.3292725086212158, | |
| "learning_rate": 0.0005302888759464181, | |
| "loss": 3.5292, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 5.850023288309269, | |
| "grad_norm": 0.2994527220726013, | |
| "learning_rate": 0.0005301141525917298, | |
| "loss": 3.5189, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 5.864578481602235, | |
| "grad_norm": 0.325177401304245, | |
| "learning_rate": 0.0005299394292370413, | |
| "loss": 3.5284, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 5.879133674895202, | |
| "grad_norm": 0.32849201560020447, | |
| "learning_rate": 0.0005297647058823528, | |
| "loss": 3.5254, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 5.893688868188169, | |
| "grad_norm": 0.31668519973754883, | |
| "learning_rate": 0.0005295899825276645, | |
| "loss": 3.5337, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 5.9082440614811365, | |
| "grad_norm": 0.344332218170166, | |
| "learning_rate": 0.000529415259172976, | |
| "loss": 3.5314, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 5.9227992547741035, | |
| "grad_norm": 0.2968040406703949, | |
| "learning_rate": 0.0005292405358182877, | |
| "loss": 3.5296, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 5.937354448067071, | |
| "grad_norm": 0.3091900050640106, | |
| "learning_rate": 0.0005290658124635992, | |
| "loss": 3.5235, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 5.951909641360038, | |
| "grad_norm": 0.3146153688430786, | |
| "learning_rate": 0.0005288910891089109, | |
| "loss": 3.5213, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 5.966464834653004, | |
| "grad_norm": 0.3385096490383148, | |
| "learning_rate": 0.0005287163657542224, | |
| "loss": 3.546, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.981020027945971, | |
| "grad_norm": 0.3091694116592407, | |
| "learning_rate": 0.000528541642399534, | |
| "loss": 3.5272, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 5.995575221238938, | |
| "grad_norm": 0.31724226474761963, | |
| "learning_rate": 0.0005283669190448456, | |
| "loss": 3.5378, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 6.009897531439218, | |
| "grad_norm": 0.3062690496444702, | |
| "learning_rate": 0.0005281921956901572, | |
| "loss": 3.4534, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 6.024452724732185, | |
| "grad_norm": 0.325293630361557, | |
| "learning_rate": 0.0005280174723354688, | |
| "loss": 3.4145, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 6.039007918025152, | |
| "grad_norm": 0.32353833317756653, | |
| "learning_rate": 0.0005278427489807804, | |
| "loss": 3.4264, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 6.053563111318118, | |
| "grad_norm": 0.332753449678421, | |
| "learning_rate": 0.000527668025626092, | |
| "loss": 3.4307, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 6.068118304611085, | |
| "grad_norm": 0.3216187059879303, | |
| "learning_rate": 0.0005274933022714035, | |
| "loss": 3.4272, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 6.082673497904052, | |
| "grad_norm": 0.31146958470344543, | |
| "learning_rate": 0.0005273185789167152, | |
| "loss": 3.4321, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 6.097228691197019, | |
| "grad_norm": 0.34451186656951904, | |
| "learning_rate": 0.0005271438555620268, | |
| "loss": 3.4401, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 6.111783884489986, | |
| "grad_norm": 0.32595980167388916, | |
| "learning_rate": 0.0005269691322073384, | |
| "loss": 3.4391, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.111783884489986, | |
| "eval_accuracy": 0.3624525645104441, | |
| "eval_loss": 3.616140365600586, | |
| "eval_runtime": 82.0511, | |
| "eval_samples_per_second": 202.935, | |
| "eval_steps_per_second": 12.687, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.126339077782953, | |
| "grad_norm": 0.34566399455070496, | |
| "learning_rate": 0.0005267944088526499, | |
| "loss": 3.4403, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 6.14089427107592, | |
| "grad_norm": 0.31366002559661865, | |
| "learning_rate": 0.0005266196854979615, | |
| "loss": 3.4449, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 6.155449464368886, | |
| "grad_norm": 0.3134048581123352, | |
| "learning_rate": 0.0005264449621432731, | |
| "loss": 3.4455, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 6.1700046576618535, | |
| "grad_norm": 0.3285142183303833, | |
| "learning_rate": 0.0005262702387885847, | |
| "loss": 3.4422, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 6.1845598509548205, | |
| "grad_norm": 0.33511438965797424, | |
| "learning_rate": 0.0005260955154338963, | |
| "loss": 3.4503, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 6.199115044247788, | |
| "grad_norm": 0.35477590560913086, | |
| "learning_rate": 0.0005259207920792079, | |
| "loss": 3.4448, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 6.213670237540755, | |
| "grad_norm": 0.3017437160015106, | |
| "learning_rate": 0.0005257460687245195, | |
| "loss": 3.4694, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 6.228225430833722, | |
| "grad_norm": 0.32487615942955017, | |
| "learning_rate": 0.000525571345369831, | |
| "loss": 3.4696, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 6.242780624126689, | |
| "grad_norm": 0.33012089133262634, | |
| "learning_rate": 0.0005253966220151426, | |
| "loss": 3.4586, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 6.257335817419655, | |
| "grad_norm": 0.3077701926231384, | |
| "learning_rate": 0.0005252218986604543, | |
| "loss": 3.4737, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 6.271891010712622, | |
| "grad_norm": 0.3275109827518463, | |
| "learning_rate": 0.0005250471753057658, | |
| "loss": 3.4622, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 6.286446204005589, | |
| "grad_norm": 0.33438587188720703, | |
| "learning_rate": 0.0005248724519510774, | |
| "loss": 3.4553, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 6.301001397298556, | |
| "grad_norm": 0.3386994004249573, | |
| "learning_rate": 0.000524697728596389, | |
| "loss": 3.4516, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 6.315556590591523, | |
| "grad_norm": 0.3206411600112915, | |
| "learning_rate": 0.0005245230052417006, | |
| "loss": 3.4649, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 6.33011178388449, | |
| "grad_norm": 0.31453728675842285, | |
| "learning_rate": 0.0005243482818870122, | |
| "loss": 3.4812, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 6.344666977177457, | |
| "grad_norm": 0.31217673420906067, | |
| "learning_rate": 0.0005241735585323238, | |
| "loss": 3.468, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 6.3592221704704235, | |
| "grad_norm": 0.33042481541633606, | |
| "learning_rate": 0.0005239988351776354, | |
| "loss": 3.479, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 6.3737773637633905, | |
| "grad_norm": 0.3223484754562378, | |
| "learning_rate": 0.0005238241118229469, | |
| "loss": 3.4638, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 6.388332557056358, | |
| "grad_norm": 0.3440265953540802, | |
| "learning_rate": 0.0005236493884682585, | |
| "loss": 3.4663, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 6.402887750349325, | |
| "grad_norm": 0.36614903807640076, | |
| "learning_rate": 0.0005234746651135701, | |
| "loss": 3.4754, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.402887750349325, | |
| "eval_accuracy": 0.36324951716844756, | |
| "eval_loss": 3.6095733642578125, | |
| "eval_runtime": 82.2643, | |
| "eval_samples_per_second": 202.408, | |
| "eval_steps_per_second": 12.654, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.417442943642292, | |
| "grad_norm": 0.3100200593471527, | |
| "learning_rate": 0.0005232999417588818, | |
| "loss": 3.4784, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 6.431998136935259, | |
| "grad_norm": 0.3248816728591919, | |
| "learning_rate": 0.0005231252184041933, | |
| "loss": 3.4871, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 6.446553330228225, | |
| "grad_norm": 0.3334600031375885, | |
| "learning_rate": 0.0005229504950495049, | |
| "loss": 3.4709, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 6.461108523521192, | |
| "grad_norm": 0.3175130784511566, | |
| "learning_rate": 0.0005227757716948165, | |
| "loss": 3.4712, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 6.475663716814159, | |
| "grad_norm": 0.30484238266944885, | |
| "learning_rate": 0.000522601048340128, | |
| "loss": 3.4694, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 6.490218910107126, | |
| "grad_norm": 0.31331515312194824, | |
| "learning_rate": 0.0005224263249854397, | |
| "loss": 3.4758, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 6.504774103400093, | |
| "grad_norm": 0.30935755372047424, | |
| "learning_rate": 0.0005222516016307512, | |
| "loss": 3.483, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 6.51932929669306, | |
| "grad_norm": 0.32532238960266113, | |
| "learning_rate": 0.0005220768782760629, | |
| "loss": 3.4779, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 6.533884489986027, | |
| "grad_norm": 0.3082233965396881, | |
| "learning_rate": 0.0005219021549213744, | |
| "loss": 3.4797, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 6.548439683278994, | |
| "grad_norm": 0.3147376477718353, | |
| "learning_rate": 0.000521727431566686, | |
| "loss": 3.4789, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.562994876571961, | |
| "grad_norm": 0.34049519896507263, | |
| "learning_rate": 0.0005215527082119976, | |
| "loss": 3.4823, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 6.577550069864928, | |
| "grad_norm": 0.33218836784362793, | |
| "learning_rate": 0.0005213779848573093, | |
| "loss": 3.4951, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 6.592105263157895, | |
| "grad_norm": 0.3302823007106781, | |
| "learning_rate": 0.0005212032615026208, | |
| "loss": 3.4838, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 6.606660456450862, | |
| "grad_norm": 0.3235093057155609, | |
| "learning_rate": 0.0005210285381479323, | |
| "loss": 3.4866, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 6.621215649743829, | |
| "grad_norm": 0.31711602210998535, | |
| "learning_rate": 0.000520853814793244, | |
| "loss": 3.4811, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 6.635770843036796, | |
| "grad_norm": 0.3168444335460663, | |
| "learning_rate": 0.0005206790914385555, | |
| "loss": 3.4842, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 6.650326036329762, | |
| "grad_norm": 0.33062317967414856, | |
| "learning_rate": 0.0005205043680838672, | |
| "loss": 3.4835, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 6.664881229622729, | |
| "grad_norm": 0.3226763606071472, | |
| "learning_rate": 0.0005203296447291787, | |
| "loss": 3.493, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 6.679436422915696, | |
| "grad_norm": 0.3275633752346039, | |
| "learning_rate": 0.0005201549213744904, | |
| "loss": 3.4856, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 6.693991616208663, | |
| "grad_norm": 0.30816447734832764, | |
| "learning_rate": 0.0005199801980198019, | |
| "loss": 3.4919, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.693991616208663, | |
| "eval_accuracy": 0.3638809913804299, | |
| "eval_loss": 3.600884437561035, | |
| "eval_runtime": 82.1269, | |
| "eval_samples_per_second": 202.747, | |
| "eval_steps_per_second": 12.676, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.70854680950163, | |
| "grad_norm": 0.3241273760795593, | |
| "learning_rate": 0.0005198054746651136, | |
| "loss": 3.4959, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 6.723102002794597, | |
| "grad_norm": 0.30980348587036133, | |
| "learning_rate": 0.0005196307513104251, | |
| "loss": 3.4824, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 6.737657196087564, | |
| "grad_norm": 0.32089006900787354, | |
| "learning_rate": 0.0005194560279557367, | |
| "loss": 3.48, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 6.752212389380531, | |
| "grad_norm": 0.30582284927368164, | |
| "learning_rate": 0.0005192813046010483, | |
| "loss": 3.4961, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 6.766767582673498, | |
| "grad_norm": 0.30927014350891113, | |
| "learning_rate": 0.0005191065812463599, | |
| "loss": 3.4827, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 6.781322775966465, | |
| "grad_norm": 0.3146321177482605, | |
| "learning_rate": 0.0005189318578916715, | |
| "loss": 3.5003, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 6.795877969259432, | |
| "grad_norm": 0.314066082239151, | |
| "learning_rate": 0.000518757134536983, | |
| "loss": 3.4864, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 6.810433162552399, | |
| "grad_norm": 0.31872811913490295, | |
| "learning_rate": 0.0005185824111822947, | |
| "loss": 3.4935, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 6.824988355845366, | |
| "grad_norm": 0.32462644577026367, | |
| "learning_rate": 0.0005184076878276063, | |
| "loss": 3.4816, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 6.839543549138332, | |
| "grad_norm": 0.33121970295906067, | |
| "learning_rate": 0.0005182329644729179, | |
| "loss": 3.502, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.854098742431299, | |
| "grad_norm": 0.32338279485702515, | |
| "learning_rate": 0.0005180582411182294, | |
| "loss": 3.4978, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 6.868653935724266, | |
| "grad_norm": 0.325857937335968, | |
| "learning_rate": 0.000517883517763541, | |
| "loss": 3.4942, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 6.883209129017233, | |
| "grad_norm": 0.31738531589508057, | |
| "learning_rate": 0.0005177087944088526, | |
| "loss": 3.4896, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 6.8977643223102, | |
| "grad_norm": 0.33013615012168884, | |
| "learning_rate": 0.0005175340710541642, | |
| "loss": 3.4956, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 6.912319515603167, | |
| "grad_norm": 0.31880733370780945, | |
| "learning_rate": 0.0005173593476994758, | |
| "loss": 3.4905, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 6.926874708896134, | |
| "grad_norm": 0.3099628984928131, | |
| "learning_rate": 0.0005171846243447874, | |
| "loss": 3.4882, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 6.9414299021891015, | |
| "grad_norm": 0.3184796869754791, | |
| "learning_rate": 0.000517009900990099, | |
| "loss": 3.4902, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 6.955985095482068, | |
| "grad_norm": 0.32240450382232666, | |
| "learning_rate": 0.0005168351776354105, | |
| "loss": 3.4919, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 6.970540288775035, | |
| "grad_norm": 0.329256534576416, | |
| "learning_rate": 0.0005166604542807221, | |
| "loss": 3.491, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 6.985095482068002, | |
| "grad_norm": 0.3288686275482178, | |
| "learning_rate": 0.0005164857309260338, | |
| "loss": 3.4905, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.985095482068002, | |
| "eval_accuracy": 0.36473482725425305, | |
| "eval_loss": 3.591662883758545, | |
| "eval_runtime": 82.1633, | |
| "eval_samples_per_second": 202.657, | |
| "eval_steps_per_second": 12.67, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.999650675360969, | |
| "grad_norm": 0.3294258117675781, | |
| "learning_rate": 0.0005163110075713453, | |
| "loss": 3.5073, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 7.0139729855612485, | |
| "grad_norm": 0.348611444234848, | |
| "learning_rate": 0.0005161362842166569, | |
| "loss": 3.4025, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 7.0285281788542155, | |
| "grad_norm": 0.31022247672080994, | |
| "learning_rate": 0.0005159615608619685, | |
| "loss": 3.3809, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 7.043083372147182, | |
| "grad_norm": 0.31374993920326233, | |
| "learning_rate": 0.0005157868375072801, | |
| "loss": 3.4001, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 7.057638565440149, | |
| "grad_norm": 0.34166577458381653, | |
| "learning_rate": 0.0005156121141525917, | |
| "loss": 3.39, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 7.072193758733116, | |
| "grad_norm": 0.34696874022483826, | |
| "learning_rate": 0.0005154373907979033, | |
| "loss": 3.3987, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 7.086748952026083, | |
| "grad_norm": 0.31268155574798584, | |
| "learning_rate": 0.0005152626674432149, | |
| "loss": 3.4064, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 7.10130414531905, | |
| "grad_norm": 0.3202010989189148, | |
| "learning_rate": 0.0005150879440885264, | |
| "loss": 3.404, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 7.115859338612017, | |
| "grad_norm": 0.34412631392478943, | |
| "learning_rate": 0.000514913220733838, | |
| "loss": 3.4127, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 7.130414531904984, | |
| "grad_norm": 0.33206769824028015, | |
| "learning_rate": 0.0005147384973791496, | |
| "loss": 3.4102, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 7.14496972519795, | |
| "grad_norm": 0.3362234830856323, | |
| "learning_rate": 0.0005145637740244613, | |
| "loss": 3.3956, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 7.159524918490917, | |
| "grad_norm": 0.34696847200393677, | |
| "learning_rate": 0.0005143890506697728, | |
| "loss": 3.4118, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 7.174080111783884, | |
| "grad_norm": 0.3048447370529175, | |
| "learning_rate": 0.0005142143273150844, | |
| "loss": 3.4247, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 7.1886353050768514, | |
| "grad_norm": 0.34809452295303345, | |
| "learning_rate": 0.000514039603960396, | |
| "loss": 3.4142, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 7.2031904983698185, | |
| "grad_norm": 0.3247174620628357, | |
| "learning_rate": 0.0005138648806057075, | |
| "loss": 3.4102, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 7.217745691662786, | |
| "grad_norm": 0.31421977281570435, | |
| "learning_rate": 0.0005136901572510192, | |
| "loss": 3.4188, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 7.232300884955753, | |
| "grad_norm": 0.33095836639404297, | |
| "learning_rate": 0.0005135154338963307, | |
| "loss": 3.4093, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 7.246856078248719, | |
| "grad_norm": 0.31490859389305115, | |
| "learning_rate": 0.0005133407105416424, | |
| "loss": 3.433, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 7.261411271541686, | |
| "grad_norm": 0.3200567662715912, | |
| "learning_rate": 0.0005131659871869539, | |
| "loss": 3.4254, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 7.275966464834653, | |
| "grad_norm": 0.32920682430267334, | |
| "learning_rate": 0.0005129912638322656, | |
| "loss": 3.4381, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.275966464834653, | |
| "eval_accuracy": 0.3647159053580816, | |
| "eval_loss": 3.599224090576172, | |
| "eval_runtime": 82.091, | |
| "eval_samples_per_second": 202.836, | |
| "eval_steps_per_second": 12.681, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.29052165812762, | |
| "grad_norm": 0.3556191027164459, | |
| "learning_rate": 0.0005128165404775771, | |
| "loss": 3.4261, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 7.305076851420587, | |
| "grad_norm": 0.3362888991832733, | |
| "learning_rate": 0.0005126418171228888, | |
| "loss": 3.4253, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 7.319632044713554, | |
| "grad_norm": 0.32096198201179504, | |
| "learning_rate": 0.0005124670937682003, | |
| "loss": 3.4288, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 7.334187238006521, | |
| "grad_norm": 0.34141862392425537, | |
| "learning_rate": 0.000512292370413512, | |
| "loss": 3.4274, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 7.348742431299487, | |
| "grad_norm": 0.3296661972999573, | |
| "learning_rate": 0.0005121176470588235, | |
| "loss": 3.4314, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 7.363297624592454, | |
| "grad_norm": 0.3159695565700531, | |
| "learning_rate": 0.000511942923704135, | |
| "loss": 3.4332, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 7.3778528178854215, | |
| "grad_norm": 0.31936854124069214, | |
| "learning_rate": 0.0005117682003494467, | |
| "loss": 3.4285, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 7.3924080111783885, | |
| "grad_norm": 0.3414633274078369, | |
| "learning_rate": 0.0005115934769947583, | |
| "loss": 3.4435, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 7.406963204471356, | |
| "grad_norm": 0.31653451919555664, | |
| "learning_rate": 0.0005114187536400699, | |
| "loss": 3.4441, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 7.421518397764323, | |
| "grad_norm": 0.35194024443626404, | |
| "learning_rate": 0.0005112440302853814, | |
| "loss": 3.428, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.436073591057289, | |
| "grad_norm": 0.3239954710006714, | |
| "learning_rate": 0.0005110693069306931, | |
| "loss": 3.44, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 7.450628784350256, | |
| "grad_norm": 0.3426033854484558, | |
| "learning_rate": 0.0005108945835760046, | |
| "loss": 3.4449, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 7.465183977643223, | |
| "grad_norm": 0.3274969458580017, | |
| "learning_rate": 0.0005107198602213162, | |
| "loss": 3.4332, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 7.47973917093619, | |
| "grad_norm": 0.3782486319541931, | |
| "learning_rate": 0.0005105451368666278, | |
| "loss": 3.4434, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 7.494294364229157, | |
| "grad_norm": 0.33219799399375916, | |
| "learning_rate": 0.0005103704135119394, | |
| "loss": 3.4469, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 7.508849557522124, | |
| "grad_norm": 0.32420504093170166, | |
| "learning_rate": 0.000510195690157251, | |
| "loss": 3.4549, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 7.523404750815091, | |
| "grad_norm": 0.3298400640487671, | |
| "learning_rate": 0.0005100209668025625, | |
| "loss": 3.4335, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 7.537959944108058, | |
| "grad_norm": 0.3362780809402466, | |
| "learning_rate": 0.0005098462434478742, | |
| "loss": 3.4504, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 7.552515137401024, | |
| "grad_norm": 0.34972232580184937, | |
| "learning_rate": 0.0005096715200931858, | |
| "loss": 3.4557, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 7.5670703306939915, | |
| "grad_norm": 0.3209405839443207, | |
| "learning_rate": 0.0005094967967384974, | |
| "loss": 3.4452, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.5670703306939915, | |
| "eval_accuracy": 0.3653472620427585, | |
| "eval_loss": 3.592453718185425, | |
| "eval_runtime": 82.12, | |
| "eval_samples_per_second": 202.764, | |
| "eval_steps_per_second": 12.677, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.5816255239869585, | |
| "grad_norm": 0.32272616028785706, | |
| "learning_rate": 0.0005093220733838089, | |
| "loss": 3.445, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 7.596180717279926, | |
| "grad_norm": 0.30670806765556335, | |
| "learning_rate": 0.0005091473500291205, | |
| "loss": 3.4596, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 7.610735910572893, | |
| "grad_norm": 0.3245435953140259, | |
| "learning_rate": 0.0005089726266744321, | |
| "loss": 3.4578, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 7.625291103865859, | |
| "grad_norm": 0.3339212238788605, | |
| "learning_rate": 0.0005087979033197437, | |
| "loss": 3.4617, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 7.639846297158826, | |
| "grad_norm": 0.31464093923568726, | |
| "learning_rate": 0.0005086231799650553, | |
| "loss": 3.4506, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 7.654401490451793, | |
| "grad_norm": 0.3026569187641144, | |
| "learning_rate": 0.0005084484566103669, | |
| "loss": 3.4549, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 7.66895668374476, | |
| "grad_norm": 0.3302854299545288, | |
| "learning_rate": 0.0005082737332556785, | |
| "loss": 3.451, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 7.683511877037727, | |
| "grad_norm": 0.33177828788757324, | |
| "learning_rate": 0.00050809900990099, | |
| "loss": 3.4434, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 7.698067070330694, | |
| "grad_norm": 0.3142891526222229, | |
| "learning_rate": 0.0005079242865463016, | |
| "loss": 3.4609, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 7.712622263623661, | |
| "grad_norm": 0.33711951971054077, | |
| "learning_rate": 0.0005077495631916133, | |
| "loss": 3.4506, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.727177456916628, | |
| "grad_norm": 0.3496488630771637, | |
| "learning_rate": 0.0005075748398369248, | |
| "loss": 3.4559, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 7.7417326502095944, | |
| "grad_norm": 0.32643625140190125, | |
| "learning_rate": 0.0005074001164822364, | |
| "loss": 3.4585, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 7.7562878435025615, | |
| "grad_norm": 0.35252007842063904, | |
| "learning_rate": 0.000507225393127548, | |
| "loss": 3.4578, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 7.770843036795529, | |
| "grad_norm": 0.3173462152481079, | |
| "learning_rate": 0.0005070506697728596, | |
| "loss": 3.4576, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 7.785398230088496, | |
| "grad_norm": 0.32679131627082825, | |
| "learning_rate": 0.0005068759464181711, | |
| "loss": 3.4617, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 7.799953423381463, | |
| "grad_norm": 0.3220043182373047, | |
| "learning_rate": 0.0005067012230634828, | |
| "loss": 3.461, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 7.81450861667443, | |
| "grad_norm": 0.3321243226528168, | |
| "learning_rate": 0.0005065264997087944, | |
| "loss": 3.4533, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 7.829063809967396, | |
| "grad_norm": 0.3184472918510437, | |
| "learning_rate": 0.0005063517763541059, | |
| "loss": 3.4589, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 7.843619003260363, | |
| "grad_norm": 0.30641835927963257, | |
| "learning_rate": 0.0005061770529994175, | |
| "loss": 3.4634, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 7.85817419655333, | |
| "grad_norm": 0.3150516152381897, | |
| "learning_rate": 0.0005060023296447291, | |
| "loss": 3.4527, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.85817419655333, | |
| "eval_accuracy": 0.3658183114828526, | |
| "eval_loss": 3.5826029777526855, | |
| "eval_runtime": 81.8134, | |
| "eval_samples_per_second": 203.524, | |
| "eval_steps_per_second": 12.724, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.872729389846297, | |
| "grad_norm": 0.3462480306625366, | |
| "learning_rate": 0.0005058276062900408, | |
| "loss": 3.4463, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 7.887284583139264, | |
| "grad_norm": 0.3430558741092682, | |
| "learning_rate": 0.0005056528829353523, | |
| "loss": 3.453, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 7.901839776432231, | |
| "grad_norm": 0.3238520324230194, | |
| "learning_rate": 0.000505478159580664, | |
| "loss": 3.454, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 7.916394969725198, | |
| "grad_norm": 0.33356767892837524, | |
| "learning_rate": 0.0005053034362259755, | |
| "loss": 3.4622, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 7.930950163018165, | |
| "grad_norm": 0.3406793475151062, | |
| "learning_rate": 0.000505128712871287, | |
| "loss": 3.4768, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 7.9455053563111315, | |
| "grad_norm": 0.3559691905975342, | |
| "learning_rate": 0.0005049539895165987, | |
| "loss": 3.4633, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 7.960060549604099, | |
| "grad_norm": 0.3063855767250061, | |
| "learning_rate": 0.0005047792661619103, | |
| "loss": 3.4613, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 7.974615742897066, | |
| "grad_norm": 0.32381609082221985, | |
| "learning_rate": 0.0005046045428072219, | |
| "loss": 3.4667, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 7.989170936190033, | |
| "grad_norm": 0.3121221661567688, | |
| "learning_rate": 0.0005044298194525334, | |
| "loss": 3.4587, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 8.003493246390311, | |
| "grad_norm": 0.3363412618637085, | |
| "learning_rate": 0.0005042550960978451, | |
| "loss": 3.4454, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 8.018048439683279, | |
| "grad_norm": 0.3512321710586548, | |
| "learning_rate": 0.0005040803727431566, | |
| "loss": 3.3513, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 8.032603632976246, | |
| "grad_norm": 0.3447287678718567, | |
| "learning_rate": 0.0005039056493884683, | |
| "loss": 3.3463, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 8.047158826269213, | |
| "grad_norm": 0.32655876874923706, | |
| "learning_rate": 0.0005037309260337798, | |
| "loss": 3.3499, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 8.06171401956218, | |
| "grad_norm": 0.3093357980251312, | |
| "learning_rate": 0.0005035562026790914, | |
| "loss": 3.3581, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 8.076269212855147, | |
| "grad_norm": 0.332110196352005, | |
| "learning_rate": 0.000503381479324403, | |
| "loss": 3.3593, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 8.090824406148114, | |
| "grad_norm": 0.33787426352500916, | |
| "learning_rate": 0.0005032067559697145, | |
| "loss": 3.3657, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 8.10537959944108, | |
| "grad_norm": 0.31102797389030457, | |
| "learning_rate": 0.0005030320326150262, | |
| "loss": 3.3654, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 8.119934792734048, | |
| "grad_norm": 0.3368318974971771, | |
| "learning_rate": 0.0005028573092603378, | |
| "loss": 3.3785, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 8.134489986027015, | |
| "grad_norm": 0.3380694091320038, | |
| "learning_rate": 0.0005026825859056494, | |
| "loss": 3.383, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 8.149045179319982, | |
| "grad_norm": 0.3217196464538574, | |
| "learning_rate": 0.0005025078625509609, | |
| "loss": 3.3971, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.149045179319982, | |
| "eval_accuracy": 0.36589305884909507, | |
| "eval_loss": 3.587799549102783, | |
| "eval_runtime": 82.1071, | |
| "eval_samples_per_second": 202.796, | |
| "eval_steps_per_second": 12.679, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.16360037261295, | |
| "grad_norm": 0.3217926621437073, | |
| "learning_rate": 0.0005023331391962726, | |
| "loss": 3.3885, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 8.178155565905914, | |
| "grad_norm": 0.306947261095047, | |
| "learning_rate": 0.0005021584158415841, | |
| "loss": 3.3801, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 8.192710759198881, | |
| "grad_norm": 0.3642211854457855, | |
| "learning_rate": 0.0005019836924868956, | |
| "loss": 3.3901, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 8.207265952491849, | |
| "grad_norm": 0.3352174758911133, | |
| "learning_rate": 0.0005018089691322073, | |
| "loss": 3.4066, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 8.221821145784816, | |
| "grad_norm": 0.3425366282463074, | |
| "learning_rate": 0.0005016342457775189, | |
| "loss": 3.404, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 8.236376339077783, | |
| "grad_norm": 0.35951751470565796, | |
| "learning_rate": 0.0005014595224228305, | |
| "loss": 3.4033, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 8.25093153237075, | |
| "grad_norm": 0.36366644501686096, | |
| "learning_rate": 0.000501284799068142, | |
| "loss": 3.3906, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 8.265486725663717, | |
| "grad_norm": 0.3793318569660187, | |
| "learning_rate": 0.0005011100757134537, | |
| "loss": 3.409, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 8.280041918956684, | |
| "grad_norm": 0.32547420263290405, | |
| "learning_rate": 0.0005009353523587653, | |
| "loss": 3.4002, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 8.294597112249651, | |
| "grad_norm": 0.31935733556747437, | |
| "learning_rate": 0.0005007606290040768, | |
| "loss": 3.3934, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 8.309152305542618, | |
| "grad_norm": 0.3431851863861084, | |
| "learning_rate": 0.0005005859056493884, | |
| "loss": 3.3893, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 8.323707498835585, | |
| "grad_norm": 0.35621240735054016, | |
| "learning_rate": 0.0005004111822947, | |
| "loss": 3.398, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 8.338262692128552, | |
| "grad_norm": 0.324577271938324, | |
| "learning_rate": 0.0005002364589400116, | |
| "loss": 3.3985, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 8.35281788542152, | |
| "grad_norm": 0.345625638961792, | |
| "learning_rate": 0.0005000617355853231, | |
| "loss": 3.4093, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 8.367373078714486, | |
| "grad_norm": 0.33227813243865967, | |
| "learning_rate": 0.0004998870122306348, | |
| "loss": 3.4129, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 8.381928272007451, | |
| "grad_norm": 0.33062544465065, | |
| "learning_rate": 0.0004997122888759464, | |
| "loss": 3.4099, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 8.396483465300419, | |
| "grad_norm": 0.31836047768592834, | |
| "learning_rate": 0.000499537565521258, | |
| "loss": 3.3991, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 8.411038658593386, | |
| "grad_norm": 0.3148673176765442, | |
| "learning_rate": 0.0004993628421665695, | |
| "loss": 3.4142, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 8.425593851886353, | |
| "grad_norm": 0.3396204710006714, | |
| "learning_rate": 0.0004991881188118811, | |
| "loss": 3.4085, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 8.44014904517932, | |
| "grad_norm": 0.35046735405921936, | |
| "learning_rate": 0.0004990133954571928, | |
| "loss": 3.3992, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.44014904517932, | |
| "eval_accuracy": 0.36625092949407667, | |
| "eval_loss": 3.582655429840088, | |
| "eval_runtime": 81.9547, | |
| "eval_samples_per_second": 203.173, | |
| "eval_steps_per_second": 12.702, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.454704238472287, | |
| "grad_norm": 0.33257773518562317, | |
| "learning_rate": 0.0004988386721025043, | |
| "loss": 3.4105, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 8.469259431765254, | |
| "grad_norm": 0.3485126197338104, | |
| "learning_rate": 0.0004986639487478159, | |
| "loss": 3.4052, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 8.483814625058221, | |
| "grad_norm": 0.3245488107204437, | |
| "learning_rate": 0.0004984892253931275, | |
| "loss": 3.4039, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 8.498369818351188, | |
| "grad_norm": 0.34247222542762756, | |
| "learning_rate": 0.0004983145020384391, | |
| "loss": 3.4078, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 8.512925011644155, | |
| "grad_norm": 0.3228091299533844, | |
| "learning_rate": 0.0004981397786837507, | |
| "loss": 3.41, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 8.527480204937122, | |
| "grad_norm": 0.3263584077358246, | |
| "learning_rate": 0.0004979650553290622, | |
| "loss": 3.4211, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 8.54203539823009, | |
| "grad_norm": 0.32324716448783875, | |
| "learning_rate": 0.0004977903319743739, | |
| "loss": 3.4323, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 8.556590591523056, | |
| "grad_norm": 0.3339226543903351, | |
| "learning_rate": 0.0004976156086196854, | |
| "loss": 3.419, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 8.571145784816022, | |
| "grad_norm": 0.33744218945503235, | |
| "learning_rate": 0.0004974408852649971, | |
| "loss": 3.4296, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 8.585700978108989, | |
| "grad_norm": 0.31706488132476807, | |
| "learning_rate": 0.0004972661619103086, | |
| "loss": 3.4172, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 8.600256171401956, | |
| "grad_norm": 0.32904085516929626, | |
| "learning_rate": 0.0004970914385556202, | |
| "loss": 3.4146, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 8.614811364694923, | |
| "grad_norm": 0.32602569460868835, | |
| "learning_rate": 0.0004969167152009318, | |
| "loss": 3.4187, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 8.62936655798789, | |
| "grad_norm": 0.33230870962142944, | |
| "learning_rate": 0.0004967419918462435, | |
| "loss": 3.4349, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 8.643921751280857, | |
| "grad_norm": 0.32816842198371887, | |
| "learning_rate": 0.000496567268491555, | |
| "loss": 3.428, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 8.658476944573824, | |
| "grad_norm": 0.33668944239616394, | |
| "learning_rate": 0.0004963925451368665, | |
| "loss": 3.4247, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 8.673032137866791, | |
| "grad_norm": 0.34307777881622314, | |
| "learning_rate": 0.0004962178217821782, | |
| "loss": 3.4277, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 8.687587331159758, | |
| "grad_norm": 0.3121023178100586, | |
| "learning_rate": 0.0004960430984274898, | |
| "loss": 3.428, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 8.702142524452725, | |
| "grad_norm": 0.338593065738678, | |
| "learning_rate": 0.0004958683750728014, | |
| "loss": 3.4189, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 8.716697717745692, | |
| "grad_norm": 0.33794039487838745, | |
| "learning_rate": 0.0004956936517181129, | |
| "loss": 3.4363, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 8.73125291103866, | |
| "grad_norm": 0.3358210027217865, | |
| "learning_rate": 0.0004955189283634246, | |
| "loss": 3.4274, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.73125291103866, | |
| "eval_accuracy": 0.36690590916714155, | |
| "eval_loss": 3.576470136642456, | |
| "eval_runtime": 82.0698, | |
| "eval_samples_per_second": 202.888, | |
| "eval_steps_per_second": 12.684, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.745808104331626, | |
| "grad_norm": 0.32459598779678345, | |
| "learning_rate": 0.0004953442050087361, | |
| "loss": 3.4198, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 8.760363297624593, | |
| "grad_norm": 0.342616468667984, | |
| "learning_rate": 0.0004951694816540476, | |
| "loss": 3.4191, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 8.774918490917559, | |
| "grad_norm": 0.31193557381629944, | |
| "learning_rate": 0.0004949947582993593, | |
| "loss": 3.4189, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 8.789473684210526, | |
| "grad_norm": 0.32262343168258667, | |
| "learning_rate": 0.0004948200349446709, | |
| "loss": 3.4277, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 8.804028877503493, | |
| "grad_norm": 0.3497733175754547, | |
| "learning_rate": 0.0004946453115899825, | |
| "loss": 3.4353, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 8.81858407079646, | |
| "grad_norm": 0.3206048011779785, | |
| "learning_rate": 0.000494470588235294, | |
| "loss": 3.4199, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 8.833139264089427, | |
| "grad_norm": 0.3430269658565521, | |
| "learning_rate": 0.0004942958648806057, | |
| "loss": 3.437, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 8.847694457382394, | |
| "grad_norm": 0.3212587833404541, | |
| "learning_rate": 0.0004941211415259173, | |
| "loss": 3.4332, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 8.862249650675361, | |
| "grad_norm": 0.32273635268211365, | |
| "learning_rate": 0.0004939464181712289, | |
| "loss": 3.4365, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 8.876804843968328, | |
| "grad_norm": 0.3025802969932556, | |
| "learning_rate": 0.0004937716948165404, | |
| "loss": 3.4226, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 8.891360037261295, | |
| "grad_norm": 0.32775530219078064, | |
| "learning_rate": 0.000493596971461852, | |
| "loss": 3.4479, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 8.905915230554262, | |
| "grad_norm": 0.3203197717666626, | |
| "learning_rate": 0.0004934222481071636, | |
| "loss": 3.4395, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 8.92047042384723, | |
| "grad_norm": 0.3446383476257324, | |
| "learning_rate": 0.0004932475247524751, | |
| "loss": 3.4261, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 8.935025617140196, | |
| "grad_norm": 0.3415515422821045, | |
| "learning_rate": 0.0004930728013977868, | |
| "loss": 3.4381, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 8.949580810433163, | |
| "grad_norm": 0.320813924074173, | |
| "learning_rate": 0.0004928980780430984, | |
| "loss": 3.4509, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 8.964136003726129, | |
| "grad_norm": 0.3143281638622284, | |
| "learning_rate": 0.00049272335468841, | |
| "loss": 3.4312, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 8.978691197019096, | |
| "grad_norm": 0.3112315535545349, | |
| "learning_rate": 0.0004925486313337215, | |
| "loss": 3.4395, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 8.993246390312063, | |
| "grad_norm": 0.32467585802078247, | |
| "learning_rate": 0.0004923739079790332, | |
| "loss": 3.4332, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 9.007568700512342, | |
| "grad_norm": 0.32676589488983154, | |
| "learning_rate": 0.0004921991846243447, | |
| "loss": 3.3757, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 9.02212389380531, | |
| "grad_norm": 0.3479295074939728, | |
| "learning_rate": 0.0004920244612696563, | |
| "loss": 3.3172, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.02212389380531, | |
| "eval_accuracy": 0.3669981681018905, | |
| "eval_loss": 3.581702709197998, | |
| "eval_runtime": 81.9193, | |
| "eval_samples_per_second": 203.261, | |
| "eval_steps_per_second": 12.708, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.036679087098276, | |
| "grad_norm": 0.3291727602481842, | |
| "learning_rate": 0.0004918497379149679, | |
| "loss": 3.3251, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 9.051234280391244, | |
| "grad_norm": 0.34830352663993835, | |
| "learning_rate": 0.0004916750145602795, | |
| "loss": 3.325, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 9.06578947368421, | |
| "grad_norm": 0.373354971408844, | |
| "learning_rate": 0.0004915002912055911, | |
| "loss": 3.3305, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 9.080344666977178, | |
| "grad_norm": 0.3475451171398163, | |
| "learning_rate": 0.0004913255678509026, | |
| "loss": 3.3425, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 9.094899860270145, | |
| "grad_norm": 0.3289613425731659, | |
| "learning_rate": 0.0004911508444962143, | |
| "loss": 3.3407, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 9.109455053563112, | |
| "grad_norm": 0.3524779975414276, | |
| "learning_rate": 0.0004909761211415259, | |
| "loss": 3.3467, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 9.124010246856079, | |
| "grad_norm": 0.3463732600212097, | |
| "learning_rate": 0.0004908013977868375, | |
| "loss": 3.3478, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 9.138565440149046, | |
| "grad_norm": 0.32002779841423035, | |
| "learning_rate": 0.0004906266744321491, | |
| "loss": 3.3472, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 9.153120633442011, | |
| "grad_norm": 0.3417486846446991, | |
| "learning_rate": 0.0004904519510774606, | |
| "loss": 3.3621, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 9.167675826734978, | |
| "grad_norm": 0.3425775468349457, | |
| "learning_rate": 0.0004902772277227722, | |
| "loss": 3.3593, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 9.182231020027945, | |
| "grad_norm": 0.3462997376918793, | |
| "learning_rate": 0.0004901025043680838, | |
| "loss": 3.3494, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 9.196786213320912, | |
| "grad_norm": 0.3444385230541229, | |
| "learning_rate": 0.0004899277810133955, | |
| "loss": 3.3521, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 9.21134140661388, | |
| "grad_norm": 0.3377320468425751, | |
| "learning_rate": 0.000489753057658707, | |
| "loss": 3.3595, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 9.225896599906847, | |
| "grad_norm": 0.379800945520401, | |
| "learning_rate": 0.0004895783343040186, | |
| "loss": 3.3602, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 9.240451793199814, | |
| "grad_norm": 0.3381807804107666, | |
| "learning_rate": 0.0004894036109493302, | |
| "loss": 3.3637, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 9.25500698649278, | |
| "grad_norm": 0.33349454402923584, | |
| "learning_rate": 0.0004892288875946419, | |
| "loss": 3.3518, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 9.269562179785748, | |
| "grad_norm": 0.3585297465324402, | |
| "learning_rate": 0.0004890541642399534, | |
| "loss": 3.3681, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 9.284117373078715, | |
| "grad_norm": 0.3441237509250641, | |
| "learning_rate": 0.0004888794408852649, | |
| "loss": 3.3759, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 9.298672566371682, | |
| "grad_norm": 0.35841336846351624, | |
| "learning_rate": 0.0004887047175305766, | |
| "loss": 3.3628, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 9.313227759664649, | |
| "grad_norm": 0.35150107741355896, | |
| "learning_rate": 0.0004885299941758881, | |
| "loss": 3.3913, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.313227759664649, | |
| "eval_accuracy": 0.3674273778212577, | |
| "eval_loss": 3.5788700580596924, | |
| "eval_runtime": 82.0761, | |
| "eval_samples_per_second": 202.873, | |
| "eval_steps_per_second": 12.683, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.327782952957616, | |
| "grad_norm": 0.3445305824279785, | |
| "learning_rate": 0.0004883552708211997, | |
| "loss": 3.3777, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 9.342338146250583, | |
| "grad_norm": 0.36344340443611145, | |
| "learning_rate": 0.00048818054746651137, | |
| "loss": 3.3759, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 9.356893339543548, | |
| "grad_norm": 0.35172000527381897, | |
| "learning_rate": 0.0004880058241118229, | |
| "loss": 3.3886, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 9.371448532836515, | |
| "grad_norm": 0.35098564624786377, | |
| "learning_rate": 0.0004878311007571345, | |
| "loss": 3.3817, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 9.386003726129482, | |
| "grad_norm": 0.3312366008758545, | |
| "learning_rate": 0.0004876563774024461, | |
| "loss": 3.3775, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 9.40055891942245, | |
| "grad_norm": 0.33731335401535034, | |
| "learning_rate": 0.00048748165404775763, | |
| "loss": 3.3849, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 9.415114112715417, | |
| "grad_norm": 0.32839637994766235, | |
| "learning_rate": 0.0004873069306930693, | |
| "loss": 3.3739, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 9.429669306008384, | |
| "grad_norm": 0.34677696228027344, | |
| "learning_rate": 0.0004871322073383809, | |
| "loss": 3.3975, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 9.44422449930135, | |
| "grad_norm": 0.32968953251838684, | |
| "learning_rate": 0.00048695748398369247, | |
| "loss": 3.3888, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 9.458779692594318, | |
| "grad_norm": 0.3250788152217865, | |
| "learning_rate": 0.000486782760629004, | |
| "loss": 3.3856, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 9.473334885887285, | |
| "grad_norm": 0.322723388671875, | |
| "learning_rate": 0.0004866080372743156, | |
| "loss": 3.3933, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 9.487890079180252, | |
| "grad_norm": 0.3373723030090332, | |
| "learning_rate": 0.0004864333139196272, | |
| "loss": 3.3887, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 9.502445272473219, | |
| "grad_norm": 0.34784844517707825, | |
| "learning_rate": 0.00048625859056493885, | |
| "loss": 3.4084, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 9.517000465766186, | |
| "grad_norm": 0.33688467741012573, | |
| "learning_rate": 0.0004860838672102504, | |
| "loss": 3.3863, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 9.531555659059153, | |
| "grad_norm": 0.32421359419822693, | |
| "learning_rate": 0.000485909143855562, | |
| "loss": 3.4006, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 9.546110852352118, | |
| "grad_norm": 0.3409993052482605, | |
| "learning_rate": 0.0004857344205008736, | |
| "loss": 3.3868, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 9.560666045645085, | |
| "grad_norm": 0.3301107585430145, | |
| "learning_rate": 0.00048555969714618517, | |
| "loss": 3.4046, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 9.575221238938052, | |
| "grad_norm": 0.33261144161224365, | |
| "learning_rate": 0.0004853849737914967, | |
| "loss": 3.4027, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 9.58977643223102, | |
| "grad_norm": 0.33643490076065063, | |
| "learning_rate": 0.00048521025043680836, | |
| "loss": 3.4043, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 9.604331625523987, | |
| "grad_norm": 0.3488525450229645, | |
| "learning_rate": 0.00048503552708211995, | |
| "loss": 3.3959, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.604331625523987, | |
| "eval_accuracy": 0.3679326277072268, | |
| "eval_loss": 3.572619915008545, | |
| "eval_runtime": 82.0172, | |
| "eval_samples_per_second": 203.018, | |
| "eval_steps_per_second": 12.692, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.618886818816954, | |
| "grad_norm": 0.3238380551338196, | |
| "learning_rate": 0.00048486080372743155, | |
| "loss": 3.4031, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 9.63344201210992, | |
| "grad_norm": 0.3409719467163086, | |
| "learning_rate": 0.0004846860803727431, | |
| "loss": 3.3999, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 9.647997205402888, | |
| "grad_norm": 0.3525969684123993, | |
| "learning_rate": 0.0004845113570180547, | |
| "loss": 3.4005, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 9.662552398695855, | |
| "grad_norm": 0.36587682366371155, | |
| "learning_rate": 0.00048433663366336633, | |
| "loss": 3.3995, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 9.677107591988822, | |
| "grad_norm": 0.3345111012458801, | |
| "learning_rate": 0.0004841619103086779, | |
| "loss": 3.4006, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 9.691662785281789, | |
| "grad_norm": 0.326628178358078, | |
| "learning_rate": 0.00048398718695398947, | |
| "loss": 3.399, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 9.706217978574756, | |
| "grad_norm": 0.3712170720100403, | |
| "learning_rate": 0.00048381246359930106, | |
| "loss": 3.4052, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 9.720773171867723, | |
| "grad_norm": 0.3188508152961731, | |
| "learning_rate": 0.00048363774024461265, | |
| "loss": 3.3954, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 9.73532836516069, | |
| "grad_norm": 0.3431711196899414, | |
| "learning_rate": 0.0004834630168899242, | |
| "loss": 3.4122, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 9.749883558453657, | |
| "grad_norm": 0.33457499742507935, | |
| "learning_rate": 0.00048328829353523584, | |
| "loss": 3.4086, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 9.764438751746622, | |
| "grad_norm": 0.3492046594619751, | |
| "learning_rate": 0.00048311357018054744, | |
| "loss": 3.4056, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 9.77899394503959, | |
| "grad_norm": 0.3399643898010254, | |
| "learning_rate": 0.00048293884682585903, | |
| "loss": 3.3998, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 9.793549138332557, | |
| "grad_norm": 0.35208308696746826, | |
| "learning_rate": 0.00048276412347117057, | |
| "loss": 3.4036, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 9.808104331625524, | |
| "grad_norm": 0.33642035722732544, | |
| "learning_rate": 0.00048258940011648217, | |
| "loss": 3.4265, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 9.82265952491849, | |
| "grad_norm": 0.3507674038410187, | |
| "learning_rate": 0.0004824146767617938, | |
| "loss": 3.3974, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 9.837214718211458, | |
| "grad_norm": 0.33665731549263, | |
| "learning_rate": 0.0004822399534071054, | |
| "loss": 3.3932, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 9.851769911504425, | |
| "grad_norm": 0.3271755874156952, | |
| "learning_rate": 0.00048206523005241695, | |
| "loss": 3.4132, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 9.866325104797392, | |
| "grad_norm": 0.3454242944717407, | |
| "learning_rate": 0.00048189050669772854, | |
| "loss": 3.4135, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 9.880880298090359, | |
| "grad_norm": 0.3278016448020935, | |
| "learning_rate": 0.00048171578334304014, | |
| "loss": 3.417, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 9.895435491383326, | |
| "grad_norm": 0.37287411093711853, | |
| "learning_rate": 0.00048154105998835173, | |
| "loss": 3.4142, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.895435491383326, | |
| "eval_accuracy": 0.3683227008338915, | |
| "eval_loss": 3.561631441116333, | |
| "eval_runtime": 81.9686, | |
| "eval_samples_per_second": 203.139, | |
| "eval_steps_per_second": 12.7, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.909990684676293, | |
| "grad_norm": 0.34253352880477905, | |
| "learning_rate": 0.0004813663366336633, | |
| "loss": 3.4003, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 9.92454587796926, | |
| "grad_norm": 0.33108028769493103, | |
| "learning_rate": 0.0004811916132789749, | |
| "loss": 3.4168, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 9.939101071262227, | |
| "grad_norm": 0.34979739785194397, | |
| "learning_rate": 0.0004810168899242865, | |
| "loss": 3.4135, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 9.953656264555192, | |
| "grad_norm": 0.3498181700706482, | |
| "learning_rate": 0.0004808421665695981, | |
| "loss": 3.4033, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 9.96821145784816, | |
| "grad_norm": 0.34960389137268066, | |
| "learning_rate": 0.00048066744321490965, | |
| "loss": 3.403, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 9.982766651141127, | |
| "grad_norm": 0.32628920674324036, | |
| "learning_rate": 0.00048049271986022124, | |
| "loss": 3.4197, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 9.997321844434094, | |
| "grad_norm": 0.33822116255760193, | |
| "learning_rate": 0.0004803179965055329, | |
| "loss": 3.4017, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 10.011644154634373, | |
| "grad_norm": 0.3604332208633423, | |
| "learning_rate": 0.0004801432731508445, | |
| "loss": 3.3313, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 10.02619934792734, | |
| "grad_norm": 0.36177417635917664, | |
| "learning_rate": 0.000479968549796156, | |
| "loss": 3.3053, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 10.040754541220307, | |
| "grad_norm": 0.32567745447158813, | |
| "learning_rate": 0.0004797938264414676, | |
| "loss": 3.31, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 10.055309734513274, | |
| "grad_norm": 0.35699018836021423, | |
| "learning_rate": 0.0004796191030867792, | |
| "loss": 3.3134, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 10.069864927806242, | |
| "grad_norm": 0.3250761330127716, | |
| "learning_rate": 0.00047944437973209086, | |
| "loss": 3.3204, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 10.084420121099209, | |
| "grad_norm": 0.34438347816467285, | |
| "learning_rate": 0.0004792696563774024, | |
| "loss": 3.3265, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 10.098975314392176, | |
| "grad_norm": 0.3575981855392456, | |
| "learning_rate": 0.000479094933022714, | |
| "loss": 3.3176, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 10.113530507685143, | |
| "grad_norm": 0.3481749892234802, | |
| "learning_rate": 0.0004789202096680256, | |
| "loss": 3.3225, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 10.12808570097811, | |
| "grad_norm": 0.3449457585811615, | |
| "learning_rate": 0.00047874548631333713, | |
| "loss": 3.3293, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 10.142640894271075, | |
| "grad_norm": 0.3334158658981323, | |
| "learning_rate": 0.0004785707629586487, | |
| "loss": 3.3256, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 10.157196087564042, | |
| "grad_norm": 0.3844655156135559, | |
| "learning_rate": 0.0004783960396039604, | |
| "loss": 3.3205, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 10.17175128085701, | |
| "grad_norm": 0.33834201097488403, | |
| "learning_rate": 0.00047822131624927197, | |
| "loss": 3.3321, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 10.186306474149976, | |
| "grad_norm": 0.33722230792045593, | |
| "learning_rate": 0.0004780465928945835, | |
| "loss": 3.3329, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.186306474149976, | |
| "eval_accuracy": 0.3681863691596128, | |
| "eval_loss": 3.574190139770508, | |
| "eval_runtime": 82.3735, | |
| "eval_samples_per_second": 202.14, | |
| "eval_steps_per_second": 12.638, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.200861667442943, | |
| "grad_norm": 0.344668984413147, | |
| "learning_rate": 0.0004778718695398951, | |
| "loss": 3.3373, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 10.21541686073591, | |
| "grad_norm": 0.32399624586105347, | |
| "learning_rate": 0.0004776971461852067, | |
| "loss": 3.3416, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 10.229972054028877, | |
| "grad_norm": 0.35089051723480225, | |
| "learning_rate": 0.00047752242283051835, | |
| "loss": 3.3416, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 10.244527247321844, | |
| "grad_norm": 0.3465506434440613, | |
| "learning_rate": 0.00047734769947582994, | |
| "loss": 3.3431, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 10.259082440614812, | |
| "grad_norm": 0.39566588401794434, | |
| "learning_rate": 0.0004771729761211415, | |
| "loss": 3.3516, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 10.273637633907779, | |
| "grad_norm": 0.37718045711517334, | |
| "learning_rate": 0.0004769982527664531, | |
| "loss": 3.3555, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 10.288192827200746, | |
| "grad_norm": 0.3543698787689209, | |
| "learning_rate": 0.00047682352941176467, | |
| "loss": 3.3564, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 10.302748020493713, | |
| "grad_norm": 0.3657527565956116, | |
| "learning_rate": 0.0004766488060570762, | |
| "loss": 3.3525, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 10.31730321378668, | |
| "grad_norm": 0.3520412743091583, | |
| "learning_rate": 0.00047647408270238786, | |
| "loss": 3.3354, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 10.331858407079647, | |
| "grad_norm": 0.34476545453071594, | |
| "learning_rate": 0.00047629935934769945, | |
| "loss": 3.3476, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 10.346413600372612, | |
| "grad_norm": 0.3469763398170471, | |
| "learning_rate": 0.00047612463599301105, | |
| "loss": 3.361, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 10.36096879366558, | |
| "grad_norm": 0.34500664472579956, | |
| "learning_rate": 0.0004759499126383226, | |
| "loss": 3.3639, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 10.375523986958546, | |
| "grad_norm": 0.31661102175712585, | |
| "learning_rate": 0.0004757751892836342, | |
| "loss": 3.3501, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 10.390079180251513, | |
| "grad_norm": 0.3227953016757965, | |
| "learning_rate": 0.0004756004659289458, | |
| "loss": 3.3623, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 10.40463437354448, | |
| "grad_norm": 0.35375112295150757, | |
| "learning_rate": 0.0004754257425742574, | |
| "loss": 3.3638, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 10.419189566837447, | |
| "grad_norm": 0.3518407642841339, | |
| "learning_rate": 0.00047525101921956896, | |
| "loss": 3.3627, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 10.433744760130415, | |
| "grad_norm": 0.3641967177391052, | |
| "learning_rate": 0.00047507629586488056, | |
| "loss": 3.359, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 10.448299953423382, | |
| "grad_norm": 0.3373609185218811, | |
| "learning_rate": 0.00047490157251019215, | |
| "loss": 3.3723, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 10.462855146716349, | |
| "grad_norm": 0.3260798454284668, | |
| "learning_rate": 0.0004747268491555037, | |
| "loss": 3.3729, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 10.477410340009316, | |
| "grad_norm": 0.33899104595184326, | |
| "learning_rate": 0.00047455212580081534, | |
| "loss": 3.3651, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.477410340009316, | |
| "eval_accuracy": 0.3683405649843142, | |
| "eval_loss": 3.5703930854797363, | |
| "eval_runtime": 82.1171, | |
| "eval_samples_per_second": 202.771, | |
| "eval_steps_per_second": 12.677, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.491965533302283, | |
| "grad_norm": 0.3435630798339844, | |
| "learning_rate": 0.00047437740244612694, | |
| "loss": 3.366, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 10.50652072659525, | |
| "grad_norm": 0.34553924202919006, | |
| "learning_rate": 0.00047420267909143853, | |
| "loss": 3.372, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 10.521075919888217, | |
| "grad_norm": 0.34176772832870483, | |
| "learning_rate": 0.0004740279557367501, | |
| "loss": 3.3765, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 10.535631113181182, | |
| "grad_norm": 0.3592422604560852, | |
| "learning_rate": 0.00047385323238206166, | |
| "loss": 3.3675, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 10.55018630647415, | |
| "grad_norm": 0.332703560590744, | |
| "learning_rate": 0.00047367850902737326, | |
| "loss": 3.3689, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 10.564741499767116, | |
| "grad_norm": 0.3539530336856842, | |
| "learning_rate": 0.0004735037856726849, | |
| "loss": 3.3794, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 10.579296693060083, | |
| "grad_norm": 0.33807680010795593, | |
| "learning_rate": 0.0004733290623179965, | |
| "loss": 3.3887, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 10.59385188635305, | |
| "grad_norm": 0.3599720597267151, | |
| "learning_rate": 0.00047315433896330804, | |
| "loss": 3.3792, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 10.608407079646017, | |
| "grad_norm": 0.3515259325504303, | |
| "learning_rate": 0.00047297961560861964, | |
| "loss": 3.379, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 10.622962272938985, | |
| "grad_norm": 0.39134037494659424, | |
| "learning_rate": 0.00047280489225393123, | |
| "loss": 3.3765, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 10.637517466231952, | |
| "grad_norm": 0.3642610013484955, | |
| "learning_rate": 0.0004726301688992429, | |
| "loss": 3.3885, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 10.652072659524919, | |
| "grad_norm": 0.3466510772705078, | |
| "learning_rate": 0.0004724554455445544, | |
| "loss": 3.3814, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 10.666627852817886, | |
| "grad_norm": 0.3323917090892792, | |
| "learning_rate": 0.000472280722189866, | |
| "loss": 3.3916, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 10.681183046110853, | |
| "grad_norm": 0.3678683638572693, | |
| "learning_rate": 0.0004721059988351776, | |
| "loss": 3.3832, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 10.69573823940382, | |
| "grad_norm": 0.3605380058288574, | |
| "learning_rate": 0.00047193127548048915, | |
| "loss": 3.373, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 10.710293432696787, | |
| "grad_norm": 0.35462960600852966, | |
| "learning_rate": 0.00047175655212580074, | |
| "loss": 3.3662, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 10.724848625989754, | |
| "grad_norm": 0.33662113547325134, | |
| "learning_rate": 0.0004715818287711124, | |
| "loss": 3.3756, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 10.73940381928272, | |
| "grad_norm": 0.34134191274642944, | |
| "learning_rate": 0.000471407105416424, | |
| "loss": 3.3793, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 10.753959012575686, | |
| "grad_norm": 0.3451937735080719, | |
| "learning_rate": 0.0004712323820617355, | |
| "loss": 3.3809, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 10.768514205868653, | |
| "grad_norm": 0.3433123230934143, | |
| "learning_rate": 0.0004710576587070471, | |
| "loss": 3.3795, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.768514205868653, | |
| "eval_accuracy": 0.3691385753880663, | |
| "eval_loss": 3.562664270401001, | |
| "eval_runtime": 82.1014, | |
| "eval_samples_per_second": 202.81, | |
| "eval_steps_per_second": 12.679, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.78306939916162, | |
| "grad_norm": 0.3543647527694702, | |
| "learning_rate": 0.0004708829353523587, | |
| "loss": 3.3819, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 10.797624592454587, | |
| "grad_norm": 0.33797594904899597, | |
| "learning_rate": 0.0004707082119976703, | |
| "loss": 3.3753, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 10.812179785747555, | |
| "grad_norm": 0.3341057598590851, | |
| "learning_rate": 0.0004705334886429819, | |
| "loss": 3.3793, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 10.826734979040522, | |
| "grad_norm": 0.333950400352478, | |
| "learning_rate": 0.0004703587652882935, | |
| "loss": 3.3847, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 10.841290172333489, | |
| "grad_norm": 0.34306496381759644, | |
| "learning_rate": 0.0004701840419336051, | |
| "loss": 3.3819, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 10.855845365626456, | |
| "grad_norm": 0.34738656878471375, | |
| "learning_rate": 0.0004700093185789167, | |
| "loss": 3.3881, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 10.870400558919423, | |
| "grad_norm": 0.3217121362686157, | |
| "learning_rate": 0.0004698345952242282, | |
| "loss": 3.3716, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 10.88495575221239, | |
| "grad_norm": 0.3289602994918823, | |
| "learning_rate": 0.00046965987186953987, | |
| "loss": 3.3895, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 10.899510945505357, | |
| "grad_norm": 0.32252728939056396, | |
| "learning_rate": 0.00046948514851485147, | |
| "loss": 3.3939, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 10.914066138798324, | |
| "grad_norm": 0.32516682147979736, | |
| "learning_rate": 0.00046931042516016306, | |
| "loss": 3.3863, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 10.92862133209129, | |
| "grad_norm": 0.3544132709503174, | |
| "learning_rate": 0.0004691357018054746, | |
| "loss": 3.3824, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 10.943176525384256, | |
| "grad_norm": 0.3622342646121979, | |
| "learning_rate": 0.0004689609784507862, | |
| "loss": 3.3955, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 10.957731718677223, | |
| "grad_norm": 0.3408651351928711, | |
| "learning_rate": 0.0004687862550960978, | |
| "loss": 3.3951, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 10.97228691197019, | |
| "grad_norm": 0.3178887665271759, | |
| "learning_rate": 0.00046861153174140944, | |
| "loss": 3.3856, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 10.986842105263158, | |
| "grad_norm": 0.3369850218296051, | |
| "learning_rate": 0.000468436808386721, | |
| "loss": 3.3947, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 11.001164415463437, | |
| "grad_norm": 0.3850873112678528, | |
| "learning_rate": 0.0004682620850320326, | |
| "loss": 3.3788, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 11.015719608756404, | |
| "grad_norm": 0.3428777754306793, | |
| "learning_rate": 0.00046808736167734417, | |
| "loss": 3.287, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 11.030274802049371, | |
| "grad_norm": 0.3666342496871948, | |
| "learning_rate": 0.0004679126383226557, | |
| "loss": 3.2824, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 11.044829995342338, | |
| "grad_norm": 0.3504311442375183, | |
| "learning_rate": 0.00046773791496796736, | |
| "loss": 3.2866, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 11.059385188635305, | |
| "grad_norm": 0.3658325970172882, | |
| "learning_rate": 0.00046756319161327895, | |
| "loss": 3.298, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.059385188635305, | |
| "eval_accuracy": 0.3690424380522388, | |
| "eval_loss": 3.5693283081054688, | |
| "eval_runtime": 82.0824, | |
| "eval_samples_per_second": 202.857, | |
| "eval_steps_per_second": 12.682, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.073940381928272, | |
| "grad_norm": 0.37235787510871887, | |
| "learning_rate": 0.00046738846825859054, | |
| "loss": 3.2916, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 11.08849557522124, | |
| "grad_norm": 0.3587813973426819, | |
| "learning_rate": 0.0004672137449039021, | |
| "loss": 3.2948, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 11.103050768514207, | |
| "grad_norm": 0.3582722246646881, | |
| "learning_rate": 0.0004670390215492137, | |
| "loss": 3.3043, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 11.117605961807174, | |
| "grad_norm": 0.36907780170440674, | |
| "learning_rate": 0.0004668642981945253, | |
| "loss": 3.302, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 11.132161155100139, | |
| "grad_norm": 0.36020976305007935, | |
| "learning_rate": 0.0004666895748398369, | |
| "loss": 3.3076, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 11.146716348393106, | |
| "grad_norm": 0.34022605419158936, | |
| "learning_rate": 0.00046651485148514846, | |
| "loss": 3.3169, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 11.161271541686073, | |
| "grad_norm": 0.36474987864494324, | |
| "learning_rate": 0.00046634012813046006, | |
| "loss": 3.3017, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 11.17582673497904, | |
| "grad_norm": 0.3661261796951294, | |
| "learning_rate": 0.00046616540477577165, | |
| "loss": 3.3226, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 11.190381928272007, | |
| "grad_norm": 0.38486114144325256, | |
| "learning_rate": 0.00046599068142108324, | |
| "loss": 3.3116, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 11.204937121564974, | |
| "grad_norm": 0.36107924580574036, | |
| "learning_rate": 0.0004658159580663948, | |
| "loss": 3.3146, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 11.219492314857941, | |
| "grad_norm": 0.33989226818084717, | |
| "learning_rate": 0.00046564123471170643, | |
| "loss": 3.3181, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 11.234047508150908, | |
| "grad_norm": 0.3407087028026581, | |
| "learning_rate": 0.00046546651135701803, | |
| "loss": 3.3134, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 11.248602701443875, | |
| "grad_norm": 0.37018853425979614, | |
| "learning_rate": 0.0004652917880023296, | |
| "loss": 3.3243, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 11.263157894736842, | |
| "grad_norm": 0.3585454225540161, | |
| "learning_rate": 0.00046511706464764116, | |
| "loss": 3.3346, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 11.27771308802981, | |
| "grad_norm": 0.337131530046463, | |
| "learning_rate": 0.00046494234129295276, | |
| "loss": 3.3256, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 11.292268281322777, | |
| "grad_norm": 0.3319418430328369, | |
| "learning_rate": 0.0004647676179382644, | |
| "loss": 3.3292, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 11.306823474615744, | |
| "grad_norm": 0.3478842079639435, | |
| "learning_rate": 0.000464592894583576, | |
| "loss": 3.3188, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 11.32137866790871, | |
| "grad_norm": 0.33565735816955566, | |
| "learning_rate": 0.00046441817122888754, | |
| "loss": 3.3281, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 11.335933861201676, | |
| "grad_norm": 0.3470151126384735, | |
| "learning_rate": 0.00046424344787419913, | |
| "loss": 3.3334, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 11.350489054494643, | |
| "grad_norm": 0.3330710232257843, | |
| "learning_rate": 0.00046406872451951073, | |
| "loss": 3.3304, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.350489054494643, | |
| "eval_accuracy": 0.3694375648530362, | |
| "eval_loss": 3.5655887126922607, | |
| "eval_runtime": 82.0827, | |
| "eval_samples_per_second": 202.856, | |
| "eval_steps_per_second": 12.682, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.36504424778761, | |
| "grad_norm": 0.3645959794521332, | |
| "learning_rate": 0.00046389400116482227, | |
| "loss": 3.3429, | |
| "step": 39050 | |
| }, | |
| { | |
| "epoch": 11.379599441080577, | |
| "grad_norm": 0.3551097512245178, | |
| "learning_rate": 0.0004637192778101339, | |
| "loss": 3.3411, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 11.394154634373544, | |
| "grad_norm": 0.33103764057159424, | |
| "learning_rate": 0.0004635445544554455, | |
| "loss": 3.3374, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 11.408709827666511, | |
| "grad_norm": 0.35111624002456665, | |
| "learning_rate": 0.0004633698311007571, | |
| "loss": 3.3407, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 11.423265020959478, | |
| "grad_norm": 0.3588010370731354, | |
| "learning_rate": 0.0004631951077460687, | |
| "loss": 3.337, | |
| "step": 39250 | |
| }, | |
| { | |
| "epoch": 11.437820214252445, | |
| "grad_norm": 0.33674612641334534, | |
| "learning_rate": 0.00046302038439138024, | |
| "loss": 3.3563, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 11.452375407545413, | |
| "grad_norm": 0.34078797698020935, | |
| "learning_rate": 0.0004628456610366919, | |
| "loss": 3.3374, | |
| "step": 39350 | |
| }, | |
| { | |
| "epoch": 11.46693060083838, | |
| "grad_norm": 0.3409959375858307, | |
| "learning_rate": 0.0004626709376820035, | |
| "loss": 3.3382, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 11.481485794131347, | |
| "grad_norm": 0.39047813415527344, | |
| "learning_rate": 0.0004624962143273151, | |
| "loss": 3.3591, | |
| "step": 39450 | |
| }, | |
| { | |
| "epoch": 11.496040987424314, | |
| "grad_norm": 0.33389782905578613, | |
| "learning_rate": 0.0004623214909726266, | |
| "loss": 3.3587, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 11.51059618071728, | |
| "grad_norm": 0.34324008226394653, | |
| "learning_rate": 0.0004621467676179382, | |
| "loss": 3.3454, | |
| "step": 39550 | |
| }, | |
| { | |
| "epoch": 11.525151374010246, | |
| "grad_norm": 0.39526453614234924, | |
| "learning_rate": 0.0004619720442632498, | |
| "loss": 3.3345, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 11.539706567303213, | |
| "grad_norm": 0.3654821217060089, | |
| "learning_rate": 0.00046179732090856145, | |
| "loss": 3.3635, | |
| "step": 39650 | |
| }, | |
| { | |
| "epoch": 11.55426176059618, | |
| "grad_norm": 0.3344481885433197, | |
| "learning_rate": 0.000461622597553873, | |
| "loss": 3.3636, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 11.568816953889147, | |
| "grad_norm": 0.37102261185646057, | |
| "learning_rate": 0.0004614478741991846, | |
| "loss": 3.3544, | |
| "step": 39750 | |
| }, | |
| { | |
| "epoch": 11.583372147182114, | |
| "grad_norm": 0.35413968563079834, | |
| "learning_rate": 0.0004612731508444962, | |
| "loss": 3.3539, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 11.597927340475081, | |
| "grad_norm": 0.3358296751976013, | |
| "learning_rate": 0.0004610984274898077, | |
| "loss": 3.3517, | |
| "step": 39850 | |
| }, | |
| { | |
| "epoch": 11.612482533768048, | |
| "grad_norm": 0.37655940651893616, | |
| "learning_rate": 0.00046092370413511937, | |
| "loss": 3.3567, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 11.627037727061015, | |
| "grad_norm": 0.3454345762729645, | |
| "learning_rate": 0.00046074898078043096, | |
| "loss": 3.3466, | |
| "step": 39950 | |
| }, | |
| { | |
| "epoch": 11.641592920353983, | |
| "grad_norm": 0.3474576771259308, | |
| "learning_rate": 0.00046057425742574256, | |
| "loss": 3.3744, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.641592920353983, | |
| "eval_accuracy": 0.3696366561084053, | |
| "eval_loss": 3.5574264526367188, | |
| "eval_runtime": 81.977, | |
| "eval_samples_per_second": 203.118, | |
| "eval_steps_per_second": 12.699, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.65614811364695, | |
| "grad_norm": 0.3512968122959137, | |
| "learning_rate": 0.0004603995340710541, | |
| "loss": 3.362, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 11.670703306939917, | |
| "grad_norm": 0.33853769302368164, | |
| "learning_rate": 0.0004602248107163657, | |
| "loss": 3.3647, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 11.685258500232884, | |
| "grad_norm": 0.33497312664985657, | |
| "learning_rate": 0.0004600500873616773, | |
| "loss": 3.3709, | |
| "step": 40150 | |
| }, | |
| { | |
| "epoch": 11.69981369352585, | |
| "grad_norm": 0.34639179706573486, | |
| "learning_rate": 0.00045987536400698894, | |
| "loss": 3.3727, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 11.714368886818818, | |
| "grad_norm": 0.3486192524433136, | |
| "learning_rate": 0.0004597006406523005, | |
| "loss": 3.3618, | |
| "step": 40250 | |
| }, | |
| { | |
| "epoch": 11.728924080111783, | |
| "grad_norm": 0.3403240144252777, | |
| "learning_rate": 0.00045952591729761207, | |
| "loss": 3.3665, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 11.74347927340475, | |
| "grad_norm": 0.36133143305778503, | |
| "learning_rate": 0.00045935119394292367, | |
| "loss": 3.3613, | |
| "step": 40350 | |
| }, | |
| { | |
| "epoch": 11.758034466697717, | |
| "grad_norm": 0.3656970262527466, | |
| "learning_rate": 0.00045917647058823526, | |
| "loss": 3.3498, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 11.772589659990684, | |
| "grad_norm": 0.32589244842529297, | |
| "learning_rate": 0.0004590017472335468, | |
| "loss": 3.3633, | |
| "step": 40450 | |
| }, | |
| { | |
| "epoch": 11.787144853283651, | |
| "grad_norm": 0.3547135293483734, | |
| "learning_rate": 0.00045882702387885845, | |
| "loss": 3.365, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 11.801700046576618, | |
| "grad_norm": 0.34963423013687134, | |
| "learning_rate": 0.00045865230052417004, | |
| "loss": 3.3544, | |
| "step": 40550 | |
| }, | |
| { | |
| "epoch": 11.816255239869585, | |
| "grad_norm": 0.34507817029953003, | |
| "learning_rate": 0.00045847757716948164, | |
| "loss": 3.3681, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 11.830810433162553, | |
| "grad_norm": 0.34605300426483154, | |
| "learning_rate": 0.0004583028538147932, | |
| "loss": 3.3609, | |
| "step": 40650 | |
| }, | |
| { | |
| "epoch": 11.84536562645552, | |
| "grad_norm": 0.341431587934494, | |
| "learning_rate": 0.00045812813046010477, | |
| "loss": 3.3677, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 11.859920819748487, | |
| "grad_norm": 0.36213192343711853, | |
| "learning_rate": 0.0004579534071054164, | |
| "loss": 3.3776, | |
| "step": 40750 | |
| }, | |
| { | |
| "epoch": 11.874476013041454, | |
| "grad_norm": 0.3856731355190277, | |
| "learning_rate": 0.000457778683750728, | |
| "loss": 3.3732, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 11.88903120633442, | |
| "grad_norm": 0.34758904576301575, | |
| "learning_rate": 0.00045760396039603955, | |
| "loss": 3.3656, | |
| "step": 40850 | |
| }, | |
| { | |
| "epoch": 11.903586399627388, | |
| "grad_norm": 0.33880770206451416, | |
| "learning_rate": 0.00045742923704135115, | |
| "loss": 3.367, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 11.918141592920353, | |
| "grad_norm": 0.3616976737976074, | |
| "learning_rate": 0.00045725451368666274, | |
| "loss": 3.3745, | |
| "step": 40950 | |
| }, | |
| { | |
| "epoch": 11.93269678621332, | |
| "grad_norm": 0.34102290868759155, | |
| "learning_rate": 0.0004570797903319743, | |
| "loss": 3.3672, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.93269678621332, | |
| "eval_accuracy": 0.3704250292731136, | |
| "eval_loss": 3.5495550632476807, | |
| "eval_runtime": 81.9801, | |
| "eval_samples_per_second": 203.11, | |
| "eval_steps_per_second": 12.698, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.947251979506287, | |
| "grad_norm": 0.41637012362480164, | |
| "learning_rate": 0.00045690506697728593, | |
| "loss": 3.3784, | |
| "step": 41050 | |
| }, | |
| { | |
| "epoch": 11.961807172799254, | |
| "grad_norm": 0.3575046956539154, | |
| "learning_rate": 0.0004567303436225975, | |
| "loss": 3.3768, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 11.976362366092221, | |
| "grad_norm": 0.3339930772781372, | |
| "learning_rate": 0.0004565556202679091, | |
| "loss": 3.3723, | |
| "step": 41150 | |
| }, | |
| { | |
| "epoch": 11.990917559385188, | |
| "grad_norm": 0.34249818325042725, | |
| "learning_rate": 0.00045638089691322066, | |
| "loss": 3.3681, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 12.005239869585468, | |
| "grad_norm": 0.3680878281593323, | |
| "learning_rate": 0.00045620617355853225, | |
| "loss": 3.3332, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 12.019795062878435, | |
| "grad_norm": 0.3671158254146576, | |
| "learning_rate": 0.0004560314502038439, | |
| "loss": 3.2644, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 12.034350256171402, | |
| "grad_norm": 0.3463709056377411, | |
| "learning_rate": 0.0004558567268491555, | |
| "loss": 3.2579, | |
| "step": 41350 | |
| }, | |
| { | |
| "epoch": 12.04890544946437, | |
| "grad_norm": 0.337708443403244, | |
| "learning_rate": 0.00045568200349446704, | |
| "loss": 3.2639, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 12.063460642757336, | |
| "grad_norm": 0.3561146855354309, | |
| "learning_rate": 0.00045550728013977863, | |
| "loss": 3.2715, | |
| "step": 41450 | |
| }, | |
| { | |
| "epoch": 12.078015836050303, | |
| "grad_norm": 0.3562977910041809, | |
| "learning_rate": 0.0004553325567850902, | |
| "loss": 3.2779, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 12.09257102934327, | |
| "grad_norm": 0.3556326925754547, | |
| "learning_rate": 0.0004551578334304018, | |
| "loss": 3.282, | |
| "step": 41550 | |
| }, | |
| { | |
| "epoch": 12.107126222636236, | |
| "grad_norm": 0.35785195231437683, | |
| "learning_rate": 0.00045498311007571347, | |
| "loss": 3.2773, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 12.121681415929203, | |
| "grad_norm": 0.3818725645542145, | |
| "learning_rate": 0.000454808386721025, | |
| "loss": 3.2777, | |
| "step": 41650 | |
| }, | |
| { | |
| "epoch": 12.13623660922217, | |
| "grad_norm": 0.36099886894226074, | |
| "learning_rate": 0.0004546336633663366, | |
| "loss": 3.2908, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 12.150791802515137, | |
| "grad_norm": 0.3814826011657715, | |
| "learning_rate": 0.0004544589400116482, | |
| "loss": 3.2877, | |
| "step": 41750 | |
| }, | |
| { | |
| "epoch": 12.165346995808104, | |
| "grad_norm": 0.3702724277973175, | |
| "learning_rate": 0.00045428421665695974, | |
| "loss": 3.2946, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 12.179902189101071, | |
| "grad_norm": 0.40045592188835144, | |
| "learning_rate": 0.00045410949330227133, | |
| "loss": 3.3067, | |
| "step": 41850 | |
| }, | |
| { | |
| "epoch": 12.194457382394038, | |
| "grad_norm": 0.36000096797943115, | |
| "learning_rate": 0.000453934769947583, | |
| "loss": 3.3008, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 12.209012575687005, | |
| "grad_norm": 0.34837639331817627, | |
| "learning_rate": 0.0004537600465928946, | |
| "loss": 3.3113, | |
| "step": 41950 | |
| }, | |
| { | |
| "epoch": 12.223567768979972, | |
| "grad_norm": 0.34509870409965515, | |
| "learning_rate": 0.0004535853232382061, | |
| "loss": 3.3059, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.223567768979972, | |
| "eval_accuracy": 0.3697869735320281, | |
| "eval_loss": 3.5653076171875, | |
| "eval_runtime": 82.1975, | |
| "eval_samples_per_second": 202.573, | |
| "eval_steps_per_second": 12.665, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.23812296227294, | |
| "grad_norm": 0.35102787613868713, | |
| "learning_rate": 0.0004534105998835177, | |
| "loss": 3.3085, | |
| "step": 42050 | |
| }, | |
| { | |
| "epoch": 12.252678155565906, | |
| "grad_norm": 0.37202975153923035, | |
| "learning_rate": 0.0004532358765288293, | |
| "loss": 3.3011, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 12.267233348858873, | |
| "grad_norm": 0.3594837784767151, | |
| "learning_rate": 0.00045306115317414095, | |
| "loss": 3.3069, | |
| "step": 42150 | |
| }, | |
| { | |
| "epoch": 12.28178854215184, | |
| "grad_norm": 0.3870075047016144, | |
| "learning_rate": 0.0004528864298194525, | |
| "loss": 3.3075, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 12.296343735444808, | |
| "grad_norm": 0.3686104714870453, | |
| "learning_rate": 0.0004527117064647641, | |
| "loss": 3.309, | |
| "step": 42250 | |
| }, | |
| { | |
| "epoch": 12.310898928737773, | |
| "grad_norm": 0.34569546580314636, | |
| "learning_rate": 0.0004525369831100757, | |
| "loss": 3.3212, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 12.32545412203074, | |
| "grad_norm": 0.38856497406959534, | |
| "learning_rate": 0.0004523622597553872, | |
| "loss": 3.3199, | |
| "step": 42350 | |
| }, | |
| { | |
| "epoch": 12.340009315323707, | |
| "grad_norm": 0.35298582911491394, | |
| "learning_rate": 0.0004521875364006988, | |
| "loss": 3.3179, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 12.354564508616674, | |
| "grad_norm": 0.3694022595882416, | |
| "learning_rate": 0.00045201281304601046, | |
| "loss": 3.3065, | |
| "step": 42450 | |
| }, | |
| { | |
| "epoch": 12.369119701909641, | |
| "grad_norm": 0.34031233191490173, | |
| "learning_rate": 0.00045183808969132206, | |
| "loss": 3.3258, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 12.383674895202608, | |
| "grad_norm": 0.3369908928871155, | |
| "learning_rate": 0.00045166336633663365, | |
| "loss": 3.321, | |
| "step": 42550 | |
| }, | |
| { | |
| "epoch": 12.398230088495575, | |
| "grad_norm": 0.34123343229293823, | |
| "learning_rate": 0.0004514886429819452, | |
| "loss": 3.3198, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 12.412785281788542, | |
| "grad_norm": 0.3767509460449219, | |
| "learning_rate": 0.0004513139196272568, | |
| "loss": 3.3205, | |
| "step": 42650 | |
| }, | |
| { | |
| "epoch": 12.42734047508151, | |
| "grad_norm": 0.35374560952186584, | |
| "learning_rate": 0.00045113919627256843, | |
| "loss": 3.3304, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 12.441895668374476, | |
| "grad_norm": 0.3370795249938965, | |
| "learning_rate": 0.00045096447291788003, | |
| "loss": 3.3301, | |
| "step": 42750 | |
| }, | |
| { | |
| "epoch": 12.456450861667443, | |
| "grad_norm": 0.3659289479255676, | |
| "learning_rate": 0.00045078974956319157, | |
| "loss": 3.3291, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 12.47100605496041, | |
| "grad_norm": 0.36521872878074646, | |
| "learning_rate": 0.00045061502620850316, | |
| "loss": 3.3148, | |
| "step": 42850 | |
| }, | |
| { | |
| "epoch": 12.485561248253378, | |
| "grad_norm": 0.35979926586151123, | |
| "learning_rate": 0.00045044030285381476, | |
| "loss": 3.3168, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 12.500116441546343, | |
| "grad_norm": 0.35986512899398804, | |
| "learning_rate": 0.0004502655794991263, | |
| "loss": 3.3327, | |
| "step": 42950 | |
| }, | |
| { | |
| "epoch": 12.51467163483931, | |
| "grad_norm": 0.3704209327697754, | |
| "learning_rate": 0.00045009085614443795, | |
| "loss": 3.3392, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.51467163483931, | |
| "eval_accuracy": 0.3704196230170646, | |
| "eval_loss": 3.5552613735198975, | |
| "eval_runtime": 82.1603, | |
| "eval_samples_per_second": 202.665, | |
| "eval_steps_per_second": 12.67, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.529226828132277, | |
| "grad_norm": 0.3597569167613983, | |
| "learning_rate": 0.00044991613278974954, | |
| "loss": 3.3412, | |
| "step": 43050 | |
| }, | |
| { | |
| "epoch": 12.543782021425244, | |
| "grad_norm": 0.357021301984787, | |
| "learning_rate": 0.00044974140943506113, | |
| "loss": 3.3279, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 12.558337214718211, | |
| "grad_norm": 0.4145265221595764, | |
| "learning_rate": 0.0004495666860803727, | |
| "loss": 3.3404, | |
| "step": 43150 | |
| }, | |
| { | |
| "epoch": 12.572892408011178, | |
| "grad_norm": 0.361686110496521, | |
| "learning_rate": 0.00044939196272568427, | |
| "loss": 3.3318, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 12.587447601304145, | |
| "grad_norm": 0.3558717370033264, | |
| "learning_rate": 0.00044921723937099586, | |
| "loss": 3.3365, | |
| "step": 43250 | |
| }, | |
| { | |
| "epoch": 12.602002794597112, | |
| "grad_norm": 0.3709941506385803, | |
| "learning_rate": 0.0004490425160163075, | |
| "loss": 3.3402, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 12.61655798789008, | |
| "grad_norm": 0.3520445227622986, | |
| "learning_rate": 0.00044886779266161905, | |
| "loss": 3.3302, | |
| "step": 43350 | |
| }, | |
| { | |
| "epoch": 12.631113181183046, | |
| "grad_norm": 0.37871918082237244, | |
| "learning_rate": 0.00044869306930693065, | |
| "loss": 3.3402, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 12.645668374476013, | |
| "grad_norm": 0.3813333511352539, | |
| "learning_rate": 0.00044851834595224224, | |
| "loss": 3.335, | |
| "step": 43450 | |
| }, | |
| { | |
| "epoch": 12.66022356776898, | |
| "grad_norm": 0.3657941222190857, | |
| "learning_rate": 0.00044834362259755383, | |
| "loss": 3.3357, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 12.674778761061948, | |
| "grad_norm": 0.3644251227378845, | |
| "learning_rate": 0.00044816889924286543, | |
| "loss": 3.337, | |
| "step": 43550 | |
| }, | |
| { | |
| "epoch": 12.689333954354915, | |
| "grad_norm": 0.37018752098083496, | |
| "learning_rate": 0.000447994175888177, | |
| "loss": 3.3494, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 12.703889147647882, | |
| "grad_norm": 0.3695676326751709, | |
| "learning_rate": 0.0004478194525334886, | |
| "loss": 3.3402, | |
| "step": 43650 | |
| }, | |
| { | |
| "epoch": 12.718444340940847, | |
| "grad_norm": 0.34251827001571655, | |
| "learning_rate": 0.0004476447291788002, | |
| "loss": 3.3527, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 12.732999534233814, | |
| "grad_norm": 0.3531229496002197, | |
| "learning_rate": 0.00044747000582411175, | |
| "loss": 3.3461, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 12.747554727526781, | |
| "grad_norm": 0.3697497546672821, | |
| "learning_rate": 0.00044729528246942335, | |
| "loss": 3.3444, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 12.762109920819748, | |
| "grad_norm": 0.3860340714454651, | |
| "learning_rate": 0.000447120559114735, | |
| "loss": 3.3482, | |
| "step": 43850 | |
| }, | |
| { | |
| "epoch": 12.776665114112715, | |
| "grad_norm": 0.37207141518592834, | |
| "learning_rate": 0.0004469458357600466, | |
| "loss": 3.3535, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 12.791220307405682, | |
| "grad_norm": 0.3802737295627594, | |
| "learning_rate": 0.00044677111240535813, | |
| "loss": 3.3584, | |
| "step": 43950 | |
| }, | |
| { | |
| "epoch": 12.80577550069865, | |
| "grad_norm": 0.35027065873146057, | |
| "learning_rate": 0.0004465963890506697, | |
| "loss": 3.3481, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.80577550069865, | |
| "eval_accuracy": 0.37069616476670064, | |
| "eval_loss": 3.550422191619873, | |
| "eval_runtime": 82.1574, | |
| "eval_samples_per_second": 202.672, | |
| "eval_steps_per_second": 12.671, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.820330693991616, | |
| "grad_norm": 0.3465663194656372, | |
| "learning_rate": 0.0004464216656959813, | |
| "loss": 3.3454, | |
| "step": 44050 | |
| }, | |
| { | |
| "epoch": 12.834885887284583, | |
| "grad_norm": 0.37407419085502625, | |
| "learning_rate": 0.00044624694234129297, | |
| "loss": 3.3608, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 12.84944108057755, | |
| "grad_norm": 0.3533693850040436, | |
| "learning_rate": 0.0004460722189866045, | |
| "loss": 3.3603, | |
| "step": 44150 | |
| }, | |
| { | |
| "epoch": 12.863996273870518, | |
| "grad_norm": 0.3582472801208496, | |
| "learning_rate": 0.0004458974956319161, | |
| "loss": 3.351, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 12.878551467163485, | |
| "grad_norm": 0.3492004871368408, | |
| "learning_rate": 0.0004457227722772277, | |
| "loss": 3.3495, | |
| "step": 44250 | |
| }, | |
| { | |
| "epoch": 12.89310666045645, | |
| "grad_norm": 0.3565833568572998, | |
| "learning_rate": 0.00044554804892253923, | |
| "loss": 3.3494, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 12.907661853749417, | |
| "grad_norm": 0.35763654112815857, | |
| "learning_rate": 0.00044537332556785083, | |
| "loss": 3.3448, | |
| "step": 44350 | |
| }, | |
| { | |
| "epoch": 12.922217047042384, | |
| "grad_norm": 0.345533549785614, | |
| "learning_rate": 0.0004451986022131625, | |
| "loss": 3.3587, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 12.936772240335351, | |
| "grad_norm": 0.366068035364151, | |
| "learning_rate": 0.00044502387885847407, | |
| "loss": 3.3506, | |
| "step": 44450 | |
| }, | |
| { | |
| "epoch": 12.951327433628318, | |
| "grad_norm": 0.3595816493034363, | |
| "learning_rate": 0.0004448491555037856, | |
| "loss": 3.3578, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 12.965882626921285, | |
| "grad_norm": 0.38502204418182373, | |
| "learning_rate": 0.0004446744321490972, | |
| "loss": 3.3659, | |
| "step": 44550 | |
| }, | |
| { | |
| "epoch": 12.980437820214252, | |
| "grad_norm": 0.37005504965782166, | |
| "learning_rate": 0.0004444997087944088, | |
| "loss": 3.3525, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 12.99499301350722, | |
| "grad_norm": 0.3695824146270752, | |
| "learning_rate": 0.0004443249854397204, | |
| "loss": 3.3609, | |
| "step": 44650 | |
| }, | |
| { | |
| "epoch": 13.009315323707499, | |
| "grad_norm": 0.36318615078926086, | |
| "learning_rate": 0.000444150262085032, | |
| "loss": 3.2828, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 13.023870517000466, | |
| "grad_norm": 0.38115406036376953, | |
| "learning_rate": 0.0004439755387303436, | |
| "loss": 3.2503, | |
| "step": 44750 | |
| }, | |
| { | |
| "epoch": 13.038425710293433, | |
| "grad_norm": 0.38130733370780945, | |
| "learning_rate": 0.0004438008153756552, | |
| "loss": 3.2474, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 13.0529809035864, | |
| "grad_norm": 0.35700133442878723, | |
| "learning_rate": 0.00044362609202096677, | |
| "loss": 3.2482, | |
| "step": 44850 | |
| }, | |
| { | |
| "epoch": 13.067536096879367, | |
| "grad_norm": 0.36134034395217896, | |
| "learning_rate": 0.0004434513686662783, | |
| "loss": 3.2571, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 13.082091290172334, | |
| "grad_norm": 0.358569473028183, | |
| "learning_rate": 0.00044327664531158996, | |
| "loss": 3.2557, | |
| "step": 44950 | |
| }, | |
| { | |
| "epoch": 13.0966464834653, | |
| "grad_norm": 0.3622344434261322, | |
| "learning_rate": 0.00044310192195690155, | |
| "loss": 3.2857, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.0966464834653, | |
| "eval_accuracy": 0.370176811604082, | |
| "eval_loss": 3.56237530708313, | |
| "eval_runtime": 82.1495, | |
| "eval_samples_per_second": 202.692, | |
| "eval_steps_per_second": 12.672, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.111201676758267, | |
| "grad_norm": 0.3850332498550415, | |
| "learning_rate": 0.00044292719860221315, | |
| "loss": 3.2617, | |
| "step": 45050 | |
| }, | |
| { | |
| "epoch": 13.125756870051234, | |
| "grad_norm": 0.3421729505062103, | |
| "learning_rate": 0.0004427524752475247, | |
| "loss": 3.2774, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 13.1403120633442, | |
| "grad_norm": 0.37413474917411804, | |
| "learning_rate": 0.0004425777518928363, | |
| "loss": 3.2746, | |
| "step": 45150 | |
| }, | |
| { | |
| "epoch": 13.154867256637168, | |
| "grad_norm": 0.34968623518943787, | |
| "learning_rate": 0.0004424030285381479, | |
| "loss": 3.2768, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 13.169422449930135, | |
| "grad_norm": 0.3550263047218323, | |
| "learning_rate": 0.0004422283051834595, | |
| "loss": 3.2796, | |
| "step": 45250 | |
| }, | |
| { | |
| "epoch": 13.183977643223102, | |
| "grad_norm": 0.4278651475906372, | |
| "learning_rate": 0.00044205358182877107, | |
| "loss": 3.2781, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 13.198532836516069, | |
| "grad_norm": 0.37585729360580444, | |
| "learning_rate": 0.00044187885847408266, | |
| "loss": 3.2847, | |
| "step": 45350 | |
| }, | |
| { | |
| "epoch": 13.213088029809036, | |
| "grad_norm": 0.36649757623672485, | |
| "learning_rate": 0.00044170413511939425, | |
| "loss": 3.2954, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 13.227643223102003, | |
| "grad_norm": 0.3601733148097992, | |
| "learning_rate": 0.0004415294117647058, | |
| "loss": 3.2782, | |
| "step": 45450 | |
| }, | |
| { | |
| "epoch": 13.24219841639497, | |
| "grad_norm": 0.3437824249267578, | |
| "learning_rate": 0.00044135468841001744, | |
| "loss": 3.2904, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 13.256753609687937, | |
| "grad_norm": 0.3610261082649231, | |
| "learning_rate": 0.00044117996505532904, | |
| "loss": 3.2972, | |
| "step": 45550 | |
| }, | |
| { | |
| "epoch": 13.271308802980904, | |
| "grad_norm": 0.3433802127838135, | |
| "learning_rate": 0.00044100524170064063, | |
| "loss": 3.2888, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 13.285863996273871, | |
| "grad_norm": 0.3463800549507141, | |
| "learning_rate": 0.0004408305183459522, | |
| "loss": 3.296, | |
| "step": 45650 | |
| }, | |
| { | |
| "epoch": 13.300419189566837, | |
| "grad_norm": 0.3694781959056854, | |
| "learning_rate": 0.00044065579499126377, | |
| "loss": 3.3033, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 13.314974382859804, | |
| "grad_norm": 0.3468164801597595, | |
| "learning_rate": 0.00044048107163657536, | |
| "loss": 3.2977, | |
| "step": 45750 | |
| }, | |
| { | |
| "epoch": 13.32952957615277, | |
| "grad_norm": 0.37423449754714966, | |
| "learning_rate": 0.000440306348281887, | |
| "loss": 3.3157, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 13.344084769445738, | |
| "grad_norm": 0.3602600693702698, | |
| "learning_rate": 0.0004401316249271986, | |
| "loss": 3.2979, | |
| "step": 45850 | |
| }, | |
| { | |
| "epoch": 13.358639962738705, | |
| "grad_norm": 0.3587697744369507, | |
| "learning_rate": 0.00043995690157251014, | |
| "loss": 3.3154, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 13.373195156031672, | |
| "grad_norm": 0.36652612686157227, | |
| "learning_rate": 0.00043978217821782174, | |
| "loss": 3.2916, | |
| "step": 45950 | |
| }, | |
| { | |
| "epoch": 13.387750349324639, | |
| "grad_norm": 0.35178062319755554, | |
| "learning_rate": 0.00043960745486313333, | |
| "loss": 3.2993, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.387750349324639, | |
| "eval_accuracy": 0.37056277127505727, | |
| "eval_loss": 3.5573084354400635, | |
| "eval_runtime": 82.0399, | |
| "eval_samples_per_second": 202.962, | |
| "eval_steps_per_second": 12.689, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.402305542617606, | |
| "grad_norm": 0.34837648272514343, | |
| "learning_rate": 0.00043943273150844487, | |
| "loss": 3.3134, | |
| "step": 46050 | |
| }, | |
| { | |
| "epoch": 13.416860735910573, | |
| "grad_norm": 0.3943060338497162, | |
| "learning_rate": 0.0004392580081537565, | |
| "loss": 3.3082, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 13.43141592920354, | |
| "grad_norm": 0.3512008488178253, | |
| "learning_rate": 0.0004390832847990681, | |
| "loss": 3.3003, | |
| "step": 46150 | |
| }, | |
| { | |
| "epoch": 13.445971122496507, | |
| "grad_norm": 0.38048577308654785, | |
| "learning_rate": 0.0004389085614443797, | |
| "loss": 3.3134, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 13.460526315789474, | |
| "grad_norm": 0.3971075415611267, | |
| "learning_rate": 0.00043873383808969125, | |
| "loss": 3.3036, | |
| "step": 46250 | |
| }, | |
| { | |
| "epoch": 13.475081509082441, | |
| "grad_norm": 0.37152063846588135, | |
| "learning_rate": 0.00043855911473500284, | |
| "loss": 3.3094, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 13.489636702375407, | |
| "grad_norm": 0.3719848692417145, | |
| "learning_rate": 0.0004383843913803145, | |
| "loss": 3.3, | |
| "step": 46350 | |
| }, | |
| { | |
| "epoch": 13.504191895668374, | |
| "grad_norm": 0.3875105381011963, | |
| "learning_rate": 0.0004382096680256261, | |
| "loss": 3.3237, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 13.51874708896134, | |
| "grad_norm": 0.35684072971343994, | |
| "learning_rate": 0.0004380349446709376, | |
| "loss": 3.3102, | |
| "step": 46450 | |
| }, | |
| { | |
| "epoch": 13.533302282254308, | |
| "grad_norm": 0.3369705379009247, | |
| "learning_rate": 0.0004378602213162492, | |
| "loss": 3.3203, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 13.547857475547275, | |
| "grad_norm": 0.39235222339630127, | |
| "learning_rate": 0.0004376854979615608, | |
| "loss": 3.3057, | |
| "step": 46550 | |
| }, | |
| { | |
| "epoch": 13.562412668840242, | |
| "grad_norm": 0.37409457564353943, | |
| "learning_rate": 0.0004375107746068724, | |
| "loss": 3.3112, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 13.576967862133209, | |
| "grad_norm": 0.3468940854072571, | |
| "learning_rate": 0.000437336051252184, | |
| "loss": 3.3314, | |
| "step": 46650 | |
| }, | |
| { | |
| "epoch": 13.591523055426176, | |
| "grad_norm": 0.3738160729408264, | |
| "learning_rate": 0.0004371613278974956, | |
| "loss": 3.3202, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 13.606078248719143, | |
| "grad_norm": 0.3744378387928009, | |
| "learning_rate": 0.0004369866045428072, | |
| "loss": 3.3136, | |
| "step": 46750 | |
| }, | |
| { | |
| "epoch": 13.62063344201211, | |
| "grad_norm": 0.3563174307346344, | |
| "learning_rate": 0.0004368118811881188, | |
| "loss": 3.3121, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 13.635188635305077, | |
| "grad_norm": 0.3530181646347046, | |
| "learning_rate": 0.0004366371578334303, | |
| "loss": 3.3339, | |
| "step": 46850 | |
| }, | |
| { | |
| "epoch": 13.649743828598044, | |
| "grad_norm": 0.3653353154659271, | |
| "learning_rate": 0.000436462434478742, | |
| "loss": 3.3354, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 13.664299021891011, | |
| "grad_norm": 0.39778947830200195, | |
| "learning_rate": 0.00043628771112405357, | |
| "loss": 3.324, | |
| "step": 46950 | |
| }, | |
| { | |
| "epoch": 13.678854215183978, | |
| "grad_norm": 0.40979740023612976, | |
| "learning_rate": 0.00043611298776936516, | |
| "loss": 3.3103, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.678854215183978, | |
| "eval_accuracy": 0.37090536337033525, | |
| "eval_loss": 3.5535919666290283, | |
| "eval_runtime": 82.0895, | |
| "eval_samples_per_second": 202.84, | |
| "eval_steps_per_second": 12.681, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.693409408476944, | |
| "grad_norm": 0.3520899713039398, | |
| "learning_rate": 0.0004359382644146767, | |
| "loss": 3.3265, | |
| "step": 47050 | |
| }, | |
| { | |
| "epoch": 13.70796460176991, | |
| "grad_norm": 0.3737325072288513, | |
| "learning_rate": 0.0004357635410599883, | |
| "loss": 3.3396, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 13.722519795062878, | |
| "grad_norm": 0.3834449350833893, | |
| "learning_rate": 0.0004355888177052999, | |
| "loss": 3.3356, | |
| "step": 47150 | |
| }, | |
| { | |
| "epoch": 13.737074988355845, | |
| "grad_norm": 0.3495875597000122, | |
| "learning_rate": 0.00043541409435061154, | |
| "loss": 3.3153, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 13.751630181648812, | |
| "grad_norm": 0.3683016002178192, | |
| "learning_rate": 0.0004352393709959231, | |
| "loss": 3.3249, | |
| "step": 47250 | |
| }, | |
| { | |
| "epoch": 13.766185374941779, | |
| "grad_norm": 0.35474908351898193, | |
| "learning_rate": 0.0004350646476412347, | |
| "loss": 3.3378, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 13.780740568234746, | |
| "grad_norm": 0.3989929258823395, | |
| "learning_rate": 0.00043488992428654627, | |
| "loss": 3.3252, | |
| "step": 47350 | |
| }, | |
| { | |
| "epoch": 13.795295761527713, | |
| "grad_norm": 0.36630779504776, | |
| "learning_rate": 0.0004347152009318578, | |
| "loss": 3.3257, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 13.80985095482068, | |
| "grad_norm": 0.3555261194705963, | |
| "learning_rate": 0.00043454047757716946, | |
| "loss": 3.3469, | |
| "step": 47450 | |
| }, | |
| { | |
| "epoch": 13.824406148113647, | |
| "grad_norm": 0.3582708537578583, | |
| "learning_rate": 0.00043436575422248105, | |
| "loss": 3.3322, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 13.838961341406614, | |
| "grad_norm": 0.3934645354747772, | |
| "learning_rate": 0.00043419103086779265, | |
| "loss": 3.3315, | |
| "step": 47550 | |
| }, | |
| { | |
| "epoch": 13.853516534699581, | |
| "grad_norm": 0.36250174045562744, | |
| "learning_rate": 0.0004340163075131042, | |
| "loss": 3.3427, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 13.868071727992549, | |
| "grad_norm": 0.36690548062324524, | |
| "learning_rate": 0.0004338415841584158, | |
| "loss": 3.334, | |
| "step": 47650 | |
| }, | |
| { | |
| "epoch": 13.882626921285514, | |
| "grad_norm": 0.34529149532318115, | |
| "learning_rate": 0.0004336668608037274, | |
| "loss": 3.3337, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 13.89718211457848, | |
| "grad_norm": 0.3658097982406616, | |
| "learning_rate": 0.000433492137449039, | |
| "loss": 3.3386, | |
| "step": 47750 | |
| }, | |
| { | |
| "epoch": 13.911737307871448, | |
| "grad_norm": 0.3537824749946594, | |
| "learning_rate": 0.00043331741409435056, | |
| "loss": 3.3259, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 13.926292501164415, | |
| "grad_norm": 0.34315189719200134, | |
| "learning_rate": 0.00043314269073966216, | |
| "loss": 3.3408, | |
| "step": 47850 | |
| }, | |
| { | |
| "epoch": 13.940847694457382, | |
| "grad_norm": 0.36260849237442017, | |
| "learning_rate": 0.00043296796738497375, | |
| "loss": 3.3478, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 13.955402887750349, | |
| "grad_norm": 0.3735863268375397, | |
| "learning_rate": 0.00043279324403028535, | |
| "loss": 3.3229, | |
| "step": 47950 | |
| }, | |
| { | |
| "epoch": 13.969958081043316, | |
| "grad_norm": 0.370256245136261, | |
| "learning_rate": 0.0004326185206755969, | |
| "loss": 3.3482, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 13.969958081043316, | |
| "eval_accuracy": 0.3714039141998958, | |
| "eval_loss": 3.5428836345672607, | |
| "eval_runtime": 82.1666, | |
| "eval_samples_per_second": 202.649, | |
| "eval_steps_per_second": 12.669, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 13.984513274336283, | |
| "grad_norm": 0.33942341804504395, | |
| "learning_rate": 0.00043244379732090854, | |
| "loss": 3.3489, | |
| "step": 48050 | |
| }, | |
| { | |
| "epoch": 13.99906846762925, | |
| "grad_norm": 0.3954874873161316, | |
| "learning_rate": 0.00043226907396622013, | |
| "loss": 3.3318, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 14.01339077782953, | |
| "grad_norm": 0.35203859210014343, | |
| "learning_rate": 0.0004320943506115317, | |
| "loss": 3.2394, | |
| "step": 48150 | |
| }, | |
| { | |
| "epoch": 14.027945971122497, | |
| "grad_norm": 0.37223705649375916, | |
| "learning_rate": 0.00043191962725684326, | |
| "loss": 3.2273, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 14.042501164415464, | |
| "grad_norm": 0.3844163417816162, | |
| "learning_rate": 0.00043174490390215486, | |
| "loss": 3.2253, | |
| "step": 48250 | |
| }, | |
| { | |
| "epoch": 14.057056357708431, | |
| "grad_norm": 0.36918577551841736, | |
| "learning_rate": 0.0004315701805474665, | |
| "loss": 3.2226, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 14.071611551001398, | |
| "grad_norm": 0.3459338843822479, | |
| "learning_rate": 0.0004313954571927781, | |
| "loss": 3.2304, | |
| "step": 48350 | |
| }, | |
| { | |
| "epoch": 14.086166744294363, | |
| "grad_norm": 0.3728587031364441, | |
| "learning_rate": 0.00043122073383808964, | |
| "loss": 3.2517, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 14.10072193758733, | |
| "grad_norm": 0.36574020981788635, | |
| "learning_rate": 0.00043104601048340124, | |
| "loss": 3.2498, | |
| "step": 48450 | |
| }, | |
| { | |
| "epoch": 14.115277130880298, | |
| "grad_norm": 0.386724054813385, | |
| "learning_rate": 0.00043087128712871283, | |
| "loss": 3.2556, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 14.129832324173265, | |
| "grad_norm": 0.3503314256668091, | |
| "learning_rate": 0.00043069656377402437, | |
| "loss": 3.2518, | |
| "step": 48550 | |
| }, | |
| { | |
| "epoch": 14.144387517466232, | |
| "grad_norm": 0.367578387260437, | |
| "learning_rate": 0.000430521840419336, | |
| "loss": 3.261, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 14.158942710759199, | |
| "grad_norm": 0.3733024001121521, | |
| "learning_rate": 0.0004303471170646476, | |
| "loss": 3.2634, | |
| "step": 48650 | |
| }, | |
| { | |
| "epoch": 14.173497904052166, | |
| "grad_norm": 0.3978147804737091, | |
| "learning_rate": 0.0004301723937099592, | |
| "loss": 3.2637, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 14.188053097345133, | |
| "grad_norm": 0.37719839811325073, | |
| "learning_rate": 0.00042999767035527075, | |
| "loss": 3.2751, | |
| "step": 48750 | |
| }, | |
| { | |
| "epoch": 14.2026082906381, | |
| "grad_norm": 0.37800267338752747, | |
| "learning_rate": 0.00042982294700058234, | |
| "loss": 3.2761, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 14.217163483931067, | |
| "grad_norm": 0.39260202646255493, | |
| "learning_rate": 0.000429648223645894, | |
| "loss": 3.2652, | |
| "step": 48850 | |
| }, | |
| { | |
| "epoch": 14.231718677224034, | |
| "grad_norm": 0.3451938331127167, | |
| "learning_rate": 0.0004294735002912056, | |
| "loss": 3.2685, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 14.246273870517001, | |
| "grad_norm": 0.37575921416282654, | |
| "learning_rate": 0.0004292987769365172, | |
| "loss": 3.2839, | |
| "step": 48950 | |
| }, | |
| { | |
| "epoch": 14.260829063809968, | |
| "grad_norm": 0.3547285795211792, | |
| "learning_rate": 0.0004291240535818287, | |
| "loss": 3.2756, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.260829063809968, | |
| "eval_accuracy": 0.370588039645721, | |
| "eval_loss": 3.5564966201782227, | |
| "eval_runtime": 82.0493, | |
| "eval_samples_per_second": 202.939, | |
| "eval_steps_per_second": 12.687, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.275384257102935, | |
| "grad_norm": 0.3877018392086029, | |
| "learning_rate": 0.0004289493302271403, | |
| "loss": 3.283, | |
| "step": 49050 | |
| }, | |
| { | |
| "epoch": 14.2899394503959, | |
| "grad_norm": 0.37036946415901184, | |
| "learning_rate": 0.0004287746068724519, | |
| "loss": 3.2817, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 14.304494643688868, | |
| "grad_norm": 0.37683945894241333, | |
| "learning_rate": 0.00042859988351776356, | |
| "loss": 3.2818, | |
| "step": 49150 | |
| }, | |
| { | |
| "epoch": 14.319049836981835, | |
| "grad_norm": 0.35723647475242615, | |
| "learning_rate": 0.0004284251601630751, | |
| "loss": 3.2777, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 14.333605030274802, | |
| "grad_norm": 0.4197160601615906, | |
| "learning_rate": 0.0004282504368083867, | |
| "loss": 3.286, | |
| "step": 49250 | |
| }, | |
| { | |
| "epoch": 14.348160223567769, | |
| "grad_norm": 0.3613061308860779, | |
| "learning_rate": 0.0004280757134536983, | |
| "loss": 3.2961, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 14.362715416860736, | |
| "grad_norm": 0.34033647179603577, | |
| "learning_rate": 0.0004279009900990098, | |
| "loss": 3.2728, | |
| "step": 49350 | |
| }, | |
| { | |
| "epoch": 14.377270610153703, | |
| "grad_norm": 0.36049821972846985, | |
| "learning_rate": 0.0004277262667443214, | |
| "loss": 3.285, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 14.39182580344667, | |
| "grad_norm": 0.3611394762992859, | |
| "learning_rate": 0.00042755154338963307, | |
| "loss": 3.2885, | |
| "step": 49450 | |
| }, | |
| { | |
| "epoch": 14.406380996739637, | |
| "grad_norm": 0.36351141333580017, | |
| "learning_rate": 0.00042737682003494466, | |
| "loss": 3.2863, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 14.420936190032604, | |
| "grad_norm": 0.3709542751312256, | |
| "learning_rate": 0.0004272020966802562, | |
| "loss": 3.3028, | |
| "step": 49550 | |
| }, | |
| { | |
| "epoch": 14.435491383325571, | |
| "grad_norm": 0.37077513337135315, | |
| "learning_rate": 0.0004270273733255678, | |
| "loss": 3.2889, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 14.450046576618538, | |
| "grad_norm": 0.39414048194885254, | |
| "learning_rate": 0.0004268526499708794, | |
| "loss": 3.2961, | |
| "step": 49650 | |
| }, | |
| { | |
| "epoch": 14.464601769911505, | |
| "grad_norm": 0.3596315383911133, | |
| "learning_rate": 0.00042667792661619104, | |
| "loss": 3.3063, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 14.47915696320447, | |
| "grad_norm": 0.3694511353969574, | |
| "learning_rate": 0.0004265032032615026, | |
| "loss": 3.2969, | |
| "step": 49750 | |
| }, | |
| { | |
| "epoch": 14.493712156497438, | |
| "grad_norm": 0.3808734714984894, | |
| "learning_rate": 0.0004263284799068142, | |
| "loss": 3.2942, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 14.508267349790405, | |
| "grad_norm": 0.39453205466270447, | |
| "learning_rate": 0.00042615375655212577, | |
| "loss": 3.2974, | |
| "step": 49850 | |
| }, | |
| { | |
| "epoch": 14.522822543083372, | |
| "grad_norm": 0.43535178899765015, | |
| "learning_rate": 0.00042597903319743736, | |
| "loss": 3.3068, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 14.537377736376339, | |
| "grad_norm": 0.3692663013935089, | |
| "learning_rate": 0.0004258043098427489, | |
| "loss": 3.293, | |
| "step": 49950 | |
| }, | |
| { | |
| "epoch": 14.551932929669306, | |
| "grad_norm": 0.3517593443393707, | |
| "learning_rate": 0.00042562958648806055, | |
| "loss": 3.3011, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.551932929669306, | |
| "eval_accuracy": 0.371388283068276, | |
| "eval_loss": 3.5488150119781494, | |
| "eval_runtime": 82.0842, | |
| "eval_samples_per_second": 202.853, | |
| "eval_steps_per_second": 12.682, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.566488122962273, | |
| "grad_norm": 0.35088521242141724, | |
| "learning_rate": 0.00042545486313337214, | |
| "loss": 3.3012, | |
| "step": 50050 | |
| }, | |
| { | |
| "epoch": 14.58104331625524, | |
| "grad_norm": 0.3828219771385193, | |
| "learning_rate": 0.00042528013977868374, | |
| "loss": 3.3086, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 14.595598509548207, | |
| "grad_norm": 0.3716656267642975, | |
| "learning_rate": 0.0004251054164239953, | |
| "loss": 3.3194, | |
| "step": 50150 | |
| }, | |
| { | |
| "epoch": 14.610153702841174, | |
| "grad_norm": 0.3662955164909363, | |
| "learning_rate": 0.0004249306930693069, | |
| "loss": 3.3024, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 14.624708896134141, | |
| "grad_norm": 0.3820929229259491, | |
| "learning_rate": 0.0004247559697146185, | |
| "loss": 3.3135, | |
| "step": 50250 | |
| }, | |
| { | |
| "epoch": 14.639264089427108, | |
| "grad_norm": 0.36421287059783936, | |
| "learning_rate": 0.0004245812463599301, | |
| "loss": 3.3051, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 14.653819282720075, | |
| "grad_norm": 0.3510805070400238, | |
| "learning_rate": 0.00042440652300524166, | |
| "loss": 3.2964, | |
| "step": 50350 | |
| }, | |
| { | |
| "epoch": 14.668374476013042, | |
| "grad_norm": 0.36210376024246216, | |
| "learning_rate": 0.00042423179965055325, | |
| "loss": 3.3028, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 14.682929669306008, | |
| "grad_norm": 0.36954787373542786, | |
| "learning_rate": 0.00042405707629586484, | |
| "loss": 3.306, | |
| "step": 50450 | |
| }, | |
| { | |
| "epoch": 14.697484862598975, | |
| "grad_norm": 0.3595735430717468, | |
| "learning_rate": 0.0004238823529411764, | |
| "loss": 3.3066, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 14.712040055891942, | |
| "grad_norm": 0.34303489327430725, | |
| "learning_rate": 0.00042370762958648803, | |
| "loss": 3.3091, | |
| "step": 50550 | |
| }, | |
| { | |
| "epoch": 14.726595249184909, | |
| "grad_norm": 0.3539566695690155, | |
| "learning_rate": 0.00042353290623179963, | |
| "loss": 3.3257, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 14.741150442477876, | |
| "grad_norm": 0.3739614486694336, | |
| "learning_rate": 0.0004233581828771112, | |
| "loss": 3.3264, | |
| "step": 50650 | |
| }, | |
| { | |
| "epoch": 14.755705635770843, | |
| "grad_norm": 0.3727917969226837, | |
| "learning_rate": 0.00042318345952242276, | |
| "loss": 3.3148, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 14.77026082906381, | |
| "grad_norm": 0.3577497899532318, | |
| "learning_rate": 0.00042300873616773436, | |
| "loss": 3.3051, | |
| "step": 50750 | |
| }, | |
| { | |
| "epoch": 14.784816022356777, | |
| "grad_norm": 0.3757348954677582, | |
| "learning_rate": 0.00042283401281304595, | |
| "loss": 3.3205, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 14.799371215649744, | |
| "grad_norm": 0.38980767130851746, | |
| "learning_rate": 0.0004226592894583576, | |
| "loss": 3.3317, | |
| "step": 50850 | |
| }, | |
| { | |
| "epoch": 14.813926408942711, | |
| "grad_norm": 0.36649829149246216, | |
| "learning_rate": 0.00042248456610366914, | |
| "loss": 3.3159, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 14.828481602235678, | |
| "grad_norm": 0.36675921082496643, | |
| "learning_rate": 0.00042230984274898073, | |
| "loss": 3.3183, | |
| "step": 50950 | |
| }, | |
| { | |
| "epoch": 14.843036795528645, | |
| "grad_norm": 0.4589317739009857, | |
| "learning_rate": 0.00042213511939429233, | |
| "loss": 3.3204, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.843036795528645, | |
| "eval_accuracy": 0.3719695731208471, | |
| "eval_loss": 3.5412871837615967, | |
| "eval_runtime": 82.0514, | |
| "eval_samples_per_second": 202.934, | |
| "eval_steps_per_second": 12.687, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.857591988821612, | |
| "grad_norm": 0.37191376090049744, | |
| "learning_rate": 0.0004219603960396039, | |
| "loss": 3.3229, | |
| "step": 51050 | |
| }, | |
| { | |
| "epoch": 14.872147182114578, | |
| "grad_norm": 0.3777611255645752, | |
| "learning_rate": 0.0004217856726849155, | |
| "loss": 3.3371, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 14.886702375407545, | |
| "grad_norm": 0.33734753727912903, | |
| "learning_rate": 0.0004216109493302271, | |
| "loss": 3.3313, | |
| "step": 51150 | |
| }, | |
| { | |
| "epoch": 14.901257568700512, | |
| "grad_norm": 0.3532101511955261, | |
| "learning_rate": 0.0004214362259755387, | |
| "loss": 3.3218, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 14.915812761993479, | |
| "grad_norm": 0.37541526556015015, | |
| "learning_rate": 0.0004212615026208503, | |
| "loss": 3.3166, | |
| "step": 51250 | |
| }, | |
| { | |
| "epoch": 14.930367955286446, | |
| "grad_norm": 0.36779287457466125, | |
| "learning_rate": 0.00042108677926616184, | |
| "loss": 3.3204, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 14.944923148579413, | |
| "grad_norm": 0.38258588314056396, | |
| "learning_rate": 0.00042091205591147343, | |
| "loss": 3.322, | |
| "step": 51350 | |
| }, | |
| { | |
| "epoch": 14.95947834187238, | |
| "grad_norm": 0.3696751892566681, | |
| "learning_rate": 0.0004207373325567851, | |
| "loss": 3.332, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 14.974033535165347, | |
| "grad_norm": 0.34622684121131897, | |
| "learning_rate": 0.0004205626092020967, | |
| "loss": 3.3217, | |
| "step": 51450 | |
| }, | |
| { | |
| "epoch": 14.988588728458314, | |
| "grad_norm": 0.37381526827812195, | |
| "learning_rate": 0.0004203878858474082, | |
| "loss": 3.3326, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 15.002911038658594, | |
| "grad_norm": 0.3781229853630066, | |
| "learning_rate": 0.0004202131624927198, | |
| "loss": 3.311, | |
| "step": 51550 | |
| }, | |
| { | |
| "epoch": 15.01746623195156, | |
| "grad_norm": 0.35842445492744446, | |
| "learning_rate": 0.0004200384391380314, | |
| "loss": 3.2183, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 15.032021425244528, | |
| "grad_norm": 0.37834274768829346, | |
| "learning_rate": 0.00041986371578334305, | |
| "loss": 3.2173, | |
| "step": 51650 | |
| }, | |
| { | |
| "epoch": 15.046576618537495, | |
| "grad_norm": 0.3703083395957947, | |
| "learning_rate": 0.0004196889924286546, | |
| "loss": 3.2128, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 15.06113181183046, | |
| "grad_norm": 0.40328437089920044, | |
| "learning_rate": 0.0004195142690739662, | |
| "loss": 3.2262, | |
| "step": 51750 | |
| }, | |
| { | |
| "epoch": 15.075687005123427, | |
| "grad_norm": 0.38304898142814636, | |
| "learning_rate": 0.0004193395457192778, | |
| "loss": 3.2371, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 15.090242198416394, | |
| "grad_norm": 0.3861612379550934, | |
| "learning_rate": 0.0004191648223645893, | |
| "loss": 3.2384, | |
| "step": 51850 | |
| }, | |
| { | |
| "epoch": 15.104797391709361, | |
| "grad_norm": 0.3532731831073761, | |
| "learning_rate": 0.0004189900990099009, | |
| "loss": 3.2312, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 15.119352585002328, | |
| "grad_norm": 0.3712012469768524, | |
| "learning_rate": 0.00041881537565521256, | |
| "loss": 3.2342, | |
| "step": 51950 | |
| }, | |
| { | |
| "epoch": 15.133907778295296, | |
| "grad_norm": 0.39667704701423645, | |
| "learning_rate": 0.00041864065230052416, | |
| "loss": 3.2399, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.133907778295296, | |
| "eval_accuracy": 0.37107648312701613, | |
| "eval_loss": 3.5585224628448486, | |
| "eval_runtime": 81.9725, | |
| "eval_samples_per_second": 203.129, | |
| "eval_steps_per_second": 12.699, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.148462971588263, | |
| "grad_norm": 0.35259756445884705, | |
| "learning_rate": 0.00041846592894583575, | |
| "loss": 3.2348, | |
| "step": 52050 | |
| }, | |
| { | |
| "epoch": 15.16301816488123, | |
| "grad_norm": 0.407882958650589, | |
| "learning_rate": 0.0004182912055911473, | |
| "loss": 3.2392, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 15.177573358174197, | |
| "grad_norm": 0.38144275546073914, | |
| "learning_rate": 0.0004181164822364589, | |
| "loss": 3.2538, | |
| "step": 52150 | |
| }, | |
| { | |
| "epoch": 15.192128551467164, | |
| "grad_norm": 0.38322558999061584, | |
| "learning_rate": 0.0004179417588817705, | |
| "loss": 3.2524, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 15.20668374476013, | |
| "grad_norm": 0.3589724004268646, | |
| "learning_rate": 0.00041776703552708213, | |
| "loss": 3.2494, | |
| "step": 52250 | |
| }, | |
| { | |
| "epoch": 15.221238938053098, | |
| "grad_norm": 0.3652423620223999, | |
| "learning_rate": 0.00041759231217239367, | |
| "loss": 3.2579, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 15.235794131346065, | |
| "grad_norm": 0.3804478645324707, | |
| "learning_rate": 0.00041741758881770527, | |
| "loss": 3.2555, | |
| "step": 52350 | |
| }, | |
| { | |
| "epoch": 15.250349324639032, | |
| "grad_norm": 0.36880606412887573, | |
| "learning_rate": 0.00041724286546301686, | |
| "loss": 3.2671, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 15.264904517931997, | |
| "grad_norm": 0.3613564074039459, | |
| "learning_rate": 0.0004170681421083284, | |
| "loss": 3.2756, | |
| "step": 52450 | |
| }, | |
| { | |
| "epoch": 15.279459711224964, | |
| "grad_norm": 0.37377554178237915, | |
| "learning_rate": 0.00041689341875364005, | |
| "loss": 3.2598, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 15.294014904517931, | |
| "grad_norm": 0.3634202480316162, | |
| "learning_rate": 0.00041671869539895164, | |
| "loss": 3.2673, | |
| "step": 52550 | |
| }, | |
| { | |
| "epoch": 15.308570097810899, | |
| "grad_norm": 0.35884931683540344, | |
| "learning_rate": 0.00041654397204426324, | |
| "loss": 3.2676, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 15.323125291103866, | |
| "grad_norm": 0.3785131871700287, | |
| "learning_rate": 0.0004163692486895748, | |
| "loss": 3.2697, | |
| "step": 52650 | |
| }, | |
| { | |
| "epoch": 15.337680484396833, | |
| "grad_norm": 0.4078878164291382, | |
| "learning_rate": 0.00041619452533488637, | |
| "loss": 3.27, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 15.3522356776898, | |
| "grad_norm": 0.3731200098991394, | |
| "learning_rate": 0.00041601980198019797, | |
| "loss": 3.273, | |
| "step": 52750 | |
| }, | |
| { | |
| "epoch": 15.366790870982767, | |
| "grad_norm": 0.3864777386188507, | |
| "learning_rate": 0.0004158450786255096, | |
| "loss": 3.2788, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 15.381346064275734, | |
| "grad_norm": 0.3813350200653076, | |
| "learning_rate": 0.00041567035527082115, | |
| "loss": 3.2745, | |
| "step": 52850 | |
| }, | |
| { | |
| "epoch": 15.3959012575687, | |
| "grad_norm": 0.3717416524887085, | |
| "learning_rate": 0.00041549563191613275, | |
| "loss": 3.2718, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 15.410456450861668, | |
| "grad_norm": 0.38161566853523254, | |
| "learning_rate": 0.00041532090856144434, | |
| "loss": 3.2822, | |
| "step": 52950 | |
| }, | |
| { | |
| "epoch": 15.425011644154635, | |
| "grad_norm": 0.3814772665500641, | |
| "learning_rate": 0.00041514618520675594, | |
| "loss": 3.2816, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.425011644154635, | |
| "eval_accuracy": 0.37122033654884123, | |
| "eval_loss": 3.555321455001831, | |
| "eval_runtime": 82.1082, | |
| "eval_samples_per_second": 202.793, | |
| "eval_steps_per_second": 12.678, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.439566837447602, | |
| "grad_norm": 0.36565178632736206, | |
| "learning_rate": 0.00041497146185206753, | |
| "loss": 3.2769, | |
| "step": 53050 | |
| }, | |
| { | |
| "epoch": 15.454122030740567, | |
| "grad_norm": 0.3605792224407196, | |
| "learning_rate": 0.0004147967384973791, | |
| "loss": 3.2783, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 15.468677224033534, | |
| "grad_norm": 0.38313713669776917, | |
| "learning_rate": 0.0004146220151426907, | |
| "loss": 3.2978, | |
| "step": 53150 | |
| }, | |
| { | |
| "epoch": 15.483232417326501, | |
| "grad_norm": 0.38904285430908203, | |
| "learning_rate": 0.0004144472917880023, | |
| "loss": 3.2719, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 15.497787610619469, | |
| "grad_norm": 0.35261771082878113, | |
| "learning_rate": 0.00041427256843331385, | |
| "loss": 3.2977, | |
| "step": 53250 | |
| }, | |
| { | |
| "epoch": 15.512342803912436, | |
| "grad_norm": 0.3562575876712799, | |
| "learning_rate": 0.00041409784507862545, | |
| "loss": 3.2952, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 15.526897997205403, | |
| "grad_norm": 0.35312509536743164, | |
| "learning_rate": 0.0004139231217239371, | |
| "loss": 3.2916, | |
| "step": 53350 | |
| }, | |
| { | |
| "epoch": 15.54145319049837, | |
| "grad_norm": 0.36511388421058655, | |
| "learning_rate": 0.0004137483983692487, | |
| "loss": 3.2909, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 15.556008383791337, | |
| "grad_norm": 0.38250142335891724, | |
| "learning_rate": 0.00041357367501456023, | |
| "loss": 3.2858, | |
| "step": 53450 | |
| }, | |
| { | |
| "epoch": 15.570563577084304, | |
| "grad_norm": 0.3564223051071167, | |
| "learning_rate": 0.0004133989516598718, | |
| "loss": 3.283, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 15.585118770377271, | |
| "grad_norm": 0.39031845331192017, | |
| "learning_rate": 0.0004132242283051834, | |
| "loss": 3.2885, | |
| "step": 53550 | |
| }, | |
| { | |
| "epoch": 15.599673963670238, | |
| "grad_norm": 0.38138341903686523, | |
| "learning_rate": 0.00041304950495049496, | |
| "loss": 3.2858, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 15.614229156963205, | |
| "grad_norm": 0.3911750614643097, | |
| "learning_rate": 0.0004128747815958066, | |
| "loss": 3.2944, | |
| "step": 53650 | |
| }, | |
| { | |
| "epoch": 15.628784350256172, | |
| "grad_norm": 0.3727046847343445, | |
| "learning_rate": 0.0004127000582411182, | |
| "loss": 3.2835, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 15.64333954354914, | |
| "grad_norm": 0.36191901564598083, | |
| "learning_rate": 0.0004125253348864298, | |
| "loss": 3.2856, | |
| "step": 53750 | |
| }, | |
| { | |
| "epoch": 15.657894736842106, | |
| "grad_norm": 0.3556697964668274, | |
| "learning_rate": 0.00041235061153174134, | |
| "loss": 3.3035, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 15.672449930135071, | |
| "grad_norm": 0.3924063444137573, | |
| "learning_rate": 0.00041217588817705293, | |
| "loss": 3.2957, | |
| "step": 53850 | |
| }, | |
| { | |
| "epoch": 15.687005123428039, | |
| "grad_norm": 0.38262417912483215, | |
| "learning_rate": 0.0004120011648223646, | |
| "loss": 3.2899, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 15.701560316721006, | |
| "grad_norm": 0.37065044045448303, | |
| "learning_rate": 0.0004118264414676762, | |
| "loss": 3.2994, | |
| "step": 53950 | |
| }, | |
| { | |
| "epoch": 15.716115510013973, | |
| "grad_norm": 0.3529834747314453, | |
| "learning_rate": 0.0004116517181129877, | |
| "loss": 3.302, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.716115510013973, | |
| "eval_accuracy": 0.371962991591744, | |
| "eval_loss": 3.545299768447876, | |
| "eval_runtime": 82.0347, | |
| "eval_samples_per_second": 202.975, | |
| "eval_steps_per_second": 12.69, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.73067070330694, | |
| "grad_norm": 0.37337830662727356, | |
| "learning_rate": 0.0004114769947582993, | |
| "loss": 3.2992, | |
| "step": 54050 | |
| }, | |
| { | |
| "epoch": 15.745225896599907, | |
| "grad_norm": 0.3891293406486511, | |
| "learning_rate": 0.0004113022714036109, | |
| "loss": 3.302, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 15.759781089892874, | |
| "grad_norm": 0.35992804169654846, | |
| "learning_rate": 0.0004111275480489225, | |
| "loss": 3.3039, | |
| "step": 54150 | |
| }, | |
| { | |
| "epoch": 15.774336283185841, | |
| "grad_norm": 0.366534948348999, | |
| "learning_rate": 0.0004109528246942341, | |
| "loss": 3.3027, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 15.788891476478808, | |
| "grad_norm": 0.38169747591018677, | |
| "learning_rate": 0.0004107781013395457, | |
| "loss": 3.2929, | |
| "step": 54250 | |
| }, | |
| { | |
| "epoch": 15.803446669771775, | |
| "grad_norm": 0.370360791683197, | |
| "learning_rate": 0.0004106033779848573, | |
| "loss": 3.3094, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 15.818001863064742, | |
| "grad_norm": 0.3841591477394104, | |
| "learning_rate": 0.0004104286546301689, | |
| "loss": 3.3006, | |
| "step": 54350 | |
| }, | |
| { | |
| "epoch": 15.83255705635771, | |
| "grad_norm": 0.3520496189594269, | |
| "learning_rate": 0.0004102539312754804, | |
| "loss": 3.3061, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 15.847112249650674, | |
| "grad_norm": 0.38375771045684814, | |
| "learning_rate": 0.00041007920792079206, | |
| "loss": 3.2943, | |
| "step": 54450 | |
| }, | |
| { | |
| "epoch": 15.861667442943642, | |
| "grad_norm": 0.34675076603889465, | |
| "learning_rate": 0.00040990448456610366, | |
| "loss": 3.3062, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 15.876222636236609, | |
| "grad_norm": 0.38124361634254456, | |
| "learning_rate": 0.00040972976121141525, | |
| "loss": 3.3046, | |
| "step": 54550 | |
| }, | |
| { | |
| "epoch": 15.890777829529576, | |
| "grad_norm": 0.3727259039878845, | |
| "learning_rate": 0.0004095550378567268, | |
| "loss": 3.3123, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 15.905333022822543, | |
| "grad_norm": 0.3622484505176544, | |
| "learning_rate": 0.0004093803145020384, | |
| "loss": 3.311, | |
| "step": 54650 | |
| }, | |
| { | |
| "epoch": 15.91988821611551, | |
| "grad_norm": 0.39095252752304077, | |
| "learning_rate": 0.00040920559114735, | |
| "loss": 3.3284, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 15.934443409408477, | |
| "grad_norm": 0.4001758396625519, | |
| "learning_rate": 0.00040903086779266163, | |
| "loss": 3.3172, | |
| "step": 54750 | |
| }, | |
| { | |
| "epoch": 15.948998602701444, | |
| "grad_norm": 0.37889063358306885, | |
| "learning_rate": 0.00040885614443797317, | |
| "loss": 3.3177, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 15.963553795994411, | |
| "grad_norm": 0.38452765345573425, | |
| "learning_rate": 0.00040868142108328476, | |
| "loss": 3.3236, | |
| "step": 54850 | |
| }, | |
| { | |
| "epoch": 15.978108989287378, | |
| "grad_norm": 0.3674323260784149, | |
| "learning_rate": 0.00040850669772859636, | |
| "loss": 3.3143, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 15.992664182580345, | |
| "grad_norm": 0.3993707001209259, | |
| "learning_rate": 0.0004083319743739079, | |
| "loss": 3.3035, | |
| "step": 54950 | |
| }, | |
| { | |
| "epoch": 16.006986492780623, | |
| "grad_norm": 0.3856005370616913, | |
| "learning_rate": 0.0004081572510192195, | |
| "loss": 3.2544, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.006986492780623, | |
| "eval_accuracy": 0.3714996989538072, | |
| "eval_loss": 3.5561764240264893, | |
| "eval_runtime": 82.0924, | |
| "eval_samples_per_second": 202.832, | |
| "eval_steps_per_second": 12.681, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.02154168607359, | |
| "grad_norm": 0.3600333034992218, | |
| "learning_rate": 0.00040798252766453114, | |
| "loss": 3.2023, | |
| "step": 55050 | |
| }, | |
| { | |
| "epoch": 16.036096879366557, | |
| "grad_norm": 0.3678044080734253, | |
| "learning_rate": 0.00040780780430984273, | |
| "loss": 3.1991, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 16.050652072659524, | |
| "grad_norm": 0.34882304072380066, | |
| "learning_rate": 0.0004076330809551543, | |
| "loss": 3.2225, | |
| "step": 55150 | |
| }, | |
| { | |
| "epoch": 16.06520726595249, | |
| "grad_norm": 0.3687790632247925, | |
| "learning_rate": 0.00040745835760046587, | |
| "loss": 3.2103, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 16.079762459245458, | |
| "grad_norm": 0.38802239298820496, | |
| "learning_rate": 0.00040728363424577746, | |
| "loss": 3.2232, | |
| "step": 55250 | |
| }, | |
| { | |
| "epoch": 16.094317652538425, | |
| "grad_norm": 0.36999067664146423, | |
| "learning_rate": 0.0004071089108910891, | |
| "loss": 3.2233, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 16.108872845831392, | |
| "grad_norm": 0.38598641753196716, | |
| "learning_rate": 0.0004069341875364007, | |
| "loss": 3.2204, | |
| "step": 55350 | |
| }, | |
| { | |
| "epoch": 16.12342803912436, | |
| "grad_norm": 0.3784811794757843, | |
| "learning_rate": 0.00040675946418171225, | |
| "loss": 3.2383, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 16.137983232417326, | |
| "grad_norm": 0.359389990568161, | |
| "learning_rate": 0.00040658474082702384, | |
| "loss": 3.225, | |
| "step": 55450 | |
| }, | |
| { | |
| "epoch": 16.152538425710294, | |
| "grad_norm": 0.37588128447532654, | |
| "learning_rate": 0.00040641001747233543, | |
| "loss": 3.2353, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 16.16709361900326, | |
| "grad_norm": 0.39327552914619446, | |
| "learning_rate": 0.000406235294117647, | |
| "loss": 3.2232, | |
| "step": 55550 | |
| }, | |
| { | |
| "epoch": 16.181648812296228, | |
| "grad_norm": 0.35211512446403503, | |
| "learning_rate": 0.0004060605707629586, | |
| "loss": 3.2425, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 16.196204005589195, | |
| "grad_norm": 0.3947155475616455, | |
| "learning_rate": 0.0004058858474082702, | |
| "loss": 3.2406, | |
| "step": 55650 | |
| }, | |
| { | |
| "epoch": 16.21075919888216, | |
| "grad_norm": 0.3916556239128113, | |
| "learning_rate": 0.0004057111240535818, | |
| "loss": 3.2281, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 16.22531439217513, | |
| "grad_norm": 0.3654915690422058, | |
| "learning_rate": 0.00040553640069889335, | |
| "loss": 3.2394, | |
| "step": 55750 | |
| }, | |
| { | |
| "epoch": 16.239869585468096, | |
| "grad_norm": 0.40865710377693176, | |
| "learning_rate": 0.00040536167734420495, | |
| "loss": 3.248, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 16.254424778761063, | |
| "grad_norm": 0.38925623893737793, | |
| "learning_rate": 0.0004051869539895166, | |
| "loss": 3.2461, | |
| "step": 55850 | |
| }, | |
| { | |
| "epoch": 16.26897997205403, | |
| "grad_norm": 0.3794665038585663, | |
| "learning_rate": 0.0004050122306348282, | |
| "loss": 3.2597, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 16.283535165346997, | |
| "grad_norm": 0.3866353929042816, | |
| "learning_rate": 0.00040483750728013973, | |
| "loss": 3.2469, | |
| "step": 55950 | |
| }, | |
| { | |
| "epoch": 16.298090358639964, | |
| "grad_norm": 0.41859227418899536, | |
| "learning_rate": 0.0004046627839254513, | |
| "loss": 3.2429, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.298090358639964, | |
| "eval_accuracy": 0.37185322108848856, | |
| "eval_loss": 3.552354097366333, | |
| "eval_runtime": 82.1568, | |
| "eval_samples_per_second": 202.673, | |
| "eval_steps_per_second": 12.671, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.31264555193293, | |
| "grad_norm": 0.3689730763435364, | |
| "learning_rate": 0.0004044880605707629, | |
| "loss": 3.2499, | |
| "step": 56050 | |
| }, | |
| { | |
| "epoch": 16.3272007452259, | |
| "grad_norm": 0.39871639013290405, | |
| "learning_rate": 0.00040431333721607446, | |
| "loss": 3.2715, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 16.341755938518865, | |
| "grad_norm": 0.3783362805843353, | |
| "learning_rate": 0.0004041386138613861, | |
| "loss": 3.2662, | |
| "step": 56150 | |
| }, | |
| { | |
| "epoch": 16.35631113181183, | |
| "grad_norm": 0.3843650817871094, | |
| "learning_rate": 0.0004039638905066977, | |
| "loss": 3.2642, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 16.370866325104796, | |
| "grad_norm": 0.37258172035217285, | |
| "learning_rate": 0.0004037891671520093, | |
| "loss": 3.2785, | |
| "step": 56250 | |
| }, | |
| { | |
| "epoch": 16.385421518397763, | |
| "grad_norm": 0.36249375343322754, | |
| "learning_rate": 0.0004036144437973209, | |
| "loss": 3.2619, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 16.39997671169073, | |
| "grad_norm": 0.3838511109352112, | |
| "learning_rate": 0.00040343972044263243, | |
| "loss": 3.2749, | |
| "step": 56350 | |
| }, | |
| { | |
| "epoch": 16.414531904983697, | |
| "grad_norm": 0.4369628131389618, | |
| "learning_rate": 0.0004032649970879441, | |
| "loss": 3.267, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 16.429087098276664, | |
| "grad_norm": 0.4005928039550781, | |
| "learning_rate": 0.00040309027373325567, | |
| "loss": 3.278, | |
| "step": 56450 | |
| }, | |
| { | |
| "epoch": 16.44364229156963, | |
| "grad_norm": 0.38534489274024963, | |
| "learning_rate": 0.00040291555037856727, | |
| "loss": 3.2823, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 16.4581974848626, | |
| "grad_norm": 0.42992380261421204, | |
| "learning_rate": 0.0004027408270238788, | |
| "loss": 3.2716, | |
| "step": 56550 | |
| }, | |
| { | |
| "epoch": 16.472752678155565, | |
| "grad_norm": 0.3646402060985565, | |
| "learning_rate": 0.0004025661036691904, | |
| "loss": 3.2619, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 16.487307871448532, | |
| "grad_norm": 0.35474634170532227, | |
| "learning_rate": 0.000402391380314502, | |
| "loss": 3.2723, | |
| "step": 56650 | |
| }, | |
| { | |
| "epoch": 16.5018630647415, | |
| "grad_norm": 0.37521177530288696, | |
| "learning_rate": 0.00040221665695981364, | |
| "loss": 3.2719, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 16.516418258034467, | |
| "grad_norm": 0.37143152952194214, | |
| "learning_rate": 0.0004020419336051252, | |
| "loss": 3.2803, | |
| "step": 56750 | |
| }, | |
| { | |
| "epoch": 16.530973451327434, | |
| "grad_norm": 0.3749611973762512, | |
| "learning_rate": 0.0004018672102504368, | |
| "loss": 3.27, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 16.5455286446204, | |
| "grad_norm": 0.38165104389190674, | |
| "learning_rate": 0.00040169248689574837, | |
| "loss": 3.2689, | |
| "step": 56850 | |
| }, | |
| { | |
| "epoch": 16.560083837913368, | |
| "grad_norm": 0.3784690499305725, | |
| "learning_rate": 0.0004015177635410599, | |
| "loss": 3.2703, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 16.574639031206335, | |
| "grad_norm": 0.3373439311981201, | |
| "learning_rate": 0.0004013430401863715, | |
| "loss": 3.2823, | |
| "step": 56950 | |
| }, | |
| { | |
| "epoch": 16.589194224499302, | |
| "grad_norm": 0.378440260887146, | |
| "learning_rate": 0.00040116831683168315, | |
| "loss": 3.2782, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.589194224499302, | |
| "eval_accuracy": 0.3719844990886345, | |
| "eval_loss": 3.547348976135254, | |
| "eval_runtime": 82.0705, | |
| "eval_samples_per_second": 202.887, | |
| "eval_steps_per_second": 12.684, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.60374941779227, | |
| "grad_norm": 0.3788599967956543, | |
| "learning_rate": 0.00040099359347699475, | |
| "loss": 3.2877, | |
| "step": 57050 | |
| }, | |
| { | |
| "epoch": 16.618304611085236, | |
| "grad_norm": 0.3887740671634674, | |
| "learning_rate": 0.0004008188701223063, | |
| "loss": 3.2761, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 16.632859804378203, | |
| "grad_norm": 0.3760894536972046, | |
| "learning_rate": 0.0004006441467676179, | |
| "loss": 3.2788, | |
| "step": 57150 | |
| }, | |
| { | |
| "epoch": 16.64741499767117, | |
| "grad_norm": 0.3659774661064148, | |
| "learning_rate": 0.0004004694234129295, | |
| "loss": 3.2825, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 16.661970190964137, | |
| "grad_norm": 0.3534715473651886, | |
| "learning_rate": 0.0004002947000582411, | |
| "loss": 3.291, | |
| "step": 57250 | |
| }, | |
| { | |
| "epoch": 16.676525384257104, | |
| "grad_norm": 0.3567655384540558, | |
| "learning_rate": 0.00040011997670355267, | |
| "loss": 3.2833, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 16.69108057755007, | |
| "grad_norm": 0.3581436574459076, | |
| "learning_rate": 0.00039994525334886426, | |
| "loss": 3.2773, | |
| "step": 57350 | |
| }, | |
| { | |
| "epoch": 16.70563577084304, | |
| "grad_norm": 0.3718293309211731, | |
| "learning_rate": 0.00039977052999417585, | |
| "loss": 3.2781, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 16.720190964136005, | |
| "grad_norm": 0.3499705195426941, | |
| "learning_rate": 0.00039959580663948745, | |
| "loss": 3.2887, | |
| "step": 57450 | |
| }, | |
| { | |
| "epoch": 16.734746157428972, | |
| "grad_norm": 0.37727317214012146, | |
| "learning_rate": 0.000399421083284799, | |
| "loss": 3.2842, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 16.749301350721936, | |
| "grad_norm": 0.38723015785217285, | |
| "learning_rate": 0.00039924635993011064, | |
| "loss": 3.2862, | |
| "step": 57550 | |
| }, | |
| { | |
| "epoch": 16.763856544014903, | |
| "grad_norm": 0.3937668204307556, | |
| "learning_rate": 0.00039907163657542223, | |
| "loss": 3.281, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 16.77841173730787, | |
| "grad_norm": 0.3740237355232239, | |
| "learning_rate": 0.0003988969132207338, | |
| "loss": 3.2949, | |
| "step": 57650 | |
| }, | |
| { | |
| "epoch": 16.792966930600837, | |
| "grad_norm": 0.3847373127937317, | |
| "learning_rate": 0.00039872218986604537, | |
| "loss": 3.2916, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 16.807522123893804, | |
| "grad_norm": 0.37429681420326233, | |
| "learning_rate": 0.00039854746651135696, | |
| "loss": 3.2901, | |
| "step": 57750 | |
| }, | |
| { | |
| "epoch": 16.82207731718677, | |
| "grad_norm": 0.38341617584228516, | |
| "learning_rate": 0.0003983727431566686, | |
| "loss": 3.2947, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 16.83663251047974, | |
| "grad_norm": 0.40129104256629944, | |
| "learning_rate": 0.0003981980198019802, | |
| "loss": 3.2808, | |
| "step": 57850 | |
| }, | |
| { | |
| "epoch": 16.851187703772705, | |
| "grad_norm": 0.3500666320323944, | |
| "learning_rate": 0.00039802329644729174, | |
| "loss": 3.2971, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 16.865742897065672, | |
| "grad_norm": 0.38086992502212524, | |
| "learning_rate": 0.00039784857309260334, | |
| "loss": 3.3031, | |
| "step": 57950 | |
| }, | |
| { | |
| "epoch": 16.88029809035864, | |
| "grad_norm": 0.37816548347473145, | |
| "learning_rate": 0.00039767384973791493, | |
| "loss": 3.2818, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.88029809035864, | |
| "eval_accuracy": 0.3729222494585223, | |
| "eval_loss": 3.53808331489563, | |
| "eval_runtime": 81.8972, | |
| "eval_samples_per_second": 203.316, | |
| "eval_steps_per_second": 12.711, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.894853283651607, | |
| "grad_norm": 0.36165186762809753, | |
| "learning_rate": 0.00039749912638322647, | |
| "loss": 3.2994, | |
| "step": 58050 | |
| }, | |
| { | |
| "epoch": 16.909408476944574, | |
| "grad_norm": 0.38042792677879333, | |
| "learning_rate": 0.0003973244030285381, | |
| "loss": 3.3018, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 16.92396367023754, | |
| "grad_norm": 0.381496787071228, | |
| "learning_rate": 0.0003971496796738497, | |
| "loss": 3.2898, | |
| "step": 58150 | |
| }, | |
| { | |
| "epoch": 16.938518863530508, | |
| "grad_norm": 0.3894367218017578, | |
| "learning_rate": 0.0003969749563191613, | |
| "loss": 3.2943, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 16.953074056823475, | |
| "grad_norm": 0.3861422538757324, | |
| "learning_rate": 0.00039680023296447285, | |
| "loss": 3.2848, | |
| "step": 58250 | |
| }, | |
| { | |
| "epoch": 16.967629250116442, | |
| "grad_norm": 0.38313028216362, | |
| "learning_rate": 0.00039662550960978444, | |
| "loss": 3.2957, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 16.98218444340941, | |
| "grad_norm": 0.37070587277412415, | |
| "learning_rate": 0.00039645078625509604, | |
| "loss": 3.2976, | |
| "step": 58350 | |
| }, | |
| { | |
| "epoch": 16.996739636702376, | |
| "grad_norm": 0.38247257471084595, | |
| "learning_rate": 0.0003962760629004077, | |
| "loss": 3.296, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 17.011061946902654, | |
| "grad_norm": 0.3740757703781128, | |
| "learning_rate": 0.0003961013395457193, | |
| "loss": 3.2211, | |
| "step": 58450 | |
| }, | |
| { | |
| "epoch": 17.02561714019562, | |
| "grad_norm": 0.37241700291633606, | |
| "learning_rate": 0.0003959266161910308, | |
| "loss": 3.1996, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 17.040172333488588, | |
| "grad_norm": 0.41477492451667786, | |
| "learning_rate": 0.0003957518928363424, | |
| "loss": 3.2016, | |
| "step": 58550 | |
| }, | |
| { | |
| "epoch": 17.054727526781555, | |
| "grad_norm": 0.38573113083839417, | |
| "learning_rate": 0.000395577169481654, | |
| "loss": 3.2037, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 17.069282720074522, | |
| "grad_norm": 0.39705806970596313, | |
| "learning_rate": 0.00039540244612696566, | |
| "loss": 3.1926, | |
| "step": 58650 | |
| }, | |
| { | |
| "epoch": 17.08383791336749, | |
| "grad_norm": 0.40704676508903503, | |
| "learning_rate": 0.0003952277227722772, | |
| "loss": 3.2136, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 17.098393106660456, | |
| "grad_norm": 0.3840765655040741, | |
| "learning_rate": 0.0003950529994175888, | |
| "loss": 3.207, | |
| "step": 58750 | |
| }, | |
| { | |
| "epoch": 17.112948299953423, | |
| "grad_norm": 0.3852939307689667, | |
| "learning_rate": 0.0003948782760629004, | |
| "loss": 3.2109, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 17.12750349324639, | |
| "grad_norm": 0.38808417320251465, | |
| "learning_rate": 0.0003947035527082119, | |
| "loss": 3.217, | |
| "step": 58850 | |
| }, | |
| { | |
| "epoch": 17.142058686539357, | |
| "grad_norm": 0.3683142364025116, | |
| "learning_rate": 0.0003945288293535235, | |
| "loss": 3.2194, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 17.156613879832324, | |
| "grad_norm": 0.41905471682548523, | |
| "learning_rate": 0.00039435410599883517, | |
| "loss": 3.2273, | |
| "step": 58950 | |
| }, | |
| { | |
| "epoch": 17.17116907312529, | |
| "grad_norm": 0.4114116132259369, | |
| "learning_rate": 0.00039417938264414676, | |
| "loss": 3.2286, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.17116907312529, | |
| "eval_accuracy": 0.3720137633876823, | |
| "eval_loss": 3.5554277896881104, | |
| "eval_runtime": 81.7778, | |
| "eval_samples_per_second": 203.613, | |
| "eval_steps_per_second": 12.73, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.18572426641826, | |
| "grad_norm": 0.39674410223960876, | |
| "learning_rate": 0.0003940046592894583, | |
| "loss": 3.2277, | |
| "step": 59050 | |
| }, | |
| { | |
| "epoch": 17.200279459711226, | |
| "grad_norm": 0.39292237162590027, | |
| "learning_rate": 0.0003938299359347699, | |
| "loss": 3.2234, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 17.214834653004193, | |
| "grad_norm": 0.3567875027656555, | |
| "learning_rate": 0.0003936552125800815, | |
| "loss": 3.2267, | |
| "step": 59150 | |
| }, | |
| { | |
| "epoch": 17.22938984629716, | |
| "grad_norm": 0.40265992283821106, | |
| "learning_rate": 0.00039348048922539314, | |
| "loss": 3.2318, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 17.243945039590127, | |
| "grad_norm": 0.39265206456184387, | |
| "learning_rate": 0.0003933057658707047, | |
| "loss": 3.2338, | |
| "step": 59250 | |
| }, | |
| { | |
| "epoch": 17.258500232883094, | |
| "grad_norm": 0.37471911311149597, | |
| "learning_rate": 0.0003931310425160163, | |
| "loss": 3.2311, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 17.27305542617606, | |
| "grad_norm": 0.40049898624420166, | |
| "learning_rate": 0.00039295631916132787, | |
| "loss": 3.2349, | |
| "step": 59350 | |
| }, | |
| { | |
| "epoch": 17.287610619469028, | |
| "grad_norm": 0.4246598184108734, | |
| "learning_rate": 0.00039278159580663946, | |
| "loss": 3.2226, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 17.302165812761995, | |
| "grad_norm": 0.43034160137176514, | |
| "learning_rate": 0.000392606872451951, | |
| "loss": 3.2329, | |
| "step": 59450 | |
| }, | |
| { | |
| "epoch": 17.316721006054962, | |
| "grad_norm": 0.39037758111953735, | |
| "learning_rate": 0.00039243214909726265, | |
| "loss": 3.2463, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 17.331276199347926, | |
| "grad_norm": 0.38634437322616577, | |
| "learning_rate": 0.00039225742574257425, | |
| "loss": 3.2473, | |
| "step": 59550 | |
| }, | |
| { | |
| "epoch": 17.345831392640893, | |
| "grad_norm": 0.3770222067832947, | |
| "learning_rate": 0.00039208270238788584, | |
| "loss": 3.243, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 17.36038658593386, | |
| "grad_norm": 0.40775683522224426, | |
| "learning_rate": 0.0003919079790331974, | |
| "loss": 3.2465, | |
| "step": 59650 | |
| }, | |
| { | |
| "epoch": 17.374941779226827, | |
| "grad_norm": 0.39329975843429565, | |
| "learning_rate": 0.000391733255678509, | |
| "loss": 3.2406, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 17.389496972519794, | |
| "grad_norm": 0.4144160747528076, | |
| "learning_rate": 0.00039155853232382057, | |
| "loss": 3.2606, | |
| "step": 59750 | |
| }, | |
| { | |
| "epoch": 17.40405216581276, | |
| "grad_norm": 0.3929508328437805, | |
| "learning_rate": 0.0003913838089691322, | |
| "loss": 3.2598, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 17.418607359105728, | |
| "grad_norm": 0.3838026225566864, | |
| "learning_rate": 0.00039120908561444376, | |
| "loss": 3.2525, | |
| "step": 59850 | |
| }, | |
| { | |
| "epoch": 17.433162552398695, | |
| "grad_norm": 0.3785262107849121, | |
| "learning_rate": 0.00039103436225975535, | |
| "loss": 3.2438, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 17.447717745691662, | |
| "grad_norm": 0.4059605598449707, | |
| "learning_rate": 0.00039085963890506695, | |
| "loss": 3.2611, | |
| "step": 59950 | |
| }, | |
| { | |
| "epoch": 17.46227293898463, | |
| "grad_norm": 0.38235408067703247, | |
| "learning_rate": 0.0003906849155503785, | |
| "loss": 3.2575, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.46227293898463, | |
| "eval_accuracy": 0.3723366108956509, | |
| "eval_loss": 3.5501465797424316, | |
| "eval_runtime": 81.843, | |
| "eval_samples_per_second": 203.45, | |
| "eval_steps_per_second": 12.719, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.476828132277596, | |
| "grad_norm": 0.37318482995033264, | |
| "learning_rate": 0.00039051019219569014, | |
| "loss": 3.2644, | |
| "step": 60050 | |
| }, | |
| { | |
| "epoch": 17.491383325570563, | |
| "grad_norm": 0.398237019777298, | |
| "learning_rate": 0.00039033546884100173, | |
| "loss": 3.2658, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 17.50593851886353, | |
| "grad_norm": 0.3602536916732788, | |
| "learning_rate": 0.0003901607454863133, | |
| "loss": 3.2693, | |
| "step": 60150 | |
| }, | |
| { | |
| "epoch": 17.520493712156497, | |
| "grad_norm": 0.39579930901527405, | |
| "learning_rate": 0.00038998602213162486, | |
| "loss": 3.2603, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 17.535048905449464, | |
| "grad_norm": 0.38210001587867737, | |
| "learning_rate": 0.00038981129877693646, | |
| "loss": 3.2689, | |
| "step": 60250 | |
| }, | |
| { | |
| "epoch": 17.54960409874243, | |
| "grad_norm": 0.3901123106479645, | |
| "learning_rate": 0.00038963657542224805, | |
| "loss": 3.2603, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 17.5641592920354, | |
| "grad_norm": 0.41346633434295654, | |
| "learning_rate": 0.0003894618520675597, | |
| "loss": 3.2657, | |
| "step": 60350 | |
| }, | |
| { | |
| "epoch": 17.578714485328366, | |
| "grad_norm": 0.38458341360092163, | |
| "learning_rate": 0.00038928712871287124, | |
| "loss": 3.2738, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 17.593269678621333, | |
| "grad_norm": 0.4368564188480377, | |
| "learning_rate": 0.00038911240535818284, | |
| "loss": 3.2623, | |
| "step": 60450 | |
| }, | |
| { | |
| "epoch": 17.6078248719143, | |
| "grad_norm": 0.3944424092769623, | |
| "learning_rate": 0.00038893768200349443, | |
| "loss": 3.27, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 17.622380065207267, | |
| "grad_norm": 0.37598180770874023, | |
| "learning_rate": 0.000388762958648806, | |
| "loss": 3.2751, | |
| "step": 60550 | |
| }, | |
| { | |
| "epoch": 17.636935258500234, | |
| "grad_norm": 0.3644326627254486, | |
| "learning_rate": 0.0003885882352941176, | |
| "loss": 3.2671, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 17.6514904517932, | |
| "grad_norm": 0.37024378776550293, | |
| "learning_rate": 0.0003884135119394292, | |
| "loss": 3.2701, | |
| "step": 60650 | |
| }, | |
| { | |
| "epoch": 17.666045645086168, | |
| "grad_norm": 0.3599623143672943, | |
| "learning_rate": 0.0003882387885847408, | |
| "loss": 3.2719, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 17.680600838379135, | |
| "grad_norm": 0.3559845983982086, | |
| "learning_rate": 0.0003880640652300524, | |
| "loss": 3.2644, | |
| "step": 60750 | |
| }, | |
| { | |
| "epoch": 17.695156031672102, | |
| "grad_norm": 0.3839481770992279, | |
| "learning_rate": 0.00038788934187536394, | |
| "loss": 3.2719, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 17.70971122496507, | |
| "grad_norm": 0.38800716400146484, | |
| "learning_rate": 0.00038771461852067554, | |
| "loss": 3.2772, | |
| "step": 60850 | |
| }, | |
| { | |
| "epoch": 17.724266418258033, | |
| "grad_norm": 0.38624536991119385, | |
| "learning_rate": 0.0003875398951659872, | |
| "loss": 3.2561, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 17.738821611551, | |
| "grad_norm": 0.3983277976512909, | |
| "learning_rate": 0.0003873651718112988, | |
| "loss": 3.2824, | |
| "step": 60950 | |
| }, | |
| { | |
| "epoch": 17.753376804843967, | |
| "grad_norm": 0.41950204968452454, | |
| "learning_rate": 0.0003871904484566103, | |
| "loss": 3.2729, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.753376804843967, | |
| "eval_accuracy": 0.37320231702732076, | |
| "eval_loss": 3.536574602127075, | |
| "eval_runtime": 81.988, | |
| "eval_samples_per_second": 203.091, | |
| "eval_steps_per_second": 12.697, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.767931998136934, | |
| "grad_norm": 0.3762170970439911, | |
| "learning_rate": 0.0003870157251019219, | |
| "loss": 3.2706, | |
| "step": 61050 | |
| }, | |
| { | |
| "epoch": 17.7824871914299, | |
| "grad_norm": 0.3924969732761383, | |
| "learning_rate": 0.0003868410017472335, | |
| "loss": 3.27, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 17.797042384722868, | |
| "grad_norm": 0.37834253907203674, | |
| "learning_rate": 0.00038666627839254505, | |
| "loss": 3.279, | |
| "step": 61150 | |
| }, | |
| { | |
| "epoch": 17.811597578015835, | |
| "grad_norm": 0.379597932100296, | |
| "learning_rate": 0.0003864915550378567, | |
| "loss": 3.2809, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 17.826152771308802, | |
| "grad_norm": 0.4007846713066101, | |
| "learning_rate": 0.0003863168316831683, | |
| "loss": 3.2846, | |
| "step": 61250 | |
| }, | |
| { | |
| "epoch": 17.84070796460177, | |
| "grad_norm": 0.3896792232990265, | |
| "learning_rate": 0.0003861421083284799, | |
| "loss": 3.2866, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 17.855263157894736, | |
| "grad_norm": 0.3664165139198303, | |
| "learning_rate": 0.0003859673849737914, | |
| "loss": 3.2843, | |
| "step": 61350 | |
| }, | |
| { | |
| "epoch": 17.869818351187703, | |
| "grad_norm": 0.3755553960800171, | |
| "learning_rate": 0.000385792661619103, | |
| "loss": 3.2877, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 17.88437354448067, | |
| "grad_norm": 0.3720515966415405, | |
| "learning_rate": 0.00038561793826441467, | |
| "loss": 3.2816, | |
| "step": 61450 | |
| }, | |
| { | |
| "epoch": 17.898928737773637, | |
| "grad_norm": 0.4346085786819458, | |
| "learning_rate": 0.00038544321490972626, | |
| "loss": 3.2794, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 17.913483931066605, | |
| "grad_norm": 0.40573427081108093, | |
| "learning_rate": 0.0003852684915550378, | |
| "loss": 3.2831, | |
| "step": 61550 | |
| }, | |
| { | |
| "epoch": 17.92803912435957, | |
| "grad_norm": 0.4129865765571594, | |
| "learning_rate": 0.0003850937682003494, | |
| "loss": 3.2839, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 17.94259431765254, | |
| "grad_norm": 0.38543757796287537, | |
| "learning_rate": 0.000384919044845661, | |
| "loss": 3.2923, | |
| "step": 61650 | |
| }, | |
| { | |
| "epoch": 17.957149510945506, | |
| "grad_norm": 0.41463637351989746, | |
| "learning_rate": 0.0003847443214909726, | |
| "loss": 3.2815, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 17.971704704238473, | |
| "grad_norm": 0.37263813614845276, | |
| "learning_rate": 0.00038456959813628423, | |
| "loss": 3.275, | |
| "step": 61750 | |
| }, | |
| { | |
| "epoch": 17.98625989753144, | |
| "grad_norm": 0.4147428870201111, | |
| "learning_rate": 0.0003843948747815958, | |
| "loss": 3.2894, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 18.000582207731718, | |
| "grad_norm": 0.3885459899902344, | |
| "learning_rate": 0.00038422015142690737, | |
| "loss": 3.2639, | |
| "step": 61850 | |
| }, | |
| { | |
| "epoch": 18.015137401024685, | |
| "grad_norm": 0.3864575922489166, | |
| "learning_rate": 0.00038404542807221896, | |
| "loss": 3.1798, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 18.029692594317652, | |
| "grad_norm": 0.4014991819858551, | |
| "learning_rate": 0.0003838707047175305, | |
| "loss": 3.1696, | |
| "step": 61950 | |
| }, | |
| { | |
| "epoch": 18.04424778761062, | |
| "grad_norm": 0.37803328037261963, | |
| "learning_rate": 0.00038369598136284215, | |
| "loss": 3.185, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.04424778761062, | |
| "eval_accuracy": 0.37221332475227303, | |
| "eval_loss": 3.5547287464141846, | |
| "eval_runtime": 81.9371, | |
| "eval_samples_per_second": 203.217, | |
| "eval_steps_per_second": 12.705, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.058802980903586, | |
| "grad_norm": 0.37901580333709717, | |
| "learning_rate": 0.00038352125800815374, | |
| "loss": 3.2009, | |
| "step": 62050 | |
| }, | |
| { | |
| "epoch": 18.073358174196553, | |
| "grad_norm": 0.3777700662612915, | |
| "learning_rate": 0.00038334653465346534, | |
| "loss": 3.1895, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 18.08791336748952, | |
| "grad_norm": 0.4100801348686218, | |
| "learning_rate": 0.0003831718112987769, | |
| "loss": 3.2006, | |
| "step": 62150 | |
| }, | |
| { | |
| "epoch": 18.102468560782487, | |
| "grad_norm": 0.4076967239379883, | |
| "learning_rate": 0.0003829970879440885, | |
| "loss": 3.193, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 18.117023754075454, | |
| "grad_norm": 0.40547990798950195, | |
| "learning_rate": 0.00038282236458940007, | |
| "loss": 3.2015, | |
| "step": 62250 | |
| }, | |
| { | |
| "epoch": 18.13157894736842, | |
| "grad_norm": 0.4088784158229828, | |
| "learning_rate": 0.0003826476412347117, | |
| "loss": 3.2018, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 18.14613414066139, | |
| "grad_norm": 0.37998151779174805, | |
| "learning_rate": 0.00038247291788002326, | |
| "loss": 3.2136, | |
| "step": 62350 | |
| }, | |
| { | |
| "epoch": 18.160689333954355, | |
| "grad_norm": 0.4111088812351227, | |
| "learning_rate": 0.00038229819452533485, | |
| "loss": 3.2258, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 18.175244527247322, | |
| "grad_norm": 0.3822641670703888, | |
| "learning_rate": 0.00038212347117064644, | |
| "loss": 3.212, | |
| "step": 62450 | |
| }, | |
| { | |
| "epoch": 18.18979972054029, | |
| "grad_norm": 0.4376724660396576, | |
| "learning_rate": 0.000381948747815958, | |
| "loss": 3.2178, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 18.204354913833257, | |
| "grad_norm": 0.38312190771102905, | |
| "learning_rate": 0.0003817740244612696, | |
| "loss": 3.2225, | |
| "step": 62550 | |
| }, | |
| { | |
| "epoch": 18.218910107126224, | |
| "grad_norm": 0.38128742575645447, | |
| "learning_rate": 0.00038159930110658123, | |
| "loss": 3.2175, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 18.23346530041919, | |
| "grad_norm": 0.40080228447914124, | |
| "learning_rate": 0.0003814245777518928, | |
| "loss": 3.2085, | |
| "step": 62650 | |
| }, | |
| { | |
| "epoch": 18.248020493712158, | |
| "grad_norm": 0.4063158333301544, | |
| "learning_rate": 0.0003812498543972044, | |
| "loss": 3.2146, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 18.262575687005125, | |
| "grad_norm": 0.39129626750946045, | |
| "learning_rate": 0.00038107513104251596, | |
| "loss": 3.2384, | |
| "step": 62750 | |
| }, | |
| { | |
| "epoch": 18.277130880298092, | |
| "grad_norm": 0.4309450387954712, | |
| "learning_rate": 0.00038090040768782755, | |
| "loss": 3.2169, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 18.29168607359106, | |
| "grad_norm": 0.4128737151622772, | |
| "learning_rate": 0.0003807256843331392, | |
| "loss": 3.2208, | |
| "step": 62850 | |
| }, | |
| { | |
| "epoch": 18.306241266884022, | |
| "grad_norm": 0.40919506549835205, | |
| "learning_rate": 0.0003805509609784508, | |
| "loss": 3.2207, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 18.32079646017699, | |
| "grad_norm": 0.40840670466423035, | |
| "learning_rate": 0.00038037623762376233, | |
| "loss": 3.2292, | |
| "step": 62950 | |
| }, | |
| { | |
| "epoch": 18.335351653469957, | |
| "grad_norm": 0.4014434218406677, | |
| "learning_rate": 0.00038020151426907393, | |
| "loss": 3.2309, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.335351653469957, | |
| "eval_accuracy": 0.3726277260311581, | |
| "eval_loss": 3.551807165145874, | |
| "eval_runtime": 81.8396, | |
| "eval_samples_per_second": 203.459, | |
| "eval_steps_per_second": 12.72, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.349906846762924, | |
| "grad_norm": 0.38082602620124817, | |
| "learning_rate": 0.0003800267909143855, | |
| "loss": 3.2401, | |
| "step": 63050 | |
| }, | |
| { | |
| "epoch": 18.36446204005589, | |
| "grad_norm": 0.3994186818599701, | |
| "learning_rate": 0.00037985206755969706, | |
| "loss": 3.2373, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 18.379017233348858, | |
| "grad_norm": 0.4042767882347107, | |
| "learning_rate": 0.0003796773442050087, | |
| "loss": 3.2355, | |
| "step": 63150 | |
| }, | |
| { | |
| "epoch": 18.393572426641825, | |
| "grad_norm": 0.38262176513671875, | |
| "learning_rate": 0.0003795026208503203, | |
| "loss": 3.2415, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 18.408127619934792, | |
| "grad_norm": 0.3814568817615509, | |
| "learning_rate": 0.0003793278974956319, | |
| "loss": 3.2483, | |
| "step": 63250 | |
| }, | |
| { | |
| "epoch": 18.42268281322776, | |
| "grad_norm": 0.4272101819515228, | |
| "learning_rate": 0.00037915317414094344, | |
| "loss": 3.248, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 18.437238006520726, | |
| "grad_norm": 0.4181455075740814, | |
| "learning_rate": 0.00037897845078625503, | |
| "loss": 3.2363, | |
| "step": 63350 | |
| }, | |
| { | |
| "epoch": 18.451793199813693, | |
| "grad_norm": 0.3957022428512573, | |
| "learning_rate": 0.0003788037274315667, | |
| "loss": 3.2492, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 18.46634839310666, | |
| "grad_norm": 0.37056073546409607, | |
| "learning_rate": 0.0003786290040768783, | |
| "loss": 3.2438, | |
| "step": 63450 | |
| }, | |
| { | |
| "epoch": 18.480903586399627, | |
| "grad_norm": 0.3788374960422516, | |
| "learning_rate": 0.0003784542807221898, | |
| "loss": 3.2402, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 18.495458779692594, | |
| "grad_norm": 0.3857443034648895, | |
| "learning_rate": 0.0003782795573675014, | |
| "loss": 3.2553, | |
| "step": 63550 | |
| }, | |
| { | |
| "epoch": 18.51001397298556, | |
| "grad_norm": 0.40392574667930603, | |
| "learning_rate": 0.000378104834012813, | |
| "loss": 3.2377, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 18.52456916627853, | |
| "grad_norm": 0.38231658935546875, | |
| "learning_rate": 0.0003779301106581246, | |
| "loss": 3.2502, | |
| "step": 63650 | |
| }, | |
| { | |
| "epoch": 18.539124359571495, | |
| "grad_norm": 0.39797940850257874, | |
| "learning_rate": 0.0003777553873034362, | |
| "loss": 3.248, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 18.553679552864462, | |
| "grad_norm": 0.3928861618041992, | |
| "learning_rate": 0.0003775806639487478, | |
| "loss": 3.2569, | |
| "step": 63750 | |
| }, | |
| { | |
| "epoch": 18.56823474615743, | |
| "grad_norm": 0.40223222970962524, | |
| "learning_rate": 0.0003774059405940594, | |
| "loss": 3.2601, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 18.582789939450397, | |
| "grad_norm": 0.40717190504074097, | |
| "learning_rate": 0.000377231217239371, | |
| "loss": 3.2462, | |
| "step": 63850 | |
| }, | |
| { | |
| "epoch": 18.597345132743364, | |
| "grad_norm": 0.3861567974090576, | |
| "learning_rate": 0.0003770564938846825, | |
| "loss": 3.2681, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 18.61190032603633, | |
| "grad_norm": 0.37393781542778015, | |
| "learning_rate": 0.00037688177052999416, | |
| "loss": 3.2719, | |
| "step": 63950 | |
| }, | |
| { | |
| "epoch": 18.626455519329298, | |
| "grad_norm": 0.4040031433105469, | |
| "learning_rate": 0.00037670704717530576, | |
| "loss": 3.2619, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.626455519329298, | |
| "eval_accuracy": 0.37300146286237046, | |
| "eval_loss": 3.5419111251831055, | |
| "eval_runtime": 81.7616, | |
| "eval_samples_per_second": 203.653, | |
| "eval_steps_per_second": 12.732, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.641010712622265, | |
| "grad_norm": 0.3954145908355713, | |
| "learning_rate": 0.00037653232382061735, | |
| "loss": 3.2617, | |
| "step": 64050 | |
| }, | |
| { | |
| "epoch": 18.655565905915232, | |
| "grad_norm": 0.42322805523872375, | |
| "learning_rate": 0.0003763576004659289, | |
| "loss": 3.2647, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 18.6701210992082, | |
| "grad_norm": 0.4017442464828491, | |
| "learning_rate": 0.0003761828771112405, | |
| "loss": 3.2474, | |
| "step": 64150 | |
| }, | |
| { | |
| "epoch": 18.684676292501166, | |
| "grad_norm": 0.4153529405593872, | |
| "learning_rate": 0.0003760081537565521, | |
| "loss": 3.2492, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 18.69923148579413, | |
| "grad_norm": 0.4200185537338257, | |
| "learning_rate": 0.00037583343040186373, | |
| "loss": 3.2658, | |
| "step": 64250 | |
| }, | |
| { | |
| "epoch": 18.713786679087097, | |
| "grad_norm": 0.4172070324420929, | |
| "learning_rate": 0.00037565870704717527, | |
| "loss": 3.2668, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 18.728341872380064, | |
| "grad_norm": 0.3836353123188019, | |
| "learning_rate": 0.00037548398369248687, | |
| "loss": 3.2579, | |
| "step": 64350 | |
| }, | |
| { | |
| "epoch": 18.74289706567303, | |
| "grad_norm": 0.4007646441459656, | |
| "learning_rate": 0.00037530926033779846, | |
| "loss": 3.257, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 18.757452258965998, | |
| "grad_norm": 0.4102405607700348, | |
| "learning_rate": 0.00037513453698311, | |
| "loss": 3.2623, | |
| "step": 64450 | |
| }, | |
| { | |
| "epoch": 18.772007452258965, | |
| "grad_norm": 0.3884364366531372, | |
| "learning_rate": 0.0003749598136284216, | |
| "loss": 3.2618, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 18.786562645551932, | |
| "grad_norm": 0.38656288385391235, | |
| "learning_rate": 0.00037478509027373324, | |
| "loss": 3.2583, | |
| "step": 64550 | |
| }, | |
| { | |
| "epoch": 18.8011178388449, | |
| "grad_norm": 0.36120912432670593, | |
| "learning_rate": 0.00037461036691904484, | |
| "loss": 3.253, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 18.815673032137866, | |
| "grad_norm": 0.3647724986076355, | |
| "learning_rate": 0.0003744356435643564, | |
| "loss": 3.274, | |
| "step": 64650 | |
| }, | |
| { | |
| "epoch": 18.830228225430833, | |
| "grad_norm": 0.37409770488739014, | |
| "learning_rate": 0.00037426092020966797, | |
| "loss": 3.2688, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 18.8447834187238, | |
| "grad_norm": 0.38123470544815063, | |
| "learning_rate": 0.00037408619685497957, | |
| "loss": 3.259, | |
| "step": 64750 | |
| }, | |
| { | |
| "epoch": 18.859338612016767, | |
| "grad_norm": 0.3785369396209717, | |
| "learning_rate": 0.0003739114735002912, | |
| "loss": 3.2745, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 18.873893805309734, | |
| "grad_norm": 0.3723326325416565, | |
| "learning_rate": 0.0003737367501456028, | |
| "loss": 3.2774, | |
| "step": 64850 | |
| }, | |
| { | |
| "epoch": 18.8884489986027, | |
| "grad_norm": 0.3873491883277893, | |
| "learning_rate": 0.00037356202679091435, | |
| "loss": 3.2709, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 18.90300419189567, | |
| "grad_norm": 0.36850690841674805, | |
| "learning_rate": 0.00037338730343622594, | |
| "loss": 3.2752, | |
| "step": 64950 | |
| }, | |
| { | |
| "epoch": 18.917559385188635, | |
| "grad_norm": 0.40277478098869324, | |
| "learning_rate": 0.00037321258008153754, | |
| "loss": 3.2775, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.917559385188635, | |
| "eval_accuracy": 0.3732211213961868, | |
| "eval_loss": 3.537968635559082, | |
| "eval_runtime": 81.7984, | |
| "eval_samples_per_second": 203.561, | |
| "eval_steps_per_second": 12.726, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.932114578481603, | |
| "grad_norm": 0.37106573581695557, | |
| "learning_rate": 0.0003730378567268491, | |
| "loss": 3.2777, | |
| "step": 65050 | |
| }, | |
| { | |
| "epoch": 18.94666977177457, | |
| "grad_norm": 0.383087694644928, | |
| "learning_rate": 0.0003728631333721607, | |
| "loss": 3.2752, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 18.961224965067537, | |
| "grad_norm": 0.37902796268463135, | |
| "learning_rate": 0.0003726884100174723, | |
| "loss": 3.2741, | |
| "step": 65150 | |
| }, | |
| { | |
| "epoch": 18.975780158360504, | |
| "grad_norm": 0.3779529929161072, | |
| "learning_rate": 0.0003725136866627839, | |
| "loss": 3.2794, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 18.99033535165347, | |
| "grad_norm": 0.39796483516693115, | |
| "learning_rate": 0.00037233896330809545, | |
| "loss": 3.2705, | |
| "step": 65250 | |
| }, | |
| { | |
| "epoch": 19.00465766185375, | |
| "grad_norm": 0.3695457875728607, | |
| "learning_rate": 0.00037216423995340705, | |
| "loss": 3.2391, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 19.019212855146716, | |
| "grad_norm": 0.3714453876018524, | |
| "learning_rate": 0.0003719895165987187, | |
| "loss": 3.1749, | |
| "step": 65350 | |
| }, | |
| { | |
| "epoch": 19.033768048439683, | |
| "grad_norm": 0.39892369508743286, | |
| "learning_rate": 0.0003718147932440303, | |
| "loss": 3.1603, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 19.04832324173265, | |
| "grad_norm": 0.4224001169204712, | |
| "learning_rate": 0.00037164006988934183, | |
| "loss": 3.1646, | |
| "step": 65450 | |
| }, | |
| { | |
| "epoch": 19.062878435025617, | |
| "grad_norm": 0.3941407799720764, | |
| "learning_rate": 0.0003714653465346534, | |
| "loss": 3.1812, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 19.077433628318584, | |
| "grad_norm": 0.4209772050380707, | |
| "learning_rate": 0.000371290623179965, | |
| "loss": 3.1734, | |
| "step": 65550 | |
| }, | |
| { | |
| "epoch": 19.09198882161155, | |
| "grad_norm": 0.43055638670921326, | |
| "learning_rate": 0.00037111589982527656, | |
| "loss": 3.1875, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 19.106544014904518, | |
| "grad_norm": 0.4523344337940216, | |
| "learning_rate": 0.0003709411764705882, | |
| "loss": 3.1893, | |
| "step": 65650 | |
| }, | |
| { | |
| "epoch": 19.121099208197485, | |
| "grad_norm": 0.3762241005897522, | |
| "learning_rate": 0.0003707664531158998, | |
| "loss": 3.1981, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 19.135654401490452, | |
| "grad_norm": 0.42582032084465027, | |
| "learning_rate": 0.0003705917297612114, | |
| "loss": 3.2033, | |
| "step": 65750 | |
| }, | |
| { | |
| "epoch": 19.15020959478342, | |
| "grad_norm": 0.4270772635936737, | |
| "learning_rate": 0.000370417006406523, | |
| "loss": 3.1948, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 19.164764788076386, | |
| "grad_norm": 0.4280288815498352, | |
| "learning_rate": 0.00037024228305183453, | |
| "loss": 3.2055, | |
| "step": 65850 | |
| }, | |
| { | |
| "epoch": 19.179319981369353, | |
| "grad_norm": 0.37996238470077515, | |
| "learning_rate": 0.0003700675596971461, | |
| "loss": 3.2012, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 19.19387517466232, | |
| "grad_norm": 0.39729738235473633, | |
| "learning_rate": 0.0003698928363424578, | |
| "loss": 3.2163, | |
| "step": 65950 | |
| }, | |
| { | |
| "epoch": 19.208430367955287, | |
| "grad_norm": 0.3996589481830597, | |
| "learning_rate": 0.00036971811298776937, | |
| "loss": 3.2032, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.208430367955287, | |
| "eval_accuracy": 0.3724144139718341, | |
| "eval_loss": 3.5546491146087646, | |
| "eval_runtime": 81.7323, | |
| "eval_samples_per_second": 203.726, | |
| "eval_steps_per_second": 12.737, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.222985561248255, | |
| "grad_norm": 0.3845502436161041, | |
| "learning_rate": 0.0003695433896330809, | |
| "loss": 3.2176, | |
| "step": 66050 | |
| }, | |
| { | |
| "epoch": 19.23754075454122, | |
| "grad_norm": 0.4252937436103821, | |
| "learning_rate": 0.0003693686662783925, | |
| "loss": 3.2114, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 19.25209594783419, | |
| "grad_norm": 0.4127264618873596, | |
| "learning_rate": 0.0003691939429237041, | |
| "loss": 3.2133, | |
| "step": 66150 | |
| }, | |
| { | |
| "epoch": 19.266651141127156, | |
| "grad_norm": 0.4106455147266388, | |
| "learning_rate": 0.00036901921956901575, | |
| "loss": 3.2114, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 19.281206334420123, | |
| "grad_norm": 0.40589675307273865, | |
| "learning_rate": 0.0003688444962143273, | |
| "loss": 3.2264, | |
| "step": 66250 | |
| }, | |
| { | |
| "epoch": 19.29576152771309, | |
| "grad_norm": 0.37764689326286316, | |
| "learning_rate": 0.0003686697728596389, | |
| "loss": 3.2144, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 19.310316721006053, | |
| "grad_norm": 0.36274319887161255, | |
| "learning_rate": 0.0003684950495049505, | |
| "loss": 3.221, | |
| "step": 66350 | |
| }, | |
| { | |
| "epoch": 19.32487191429902, | |
| "grad_norm": 0.4248831868171692, | |
| "learning_rate": 0.000368320326150262, | |
| "loss": 3.2154, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 19.339427107591987, | |
| "grad_norm": 0.38562580943107605, | |
| "learning_rate": 0.0003681456027955736, | |
| "loss": 3.2229, | |
| "step": 66450 | |
| }, | |
| { | |
| "epoch": 19.353982300884955, | |
| "grad_norm": 0.4277852773666382, | |
| "learning_rate": 0.00036797087944088526, | |
| "loss": 3.2293, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 19.36853749417792, | |
| "grad_norm": 0.41313090920448303, | |
| "learning_rate": 0.00036779615608619685, | |
| "loss": 3.2204, | |
| "step": 66550 | |
| }, | |
| { | |
| "epoch": 19.38309268747089, | |
| "grad_norm": 0.38125768303871155, | |
| "learning_rate": 0.0003676214327315084, | |
| "loss": 3.2203, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 19.397647880763856, | |
| "grad_norm": 0.39847925305366516, | |
| "learning_rate": 0.00036744670937682, | |
| "loss": 3.2334, | |
| "step": 66650 | |
| }, | |
| { | |
| "epoch": 19.412203074056823, | |
| "grad_norm": 0.4110550880432129, | |
| "learning_rate": 0.0003672719860221316, | |
| "loss": 3.2226, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 19.42675826734979, | |
| "grad_norm": 0.3868572413921356, | |
| "learning_rate": 0.00036709726266744323, | |
| "loss": 3.2229, | |
| "step": 66750 | |
| }, | |
| { | |
| "epoch": 19.441313460642757, | |
| "grad_norm": 0.3978685438632965, | |
| "learning_rate": 0.00036692253931275477, | |
| "loss": 3.2135, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 19.455868653935724, | |
| "grad_norm": 0.3937489092350006, | |
| "learning_rate": 0.00036674781595806636, | |
| "loss": 3.2323, | |
| "step": 66850 | |
| }, | |
| { | |
| "epoch": 19.47042384722869, | |
| "grad_norm": 0.404356449842453, | |
| "learning_rate": 0.00036657309260337796, | |
| "loss": 3.224, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 19.484979040521658, | |
| "grad_norm": 0.39086246490478516, | |
| "learning_rate": 0.00036639836924868955, | |
| "loss": 3.2444, | |
| "step": 66950 | |
| }, | |
| { | |
| "epoch": 19.499534233814625, | |
| "grad_norm": 0.42522621154785156, | |
| "learning_rate": 0.0003662236458940011, | |
| "loss": 3.2309, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.499534233814625, | |
| "eval_accuracy": 0.3729412888819992, | |
| "eval_loss": 3.546234607696533, | |
| "eval_runtime": 81.8426, | |
| "eval_samples_per_second": 203.452, | |
| "eval_steps_per_second": 12.72, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.514089427107592, | |
| "grad_norm": 0.4020242691040039, | |
| "learning_rate": 0.00036604892253931274, | |
| "loss": 3.2416, | |
| "step": 67050 | |
| }, | |
| { | |
| "epoch": 19.52864462040056, | |
| "grad_norm": 0.3763948678970337, | |
| "learning_rate": 0.00036587419918462433, | |
| "loss": 3.2385, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 19.543199813693526, | |
| "grad_norm": 0.456543892621994, | |
| "learning_rate": 0.00036569947582993593, | |
| "loss": 3.2479, | |
| "step": 67150 | |
| }, | |
| { | |
| "epoch": 19.557755006986493, | |
| "grad_norm": 0.38159674406051636, | |
| "learning_rate": 0.00036552475247524747, | |
| "loss": 3.2304, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 19.57231020027946, | |
| "grad_norm": 0.4304100275039673, | |
| "learning_rate": 0.00036535002912055906, | |
| "loss": 3.2482, | |
| "step": 67250 | |
| }, | |
| { | |
| "epoch": 19.586865393572428, | |
| "grad_norm": 0.3742108941078186, | |
| "learning_rate": 0.00036517530576587066, | |
| "loss": 3.246, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 19.601420586865395, | |
| "grad_norm": 0.3841767907142639, | |
| "learning_rate": 0.0003650005824111823, | |
| "loss": 3.249, | |
| "step": 67350 | |
| }, | |
| { | |
| "epoch": 19.61597578015836, | |
| "grad_norm": 0.3809995949268341, | |
| "learning_rate": 0.00036482585905649385, | |
| "loss": 3.237, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 19.63053097345133, | |
| "grad_norm": 0.37713685631752014, | |
| "learning_rate": 0.00036465113570180544, | |
| "loss": 3.253, | |
| "step": 67450 | |
| }, | |
| { | |
| "epoch": 19.645086166744296, | |
| "grad_norm": 0.3750007152557373, | |
| "learning_rate": 0.00036447641234711703, | |
| "loss": 3.2377, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 19.659641360037263, | |
| "grad_norm": 0.41841328144073486, | |
| "learning_rate": 0.0003643016889924286, | |
| "loss": 3.2503, | |
| "step": 67550 | |
| }, | |
| { | |
| "epoch": 19.67419655333023, | |
| "grad_norm": 0.3925545811653137, | |
| "learning_rate": 0.0003641269656377402, | |
| "loss": 3.2513, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 19.688751746623197, | |
| "grad_norm": 0.3966905474662781, | |
| "learning_rate": 0.0003639522422830518, | |
| "loss": 3.2452, | |
| "step": 67650 | |
| }, | |
| { | |
| "epoch": 19.70330693991616, | |
| "grad_norm": 0.3865147531032562, | |
| "learning_rate": 0.0003637775189283634, | |
| "loss": 3.2528, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 19.717862133209128, | |
| "grad_norm": 0.3921230137348175, | |
| "learning_rate": 0.00036360279557367495, | |
| "loss": 3.2593, | |
| "step": 67750 | |
| }, | |
| { | |
| "epoch": 19.732417326502095, | |
| "grad_norm": 0.4088384807109833, | |
| "learning_rate": 0.00036342807221898655, | |
| "loss": 3.252, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 19.74697251979506, | |
| "grad_norm": 0.36401990056037903, | |
| "learning_rate": 0.00036325334886429814, | |
| "loss": 3.2391, | |
| "step": 67850 | |
| }, | |
| { | |
| "epoch": 19.76152771308803, | |
| "grad_norm": 0.42498478293418884, | |
| "learning_rate": 0.0003630786255096098, | |
| "loss": 3.2565, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 19.776082906380996, | |
| "grad_norm": 0.38812682032585144, | |
| "learning_rate": 0.00036290390215492133, | |
| "loss": 3.2531, | |
| "step": 67950 | |
| }, | |
| { | |
| "epoch": 19.790638099673963, | |
| "grad_norm": 0.3900016248226166, | |
| "learning_rate": 0.0003627291788002329, | |
| "loss": 3.2505, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.790638099673963, | |
| "eval_accuracy": 0.3735195232246296, | |
| "eval_loss": 3.5360209941864014, | |
| "eval_runtime": 81.8176, | |
| "eval_samples_per_second": 203.514, | |
| "eval_steps_per_second": 12.723, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.80519329296693, | |
| "grad_norm": 0.38015520572662354, | |
| "learning_rate": 0.0003625544554455445, | |
| "loss": 3.2531, | |
| "step": 68050 | |
| }, | |
| { | |
| "epoch": 19.819748486259897, | |
| "grad_norm": 0.42031189799308777, | |
| "learning_rate": 0.0003623797320908561, | |
| "loss": 3.2618, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 19.834303679552864, | |
| "grad_norm": 0.3681032061576843, | |
| "learning_rate": 0.00036220500873616776, | |
| "loss": 3.2649, | |
| "step": 68150 | |
| }, | |
| { | |
| "epoch": 19.84885887284583, | |
| "grad_norm": 0.3757990300655365, | |
| "learning_rate": 0.0003620302853814793, | |
| "loss": 3.2503, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 19.863414066138798, | |
| "grad_norm": 0.3976788818836212, | |
| "learning_rate": 0.0003618555620267909, | |
| "loss": 3.2689, | |
| "step": 68250 | |
| }, | |
| { | |
| "epoch": 19.877969259431765, | |
| "grad_norm": 0.40321457386016846, | |
| "learning_rate": 0.0003616808386721025, | |
| "loss": 3.2462, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 19.892524452724732, | |
| "grad_norm": 0.37532952427864075, | |
| "learning_rate": 0.00036150611531741403, | |
| "loss": 3.2738, | |
| "step": 68350 | |
| }, | |
| { | |
| "epoch": 19.9070796460177, | |
| "grad_norm": 0.4277494549751282, | |
| "learning_rate": 0.0003613313919627256, | |
| "loss": 3.256, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 19.921634839310666, | |
| "grad_norm": 0.3703059256076813, | |
| "learning_rate": 0.00036115666860803727, | |
| "loss": 3.2696, | |
| "step": 68450 | |
| }, | |
| { | |
| "epoch": 19.936190032603633, | |
| "grad_norm": 0.3833157420158386, | |
| "learning_rate": 0.00036098194525334887, | |
| "loss": 3.2654, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 19.9507452258966, | |
| "grad_norm": 0.38514086604118347, | |
| "learning_rate": 0.0003608072218986604, | |
| "loss": 3.2569, | |
| "step": 68550 | |
| }, | |
| { | |
| "epoch": 19.965300419189568, | |
| "grad_norm": 0.3989536166191101, | |
| "learning_rate": 0.000360632498543972, | |
| "loss": 3.2681, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 19.979855612482535, | |
| "grad_norm": 0.4035058915615082, | |
| "learning_rate": 0.0003604577751892836, | |
| "loss": 3.2657, | |
| "step": 68650 | |
| }, | |
| { | |
| "epoch": 19.9944108057755, | |
| "grad_norm": 0.3832719027996063, | |
| "learning_rate": 0.00036028305183459513, | |
| "loss": 3.2651, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 20.00873311597578, | |
| "grad_norm": 0.377940833568573, | |
| "learning_rate": 0.0003601083284799068, | |
| "loss": 3.2075, | |
| "step": 68750 | |
| }, | |
| { | |
| "epoch": 20.023288309268747, | |
| "grad_norm": 0.3859306275844574, | |
| "learning_rate": 0.0003599336051252184, | |
| "loss": 3.1599, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 20.037843502561714, | |
| "grad_norm": 0.40021654963493347, | |
| "learning_rate": 0.00035975888177052997, | |
| "loss": 3.1607, | |
| "step": 68850 | |
| }, | |
| { | |
| "epoch": 20.05239869585468, | |
| "grad_norm": 0.47861674427986145, | |
| "learning_rate": 0.0003595841584158415, | |
| "loss": 3.1678, | |
| "step": 68900 | |
| }, | |
| { | |
| "epoch": 20.066953889147648, | |
| "grad_norm": 0.39029547572135925, | |
| "learning_rate": 0.0003594094350611531, | |
| "loss": 3.1692, | |
| "step": 68950 | |
| }, | |
| { | |
| "epoch": 20.081509082440615, | |
| "grad_norm": 0.4111054241657257, | |
| "learning_rate": 0.00035923471170646475, | |
| "loss": 3.1711, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 20.081509082440615, | |
| "eval_accuracy": 0.37272445100351276, | |
| "eval_loss": 3.5536580085754395, | |
| "eval_runtime": 82.7697, | |
| "eval_samples_per_second": 201.173, | |
| "eval_steps_per_second": 12.577, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 20.096064275733582, | |
| "grad_norm": 0.39829033613204956, | |
| "learning_rate": 0.00035905998835177635, | |
| "loss": 3.1751, | |
| "step": 69050 | |
| }, | |
| { | |
| "epoch": 20.11061946902655, | |
| "grad_norm": 0.41842859983444214, | |
| "learning_rate": 0.00035888526499708794, | |
| "loss": 3.1834, | |
| "step": 69100 | |
| }, | |
| { | |
| "epoch": 20.125174662319516, | |
| "grad_norm": 0.41401928663253784, | |
| "learning_rate": 0.0003587105416423995, | |
| "loss": 3.1759, | |
| "step": 69150 | |
| }, | |
| { | |
| "epoch": 20.139729855612483, | |
| "grad_norm": 0.3995038568973541, | |
| "learning_rate": 0.0003585358182877111, | |
| "loss": 3.1831, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 20.15428504890545, | |
| "grad_norm": 0.3919394910335541, | |
| "learning_rate": 0.00035836109493302267, | |
| "loss": 3.1903, | |
| "step": 69250 | |
| }, | |
| { | |
| "epoch": 20.168840242198417, | |
| "grad_norm": 0.39510226249694824, | |
| "learning_rate": 0.0003581863715783343, | |
| "loss": 3.1842, | |
| "step": 69300 | |
| }, | |
| { | |
| "epoch": 20.183395435491384, | |
| "grad_norm": 0.3905065059661865, | |
| "learning_rate": 0.00035801164822364586, | |
| "loss": 3.192, | |
| "step": 69350 | |
| }, | |
| { | |
| "epoch": 20.19795062878435, | |
| "grad_norm": 0.39700841903686523, | |
| "learning_rate": 0.00035783692486895745, | |
| "loss": 3.1958, | |
| "step": 69400 | |
| }, | |
| { | |
| "epoch": 20.21250582207732, | |
| "grad_norm": 0.4083441495895386, | |
| "learning_rate": 0.00035766220151426905, | |
| "loss": 3.2051, | |
| "step": 69450 | |
| }, | |
| { | |
| "epoch": 20.227061015370285, | |
| "grad_norm": 0.41492146253585815, | |
| "learning_rate": 0.0003574874781595806, | |
| "loss": 3.1953, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 20.241616208663253, | |
| "grad_norm": 0.3910234570503235, | |
| "learning_rate": 0.00035731275480489224, | |
| "loss": 3.1984, | |
| "step": 69550 | |
| }, | |
| { | |
| "epoch": 20.25617140195622, | |
| "grad_norm": 0.370971143245697, | |
| "learning_rate": 0.00035713803145020383, | |
| "loss": 3.1984, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 20.270726595249187, | |
| "grad_norm": 0.40494441986083984, | |
| "learning_rate": 0.0003569633080955154, | |
| "loss": 3.2031, | |
| "step": 69650 | |
| }, | |
| { | |
| "epoch": 20.28528178854215, | |
| "grad_norm": 0.38996946811676025, | |
| "learning_rate": 0.00035678858474082697, | |
| "loss": 3.2088, | |
| "step": 69700 | |
| }, | |
| { | |
| "epoch": 20.299836981835117, | |
| "grad_norm": 0.4138713479042053, | |
| "learning_rate": 0.00035661386138613856, | |
| "loss": 3.2059, | |
| "step": 69750 | |
| }, | |
| { | |
| "epoch": 20.314392175128084, | |
| "grad_norm": 0.37535014748573303, | |
| "learning_rate": 0.00035643913803145015, | |
| "loss": 3.2303, | |
| "step": 69800 | |
| }, | |
| { | |
| "epoch": 20.32894736842105, | |
| "grad_norm": 0.40611809492111206, | |
| "learning_rate": 0.0003562644146767618, | |
| "loss": 3.2088, | |
| "step": 69850 | |
| }, | |
| { | |
| "epoch": 20.34350256171402, | |
| "grad_norm": 0.42331892251968384, | |
| "learning_rate": 0.00035608969132207334, | |
| "loss": 3.2122, | |
| "step": 69900 | |
| }, | |
| { | |
| "epoch": 20.358057755006985, | |
| "grad_norm": 0.4339621961116791, | |
| "learning_rate": 0.00035591496796738494, | |
| "loss": 3.2288, | |
| "step": 69950 | |
| }, | |
| { | |
| "epoch": 20.372612948299953, | |
| "grad_norm": 0.3983519673347473, | |
| "learning_rate": 0.00035574024461269653, | |
| "loss": 3.2195, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 20.372612948299953, | |
| "eval_accuracy": 0.3729008594889372, | |
| "eval_loss": 3.5468862056732178, | |
| "eval_runtime": 81.8656, | |
| "eval_samples_per_second": 203.394, | |
| "eval_steps_per_second": 12.716, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 20.38716814159292, | |
| "grad_norm": 0.3767089247703552, | |
| "learning_rate": 0.0003555655212580081, | |
| "loss": 3.2216, | |
| "step": 70050 | |
| }, | |
| { | |
| "epoch": 20.401723334885887, | |
| "grad_norm": 0.3922146260738373, | |
| "learning_rate": 0.00035539079790331967, | |
| "loss": 3.2206, | |
| "step": 70100 | |
| }, | |
| { | |
| "epoch": 20.416278528178854, | |
| "grad_norm": 0.4167845845222473, | |
| "learning_rate": 0.0003552160745486313, | |
| "loss": 3.2289, | |
| "step": 70150 | |
| }, | |
| { | |
| "epoch": 20.43083372147182, | |
| "grad_norm": 0.39565253257751465, | |
| "learning_rate": 0.0003550413511939429, | |
| "loss": 3.2027, | |
| "step": 70200 | |
| }, | |
| { | |
| "epoch": 20.445388914764788, | |
| "grad_norm": 0.3959178626537323, | |
| "learning_rate": 0.0003548666278392545, | |
| "loss": 3.2218, | |
| "step": 70250 | |
| }, | |
| { | |
| "epoch": 20.459944108057755, | |
| "grad_norm": 0.4540488123893738, | |
| "learning_rate": 0.00035469190448456604, | |
| "loss": 3.215, | |
| "step": 70300 | |
| }, | |
| { | |
| "epoch": 20.474499301350722, | |
| "grad_norm": 0.4284607768058777, | |
| "learning_rate": 0.00035451718112987764, | |
| "loss": 3.2352, | |
| "step": 70350 | |
| }, | |
| { | |
| "epoch": 20.48905449464369, | |
| "grad_norm": 0.41205471754074097, | |
| "learning_rate": 0.0003543424577751893, | |
| "loss": 3.2305, | |
| "step": 70400 | |
| }, | |
| { | |
| "epoch": 20.503609687936656, | |
| "grad_norm": 0.4357890784740448, | |
| "learning_rate": 0.0003541677344205009, | |
| "loss": 3.2222, | |
| "step": 70450 | |
| }, | |
| { | |
| "epoch": 20.518164881229623, | |
| "grad_norm": 0.4063858985900879, | |
| "learning_rate": 0.0003539930110658124, | |
| "loss": 3.2274, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 20.53272007452259, | |
| "grad_norm": 0.43921324610710144, | |
| "learning_rate": 0.000353818287711124, | |
| "loss": 3.2388, | |
| "step": 70550 | |
| }, | |
| { | |
| "epoch": 20.547275267815557, | |
| "grad_norm": 0.40713071823120117, | |
| "learning_rate": 0.0003536435643564356, | |
| "loss": 3.2267, | |
| "step": 70600 | |
| }, | |
| { | |
| "epoch": 20.561830461108524, | |
| "grad_norm": 0.40011587738990784, | |
| "learning_rate": 0.00035346884100174715, | |
| "loss": 3.2305, | |
| "step": 70650 | |
| }, | |
| { | |
| "epoch": 20.57638565440149, | |
| "grad_norm": 0.387611985206604, | |
| "learning_rate": 0.0003532941176470588, | |
| "loss": 3.2329, | |
| "step": 70700 | |
| }, | |
| { | |
| "epoch": 20.59094084769446, | |
| "grad_norm": 0.38848719000816345, | |
| "learning_rate": 0.0003531193942923704, | |
| "loss": 3.2272, | |
| "step": 70750 | |
| }, | |
| { | |
| "epoch": 20.605496040987425, | |
| "grad_norm": 0.39772090315818787, | |
| "learning_rate": 0.000352944670937682, | |
| "loss": 3.2313, | |
| "step": 70800 | |
| }, | |
| { | |
| "epoch": 20.620051234280393, | |
| "grad_norm": 0.4233378767967224, | |
| "learning_rate": 0.0003527699475829935, | |
| "loss": 3.2331, | |
| "step": 70850 | |
| }, | |
| { | |
| "epoch": 20.63460642757336, | |
| "grad_norm": 0.38568368554115295, | |
| "learning_rate": 0.0003525952242283051, | |
| "loss": 3.2328, | |
| "step": 70900 | |
| }, | |
| { | |
| "epoch": 20.649161620866327, | |
| "grad_norm": 0.396078884601593, | |
| "learning_rate": 0.00035242050087361677, | |
| "loss": 3.2334, | |
| "step": 70950 | |
| }, | |
| { | |
| "epoch": 20.663716814159294, | |
| "grad_norm": 0.3970303535461426, | |
| "learning_rate": 0.00035224577751892836, | |
| "loss": 3.223, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 20.663716814159294, | |
| "eval_accuracy": 0.37352022838846205, | |
| "eval_loss": 3.5412790775299072, | |
| "eval_runtime": 81.7614, | |
| "eval_samples_per_second": 203.654, | |
| "eval_steps_per_second": 12.732, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 20.678272007452257, | |
| "grad_norm": 0.3995371460914612, | |
| "learning_rate": 0.0003520710541642399, | |
| "loss": 3.2358, | |
| "step": 71050 | |
| }, | |
| { | |
| "epoch": 20.692827200745224, | |
| "grad_norm": 0.40351197123527527, | |
| "learning_rate": 0.0003518963308095515, | |
| "loss": 3.2642, | |
| "step": 71100 | |
| }, | |
| { | |
| "epoch": 20.70738239403819, | |
| "grad_norm": 0.41050276160240173, | |
| "learning_rate": 0.0003517216074548631, | |
| "loss": 3.2379, | |
| "step": 71150 | |
| }, | |
| { | |
| "epoch": 20.72193758733116, | |
| "grad_norm": 0.4151143729686737, | |
| "learning_rate": 0.0003515468841001747, | |
| "loss": 3.2469, | |
| "step": 71200 | |
| }, | |
| { | |
| "epoch": 20.736492780624125, | |
| "grad_norm": 0.3841537535190582, | |
| "learning_rate": 0.00035137216074548634, | |
| "loss": 3.2465, | |
| "step": 71250 | |
| }, | |
| { | |
| "epoch": 20.751047973917093, | |
| "grad_norm": 0.3815242052078247, | |
| "learning_rate": 0.0003511974373907979, | |
| "loss": 3.2391, | |
| "step": 71300 | |
| }, | |
| { | |
| "epoch": 20.76560316721006, | |
| "grad_norm": 0.3913818597793579, | |
| "learning_rate": 0.00035102271403610947, | |
| "loss": 3.2556, | |
| "step": 71350 | |
| }, | |
| { | |
| "epoch": 20.780158360503027, | |
| "grad_norm": 0.3985620141029358, | |
| "learning_rate": 0.00035084799068142106, | |
| "loss": 3.2301, | |
| "step": 71400 | |
| }, | |
| { | |
| "epoch": 20.794713553795994, | |
| "grad_norm": 0.42872658371925354, | |
| "learning_rate": 0.0003506732673267326, | |
| "loss": 3.237, | |
| "step": 71450 | |
| }, | |
| { | |
| "epoch": 20.80926874708896, | |
| "grad_norm": 0.4328446388244629, | |
| "learning_rate": 0.00035049854397204425, | |
| "loss": 3.239, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 20.823823940381928, | |
| "grad_norm": 0.379326730966568, | |
| "learning_rate": 0.00035032382061735585, | |
| "loss": 3.2493, | |
| "step": 71550 | |
| }, | |
| { | |
| "epoch": 20.838379133674895, | |
| "grad_norm": 0.4020197093486786, | |
| "learning_rate": 0.00035014909726266744, | |
| "loss": 3.2575, | |
| "step": 71600 | |
| }, | |
| { | |
| "epoch": 20.852934326967862, | |
| "grad_norm": 0.4066036641597748, | |
| "learning_rate": 0.000349974373907979, | |
| "loss": 3.2382, | |
| "step": 71650 | |
| }, | |
| { | |
| "epoch": 20.86748952026083, | |
| "grad_norm": 0.39925462007522583, | |
| "learning_rate": 0.0003497996505532906, | |
| "loss": 3.2511, | |
| "step": 71700 | |
| }, | |
| { | |
| "epoch": 20.882044713553796, | |
| "grad_norm": 0.4000072181224823, | |
| "learning_rate": 0.00034962492719860217, | |
| "loss": 3.2396, | |
| "step": 71750 | |
| }, | |
| { | |
| "epoch": 20.896599906846763, | |
| "grad_norm": 0.4050699770450592, | |
| "learning_rate": 0.0003494502038439138, | |
| "loss": 3.2442, | |
| "step": 71800 | |
| }, | |
| { | |
| "epoch": 20.91115510013973, | |
| "grad_norm": 0.37977081537246704, | |
| "learning_rate": 0.00034927548048922536, | |
| "loss": 3.2459, | |
| "step": 71850 | |
| }, | |
| { | |
| "epoch": 20.925710293432697, | |
| "grad_norm": 0.4020400941371918, | |
| "learning_rate": 0.00034910075713453695, | |
| "loss": 3.2413, | |
| "step": 71900 | |
| }, | |
| { | |
| "epoch": 20.940265486725664, | |
| "grad_norm": 0.4111315906047821, | |
| "learning_rate": 0.00034892603377984855, | |
| "loss": 3.239, | |
| "step": 71950 | |
| }, | |
| { | |
| "epoch": 20.95482068001863, | |
| "grad_norm": 0.39822298288345337, | |
| "learning_rate": 0.0003487513104251601, | |
| "loss": 3.2548, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 20.95482068001863, | |
| "eval_accuracy": 0.3741189124822343, | |
| "eval_loss": 3.531515598297119, | |
| "eval_runtime": 81.8708, | |
| "eval_samples_per_second": 203.382, | |
| "eval_steps_per_second": 12.715, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 20.9693758733116, | |
| "grad_norm": 0.408417671918869, | |
| "learning_rate": 0.0003485765870704717, | |
| "loss": 3.2567, | |
| "step": 72050 | |
| }, | |
| { | |
| "epoch": 20.983931066604566, | |
| "grad_norm": 0.41261881589889526, | |
| "learning_rate": 0.00034840186371578333, | |
| "loss": 3.2501, | |
| "step": 72100 | |
| }, | |
| { | |
| "epoch": 20.998486259897533, | |
| "grad_norm": 0.3897479474544525, | |
| "learning_rate": 0.0003482271403610949, | |
| "loss": 3.2588, | |
| "step": 72150 | |
| }, | |
| { | |
| "epoch": 21.01280857009781, | |
| "grad_norm": 0.39372244477272034, | |
| "learning_rate": 0.0003480524170064065, | |
| "loss": 3.1648, | |
| "step": 72200 | |
| }, | |
| { | |
| "epoch": 21.027363763390778, | |
| "grad_norm": 0.42088451981544495, | |
| "learning_rate": 0.00034787769365171806, | |
| "loss": 3.1406, | |
| "step": 72250 | |
| }, | |
| { | |
| "epoch": 21.041918956683745, | |
| "grad_norm": 0.4015001058578491, | |
| "learning_rate": 0.00034770297029702965, | |
| "loss": 3.1476, | |
| "step": 72300 | |
| }, | |
| { | |
| "epoch": 21.05647414997671, | |
| "grad_norm": 0.3994520306587219, | |
| "learning_rate": 0.0003475282469423413, | |
| "loss": 3.1524, | |
| "step": 72350 | |
| }, | |
| { | |
| "epoch": 21.07102934326968, | |
| "grad_norm": 0.4160577356815338, | |
| "learning_rate": 0.0003473535235876529, | |
| "loss": 3.1735, | |
| "step": 72400 | |
| }, | |
| { | |
| "epoch": 21.085584536562646, | |
| "grad_norm": 0.419156938791275, | |
| "learning_rate": 0.00034717880023296444, | |
| "loss": 3.1664, | |
| "step": 72450 | |
| }, | |
| { | |
| "epoch": 21.100139729855613, | |
| "grad_norm": 0.38583138585090637, | |
| "learning_rate": 0.00034700407687827603, | |
| "loss": 3.1782, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 21.11469492314858, | |
| "grad_norm": 0.41160887479782104, | |
| "learning_rate": 0.0003468293535235876, | |
| "loss": 3.1683, | |
| "step": 72550 | |
| }, | |
| { | |
| "epoch": 21.129250116441547, | |
| "grad_norm": 0.44171571731567383, | |
| "learning_rate": 0.00034665463016889916, | |
| "loss": 3.1763, | |
| "step": 72600 | |
| }, | |
| { | |
| "epoch": 21.143805309734514, | |
| "grad_norm": 0.43221530318260193, | |
| "learning_rate": 0.0003464799068142108, | |
| "loss": 3.1839, | |
| "step": 72650 | |
| }, | |
| { | |
| "epoch": 21.15836050302748, | |
| "grad_norm": 0.38897183537483215, | |
| "learning_rate": 0.0003463051834595224, | |
| "loss": 3.1693, | |
| "step": 72700 | |
| }, | |
| { | |
| "epoch": 21.172915696320448, | |
| "grad_norm": 0.3928532302379608, | |
| "learning_rate": 0.000346130460104834, | |
| "loss": 3.178, | |
| "step": 72750 | |
| }, | |
| { | |
| "epoch": 21.187470889613415, | |
| "grad_norm": 0.3817063570022583, | |
| "learning_rate": 0.00034595573675014554, | |
| "loss": 3.1774, | |
| "step": 72800 | |
| }, | |
| { | |
| "epoch": 21.202026082906382, | |
| "grad_norm": 0.40175697207450867, | |
| "learning_rate": 0.00034578101339545714, | |
| "loss": 3.1962, | |
| "step": 72850 | |
| }, | |
| { | |
| "epoch": 21.21658127619935, | |
| "grad_norm": 0.4562869966030121, | |
| "learning_rate": 0.0003456062900407688, | |
| "loss": 3.188, | |
| "step": 72900 | |
| }, | |
| { | |
| "epoch": 21.231136469492316, | |
| "grad_norm": 0.43213754892349243, | |
| "learning_rate": 0.0003454315666860804, | |
| "loss": 3.1863, | |
| "step": 72950 | |
| }, | |
| { | |
| "epoch": 21.245691662785283, | |
| "grad_norm": 0.4060748517513275, | |
| "learning_rate": 0.0003452568433313919, | |
| "loss": 3.1821, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 21.245691662785283, | |
| "eval_accuracy": 0.3730456531292056, | |
| "eval_loss": 3.551159381866455, | |
| "eval_runtime": 81.8177, | |
| "eval_samples_per_second": 203.513, | |
| "eval_steps_per_second": 12.723, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 21.260246856078247, | |
| "grad_norm": 0.3947387635707855, | |
| "learning_rate": 0.0003450821199767035, | |
| "loss": 3.1929, | |
| "step": 73050 | |
| }, | |
| { | |
| "epoch": 21.274802049371214, | |
| "grad_norm": 0.3846012353897095, | |
| "learning_rate": 0.0003449073966220151, | |
| "loss": 3.1886, | |
| "step": 73100 | |
| }, | |
| { | |
| "epoch": 21.28935724266418, | |
| "grad_norm": 0.4448850750923157, | |
| "learning_rate": 0.0003447326732673267, | |
| "loss": 3.1991, | |
| "step": 73150 | |
| }, | |
| { | |
| "epoch": 21.303912435957148, | |
| "grad_norm": 0.40468770265579224, | |
| "learning_rate": 0.0003445579499126383, | |
| "loss": 3.2074, | |
| "step": 73200 | |
| }, | |
| { | |
| "epoch": 21.318467629250115, | |
| "grad_norm": 0.44057130813598633, | |
| "learning_rate": 0.0003443832265579499, | |
| "loss": 3.2045, | |
| "step": 73250 | |
| }, | |
| { | |
| "epoch": 21.333022822543082, | |
| "grad_norm": 0.395723432302475, | |
| "learning_rate": 0.0003442085032032615, | |
| "loss": 3.1955, | |
| "step": 73300 | |
| }, | |
| { | |
| "epoch": 21.34757801583605, | |
| "grad_norm": 0.4368989169597626, | |
| "learning_rate": 0.0003440337798485731, | |
| "loss": 3.192, | |
| "step": 73350 | |
| }, | |
| { | |
| "epoch": 21.362133209129016, | |
| "grad_norm": 0.4212827980518341, | |
| "learning_rate": 0.0003438590564938846, | |
| "loss": 3.2079, | |
| "step": 73400 | |
| }, | |
| { | |
| "epoch": 21.376688402421983, | |
| "grad_norm": 0.3966172933578491, | |
| "learning_rate": 0.0003436843331391962, | |
| "loss": 3.2161, | |
| "step": 73450 | |
| }, | |
| { | |
| "epoch": 21.39124359571495, | |
| "grad_norm": 0.42010778188705444, | |
| "learning_rate": 0.00034350960978450786, | |
| "loss": 3.2037, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 21.405798789007918, | |
| "grad_norm": 0.41357988119125366, | |
| "learning_rate": 0.00034333488642981946, | |
| "loss": 3.204, | |
| "step": 73550 | |
| }, | |
| { | |
| "epoch": 21.420353982300885, | |
| "grad_norm": 0.41483357548713684, | |
| "learning_rate": 0.000343160163075131, | |
| "loss": 3.203, | |
| "step": 73600 | |
| }, | |
| { | |
| "epoch": 21.43490917559385, | |
| "grad_norm": 0.4366847276687622, | |
| "learning_rate": 0.0003429854397204426, | |
| "loss": 3.2135, | |
| "step": 73650 | |
| }, | |
| { | |
| "epoch": 21.44946436888682, | |
| "grad_norm": 0.41424912214279175, | |
| "learning_rate": 0.0003428107163657542, | |
| "loss": 3.2261, | |
| "step": 73700 | |
| }, | |
| { | |
| "epoch": 21.464019562179786, | |
| "grad_norm": 0.40357306599617004, | |
| "learning_rate": 0.00034263599301106583, | |
| "loss": 3.2133, | |
| "step": 73750 | |
| }, | |
| { | |
| "epoch": 21.478574755472753, | |
| "grad_norm": 0.40712884068489075, | |
| "learning_rate": 0.0003424612696563774, | |
| "loss": 3.2114, | |
| "step": 73800 | |
| }, | |
| { | |
| "epoch": 21.49312994876572, | |
| "grad_norm": 0.4078884422779083, | |
| "learning_rate": 0.00034228654630168897, | |
| "loss": 3.2173, | |
| "step": 73850 | |
| }, | |
| { | |
| "epoch": 21.507685142058687, | |
| "grad_norm": 0.3806934952735901, | |
| "learning_rate": 0.00034211182294700056, | |
| "loss": 3.216, | |
| "step": 73900 | |
| }, | |
| { | |
| "epoch": 21.522240335351654, | |
| "grad_norm": 0.41860878467559814, | |
| "learning_rate": 0.0003419370995923121, | |
| "loss": 3.2196, | |
| "step": 73950 | |
| }, | |
| { | |
| "epoch": 21.53679552864462, | |
| "grad_norm": 0.41007712483406067, | |
| "learning_rate": 0.0003417623762376237, | |
| "loss": 3.2097, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 21.53679552864462, | |
| "eval_accuracy": 0.3736710159213065, | |
| "eval_loss": 3.5420477390289307, | |
| "eval_runtime": 81.9078, | |
| "eval_samples_per_second": 203.29, | |
| "eval_steps_per_second": 12.709, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 21.551350721937588, | |
| "grad_norm": 0.3961637616157532, | |
| "learning_rate": 0.00034158765288293534, | |
| "loss": 3.229, | |
| "step": 74050 | |
| }, | |
| { | |
| "epoch": 21.565905915230555, | |
| "grad_norm": 0.41290947794914246, | |
| "learning_rate": 0.00034141292952824694, | |
| "loss": 3.2077, | |
| "step": 74100 | |
| }, | |
| { | |
| "epoch": 21.580461108523522, | |
| "grad_norm": 0.4003162086009979, | |
| "learning_rate": 0.0003412382061735585, | |
| "loss": 3.217, | |
| "step": 74150 | |
| }, | |
| { | |
| "epoch": 21.59501630181649, | |
| "grad_norm": 0.41298526525497437, | |
| "learning_rate": 0.0003410634828188701, | |
| "loss": 3.2159, | |
| "step": 74200 | |
| }, | |
| { | |
| "epoch": 21.609571495109456, | |
| "grad_norm": 0.44664058089256287, | |
| "learning_rate": 0.00034088875946418167, | |
| "loss": 3.2394, | |
| "step": 74250 | |
| }, | |
| { | |
| "epoch": 21.624126688402423, | |
| "grad_norm": 0.39693984389305115, | |
| "learning_rate": 0.0003407140361094933, | |
| "loss": 3.2228, | |
| "step": 74300 | |
| }, | |
| { | |
| "epoch": 21.63868188169539, | |
| "grad_norm": 0.4212318956851959, | |
| "learning_rate": 0.00034053931275480486, | |
| "loss": 3.2259, | |
| "step": 74350 | |
| }, | |
| { | |
| "epoch": 21.653237074988354, | |
| "grad_norm": 0.3891969621181488, | |
| "learning_rate": 0.00034036458940011645, | |
| "loss": 3.2305, | |
| "step": 74400 | |
| }, | |
| { | |
| "epoch": 21.66779226828132, | |
| "grad_norm": 0.4122307300567627, | |
| "learning_rate": 0.00034018986604542804, | |
| "loss": 3.2215, | |
| "step": 74450 | |
| }, | |
| { | |
| "epoch": 21.682347461574288, | |
| "grad_norm": 0.39441734552383423, | |
| "learning_rate": 0.00034001514269073964, | |
| "loss": 3.2349, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 21.696902654867255, | |
| "grad_norm": 0.411464124917984, | |
| "learning_rate": 0.0003398404193360512, | |
| "loss": 3.2323, | |
| "step": 74550 | |
| }, | |
| { | |
| "epoch": 21.711457848160222, | |
| "grad_norm": 0.3994988203048706, | |
| "learning_rate": 0.00033966569598136283, | |
| "loss": 3.2277, | |
| "step": 74600 | |
| }, | |
| { | |
| "epoch": 21.72601304145319, | |
| "grad_norm": 0.40555524826049805, | |
| "learning_rate": 0.0003394909726266744, | |
| "loss": 3.2333, | |
| "step": 74650 | |
| }, | |
| { | |
| "epoch": 21.740568234746156, | |
| "grad_norm": 0.39020857214927673, | |
| "learning_rate": 0.000339316249271986, | |
| "loss": 3.2299, | |
| "step": 74700 | |
| }, | |
| { | |
| "epoch": 21.755123428039123, | |
| "grad_norm": 0.4349067807197571, | |
| "learning_rate": 0.00033914152591729756, | |
| "loss": 3.2264, | |
| "step": 74750 | |
| }, | |
| { | |
| "epoch": 21.76967862133209, | |
| "grad_norm": 0.4634840190410614, | |
| "learning_rate": 0.00033896680256260915, | |
| "loss": 3.237, | |
| "step": 74800 | |
| }, | |
| { | |
| "epoch": 21.784233814625058, | |
| "grad_norm": 0.40544989705085754, | |
| "learning_rate": 0.00033879207920792074, | |
| "loss": 3.2314, | |
| "step": 74850 | |
| }, | |
| { | |
| "epoch": 21.798789007918025, | |
| "grad_norm": 0.4065469205379486, | |
| "learning_rate": 0.0003386173558532324, | |
| "loss": 3.2311, | |
| "step": 74900 | |
| }, | |
| { | |
| "epoch": 21.81334420121099, | |
| "grad_norm": 0.39246711134910583, | |
| "learning_rate": 0.00033844263249854393, | |
| "loss": 3.2345, | |
| "step": 74950 | |
| }, | |
| { | |
| "epoch": 21.82789939450396, | |
| "grad_norm": 0.40202024579048157, | |
| "learning_rate": 0.00033826790914385553, | |
| "loss": 3.228, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 21.82789939450396, | |
| "eval_accuracy": 0.37403382271311547, | |
| "eval_loss": 3.532505512237549, | |
| "eval_runtime": 81.8937, | |
| "eval_samples_per_second": 203.325, | |
| "eval_steps_per_second": 12.712, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 21.842454587796926, | |
| "grad_norm": 0.41669344902038574, | |
| "learning_rate": 0.0003380931857891671, | |
| "loss": 3.2383, | |
| "step": 75050 | |
| }, | |
| { | |
| "epoch": 21.857009781089893, | |
| "grad_norm": 0.4140610098838806, | |
| "learning_rate": 0.00033791846243447866, | |
| "loss": 3.24, | |
| "step": 75100 | |
| }, | |
| { | |
| "epoch": 21.87156497438286, | |
| "grad_norm": 0.40579092502593994, | |
| "learning_rate": 0.0003377437390797903, | |
| "loss": 3.2404, | |
| "step": 75150 | |
| }, | |
| { | |
| "epoch": 21.886120167675827, | |
| "grad_norm": 0.4353466331958771, | |
| "learning_rate": 0.0003375690157251019, | |
| "loss": 3.2362, | |
| "step": 75200 | |
| }, | |
| { | |
| "epoch": 21.900675360968794, | |
| "grad_norm": 0.42096754908561707, | |
| "learning_rate": 0.0003373942923704135, | |
| "loss": 3.2439, | |
| "step": 75250 | |
| }, | |
| { | |
| "epoch": 21.91523055426176, | |
| "grad_norm": 0.40233075618743896, | |
| "learning_rate": 0.00033721956901572504, | |
| "loss": 3.2391, | |
| "step": 75300 | |
| }, | |
| { | |
| "epoch": 21.929785747554728, | |
| "grad_norm": 0.38952115178108215, | |
| "learning_rate": 0.00033704484566103663, | |
| "loss": 3.2471, | |
| "step": 75350 | |
| }, | |
| { | |
| "epoch": 21.944340940847695, | |
| "grad_norm": 0.4135924279689789, | |
| "learning_rate": 0.00033687012230634823, | |
| "loss": 3.2398, | |
| "step": 75400 | |
| }, | |
| { | |
| "epoch": 21.958896134140662, | |
| "grad_norm": 0.38750022649765015, | |
| "learning_rate": 0.0003366953989516599, | |
| "loss": 3.2335, | |
| "step": 75450 | |
| }, | |
| { | |
| "epoch": 21.97345132743363, | |
| "grad_norm": 0.40610578656196594, | |
| "learning_rate": 0.00033652067559697147, | |
| "loss": 3.2499, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 21.988006520726596, | |
| "grad_norm": 0.44946858286857605, | |
| "learning_rate": 0.000336345952242283, | |
| "loss": 3.2562, | |
| "step": 75550 | |
| }, | |
| { | |
| "epoch": 22.002328830926874, | |
| "grad_norm": 0.40398386120796204, | |
| "learning_rate": 0.0003361712288875946, | |
| "loss": 3.2144, | |
| "step": 75600 | |
| }, | |
| { | |
| "epoch": 22.01688402421984, | |
| "grad_norm": 0.4144744873046875, | |
| "learning_rate": 0.0003359965055329062, | |
| "loss": 3.143, | |
| "step": 75650 | |
| }, | |
| { | |
| "epoch": 22.03143921751281, | |
| "grad_norm": 0.41628196835517883, | |
| "learning_rate": 0.00033582178217821785, | |
| "loss": 3.137, | |
| "step": 75700 | |
| }, | |
| { | |
| "epoch": 22.045994410805775, | |
| "grad_norm": 0.4395162761211395, | |
| "learning_rate": 0.0003356470588235294, | |
| "loss": 3.15, | |
| "step": 75750 | |
| }, | |
| { | |
| "epoch": 22.060549604098743, | |
| "grad_norm": 0.43346747756004333, | |
| "learning_rate": 0.000335472335468841, | |
| "loss": 3.1455, | |
| "step": 75800 | |
| }, | |
| { | |
| "epoch": 22.07510479739171, | |
| "grad_norm": 0.39059779047966003, | |
| "learning_rate": 0.0003352976121141526, | |
| "loss": 3.1503, | |
| "step": 75850 | |
| }, | |
| { | |
| "epoch": 22.089659990684677, | |
| "grad_norm": 0.43091872334480286, | |
| "learning_rate": 0.0003351228887594641, | |
| "loss": 3.1665, | |
| "step": 75900 | |
| }, | |
| { | |
| "epoch": 22.104215183977644, | |
| "grad_norm": 0.4411010146141052, | |
| "learning_rate": 0.0003349481654047757, | |
| "loss": 3.1573, | |
| "step": 75950 | |
| }, | |
| { | |
| "epoch": 22.11877037727061, | |
| "grad_norm": 0.4069999158382416, | |
| "learning_rate": 0.00033477344205008736, | |
| "loss": 3.1735, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 22.11877037727061, | |
| "eval_accuracy": 0.3730900784506516, | |
| "eval_loss": 3.5555386543273926, | |
| "eval_runtime": 81.9047, | |
| "eval_samples_per_second": 203.297, | |
| "eval_steps_per_second": 12.71, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 22.133325570563578, | |
| "grad_norm": 0.4292955696582794, | |
| "learning_rate": 0.00033459871869539895, | |
| "loss": 3.1558, | |
| "step": 76050 | |
| }, | |
| { | |
| "epoch": 22.147880763856545, | |
| "grad_norm": 0.4282265901565552, | |
| "learning_rate": 0.0003344239953407105, | |
| "loss": 3.1646, | |
| "step": 76100 | |
| }, | |
| { | |
| "epoch": 22.162435957149512, | |
| "grad_norm": 0.42730513215065, | |
| "learning_rate": 0.0003342492719860221, | |
| "loss": 3.1547, | |
| "step": 76150 | |
| }, | |
| { | |
| "epoch": 22.17699115044248, | |
| "grad_norm": 0.4199974834918976, | |
| "learning_rate": 0.0003340745486313337, | |
| "loss": 3.1713, | |
| "step": 76200 | |
| }, | |
| { | |
| "epoch": 22.191546343735446, | |
| "grad_norm": 0.43740731477737427, | |
| "learning_rate": 0.0003338998252766453, | |
| "loss": 3.1632, | |
| "step": 76250 | |
| }, | |
| { | |
| "epoch": 22.206101537028413, | |
| "grad_norm": 0.4053819179534912, | |
| "learning_rate": 0.00033372510192195687, | |
| "loss": 3.1757, | |
| "step": 76300 | |
| }, | |
| { | |
| "epoch": 22.22065673032138, | |
| "grad_norm": 0.4016455411911011, | |
| "learning_rate": 0.00033355037856726847, | |
| "loss": 3.1808, | |
| "step": 76350 | |
| }, | |
| { | |
| "epoch": 22.235211923614347, | |
| "grad_norm": 0.43176504969596863, | |
| "learning_rate": 0.00033337565521258006, | |
| "loss": 3.183, | |
| "step": 76400 | |
| }, | |
| { | |
| "epoch": 22.24976711690731, | |
| "grad_norm": 0.40624263882637024, | |
| "learning_rate": 0.00033320093185789165, | |
| "loss": 3.1848, | |
| "step": 76450 | |
| }, | |
| { | |
| "epoch": 22.264322310200278, | |
| "grad_norm": 0.41378945112228394, | |
| "learning_rate": 0.0003330262085032032, | |
| "loss": 3.1851, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 22.278877503493245, | |
| "grad_norm": 0.42694327235221863, | |
| "learning_rate": 0.00033285148514851484, | |
| "loss": 3.1808, | |
| "step": 76550 | |
| }, | |
| { | |
| "epoch": 22.293432696786212, | |
| "grad_norm": 0.3923320472240448, | |
| "learning_rate": 0.00033267676179382644, | |
| "loss": 3.1834, | |
| "step": 76600 | |
| }, | |
| { | |
| "epoch": 22.30798789007918, | |
| "grad_norm": 0.42751583456993103, | |
| "learning_rate": 0.00033250203843913803, | |
| "loss": 3.177, | |
| "step": 76650 | |
| }, | |
| { | |
| "epoch": 22.322543083372146, | |
| "grad_norm": 0.3884715139865875, | |
| "learning_rate": 0.00033232731508444957, | |
| "loss": 3.209, | |
| "step": 76700 | |
| }, | |
| { | |
| "epoch": 22.337098276665113, | |
| "grad_norm": 0.3948022127151489, | |
| "learning_rate": 0.00033215259172976117, | |
| "loss": 3.1882, | |
| "step": 76750 | |
| }, | |
| { | |
| "epoch": 22.35165346995808, | |
| "grad_norm": 0.4485670030117035, | |
| "learning_rate": 0.00033197786837507276, | |
| "loss": 3.202, | |
| "step": 76800 | |
| }, | |
| { | |
| "epoch": 22.366208663251047, | |
| "grad_norm": 0.42500367760658264, | |
| "learning_rate": 0.0003318031450203844, | |
| "loss": 3.2016, | |
| "step": 76850 | |
| }, | |
| { | |
| "epoch": 22.380763856544014, | |
| "grad_norm": 0.42744216322898865, | |
| "learning_rate": 0.00033162842166569595, | |
| "loss": 3.1826, | |
| "step": 76900 | |
| }, | |
| { | |
| "epoch": 22.39531904983698, | |
| "grad_norm": 0.4164310097694397, | |
| "learning_rate": 0.00033145369831100754, | |
| "loss": 3.1992, | |
| "step": 76950 | |
| }, | |
| { | |
| "epoch": 22.40987424312995, | |
| "grad_norm": 0.39495959877967834, | |
| "learning_rate": 0.00033127897495631914, | |
| "loss": 3.1979, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 22.40987424312995, | |
| "eval_accuracy": 0.3734558234250959, | |
| "eval_loss": 3.54536509513855, | |
| "eval_runtime": 81.8351, | |
| "eval_samples_per_second": 203.47, | |
| "eval_steps_per_second": 12.721, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 22.424429436422916, | |
| "grad_norm": 0.4242117702960968, | |
| "learning_rate": 0.0003311042516016307, | |
| "loss": 3.21, | |
| "step": 77050 | |
| }, | |
| { | |
| "epoch": 22.438984629715883, | |
| "grad_norm": 0.4658028483390808, | |
| "learning_rate": 0.0003309295282469423, | |
| "loss": 3.1979, | |
| "step": 77100 | |
| }, | |
| { | |
| "epoch": 22.45353982300885, | |
| "grad_norm": 0.4351952373981476, | |
| "learning_rate": 0.0003307548048922539, | |
| "loss": 3.1984, | |
| "step": 77150 | |
| }, | |
| { | |
| "epoch": 22.468095016301817, | |
| "grad_norm": 0.4169571101665497, | |
| "learning_rate": 0.0003305800815375655, | |
| "loss": 3.2123, | |
| "step": 77200 | |
| }, | |
| { | |
| "epoch": 22.482650209594784, | |
| "grad_norm": 0.4277021586894989, | |
| "learning_rate": 0.00033040535818287705, | |
| "loss": 3.2051, | |
| "step": 77250 | |
| }, | |
| { | |
| "epoch": 22.49720540288775, | |
| "grad_norm": 0.4114350378513336, | |
| "learning_rate": 0.00033023063482818865, | |
| "loss": 3.1944, | |
| "step": 77300 | |
| }, | |
| { | |
| "epoch": 22.511760596180718, | |
| "grad_norm": 0.43812093138694763, | |
| "learning_rate": 0.00033005591147350024, | |
| "loss": 3.2119, | |
| "step": 77350 | |
| }, | |
| { | |
| "epoch": 22.526315789473685, | |
| "grad_norm": 0.440188467502594, | |
| "learning_rate": 0.0003298811881188119, | |
| "loss": 3.2025, | |
| "step": 77400 | |
| }, | |
| { | |
| "epoch": 22.540870982766652, | |
| "grad_norm": 0.42282989621162415, | |
| "learning_rate": 0.00032970646476412343, | |
| "loss": 3.2128, | |
| "step": 77450 | |
| }, | |
| { | |
| "epoch": 22.55542617605962, | |
| "grad_norm": 0.44431719183921814, | |
| "learning_rate": 0.000329531741409435, | |
| "loss": 3.2088, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 22.569981369352586, | |
| "grad_norm": 0.4037800133228302, | |
| "learning_rate": 0.0003293570180547466, | |
| "loss": 3.2099, | |
| "step": 77550 | |
| }, | |
| { | |
| "epoch": 22.584536562645553, | |
| "grad_norm": 0.4212678372859955, | |
| "learning_rate": 0.0003291822947000582, | |
| "loss": 3.2043, | |
| "step": 77600 | |
| }, | |
| { | |
| "epoch": 22.59909175593852, | |
| "grad_norm": 0.4073200523853302, | |
| "learning_rate": 0.00032900757134536975, | |
| "loss": 3.2156, | |
| "step": 77650 | |
| }, | |
| { | |
| "epoch": 22.613646949231487, | |
| "grad_norm": 0.4021799564361572, | |
| "learning_rate": 0.0003288328479906814, | |
| "loss": 3.2298, | |
| "step": 77700 | |
| }, | |
| { | |
| "epoch": 22.628202142524454, | |
| "grad_norm": 0.3939412236213684, | |
| "learning_rate": 0.000328658124635993, | |
| "loss": 3.2119, | |
| "step": 77750 | |
| }, | |
| { | |
| "epoch": 22.64275733581742, | |
| "grad_norm": 0.393204003572464, | |
| "learning_rate": 0.0003284834012813046, | |
| "loss": 3.2148, | |
| "step": 77800 | |
| }, | |
| { | |
| "epoch": 22.657312529110385, | |
| "grad_norm": 0.4322431981563568, | |
| "learning_rate": 0.00032830867792661613, | |
| "loss": 3.2223, | |
| "step": 77850 | |
| }, | |
| { | |
| "epoch": 22.671867722403352, | |
| "grad_norm": 0.3913043141365051, | |
| "learning_rate": 0.0003281339545719277, | |
| "loss": 3.2141, | |
| "step": 77900 | |
| }, | |
| { | |
| "epoch": 22.68642291569632, | |
| "grad_norm": 0.39243510365486145, | |
| "learning_rate": 0.0003279592312172394, | |
| "loss": 3.2261, | |
| "step": 77950 | |
| }, | |
| { | |
| "epoch": 22.700978108989286, | |
| "grad_norm": 0.4269677698612213, | |
| "learning_rate": 0.00032778450786255097, | |
| "loss": 3.2138, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 22.700978108989286, | |
| "eval_accuracy": 0.3735578371261941, | |
| "eval_loss": 3.539123058319092, | |
| "eval_runtime": 81.8143, | |
| "eval_samples_per_second": 203.522, | |
| "eval_steps_per_second": 12.724, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 22.715533302282253, | |
| "grad_norm": 0.4153009057044983, | |
| "learning_rate": 0.0003276097845078625, | |
| "loss": 3.222, | |
| "step": 78050 | |
| }, | |
| { | |
| "epoch": 22.73008849557522, | |
| "grad_norm": 0.41230499744415283, | |
| "learning_rate": 0.0003274350611531741, | |
| "loss": 3.2244, | |
| "step": 78100 | |
| }, | |
| { | |
| "epoch": 22.744643688868187, | |
| "grad_norm": 0.42509111762046814, | |
| "learning_rate": 0.0003272603377984857, | |
| "loss": 3.2102, | |
| "step": 78150 | |
| }, | |
| { | |
| "epoch": 22.759198882161154, | |
| "grad_norm": 0.39669370651245117, | |
| "learning_rate": 0.00032708561444379724, | |
| "loss": 3.239, | |
| "step": 78200 | |
| }, | |
| { | |
| "epoch": 22.77375407545412, | |
| "grad_norm": 0.3981836140155792, | |
| "learning_rate": 0.0003269108910891089, | |
| "loss": 3.2214, | |
| "step": 78250 | |
| }, | |
| { | |
| "epoch": 22.78830926874709, | |
| "grad_norm": 0.4179050326347351, | |
| "learning_rate": 0.0003267361677344205, | |
| "loss": 3.2081, | |
| "step": 78300 | |
| }, | |
| { | |
| "epoch": 22.802864462040056, | |
| "grad_norm": 0.4550701379776001, | |
| "learning_rate": 0.0003265614443797321, | |
| "loss": 3.2302, | |
| "step": 78350 | |
| }, | |
| { | |
| "epoch": 22.817419655333023, | |
| "grad_norm": 0.4033101201057434, | |
| "learning_rate": 0.0003263867210250436, | |
| "loss": 3.2237, | |
| "step": 78400 | |
| }, | |
| { | |
| "epoch": 22.83197484862599, | |
| "grad_norm": 0.4091835916042328, | |
| "learning_rate": 0.0003262119976703552, | |
| "loss": 3.2316, | |
| "step": 78450 | |
| }, | |
| { | |
| "epoch": 22.846530041918957, | |
| "grad_norm": 0.4019656181335449, | |
| "learning_rate": 0.00032603727431566686, | |
| "loss": 3.2261, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 22.861085235211924, | |
| "grad_norm": 0.40655606985092163, | |
| "learning_rate": 0.00032586255096097845, | |
| "loss": 3.2196, | |
| "step": 78550 | |
| }, | |
| { | |
| "epoch": 22.87564042850489, | |
| "grad_norm": 0.43008360266685486, | |
| "learning_rate": 0.00032568782760629005, | |
| "loss": 3.2287, | |
| "step": 78600 | |
| }, | |
| { | |
| "epoch": 22.890195621797858, | |
| "grad_norm": 0.43322986364364624, | |
| "learning_rate": 0.0003255131042516016, | |
| "loss": 3.2303, | |
| "step": 78650 | |
| }, | |
| { | |
| "epoch": 22.904750815090825, | |
| "grad_norm": 0.38893595337867737, | |
| "learning_rate": 0.0003253383808969132, | |
| "loss": 3.2292, | |
| "step": 78700 | |
| }, | |
| { | |
| "epoch": 22.919306008383792, | |
| "grad_norm": 0.4001709818840027, | |
| "learning_rate": 0.0003251636575422248, | |
| "loss": 3.2377, | |
| "step": 78750 | |
| }, | |
| { | |
| "epoch": 22.93386120167676, | |
| "grad_norm": 0.3891727924346924, | |
| "learning_rate": 0.0003249889341875364, | |
| "loss": 3.2377, | |
| "step": 78800 | |
| }, | |
| { | |
| "epoch": 22.948416394969726, | |
| "grad_norm": 0.4228573143482208, | |
| "learning_rate": 0.00032481421083284796, | |
| "loss": 3.2485, | |
| "step": 78850 | |
| }, | |
| { | |
| "epoch": 22.962971588262693, | |
| "grad_norm": 0.41021063923835754, | |
| "learning_rate": 0.00032463948747815956, | |
| "loss": 3.232, | |
| "step": 78900 | |
| }, | |
| { | |
| "epoch": 22.97752678155566, | |
| "grad_norm": 0.3969871997833252, | |
| "learning_rate": 0.00032446476412347115, | |
| "loss": 3.223, | |
| "step": 78950 | |
| }, | |
| { | |
| "epoch": 22.992081974848627, | |
| "grad_norm": 0.415117472410202, | |
| "learning_rate": 0.0003242900407687827, | |
| "loss": 3.2247, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 22.992081974848627, | |
| "eval_accuracy": 0.3743449174905429, | |
| "eval_loss": 3.5317611694335938, | |
| "eval_runtime": 81.971, | |
| "eval_samples_per_second": 203.133, | |
| "eval_steps_per_second": 12.7, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 23.006404285048905, | |
| "grad_norm": 0.4239928722381592, | |
| "learning_rate": 0.00032411531741409434, | |
| "loss": 3.1901, | |
| "step": 79050 | |
| }, | |
| { | |
| "epoch": 23.020959478341872, | |
| "grad_norm": 0.41547757387161255, | |
| "learning_rate": 0.00032394059405940593, | |
| "loss": 3.1375, | |
| "step": 79100 | |
| }, | |
| { | |
| "epoch": 23.03551467163484, | |
| "grad_norm": 0.4446639120578766, | |
| "learning_rate": 0.00032376587070471753, | |
| "loss": 3.13, | |
| "step": 79150 | |
| }, | |
| { | |
| "epoch": 23.050069864927806, | |
| "grad_norm": 0.40464508533477783, | |
| "learning_rate": 0.00032359114735002907, | |
| "loss": 3.1342, | |
| "step": 79200 | |
| }, | |
| { | |
| "epoch": 23.064625058220773, | |
| "grad_norm": 0.4241481125354767, | |
| "learning_rate": 0.00032341642399534066, | |
| "loss": 3.1453, | |
| "step": 79250 | |
| }, | |
| { | |
| "epoch": 23.07918025151374, | |
| "grad_norm": 0.4222959280014038, | |
| "learning_rate": 0.00032324170064065226, | |
| "loss": 3.1496, | |
| "step": 79300 | |
| }, | |
| { | |
| "epoch": 23.093735444806708, | |
| "grad_norm": 0.4314693808555603, | |
| "learning_rate": 0.0003230669772859639, | |
| "loss": 3.146, | |
| "step": 79350 | |
| }, | |
| { | |
| "epoch": 23.108290638099675, | |
| "grad_norm": 0.4298093914985657, | |
| "learning_rate": 0.00032289225393127545, | |
| "loss": 3.1474, | |
| "step": 79400 | |
| }, | |
| { | |
| "epoch": 23.12284583139264, | |
| "grad_norm": 0.45423945784568787, | |
| "learning_rate": 0.00032271753057658704, | |
| "loss": 3.1648, | |
| "step": 79450 | |
| }, | |
| { | |
| "epoch": 23.13740102468561, | |
| "grad_norm": 0.41885605454444885, | |
| "learning_rate": 0.00032254280722189863, | |
| "loss": 3.164, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 23.151956217978576, | |
| "grad_norm": 0.46855849027633667, | |
| "learning_rate": 0.00032236808386721023, | |
| "loss": 3.1667, | |
| "step": 79550 | |
| }, | |
| { | |
| "epoch": 23.166511411271543, | |
| "grad_norm": 0.5857585072517395, | |
| "learning_rate": 0.00032219336051252177, | |
| "loss": 3.1693, | |
| "step": 79600 | |
| }, | |
| { | |
| "epoch": 23.18106660456451, | |
| "grad_norm": 0.4339720904827118, | |
| "learning_rate": 0.0003220186371578334, | |
| "loss": 3.1597, | |
| "step": 79650 | |
| }, | |
| { | |
| "epoch": 23.195621797857477, | |
| "grad_norm": 0.4228954017162323, | |
| "learning_rate": 0.000321843913803145, | |
| "loss": 3.1633, | |
| "step": 79700 | |
| }, | |
| { | |
| "epoch": 23.210176991150444, | |
| "grad_norm": 0.40704774856567383, | |
| "learning_rate": 0.0003216691904484566, | |
| "loss": 3.1762, | |
| "step": 79750 | |
| }, | |
| { | |
| "epoch": 23.22473218444341, | |
| "grad_norm": 0.41080963611602783, | |
| "learning_rate": 0.00032149446709376815, | |
| "loss": 3.1541, | |
| "step": 79800 | |
| }, | |
| { | |
| "epoch": 23.239287377736375, | |
| "grad_norm": 0.435896098613739, | |
| "learning_rate": 0.00032131974373907974, | |
| "loss": 3.1701, | |
| "step": 79850 | |
| }, | |
| { | |
| "epoch": 23.25384257102934, | |
| "grad_norm": 0.47746923565864563, | |
| "learning_rate": 0.0003211450203843914, | |
| "loss": 3.1742, | |
| "step": 79900 | |
| }, | |
| { | |
| "epoch": 23.26839776432231, | |
| "grad_norm": 0.41168075799942017, | |
| "learning_rate": 0.000320970297029703, | |
| "loss": 3.1756, | |
| "step": 79950 | |
| }, | |
| { | |
| "epoch": 23.282952957615276, | |
| "grad_norm": 0.4239851236343384, | |
| "learning_rate": 0.0003207955736750145, | |
| "loss": 3.1887, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 23.282952957615276, | |
| "eval_accuracy": 0.3736681952659766, | |
| "eval_loss": 3.5504491329193115, | |
| "eval_runtime": 81.7978, | |
| "eval_samples_per_second": 203.563, | |
| "eval_steps_per_second": 12.726, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 23.297508150908243, | |
| "grad_norm": 0.4188058376312256, | |
| "learning_rate": 0.0003206208503203261, | |
| "loss": 3.1669, | |
| "step": 80050 | |
| }, | |
| { | |
| "epoch": 23.31206334420121, | |
| "grad_norm": 0.4258587062358856, | |
| "learning_rate": 0.0003204461269656377, | |
| "loss": 3.1854, | |
| "step": 80100 | |
| }, | |
| { | |
| "epoch": 23.326618537494177, | |
| "grad_norm": 0.40632376074790955, | |
| "learning_rate": 0.00032027140361094925, | |
| "loss": 3.1751, | |
| "step": 80150 | |
| }, | |
| { | |
| "epoch": 23.341173730787144, | |
| "grad_norm": 0.41706207394599915, | |
| "learning_rate": 0.0003200966802562609, | |
| "loss": 3.1802, | |
| "step": 80200 | |
| }, | |
| { | |
| "epoch": 23.35572892408011, | |
| "grad_norm": 0.43828704953193665, | |
| "learning_rate": 0.0003199219569015725, | |
| "loss": 3.1794, | |
| "step": 80250 | |
| }, | |
| { | |
| "epoch": 23.370284117373078, | |
| "grad_norm": 0.4298463463783264, | |
| "learning_rate": 0.0003197472335468841, | |
| "loss": 3.1733, | |
| "step": 80300 | |
| }, | |
| { | |
| "epoch": 23.384839310666045, | |
| "grad_norm": 0.42979755997657776, | |
| "learning_rate": 0.00031957251019219563, | |
| "loss": 3.1799, | |
| "step": 80350 | |
| }, | |
| { | |
| "epoch": 23.399394503959012, | |
| "grad_norm": 0.44446930289268494, | |
| "learning_rate": 0.0003193977868375072, | |
| "loss": 3.1785, | |
| "step": 80400 | |
| }, | |
| { | |
| "epoch": 23.41394969725198, | |
| "grad_norm": 0.4211976230144501, | |
| "learning_rate": 0.00031922306348281887, | |
| "loss": 3.1918, | |
| "step": 80450 | |
| }, | |
| { | |
| "epoch": 23.428504890544946, | |
| "grad_norm": 0.407622754573822, | |
| "learning_rate": 0.00031904834012813047, | |
| "loss": 3.1993, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 23.443060083837914, | |
| "grad_norm": 0.41485804319381714, | |
| "learning_rate": 0.000318873616773442, | |
| "loss": 3.1961, | |
| "step": 80550 | |
| }, | |
| { | |
| "epoch": 23.45761527713088, | |
| "grad_norm": 0.40271124243736267, | |
| "learning_rate": 0.0003186988934187536, | |
| "loss": 3.1924, | |
| "step": 80600 | |
| }, | |
| { | |
| "epoch": 23.472170470423848, | |
| "grad_norm": 0.41611066460609436, | |
| "learning_rate": 0.0003185241700640652, | |
| "loss": 3.2113, | |
| "step": 80650 | |
| }, | |
| { | |
| "epoch": 23.486725663716815, | |
| "grad_norm": 0.43158844113349915, | |
| "learning_rate": 0.0003183494467093768, | |
| "loss": 3.1921, | |
| "step": 80700 | |
| }, | |
| { | |
| "epoch": 23.50128085700978, | |
| "grad_norm": 0.4268464148044586, | |
| "learning_rate": 0.0003181747233546884, | |
| "loss": 3.192, | |
| "step": 80750 | |
| }, | |
| { | |
| "epoch": 23.51583605030275, | |
| "grad_norm": 0.4124521017074585, | |
| "learning_rate": 0.000318, | |
| "loss": 3.1799, | |
| "step": 80800 | |
| }, | |
| { | |
| "epoch": 23.530391243595716, | |
| "grad_norm": 0.4254520535469055, | |
| "learning_rate": 0.00031782527664531157, | |
| "loss": 3.197, | |
| "step": 80850 | |
| }, | |
| { | |
| "epoch": 23.544946436888683, | |
| "grad_norm": 0.41922834515571594, | |
| "learning_rate": 0.00031765055329062317, | |
| "loss": 3.1995, | |
| "step": 80900 | |
| }, | |
| { | |
| "epoch": 23.55950163018165, | |
| "grad_norm": 0.43130457401275635, | |
| "learning_rate": 0.0003174758299359347, | |
| "loss": 3.1999, | |
| "step": 80950 | |
| }, | |
| { | |
| "epoch": 23.574056823474617, | |
| "grad_norm": 0.4318077564239502, | |
| "learning_rate": 0.0003173011065812463, | |
| "loss": 3.2066, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 23.574056823474617, | |
| "eval_accuracy": 0.3741489994724199, | |
| "eval_loss": 3.5403668880462646, | |
| "eval_runtime": 81.918, | |
| "eval_samples_per_second": 203.264, | |
| "eval_steps_per_second": 12.708, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 23.588612016767584, | |
| "grad_norm": 0.41116848587989807, | |
| "learning_rate": 0.00031712638322655795, | |
| "loss": 3.2045, | |
| "step": 81050 | |
| }, | |
| { | |
| "epoch": 23.60316721006055, | |
| "grad_norm": 0.406704843044281, | |
| "learning_rate": 0.00031695165987186954, | |
| "loss": 3.1971, | |
| "step": 81100 | |
| }, | |
| { | |
| "epoch": 23.61772240335352, | |
| "grad_norm": 0.42053982615470886, | |
| "learning_rate": 0.0003167769365171811, | |
| "loss": 3.2032, | |
| "step": 81150 | |
| }, | |
| { | |
| "epoch": 23.63227759664648, | |
| "grad_norm": 0.4305388629436493, | |
| "learning_rate": 0.0003166022131624927, | |
| "loss": 3.2036, | |
| "step": 81200 | |
| }, | |
| { | |
| "epoch": 23.64683278993945, | |
| "grad_norm": 0.41716063022613525, | |
| "learning_rate": 0.00031642748980780427, | |
| "loss": 3.207, | |
| "step": 81250 | |
| }, | |
| { | |
| "epoch": 23.661387983232416, | |
| "grad_norm": 0.4108598828315735, | |
| "learning_rate": 0.0003162527664531159, | |
| "loss": 3.1971, | |
| "step": 81300 | |
| }, | |
| { | |
| "epoch": 23.675943176525383, | |
| "grad_norm": 0.44729098677635193, | |
| "learning_rate": 0.00031607804309842746, | |
| "loss": 3.2043, | |
| "step": 81350 | |
| }, | |
| { | |
| "epoch": 23.69049836981835, | |
| "grad_norm": 0.4198160469532013, | |
| "learning_rate": 0.00031590331974373905, | |
| "loss": 3.2116, | |
| "step": 81400 | |
| }, | |
| { | |
| "epoch": 23.705053563111317, | |
| "grad_norm": 0.3962330222129822, | |
| "learning_rate": 0.00031572859638905065, | |
| "loss": 3.2211, | |
| "step": 81450 | |
| }, | |
| { | |
| "epoch": 23.719608756404284, | |
| "grad_norm": 0.4206036627292633, | |
| "learning_rate": 0.0003155538730343622, | |
| "loss": 3.2011, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 23.73416394969725, | |
| "grad_norm": 0.40233752131462097, | |
| "learning_rate": 0.0003153791496796738, | |
| "loss": 3.2254, | |
| "step": 81550 | |
| }, | |
| { | |
| "epoch": 23.74871914299022, | |
| "grad_norm": 0.4157436490058899, | |
| "learning_rate": 0.00031520442632498543, | |
| "loss": 3.2163, | |
| "step": 81600 | |
| }, | |
| { | |
| "epoch": 23.763274336283185, | |
| "grad_norm": 0.3998465836048126, | |
| "learning_rate": 0.000315029702970297, | |
| "loss": 3.2075, | |
| "step": 81650 | |
| }, | |
| { | |
| "epoch": 23.777829529576152, | |
| "grad_norm": 0.43314433097839355, | |
| "learning_rate": 0.0003148549796156086, | |
| "loss": 3.2226, | |
| "step": 81700 | |
| }, | |
| { | |
| "epoch": 23.79238472286912, | |
| "grad_norm": 0.404647558927536, | |
| "learning_rate": 0.00031468025626092016, | |
| "loss": 3.2219, | |
| "step": 81750 | |
| }, | |
| { | |
| "epoch": 23.806939916162086, | |
| "grad_norm": 0.4033067524433136, | |
| "learning_rate": 0.00031450553290623175, | |
| "loss": 3.2164, | |
| "step": 81800 | |
| }, | |
| { | |
| "epoch": 23.821495109455054, | |
| "grad_norm": 0.4236786663532257, | |
| "learning_rate": 0.0003143308095515434, | |
| "loss": 3.2106, | |
| "step": 81850 | |
| }, | |
| { | |
| "epoch": 23.83605030274802, | |
| "grad_norm": 0.41767993569374084, | |
| "learning_rate": 0.000314156086196855, | |
| "loss": 3.2031, | |
| "step": 81900 | |
| }, | |
| { | |
| "epoch": 23.850605496040988, | |
| "grad_norm": 0.41541624069213867, | |
| "learning_rate": 0.00031398136284216654, | |
| "loss": 3.2279, | |
| "step": 81950 | |
| }, | |
| { | |
| "epoch": 23.865160689333955, | |
| "grad_norm": 0.4575217366218567, | |
| "learning_rate": 0.00031380663948747813, | |
| "loss": 3.2304, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 23.865160689333955, | |
| "eval_accuracy": 0.37419659803111205, | |
| "eval_loss": 3.532099723815918, | |
| "eval_runtime": 81.7874, | |
| "eval_samples_per_second": 203.589, | |
| "eval_steps_per_second": 12.728, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 23.879715882626922, | |
| "grad_norm": 0.4547325372695923, | |
| "learning_rate": 0.0003136319161327897, | |
| "loss": 3.2115, | |
| "step": 82050 | |
| }, | |
| { | |
| "epoch": 23.89427107591989, | |
| "grad_norm": 0.4252195358276367, | |
| "learning_rate": 0.00031345719277810127, | |
| "loss": 3.2221, | |
| "step": 82100 | |
| }, | |
| { | |
| "epoch": 23.908826269212856, | |
| "grad_norm": 0.45146381855010986, | |
| "learning_rate": 0.0003132824694234129, | |
| "loss": 3.2145, | |
| "step": 82150 | |
| }, | |
| { | |
| "epoch": 23.923381462505823, | |
| "grad_norm": 0.4394795000553131, | |
| "learning_rate": 0.0003131077460687245, | |
| "loss": 3.2262, | |
| "step": 82200 | |
| }, | |
| { | |
| "epoch": 23.93793665579879, | |
| "grad_norm": 0.4240601658821106, | |
| "learning_rate": 0.0003129330227140361, | |
| "loss": 3.2171, | |
| "step": 82250 | |
| }, | |
| { | |
| "epoch": 23.952491849091757, | |
| "grad_norm": 0.40560463070869446, | |
| "learning_rate": 0.00031275829935934764, | |
| "loss": 3.2246, | |
| "step": 82300 | |
| }, | |
| { | |
| "epoch": 23.967047042384724, | |
| "grad_norm": 0.46602579951286316, | |
| "learning_rate": 0.00031258357600465924, | |
| "loss": 3.23, | |
| "step": 82350 | |
| }, | |
| { | |
| "epoch": 23.98160223567769, | |
| "grad_norm": 0.44112420082092285, | |
| "learning_rate": 0.00031240885264997083, | |
| "loss": 3.2296, | |
| "step": 82400 | |
| }, | |
| { | |
| "epoch": 23.99615742897066, | |
| "grad_norm": 0.4227476716041565, | |
| "learning_rate": 0.0003122341292952825, | |
| "loss": 3.217, | |
| "step": 82450 | |
| }, | |
| { | |
| "epoch": 24.010479739170936, | |
| "grad_norm": 0.46348699927330017, | |
| "learning_rate": 0.000312059405940594, | |
| "loss": 3.1503, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 24.025034932463903, | |
| "grad_norm": 0.4232129156589508, | |
| "learning_rate": 0.0003118846825859056, | |
| "loss": 3.1212, | |
| "step": 82550 | |
| }, | |
| { | |
| "epoch": 24.03959012575687, | |
| "grad_norm": 0.44914910197257996, | |
| "learning_rate": 0.0003117099592312172, | |
| "loss": 3.1077, | |
| "step": 82600 | |
| }, | |
| { | |
| "epoch": 24.054145319049837, | |
| "grad_norm": 0.38889068365097046, | |
| "learning_rate": 0.0003115352358765288, | |
| "loss": 3.1287, | |
| "step": 82650 | |
| }, | |
| { | |
| "epoch": 24.068700512342804, | |
| "grad_norm": 0.44905269145965576, | |
| "learning_rate": 0.0003113605125218404, | |
| "loss": 3.1385, | |
| "step": 82700 | |
| }, | |
| { | |
| "epoch": 24.08325570563577, | |
| "grad_norm": 0.41245752573013306, | |
| "learning_rate": 0.000311185789167152, | |
| "loss": 3.1464, | |
| "step": 82750 | |
| }, | |
| { | |
| "epoch": 24.09781089892874, | |
| "grad_norm": 0.4094231128692627, | |
| "learning_rate": 0.0003110110658124636, | |
| "loss": 3.1469, | |
| "step": 82800 | |
| }, | |
| { | |
| "epoch": 24.112366092221706, | |
| "grad_norm": 0.42689046263694763, | |
| "learning_rate": 0.0003108363424577752, | |
| "loss": 3.1436, | |
| "step": 82850 | |
| }, | |
| { | |
| "epoch": 24.126921285514673, | |
| "grad_norm": 0.41381365060806274, | |
| "learning_rate": 0.0003106616191030867, | |
| "loss": 3.1472, | |
| "step": 82900 | |
| }, | |
| { | |
| "epoch": 24.14147647880764, | |
| "grad_norm": 0.42648911476135254, | |
| "learning_rate": 0.0003104868957483983, | |
| "loss": 3.1439, | |
| "step": 82950 | |
| }, | |
| { | |
| "epoch": 24.156031672100607, | |
| "grad_norm": 0.40339183807373047, | |
| "learning_rate": 0.00031031217239370996, | |
| "loss": 3.1437, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 24.156031672100607, | |
| "eval_accuracy": 0.37356829705637584, | |
| "eval_loss": 3.5519988536834717, | |
| "eval_runtime": 81.8175, | |
| "eval_samples_per_second": 203.514, | |
| "eval_steps_per_second": 12.723, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 24.170586865393574, | |
| "grad_norm": 0.4012400805950165, | |
| "learning_rate": 0.00031013744903902156, | |
| "loss": 3.1582, | |
| "step": 83050 | |
| }, | |
| { | |
| "epoch": 24.18514205868654, | |
| "grad_norm": 0.4456459581851959, | |
| "learning_rate": 0.0003099627256843331, | |
| "loss": 3.1617, | |
| "step": 83100 | |
| }, | |
| { | |
| "epoch": 24.199697251979508, | |
| "grad_norm": 0.47571277618408203, | |
| "learning_rate": 0.0003097880023296447, | |
| "loss": 3.1632, | |
| "step": 83150 | |
| }, | |
| { | |
| "epoch": 24.21425244527247, | |
| "grad_norm": 0.4243694841861725, | |
| "learning_rate": 0.0003096132789749563, | |
| "loss": 3.152, | |
| "step": 83200 | |
| }, | |
| { | |
| "epoch": 24.22880763856544, | |
| "grad_norm": 0.4189963638782501, | |
| "learning_rate": 0.00030943855562026794, | |
| "loss": 3.1717, | |
| "step": 83250 | |
| }, | |
| { | |
| "epoch": 24.243362831858406, | |
| "grad_norm": 0.43693459033966064, | |
| "learning_rate": 0.0003092638322655795, | |
| "loss": 3.1673, | |
| "step": 83300 | |
| }, | |
| { | |
| "epoch": 24.257918025151373, | |
| "grad_norm": 0.42013630270957947, | |
| "learning_rate": 0.00030908910891089107, | |
| "loss": 3.1529, | |
| "step": 83350 | |
| }, | |
| { | |
| "epoch": 24.27247321844434, | |
| "grad_norm": 0.4308550953865051, | |
| "learning_rate": 0.00030891438555620266, | |
| "loss": 3.1636, | |
| "step": 83400 | |
| }, | |
| { | |
| "epoch": 24.287028411737307, | |
| "grad_norm": 0.44483253359794617, | |
| "learning_rate": 0.0003087396622015142, | |
| "loss": 3.1698, | |
| "step": 83450 | |
| }, | |
| { | |
| "epoch": 24.301583605030274, | |
| "grad_norm": 0.44508424401283264, | |
| "learning_rate": 0.0003085649388468258, | |
| "loss": 3.1733, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 24.31613879832324, | |
| "grad_norm": 0.41914376616477966, | |
| "learning_rate": 0.00030839021549213745, | |
| "loss": 3.1694, | |
| "step": 83550 | |
| }, | |
| { | |
| "epoch": 24.330693991616208, | |
| "grad_norm": 0.4346310794353485, | |
| "learning_rate": 0.00030821549213744904, | |
| "loss": 3.1721, | |
| "step": 83600 | |
| }, | |
| { | |
| "epoch": 24.345249184909175, | |
| "grad_norm": 0.41712722182273865, | |
| "learning_rate": 0.0003080407687827606, | |
| "loss": 3.1733, | |
| "step": 83650 | |
| }, | |
| { | |
| "epoch": 24.359804378202142, | |
| "grad_norm": 0.41999542713165283, | |
| "learning_rate": 0.0003078660454280722, | |
| "loss": 3.1791, | |
| "step": 83700 | |
| }, | |
| { | |
| "epoch": 24.37435957149511, | |
| "grad_norm": 0.41637396812438965, | |
| "learning_rate": 0.00030769132207338377, | |
| "loss": 3.184, | |
| "step": 83750 | |
| }, | |
| { | |
| "epoch": 24.388914764788076, | |
| "grad_norm": 0.42585980892181396, | |
| "learning_rate": 0.00030751659871869536, | |
| "loss": 3.1754, | |
| "step": 83800 | |
| }, | |
| { | |
| "epoch": 24.403469958081043, | |
| "grad_norm": 0.42245015501976013, | |
| "learning_rate": 0.00030734187536400696, | |
| "loss": 3.1847, | |
| "step": 83850 | |
| }, | |
| { | |
| "epoch": 24.41802515137401, | |
| "grad_norm": 0.41260600090026855, | |
| "learning_rate": 0.00030716715200931855, | |
| "loss": 3.1734, | |
| "step": 83900 | |
| }, | |
| { | |
| "epoch": 24.432580344666977, | |
| "grad_norm": 0.43477895855903625, | |
| "learning_rate": 0.00030699242865463015, | |
| "loss": 3.1787, | |
| "step": 83950 | |
| }, | |
| { | |
| "epoch": 24.447135537959944, | |
| "grad_norm": 0.41777095198631287, | |
| "learning_rate": 0.00030681770529994174, | |
| "loss": 3.182, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 24.447135537959944, | |
| "eval_accuracy": 0.37411327117157445, | |
| "eval_loss": 3.545283317565918, | |
| "eval_runtime": 81.7239, | |
| "eval_samples_per_second": 203.747, | |
| "eval_steps_per_second": 12.738, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 24.46169073125291, | |
| "grad_norm": 0.4202840328216553, | |
| "learning_rate": 0.0003066429819452533, | |
| "loss": 3.1861, | |
| "step": 84050 | |
| }, | |
| { | |
| "epoch": 24.47624592454588, | |
| "grad_norm": 0.4049651026725769, | |
| "learning_rate": 0.00030646825859056493, | |
| "loss": 3.1817, | |
| "step": 84100 | |
| }, | |
| { | |
| "epoch": 24.490801117838846, | |
| "grad_norm": 0.4313950538635254, | |
| "learning_rate": 0.0003062935352358765, | |
| "loss": 3.1855, | |
| "step": 84150 | |
| }, | |
| { | |
| "epoch": 24.505356311131813, | |
| "grad_norm": 0.4170263409614563, | |
| "learning_rate": 0.0003061188118811881, | |
| "loss": 3.1877, | |
| "step": 84200 | |
| }, | |
| { | |
| "epoch": 24.51991150442478, | |
| "grad_norm": 0.4345579445362091, | |
| "learning_rate": 0.00030594408852649966, | |
| "loss": 3.2028, | |
| "step": 84250 | |
| }, | |
| { | |
| "epoch": 24.534466697717747, | |
| "grad_norm": 0.41852110624313354, | |
| "learning_rate": 0.00030576936517181125, | |
| "loss": 3.1948, | |
| "step": 84300 | |
| }, | |
| { | |
| "epoch": 24.549021891010714, | |
| "grad_norm": 0.44190332293510437, | |
| "learning_rate": 0.00030559464181712285, | |
| "loss": 3.1929, | |
| "step": 84350 | |
| }, | |
| { | |
| "epoch": 24.56357708430368, | |
| "grad_norm": 0.42110559344291687, | |
| "learning_rate": 0.0003054199184624345, | |
| "loss": 3.1904, | |
| "step": 84400 | |
| }, | |
| { | |
| "epoch": 24.578132277596648, | |
| "grad_norm": 0.4028767943382263, | |
| "learning_rate": 0.00030524519510774604, | |
| "loss": 3.1896, | |
| "step": 84450 | |
| }, | |
| { | |
| "epoch": 24.592687470889615, | |
| "grad_norm": 0.42882096767425537, | |
| "learning_rate": 0.00030507047175305763, | |
| "loss": 3.1972, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 24.60724266418258, | |
| "grad_norm": 0.4360930919647217, | |
| "learning_rate": 0.0003048957483983692, | |
| "loss": 3.191, | |
| "step": 84550 | |
| }, | |
| { | |
| "epoch": 24.621797857475546, | |
| "grad_norm": 0.49519217014312744, | |
| "learning_rate": 0.00030472102504368076, | |
| "loss": 3.1951, | |
| "step": 84600 | |
| }, | |
| { | |
| "epoch": 24.636353050768513, | |
| "grad_norm": 0.44893336296081543, | |
| "learning_rate": 0.0003045463016889924, | |
| "loss": 3.1949, | |
| "step": 84650 | |
| }, | |
| { | |
| "epoch": 24.65090824406148, | |
| "grad_norm": 0.4005873501300812, | |
| "learning_rate": 0.000304371578334304, | |
| "loss": 3.1966, | |
| "step": 84700 | |
| }, | |
| { | |
| "epoch": 24.665463437354447, | |
| "grad_norm": 0.4110483229160309, | |
| "learning_rate": 0.0003041968549796156, | |
| "loss": 3.1874, | |
| "step": 84750 | |
| }, | |
| { | |
| "epoch": 24.680018630647414, | |
| "grad_norm": 0.38776257634162903, | |
| "learning_rate": 0.00030402213162492714, | |
| "loss": 3.1853, | |
| "step": 84800 | |
| }, | |
| { | |
| "epoch": 24.69457382394038, | |
| "grad_norm": 0.4316357672214508, | |
| "learning_rate": 0.00030384740827023874, | |
| "loss": 3.2007, | |
| "step": 84850 | |
| }, | |
| { | |
| "epoch": 24.709129017233348, | |
| "grad_norm": 0.4216737151145935, | |
| "learning_rate": 0.00030367268491555033, | |
| "loss": 3.1997, | |
| "step": 84900 | |
| }, | |
| { | |
| "epoch": 24.723684210526315, | |
| "grad_norm": 0.4443216621875763, | |
| "learning_rate": 0.000303497961560862, | |
| "loss": 3.2027, | |
| "step": 84950 | |
| }, | |
| { | |
| "epoch": 24.738239403819282, | |
| "grad_norm": 0.4177134335041046, | |
| "learning_rate": 0.00030332323820617357, | |
| "loss": 3.2048, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 24.738239403819282, | |
| "eval_accuracy": 0.3743467979274295, | |
| "eval_loss": 3.537952423095703, | |
| "eval_runtime": 81.7346, | |
| "eval_samples_per_second": 203.72, | |
| "eval_steps_per_second": 12.736, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 24.75279459711225, | |
| "grad_norm": 0.44068464636802673, | |
| "learning_rate": 0.0003031485148514851, | |
| "loss": 3.2005, | |
| "step": 85050 | |
| }, | |
| { | |
| "epoch": 24.767349790405216, | |
| "grad_norm": 0.41270115971565247, | |
| "learning_rate": 0.0003029737914967967, | |
| "loss": 3.2162, | |
| "step": 85100 | |
| }, | |
| { | |
| "epoch": 24.781904983698183, | |
| "grad_norm": 0.45513081550598145, | |
| "learning_rate": 0.0003027990681421083, | |
| "loss": 3.2025, | |
| "step": 85150 | |
| }, | |
| { | |
| "epoch": 24.79646017699115, | |
| "grad_norm": 0.42816856503486633, | |
| "learning_rate": 0.00030262434478741984, | |
| "loss": 3.2076, | |
| "step": 85200 | |
| }, | |
| { | |
| "epoch": 24.811015370284117, | |
| "grad_norm": 0.4347361624240875, | |
| "learning_rate": 0.0003024496214327315, | |
| "loss": 3.2133, | |
| "step": 85250 | |
| }, | |
| { | |
| "epoch": 24.825570563577084, | |
| "grad_norm": 0.4000208079814911, | |
| "learning_rate": 0.0003022748980780431, | |
| "loss": 3.2123, | |
| "step": 85300 | |
| }, | |
| { | |
| "epoch": 24.84012575687005, | |
| "grad_norm": 0.414603054523468, | |
| "learning_rate": 0.0003021001747233547, | |
| "loss": 3.1984, | |
| "step": 85350 | |
| }, | |
| { | |
| "epoch": 24.85468095016302, | |
| "grad_norm": 0.4231418967247009, | |
| "learning_rate": 0.0003019254513686662, | |
| "loss": 3.2012, | |
| "step": 85400 | |
| }, | |
| { | |
| "epoch": 24.869236143455986, | |
| "grad_norm": 0.4129658639431, | |
| "learning_rate": 0.0003017507280139778, | |
| "loss": 3.201, | |
| "step": 85450 | |
| }, | |
| { | |
| "epoch": 24.883791336748953, | |
| "grad_norm": 0.4171248972415924, | |
| "learning_rate": 0.00030157600465928946, | |
| "loss": 3.2035, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 24.89834653004192, | |
| "grad_norm": 0.41114258766174316, | |
| "learning_rate": 0.00030140128130460106, | |
| "loss": 3.218, | |
| "step": 85550 | |
| }, | |
| { | |
| "epoch": 24.912901723334887, | |
| "grad_norm": 0.4402022659778595, | |
| "learning_rate": 0.0003012265579499126, | |
| "loss": 3.2088, | |
| "step": 85600 | |
| }, | |
| { | |
| "epoch": 24.927456916627854, | |
| "grad_norm": 0.42849573493003845, | |
| "learning_rate": 0.0003010518345952242, | |
| "loss": 3.2107, | |
| "step": 85650 | |
| }, | |
| { | |
| "epoch": 24.94201210992082, | |
| "grad_norm": 0.42676955461502075, | |
| "learning_rate": 0.0003008771112405358, | |
| "loss": 3.2172, | |
| "step": 85700 | |
| }, | |
| { | |
| "epoch": 24.956567303213788, | |
| "grad_norm": 0.4381151497364044, | |
| "learning_rate": 0.0003007023878858473, | |
| "loss": 3.2152, | |
| "step": 85750 | |
| }, | |
| { | |
| "epoch": 24.971122496506755, | |
| "grad_norm": 0.435381144285202, | |
| "learning_rate": 0.000300527664531159, | |
| "loss": 3.2086, | |
| "step": 85800 | |
| }, | |
| { | |
| "epoch": 24.985677689799722, | |
| "grad_norm": 0.41387078166007996, | |
| "learning_rate": 0.00030035294117647057, | |
| "loss": 3.2194, | |
| "step": 85850 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.00030017821782178216, | |
| "loss": 3.2097, | |
| "step": 85900 | |
| }, | |
| { | |
| "epoch": 25.014555193292967, | |
| "grad_norm": 0.426144003868103, | |
| "learning_rate": 0.00030000349446709376, | |
| "loss": 3.1111, | |
| "step": 85950 | |
| }, | |
| { | |
| "epoch": 25.029110386585934, | |
| "grad_norm": 0.42004653811454773, | |
| "learning_rate": 0.00029982877111240535, | |
| "loss": 3.1086, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 25.029110386585934, | |
| "eval_accuracy": 0.37363046900093916, | |
| "eval_loss": 3.551327705383301, | |
| "eval_runtime": 81.9198, | |
| "eval_samples_per_second": 203.26, | |
| "eval_steps_per_second": 12.708, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 25.0436655798789, | |
| "grad_norm": 0.4292803108692169, | |
| "learning_rate": 0.00029965404775771694, | |
| "loss": 3.1121, | |
| "step": 86050 | |
| }, | |
| { | |
| "epoch": 25.05822077317187, | |
| "grad_norm": 0.42284587025642395, | |
| "learning_rate": 0.0002994793244030285, | |
| "loss": 3.1223, | |
| "step": 86100 | |
| }, | |
| { | |
| "epoch": 25.072775966464835, | |
| "grad_norm": 0.43743574619293213, | |
| "learning_rate": 0.00029930460104834013, | |
| "loss": 3.1226, | |
| "step": 86150 | |
| }, | |
| { | |
| "epoch": 25.087331159757802, | |
| "grad_norm": 0.4359569847583771, | |
| "learning_rate": 0.0002991298776936517, | |
| "loss": 3.1408, | |
| "step": 86200 | |
| }, | |
| { | |
| "epoch": 25.10188635305077, | |
| "grad_norm": 0.4496668577194214, | |
| "learning_rate": 0.00029895515433896327, | |
| "loss": 3.1368, | |
| "step": 86250 | |
| }, | |
| { | |
| "epoch": 25.116441546343736, | |
| "grad_norm": 0.43323931097984314, | |
| "learning_rate": 0.00029878043098427486, | |
| "loss": 3.1302, | |
| "step": 86300 | |
| }, | |
| { | |
| "epoch": 25.130996739636704, | |
| "grad_norm": 0.43633973598480225, | |
| "learning_rate": 0.00029860570762958646, | |
| "loss": 3.1457, | |
| "step": 86350 | |
| }, | |
| { | |
| "epoch": 25.14555193292967, | |
| "grad_norm": 0.45209595561027527, | |
| "learning_rate": 0.00029843098427489805, | |
| "loss": 3.1418, | |
| "step": 86400 | |
| }, | |
| { | |
| "epoch": 25.160107126222638, | |
| "grad_norm": 0.46032625436782837, | |
| "learning_rate": 0.00029825626092020964, | |
| "loss": 3.1339, | |
| "step": 86450 | |
| }, | |
| { | |
| "epoch": 25.174662319515605, | |
| "grad_norm": 0.4778590202331543, | |
| "learning_rate": 0.00029808153756552124, | |
| "loss": 3.1404, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 25.189217512808572, | |
| "grad_norm": 0.4342440664768219, | |
| "learning_rate": 0.00029790681421083283, | |
| "loss": 3.1359, | |
| "step": 86550 | |
| }, | |
| { | |
| "epoch": 25.203772706101535, | |
| "grad_norm": 0.4489634931087494, | |
| "learning_rate": 0.00029773209085614443, | |
| "loss": 3.1442, | |
| "step": 86600 | |
| }, | |
| { | |
| "epoch": 25.218327899394502, | |
| "grad_norm": 0.42772072553634644, | |
| "learning_rate": 0.00029755736750145597, | |
| "loss": 3.1501, | |
| "step": 86650 | |
| }, | |
| { | |
| "epoch": 25.23288309268747, | |
| "grad_norm": 0.44538307189941406, | |
| "learning_rate": 0.0002973826441467676, | |
| "loss": 3.1409, | |
| "step": 86700 | |
| }, | |
| { | |
| "epoch": 25.247438285980436, | |
| "grad_norm": 0.4433085322380066, | |
| "learning_rate": 0.00029720792079207916, | |
| "loss": 3.1628, | |
| "step": 86750 | |
| }, | |
| { | |
| "epoch": 25.261993479273404, | |
| "grad_norm": 0.41855108737945557, | |
| "learning_rate": 0.00029703319743739075, | |
| "loss": 3.152, | |
| "step": 86800 | |
| }, | |
| { | |
| "epoch": 25.27654867256637, | |
| "grad_norm": 0.45861804485321045, | |
| "learning_rate": 0.00029685847408270234, | |
| "loss": 3.1597, | |
| "step": 86850 | |
| }, | |
| { | |
| "epoch": 25.291103865859338, | |
| "grad_norm": 0.4248652458190918, | |
| "learning_rate": 0.00029668375072801394, | |
| "loss": 3.1514, | |
| "step": 86900 | |
| }, | |
| { | |
| "epoch": 25.305659059152305, | |
| "grad_norm": 0.40772923827171326, | |
| "learning_rate": 0.00029650902737332553, | |
| "loss": 3.1586, | |
| "step": 86950 | |
| }, | |
| { | |
| "epoch": 25.320214252445272, | |
| "grad_norm": 0.4152853488922119, | |
| "learning_rate": 0.00029633430401863713, | |
| "loss": 3.1688, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 25.320214252445272, | |
| "eval_accuracy": 0.37409528949384635, | |
| "eval_loss": 3.5473685264587402, | |
| "eval_runtime": 81.9034, | |
| "eval_samples_per_second": 203.3, | |
| "eval_steps_per_second": 12.71, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 25.33476944573824, | |
| "grad_norm": 0.4570411145687103, | |
| "learning_rate": 0.0002961595806639487, | |
| "loss": 3.1683, | |
| "step": 87050 | |
| }, | |
| { | |
| "epoch": 25.349324639031206, | |
| "grad_norm": 0.4372624456882477, | |
| "learning_rate": 0.0002959848573092603, | |
| "loss": 3.1787, | |
| "step": 87100 | |
| }, | |
| { | |
| "epoch": 25.363879832324173, | |
| "grad_norm": 0.42800650000572205, | |
| "learning_rate": 0.0002958101339545719, | |
| "loss": 3.1582, | |
| "step": 87150 | |
| }, | |
| { | |
| "epoch": 25.37843502561714, | |
| "grad_norm": 0.41939014196395874, | |
| "learning_rate": 0.0002956354105998835, | |
| "loss": 3.1663, | |
| "step": 87200 | |
| }, | |
| { | |
| "epoch": 25.392990218910107, | |
| "grad_norm": 0.4626615643501282, | |
| "learning_rate": 0.0002954606872451951, | |
| "loss": 3.1721, | |
| "step": 87250 | |
| }, | |
| { | |
| "epoch": 25.407545412203074, | |
| "grad_norm": 0.4040308892726898, | |
| "learning_rate": 0.0002952859638905067, | |
| "loss": 3.1573, | |
| "step": 87300 | |
| }, | |
| { | |
| "epoch": 25.42210060549604, | |
| "grad_norm": 0.4421527683734894, | |
| "learning_rate": 0.00029511124053581823, | |
| "loss": 3.1697, | |
| "step": 87350 | |
| }, | |
| { | |
| "epoch": 25.43665579878901, | |
| "grad_norm": 0.45232364535331726, | |
| "learning_rate": 0.0002949365171811299, | |
| "loss": 3.1709, | |
| "step": 87400 | |
| }, | |
| { | |
| "epoch": 25.451210992081975, | |
| "grad_norm": 0.42024415731430054, | |
| "learning_rate": 0.0002947617938264414, | |
| "loss": 3.1763, | |
| "step": 87450 | |
| }, | |
| { | |
| "epoch": 25.465766185374942, | |
| "grad_norm": 0.4415486752986908, | |
| "learning_rate": 0.000294587070471753, | |
| "loss": 3.165, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 25.48032137866791, | |
| "grad_norm": 0.4679517447948456, | |
| "learning_rate": 0.0002944123471170646, | |
| "loss": 3.1779, | |
| "step": 87550 | |
| }, | |
| { | |
| "epoch": 25.494876571960877, | |
| "grad_norm": 0.45526716113090515, | |
| "learning_rate": 0.0002942376237623762, | |
| "loss": 3.1854, | |
| "step": 87600 | |
| }, | |
| { | |
| "epoch": 25.509431765253844, | |
| "grad_norm": 0.4492674767971039, | |
| "learning_rate": 0.0002940629004076878, | |
| "loss": 3.2033, | |
| "step": 87650 | |
| }, | |
| { | |
| "epoch": 25.52398695854681, | |
| "grad_norm": 0.44568589329719543, | |
| "learning_rate": 0.0002938881770529994, | |
| "loss": 3.1885, | |
| "step": 87700 | |
| }, | |
| { | |
| "epoch": 25.538542151839778, | |
| "grad_norm": 0.422615110874176, | |
| "learning_rate": 0.000293713453698311, | |
| "loss": 3.1796, | |
| "step": 87750 | |
| }, | |
| { | |
| "epoch": 25.553097345132745, | |
| "grad_norm": 0.4262188971042633, | |
| "learning_rate": 0.0002935387303436226, | |
| "loss": 3.1867, | |
| "step": 87800 | |
| }, | |
| { | |
| "epoch": 25.567652538425712, | |
| "grad_norm": 0.42963874340057373, | |
| "learning_rate": 0.0002933640069889342, | |
| "loss": 3.1725, | |
| "step": 87850 | |
| }, | |
| { | |
| "epoch": 25.58220773171868, | |
| "grad_norm": 0.453431636095047, | |
| "learning_rate": 0.0002931892836342457, | |
| "loss": 3.197, | |
| "step": 87900 | |
| }, | |
| { | |
| "epoch": 25.596762925011646, | |
| "grad_norm": 0.41915765404701233, | |
| "learning_rate": 0.00029301456027955736, | |
| "loss": 3.1897, | |
| "step": 87950 | |
| }, | |
| { | |
| "epoch": 25.61131811830461, | |
| "grad_norm": 0.4209059476852417, | |
| "learning_rate": 0.0002928398369248689, | |
| "loss": 3.1861, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 25.61131811830461, | |
| "eval_accuracy": 0.3746568349591081, | |
| "eval_loss": 3.541332244873047, | |
| "eval_runtime": 81.821, | |
| "eval_samples_per_second": 203.505, | |
| "eval_steps_per_second": 12.723, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 25.625873311597577, | |
| "grad_norm": 0.4396921992301941, | |
| "learning_rate": 0.0002926651135701805, | |
| "loss": 3.1893, | |
| "step": 88050 | |
| }, | |
| { | |
| "epoch": 25.640428504890544, | |
| "grad_norm": 0.4290963411331177, | |
| "learning_rate": 0.00029249039021549215, | |
| "loss": 3.1838, | |
| "step": 88100 | |
| }, | |
| { | |
| "epoch": 25.65498369818351, | |
| "grad_norm": 0.4338902235031128, | |
| "learning_rate": 0.0002923156668608037, | |
| "loss": 3.1875, | |
| "step": 88150 | |
| }, | |
| { | |
| "epoch": 25.669538891476478, | |
| "grad_norm": 0.4225952923297882, | |
| "learning_rate": 0.0002921409435061153, | |
| "loss": 3.1995, | |
| "step": 88200 | |
| }, | |
| { | |
| "epoch": 25.684094084769445, | |
| "grad_norm": 0.4523755609989166, | |
| "learning_rate": 0.0002919662201514269, | |
| "loss": 3.1956, | |
| "step": 88250 | |
| }, | |
| { | |
| "epoch": 25.698649278062412, | |
| "grad_norm": 0.4463915526866913, | |
| "learning_rate": 0.00029179149679673847, | |
| "loss": 3.1864, | |
| "step": 88300 | |
| }, | |
| { | |
| "epoch": 25.71320447135538, | |
| "grad_norm": 0.4488876461982727, | |
| "learning_rate": 0.00029161677344205007, | |
| "loss": 3.1963, | |
| "step": 88350 | |
| }, | |
| { | |
| "epoch": 25.727759664648346, | |
| "grad_norm": 0.44967707991600037, | |
| "learning_rate": 0.00029144205008736166, | |
| "loss": 3.1899, | |
| "step": 88400 | |
| }, | |
| { | |
| "epoch": 25.742314857941313, | |
| "grad_norm": 0.4437671899795532, | |
| "learning_rate": 0.00029126732673267325, | |
| "loss": 3.1875, | |
| "step": 88450 | |
| }, | |
| { | |
| "epoch": 25.75687005123428, | |
| "grad_norm": 0.42447367310523987, | |
| "learning_rate": 0.00029109260337798485, | |
| "loss": 3.1903, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 25.771425244527247, | |
| "grad_norm": 0.43196901679039, | |
| "learning_rate": 0.00029091788002329644, | |
| "loss": 3.2045, | |
| "step": 88550 | |
| }, | |
| { | |
| "epoch": 25.785980437820214, | |
| "grad_norm": 0.4452676773071289, | |
| "learning_rate": 0.000290743156668608, | |
| "loss": 3.206, | |
| "step": 88600 | |
| }, | |
| { | |
| "epoch": 25.80053563111318, | |
| "grad_norm": 0.4434965252876282, | |
| "learning_rate": 0.00029056843331391963, | |
| "loss": 3.1847, | |
| "step": 88650 | |
| }, | |
| { | |
| "epoch": 25.81509082440615, | |
| "grad_norm": 0.460601270198822, | |
| "learning_rate": 0.00029039370995923117, | |
| "loss": 3.1989, | |
| "step": 88700 | |
| }, | |
| { | |
| "epoch": 25.829646017699115, | |
| "grad_norm": 0.442313015460968, | |
| "learning_rate": 0.00029021898660454277, | |
| "loss": 3.2118, | |
| "step": 88750 | |
| }, | |
| { | |
| "epoch": 25.844201210992082, | |
| "grad_norm": 0.4145523011684418, | |
| "learning_rate": 0.00029004426324985436, | |
| "loss": 3.1971, | |
| "step": 88800 | |
| }, | |
| { | |
| "epoch": 25.85875640428505, | |
| "grad_norm": 0.45460519194602966, | |
| "learning_rate": 0.00028986953989516595, | |
| "loss": 3.2089, | |
| "step": 88850 | |
| }, | |
| { | |
| "epoch": 25.873311597578017, | |
| "grad_norm": 0.42569699883461, | |
| "learning_rate": 0.00028969481654047755, | |
| "loss": 3.207, | |
| "step": 88900 | |
| }, | |
| { | |
| "epoch": 25.887866790870984, | |
| "grad_norm": 0.4331372380256653, | |
| "learning_rate": 0.00028952009318578914, | |
| "loss": 3.1861, | |
| "step": 88950 | |
| }, | |
| { | |
| "epoch": 25.90242198416395, | |
| "grad_norm": 0.4346257150173187, | |
| "learning_rate": 0.00028934536983110074, | |
| "loss": 3.2058, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 25.90242198416395, | |
| "eval_accuracy": 0.37475250218571404, | |
| "eval_loss": 3.5308263301849365, | |
| "eval_runtime": 81.1582, | |
| "eval_samples_per_second": 205.167, | |
| "eval_steps_per_second": 12.827, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 25.916977177456918, | |
| "grad_norm": 0.42136168479919434, | |
| "learning_rate": 0.00028917064647641233, | |
| "loss": 3.1966, | |
| "step": 89050 | |
| }, | |
| { | |
| "epoch": 25.931532370749885, | |
| "grad_norm": 0.4284227192401886, | |
| "learning_rate": 0.0002889959231217239, | |
| "loss": 3.1927, | |
| "step": 89100 | |
| }, | |
| { | |
| "epoch": 25.946087564042852, | |
| "grad_norm": 0.44938603043556213, | |
| "learning_rate": 0.0002888211997670355, | |
| "loss": 3.2034, | |
| "step": 89150 | |
| }, | |
| { | |
| "epoch": 25.96064275733582, | |
| "grad_norm": 0.4403515160083771, | |
| "learning_rate": 0.0002886464764123471, | |
| "loss": 3.2132, | |
| "step": 89200 | |
| }, | |
| { | |
| "epoch": 25.975197950628786, | |
| "grad_norm": 0.43645545840263367, | |
| "learning_rate": 0.0002884717530576587, | |
| "loss": 3.1954, | |
| "step": 89250 | |
| }, | |
| { | |
| "epoch": 25.989753143921753, | |
| "grad_norm": 0.44919466972351074, | |
| "learning_rate": 0.00028829702970297025, | |
| "loss": 3.2042, | |
| "step": 89300 | |
| }, | |
| { | |
| "epoch": 26.00407545412203, | |
| "grad_norm": 0.433603435754776, | |
| "learning_rate": 0.0002881223063482819, | |
| "loss": 3.1714, | |
| "step": 89350 | |
| }, | |
| { | |
| "epoch": 26.018630647414998, | |
| "grad_norm": 0.4180433452129364, | |
| "learning_rate": 0.00028794758299359344, | |
| "loss": 3.0901, | |
| "step": 89400 | |
| }, | |
| { | |
| "epoch": 26.033185840707965, | |
| "grad_norm": 0.4278295338153839, | |
| "learning_rate": 0.00028777285963890503, | |
| "loss": 3.1051, | |
| "step": 89450 | |
| }, | |
| { | |
| "epoch": 26.047741034000932, | |
| "grad_norm": 0.4252666234970093, | |
| "learning_rate": 0.0002875981362842166, | |
| "loss": 3.1074, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 26.0622962272939, | |
| "grad_norm": 0.4354073107242584, | |
| "learning_rate": 0.0002874234129295282, | |
| "loss": 3.1061, | |
| "step": 89550 | |
| }, | |
| { | |
| "epoch": 26.076851420586866, | |
| "grad_norm": 0.4766174852848053, | |
| "learning_rate": 0.0002872486895748398, | |
| "loss": 3.1193, | |
| "step": 89600 | |
| }, | |
| { | |
| "epoch": 26.091406613879833, | |
| "grad_norm": 0.45337629318237305, | |
| "learning_rate": 0.0002870739662201514, | |
| "loss": 3.1276, | |
| "step": 89650 | |
| }, | |
| { | |
| "epoch": 26.1059618071728, | |
| "grad_norm": 0.4593164026737213, | |
| "learning_rate": 0.000286899242865463, | |
| "loss": 3.1249, | |
| "step": 89700 | |
| }, | |
| { | |
| "epoch": 26.120517000465767, | |
| "grad_norm": 0.45432472229003906, | |
| "learning_rate": 0.0002867245195107746, | |
| "loss": 3.1335, | |
| "step": 89750 | |
| }, | |
| { | |
| "epoch": 26.135072193758734, | |
| "grad_norm": 0.4611753225326538, | |
| "learning_rate": 0.0002865497961560862, | |
| "loss": 3.1344, | |
| "step": 89800 | |
| }, | |
| { | |
| "epoch": 26.1496273870517, | |
| "grad_norm": 0.4392394423484802, | |
| "learning_rate": 0.00028637507280139773, | |
| "loss": 3.1237, | |
| "step": 89850 | |
| }, | |
| { | |
| "epoch": 26.16418258034467, | |
| "grad_norm": 0.4329069256782532, | |
| "learning_rate": 0.0002862003494467094, | |
| "loss": 3.1266, | |
| "step": 89900 | |
| }, | |
| { | |
| "epoch": 26.178737773637636, | |
| "grad_norm": 0.44258376955986023, | |
| "learning_rate": 0.0002860256260920209, | |
| "loss": 3.135, | |
| "step": 89950 | |
| }, | |
| { | |
| "epoch": 26.1932929669306, | |
| "grad_norm": 0.4630243182182312, | |
| "learning_rate": 0.0002858509027373325, | |
| "loss": 3.1451, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 26.1932929669306, | |
| "eval_accuracy": 0.3735493751602044, | |
| "eval_loss": 3.5569868087768555, | |
| "eval_runtime": 81.1934, | |
| "eval_samples_per_second": 205.078, | |
| "eval_steps_per_second": 12.821, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 26.207848160223566, | |
| "grad_norm": 0.4608689248561859, | |
| "learning_rate": 0.0002856761793826441, | |
| "loss": 3.1444, | |
| "step": 90050 | |
| }, | |
| { | |
| "epoch": 26.222403353516533, | |
| "grad_norm": 0.4300023317337036, | |
| "learning_rate": 0.0002855014560279557, | |
| "loss": 3.1476, | |
| "step": 90100 | |
| }, | |
| { | |
| "epoch": 26.2369585468095, | |
| "grad_norm": 0.4109968841075897, | |
| "learning_rate": 0.0002853267326732673, | |
| "loss": 3.1403, | |
| "step": 90150 | |
| }, | |
| { | |
| "epoch": 26.251513740102467, | |
| "grad_norm": 0.43651413917541504, | |
| "learning_rate": 0.0002851520093185789, | |
| "loss": 3.1376, | |
| "step": 90200 | |
| }, | |
| { | |
| "epoch": 26.266068933395434, | |
| "grad_norm": 0.4517920911312103, | |
| "learning_rate": 0.0002849772859638905, | |
| "loss": 3.1515, | |
| "step": 90250 | |
| }, | |
| { | |
| "epoch": 26.2806241266884, | |
| "grad_norm": 0.43993499875068665, | |
| "learning_rate": 0.0002848025626092021, | |
| "loss": 3.1637, | |
| "step": 90300 | |
| }, | |
| { | |
| "epoch": 26.29517931998137, | |
| "grad_norm": 0.42418476939201355, | |
| "learning_rate": 0.0002846278392545137, | |
| "loss": 3.1535, | |
| "step": 90350 | |
| }, | |
| { | |
| "epoch": 26.309734513274336, | |
| "grad_norm": 0.4256189167499542, | |
| "learning_rate": 0.00028445311589982527, | |
| "loss": 3.1549, | |
| "step": 90400 | |
| }, | |
| { | |
| "epoch": 26.324289706567303, | |
| "grad_norm": 0.45599597692489624, | |
| "learning_rate": 0.00028427839254513686, | |
| "loss": 3.1589, | |
| "step": 90450 | |
| }, | |
| { | |
| "epoch": 26.33884489986027, | |
| "grad_norm": 0.46493786573410034, | |
| "learning_rate": 0.00028410366919044846, | |
| "loss": 3.1597, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 26.353400093153237, | |
| "grad_norm": 0.4365712106227875, | |
| "learning_rate": 0.00028392894583576, | |
| "loss": 3.1467, | |
| "step": 90550 | |
| }, | |
| { | |
| "epoch": 26.367955286446204, | |
| "grad_norm": 0.41159552335739136, | |
| "learning_rate": 0.00028375422248107165, | |
| "loss": 3.157, | |
| "step": 90600 | |
| }, | |
| { | |
| "epoch": 26.38251047973917, | |
| "grad_norm": 0.4257558286190033, | |
| "learning_rate": 0.0002835794991263832, | |
| "loss": 3.15, | |
| "step": 90650 | |
| }, | |
| { | |
| "epoch": 26.397065673032138, | |
| "grad_norm": 0.42205342650413513, | |
| "learning_rate": 0.0002834047757716948, | |
| "loss": 3.1705, | |
| "step": 90700 | |
| }, | |
| { | |
| "epoch": 26.411620866325105, | |
| "grad_norm": 0.4496673047542572, | |
| "learning_rate": 0.0002832300524170064, | |
| "loss": 3.1612, | |
| "step": 90750 | |
| }, | |
| { | |
| "epoch": 26.426176059618072, | |
| "grad_norm": 0.433896541595459, | |
| "learning_rate": 0.00028305532906231797, | |
| "loss": 3.1524, | |
| "step": 90800 | |
| }, | |
| { | |
| "epoch": 26.44073125291104, | |
| "grad_norm": 0.4470045864582062, | |
| "learning_rate": 0.00028288060570762956, | |
| "loss": 3.1676, | |
| "step": 90850 | |
| }, | |
| { | |
| "epoch": 26.455286446204006, | |
| "grad_norm": 0.42775487899780273, | |
| "learning_rate": 0.00028270588235294116, | |
| "loss": 3.1672, | |
| "step": 90900 | |
| }, | |
| { | |
| "epoch": 26.469841639496973, | |
| "grad_norm": 0.4461595416069031, | |
| "learning_rate": 0.00028253115899825275, | |
| "loss": 3.1658, | |
| "step": 90950 | |
| }, | |
| { | |
| "epoch": 26.48439683278994, | |
| "grad_norm": 0.4303746819496155, | |
| "learning_rate": 0.0002823564356435643, | |
| "loss": 3.1654, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 26.48439683278994, | |
| "eval_accuracy": 0.3743263481762877, | |
| "eval_loss": 3.54341983795166, | |
| "eval_runtime": 81.1797, | |
| "eval_samples_per_second": 205.113, | |
| "eval_steps_per_second": 12.823, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 26.498952026082907, | |
| "grad_norm": 0.42270606756210327, | |
| "learning_rate": 0.00028218171228887594, | |
| "loss": 3.1608, | |
| "step": 91050 | |
| }, | |
| { | |
| "epoch": 26.513507219375875, | |
| "grad_norm": 0.4338577389717102, | |
| "learning_rate": 0.0002820069889341875, | |
| "loss": 3.1673, | |
| "step": 91100 | |
| }, | |
| { | |
| "epoch": 26.52806241266884, | |
| "grad_norm": 0.4756627380847931, | |
| "learning_rate": 0.00028183226557949913, | |
| "loss": 3.1703, | |
| "step": 91150 | |
| }, | |
| { | |
| "epoch": 26.54261760596181, | |
| "grad_norm": 0.4332279860973358, | |
| "learning_rate": 0.00028165754222481067, | |
| "loss": 3.1729, | |
| "step": 91200 | |
| }, | |
| { | |
| "epoch": 26.557172799254776, | |
| "grad_norm": 0.42473283410072327, | |
| "learning_rate": 0.00028148281887012226, | |
| "loss": 3.1701, | |
| "step": 91250 | |
| }, | |
| { | |
| "epoch": 26.571727992547743, | |
| "grad_norm": 0.4678240716457367, | |
| "learning_rate": 0.0002813080955154339, | |
| "loss": 3.1605, | |
| "step": 91300 | |
| }, | |
| { | |
| "epoch": 26.586283185840706, | |
| "grad_norm": 0.421902060508728, | |
| "learning_rate": 0.00028113337216074545, | |
| "loss": 3.1824, | |
| "step": 91350 | |
| }, | |
| { | |
| "epoch": 26.600838379133673, | |
| "grad_norm": 0.4535716772079468, | |
| "learning_rate": 0.00028095864880605705, | |
| "loss": 3.1759, | |
| "step": 91400 | |
| }, | |
| { | |
| "epoch": 26.61539357242664, | |
| "grad_norm": 0.43654313683509827, | |
| "learning_rate": 0.00028078392545136864, | |
| "loss": 3.1928, | |
| "step": 91450 | |
| }, | |
| { | |
| "epoch": 26.629948765719607, | |
| "grad_norm": 0.4195038378238678, | |
| "learning_rate": 0.00028060920209668023, | |
| "loss": 3.1797, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 26.644503959012575, | |
| "grad_norm": 0.4246092140674591, | |
| "learning_rate": 0.00028043447874199183, | |
| "loss": 3.1745, | |
| "step": 91550 | |
| }, | |
| { | |
| "epoch": 26.65905915230554, | |
| "grad_norm": 0.43412160873413086, | |
| "learning_rate": 0.0002802597553873034, | |
| "loss": 3.1865, | |
| "step": 91600 | |
| }, | |
| { | |
| "epoch": 26.67361434559851, | |
| "grad_norm": 0.4211180508136749, | |
| "learning_rate": 0.000280085032032615, | |
| "loss": 3.1898, | |
| "step": 91650 | |
| }, | |
| { | |
| "epoch": 26.688169538891476, | |
| "grad_norm": 0.4517876207828522, | |
| "learning_rate": 0.00027991030867792656, | |
| "loss": 3.1716, | |
| "step": 91700 | |
| }, | |
| { | |
| "epoch": 26.702724732184443, | |
| "grad_norm": 0.41437435150146484, | |
| "learning_rate": 0.0002797355853232382, | |
| "loss": 3.1881, | |
| "step": 91750 | |
| }, | |
| { | |
| "epoch": 26.71727992547741, | |
| "grad_norm": 0.4272013306617737, | |
| "learning_rate": 0.00027956086196854975, | |
| "loss": 3.1891, | |
| "step": 91800 | |
| }, | |
| { | |
| "epoch": 26.731835118770377, | |
| "grad_norm": 0.4276150166988373, | |
| "learning_rate": 0.0002793861386138614, | |
| "loss": 3.184, | |
| "step": 91850 | |
| }, | |
| { | |
| "epoch": 26.746390312063344, | |
| "grad_norm": 0.47696298360824585, | |
| "learning_rate": 0.00027921141525917293, | |
| "loss": 3.1895, | |
| "step": 91900 | |
| }, | |
| { | |
| "epoch": 26.76094550535631, | |
| "grad_norm": 0.43110692501068115, | |
| "learning_rate": 0.00027903669190448453, | |
| "loss": 3.1882, | |
| "step": 91950 | |
| }, | |
| { | |
| "epoch": 26.775500698649278, | |
| "grad_norm": 0.419421911239624, | |
| "learning_rate": 0.0002788619685497961, | |
| "loss": 3.1848, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 26.775500698649278, | |
| "eval_accuracy": 0.3747238255231934, | |
| "eval_loss": 3.5384514331817627, | |
| "eval_runtime": 81.0971, | |
| "eval_samples_per_second": 205.322, | |
| "eval_steps_per_second": 12.836, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 26.790055891942245, | |
| "grad_norm": 0.43019166588783264, | |
| "learning_rate": 0.0002786872451951077, | |
| "loss": 3.1901, | |
| "step": 92050 | |
| }, | |
| { | |
| "epoch": 26.804611085235212, | |
| "grad_norm": 0.4565688967704773, | |
| "learning_rate": 0.0002785125218404193, | |
| "loss": 3.201, | |
| "step": 92100 | |
| }, | |
| { | |
| "epoch": 26.81916627852818, | |
| "grad_norm": 0.4938756227493286, | |
| "learning_rate": 0.0002783377984857309, | |
| "loss": 3.1854, | |
| "step": 92150 | |
| }, | |
| { | |
| "epoch": 26.833721471821146, | |
| "grad_norm": 0.4353455603122711, | |
| "learning_rate": 0.0002781630751310425, | |
| "loss": 3.1896, | |
| "step": 92200 | |
| }, | |
| { | |
| "epoch": 26.848276665114113, | |
| "grad_norm": 0.43991193175315857, | |
| "learning_rate": 0.0002779883517763541, | |
| "loss": 3.1931, | |
| "step": 92250 | |
| }, | |
| { | |
| "epoch": 26.86283185840708, | |
| "grad_norm": 0.4256923198699951, | |
| "learning_rate": 0.0002778136284216657, | |
| "loss": 3.1795, | |
| "step": 92300 | |
| }, | |
| { | |
| "epoch": 26.877387051700047, | |
| "grad_norm": 0.44413042068481445, | |
| "learning_rate": 0.0002776389050669773, | |
| "loss": 3.1945, | |
| "step": 92350 | |
| }, | |
| { | |
| "epoch": 26.891942244993015, | |
| "grad_norm": 0.44300082325935364, | |
| "learning_rate": 0.0002774641817122888, | |
| "loss": 3.1939, | |
| "step": 92400 | |
| }, | |
| { | |
| "epoch": 26.90649743828598, | |
| "grad_norm": 0.4375019371509552, | |
| "learning_rate": 0.00027728945835760047, | |
| "loss": 3.1873, | |
| "step": 92450 | |
| }, | |
| { | |
| "epoch": 26.92105263157895, | |
| "grad_norm": 0.4764561057090759, | |
| "learning_rate": 0.000277114735002912, | |
| "loss": 3.1943, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 26.935607824871916, | |
| "grad_norm": 0.46133366227149963, | |
| "learning_rate": 0.00027694001164822366, | |
| "loss": 3.2093, | |
| "step": 92550 | |
| }, | |
| { | |
| "epoch": 26.950163018164883, | |
| "grad_norm": 0.46737441420555115, | |
| "learning_rate": 0.0002767652882935352, | |
| "loss": 3.1834, | |
| "step": 92600 | |
| }, | |
| { | |
| "epoch": 26.96471821145785, | |
| "grad_norm": 0.4142431914806366, | |
| "learning_rate": 0.0002765905649388468, | |
| "loss": 3.1919, | |
| "step": 92650 | |
| }, | |
| { | |
| "epoch": 26.979273404750813, | |
| "grad_norm": 0.4396675229072571, | |
| "learning_rate": 0.0002764158415841584, | |
| "loss": 3.2041, | |
| "step": 92700 | |
| }, | |
| { | |
| "epoch": 26.99382859804378, | |
| "grad_norm": 0.4369431138038635, | |
| "learning_rate": 0.00027624111822947, | |
| "loss": 3.1963, | |
| "step": 92750 | |
| }, | |
| { | |
| "epoch": 27.008150908244062, | |
| "grad_norm": 0.4310619831085205, | |
| "learning_rate": 0.0002760663948747816, | |
| "loss": 3.1384, | |
| "step": 92800 | |
| }, | |
| { | |
| "epoch": 27.02270610153703, | |
| "grad_norm": 0.4611195921897888, | |
| "learning_rate": 0.00027589167152009317, | |
| "loss": 3.098, | |
| "step": 92850 | |
| }, | |
| { | |
| "epoch": 27.037261294829996, | |
| "grad_norm": 0.46494999527931213, | |
| "learning_rate": 0.00027571694816540477, | |
| "loss": 3.1123, | |
| "step": 92900 | |
| }, | |
| { | |
| "epoch": 27.051816488122963, | |
| "grad_norm": 0.46609047055244446, | |
| "learning_rate": 0.0002755422248107163, | |
| "loss": 3.1114, | |
| "step": 92950 | |
| }, | |
| { | |
| "epoch": 27.06637168141593, | |
| "grad_norm": 0.44031664729118347, | |
| "learning_rate": 0.00027536750145602795, | |
| "loss": 3.1256, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 27.06637168141593, | |
| "eval_accuracy": 0.37408541720019167, | |
| "eval_loss": 3.5510995388031006, | |
| "eval_runtime": 80.4791, | |
| "eval_samples_per_second": 206.898, | |
| "eval_steps_per_second": 12.935, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 27.080926874708897, | |
| "grad_norm": 0.4661605656147003, | |
| "learning_rate": 0.0002751927781013395, | |
| "loss": 3.1129, | |
| "step": 93050 | |
| }, | |
| { | |
| "epoch": 27.095482068001864, | |
| "grad_norm": 0.4188961088657379, | |
| "learning_rate": 0.0002750180547466511, | |
| "loss": 3.1093, | |
| "step": 93100 | |
| }, | |
| { | |
| "epoch": 27.11003726129483, | |
| "grad_norm": 0.5129013061523438, | |
| "learning_rate": 0.0002748433313919627, | |
| "loss": 3.1095, | |
| "step": 93150 | |
| }, | |
| { | |
| "epoch": 27.1245924545878, | |
| "grad_norm": 0.4390321969985962, | |
| "learning_rate": 0.0002746686080372743, | |
| "loss": 3.1068, | |
| "step": 93200 | |
| }, | |
| { | |
| "epoch": 27.139147647880765, | |
| "grad_norm": 0.44985267519950867, | |
| "learning_rate": 0.00027449388468258587, | |
| "loss": 3.1134, | |
| "step": 93250 | |
| }, | |
| { | |
| "epoch": 27.153702841173732, | |
| "grad_norm": 0.4393739402294159, | |
| "learning_rate": 0.00027431916132789747, | |
| "loss": 3.1153, | |
| "step": 93300 | |
| }, | |
| { | |
| "epoch": 27.168258034466696, | |
| "grad_norm": 0.4716503918170929, | |
| "learning_rate": 0.00027414443797320906, | |
| "loss": 3.1233, | |
| "step": 93350 | |
| }, | |
| { | |
| "epoch": 27.182813227759663, | |
| "grad_norm": 0.43752142786979675, | |
| "learning_rate": 0.00027396971461852065, | |
| "loss": 3.1209, | |
| "step": 93400 | |
| }, | |
| { | |
| "epoch": 27.19736842105263, | |
| "grad_norm": 0.4430052936077118, | |
| "learning_rate": 0.00027379499126383225, | |
| "loss": 3.1439, | |
| "step": 93450 | |
| }, | |
| { | |
| "epoch": 27.211923614345597, | |
| "grad_norm": 0.46688228845596313, | |
| "learning_rate": 0.00027362026790914384, | |
| "loss": 3.1287, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 27.226478807638564, | |
| "grad_norm": 0.4711000621318817, | |
| "learning_rate": 0.00027344554455445544, | |
| "loss": 3.1325, | |
| "step": 93550 | |
| }, | |
| { | |
| "epoch": 27.24103400093153, | |
| "grad_norm": 0.4563055634498596, | |
| "learning_rate": 0.00027327082119976703, | |
| "loss": 3.1549, | |
| "step": 93600 | |
| }, | |
| { | |
| "epoch": 27.2555891942245, | |
| "grad_norm": 0.4919802248477936, | |
| "learning_rate": 0.00027309609784507857, | |
| "loss": 3.1281, | |
| "step": 93650 | |
| }, | |
| { | |
| "epoch": 27.270144387517465, | |
| "grad_norm": 0.41443052887916565, | |
| "learning_rate": 0.0002729213744903902, | |
| "loss": 3.1317, | |
| "step": 93700 | |
| }, | |
| { | |
| "epoch": 27.284699580810432, | |
| "grad_norm": 0.46947622299194336, | |
| "learning_rate": 0.00027274665113570176, | |
| "loss": 3.1429, | |
| "step": 93750 | |
| }, | |
| { | |
| "epoch": 27.2992547741034, | |
| "grad_norm": 0.4295712113380432, | |
| "learning_rate": 0.00027257192778101335, | |
| "loss": 3.1363, | |
| "step": 93800 | |
| }, | |
| { | |
| "epoch": 27.313809967396367, | |
| "grad_norm": 0.45617058873176575, | |
| "learning_rate": 0.00027239720442632495, | |
| "loss": 3.1512, | |
| "step": 93850 | |
| }, | |
| { | |
| "epoch": 27.328365160689334, | |
| "grad_norm": 0.46890562772750854, | |
| "learning_rate": 0.00027222248107163654, | |
| "loss": 3.1501, | |
| "step": 93900 | |
| }, | |
| { | |
| "epoch": 27.3429203539823, | |
| "grad_norm": 0.45236900448799133, | |
| "learning_rate": 0.00027204775771694814, | |
| "loss": 3.1608, | |
| "step": 93950 | |
| }, | |
| { | |
| "epoch": 27.357475547275268, | |
| "grad_norm": 0.47922393679618835, | |
| "learning_rate": 0.00027187303436225973, | |
| "loss": 3.1486, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 27.357475547275268, | |
| "eval_accuracy": 0.374177793662246, | |
| "eval_loss": 3.5487077236175537, | |
| "eval_runtime": 80.4179, | |
| "eval_samples_per_second": 207.056, | |
| "eval_steps_per_second": 12.945, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 27.372030740568235, | |
| "grad_norm": 0.44851404428482056, | |
| "learning_rate": 0.0002716983110075713, | |
| "loss": 3.152, | |
| "step": 94050 | |
| }, | |
| { | |
| "epoch": 27.386585933861202, | |
| "grad_norm": 0.46306702494621277, | |
| "learning_rate": 0.0002715235876528829, | |
| "loss": 3.1602, | |
| "step": 94100 | |
| }, | |
| { | |
| "epoch": 27.40114112715417, | |
| "grad_norm": 0.43412405252456665, | |
| "learning_rate": 0.0002713488642981945, | |
| "loss": 3.1475, | |
| "step": 94150 | |
| }, | |
| { | |
| "epoch": 27.415696320447136, | |
| "grad_norm": 0.4401833415031433, | |
| "learning_rate": 0.00027117414094350606, | |
| "loss": 3.1472, | |
| "step": 94200 | |
| }, | |
| { | |
| "epoch": 27.430251513740103, | |
| "grad_norm": 0.447394996881485, | |
| "learning_rate": 0.0002709994175888177, | |
| "loss": 3.1628, | |
| "step": 94250 | |
| }, | |
| { | |
| "epoch": 27.44480670703307, | |
| "grad_norm": 0.44516995549201965, | |
| "learning_rate": 0.00027082469423412924, | |
| "loss": 3.1578, | |
| "step": 94300 | |
| }, | |
| { | |
| "epoch": 27.459361900326037, | |
| "grad_norm": 0.4865111708641052, | |
| "learning_rate": 0.00027064997087944084, | |
| "loss": 3.1528, | |
| "step": 94350 | |
| }, | |
| { | |
| "epoch": 27.473917093619004, | |
| "grad_norm": 0.4723801016807556, | |
| "learning_rate": 0.00027047524752475243, | |
| "loss": 3.1541, | |
| "step": 94400 | |
| }, | |
| { | |
| "epoch": 27.48847228691197, | |
| "grad_norm": 0.4521695077419281, | |
| "learning_rate": 0.000270300524170064, | |
| "loss": 3.1602, | |
| "step": 94450 | |
| }, | |
| { | |
| "epoch": 27.50302748020494, | |
| "grad_norm": 0.4700624942779541, | |
| "learning_rate": 0.0002701258008153757, | |
| "loss": 3.1491, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 27.517582673497905, | |
| "grad_norm": 0.44565245509147644, | |
| "learning_rate": 0.0002699510774606872, | |
| "loss": 3.1646, | |
| "step": 94550 | |
| }, | |
| { | |
| "epoch": 27.532137866790872, | |
| "grad_norm": 0.45179468393325806, | |
| "learning_rate": 0.0002697763541059988, | |
| "loss": 3.1661, | |
| "step": 94600 | |
| }, | |
| { | |
| "epoch": 27.54669306008384, | |
| "grad_norm": 0.4487448036670685, | |
| "learning_rate": 0.0002696016307513104, | |
| "loss": 3.1692, | |
| "step": 94650 | |
| }, | |
| { | |
| "epoch": 27.561248253376803, | |
| "grad_norm": 0.45618101954460144, | |
| "learning_rate": 0.000269426907396622, | |
| "loss": 3.1503, | |
| "step": 94700 | |
| }, | |
| { | |
| "epoch": 27.57580344666977, | |
| "grad_norm": 0.44323694705963135, | |
| "learning_rate": 0.0002692521840419336, | |
| "loss": 3.1664, | |
| "step": 94750 | |
| }, | |
| { | |
| "epoch": 27.590358639962737, | |
| "grad_norm": 0.432827353477478, | |
| "learning_rate": 0.0002690774606872452, | |
| "loss": 3.1684, | |
| "step": 94800 | |
| }, | |
| { | |
| "epoch": 27.604913833255704, | |
| "grad_norm": 0.49907568097114563, | |
| "learning_rate": 0.0002689027373325568, | |
| "loss": 3.1648, | |
| "step": 94850 | |
| }, | |
| { | |
| "epoch": 27.61946902654867, | |
| "grad_norm": 0.43723028898239136, | |
| "learning_rate": 0.0002687280139778683, | |
| "loss": 3.1692, | |
| "step": 94900 | |
| }, | |
| { | |
| "epoch": 27.63402421984164, | |
| "grad_norm": 0.49944910407066345, | |
| "learning_rate": 0.00026855329062317997, | |
| "loss": 3.1727, | |
| "step": 94950 | |
| }, | |
| { | |
| "epoch": 27.648579413134605, | |
| "grad_norm": 0.4763164520263672, | |
| "learning_rate": 0.0002683785672684915, | |
| "loss": 3.1817, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 27.648579413134605, | |
| "eval_accuracy": 0.37482924751614854, | |
| "eval_loss": 3.541757106781006, | |
| "eval_runtime": 80.4551, | |
| "eval_samples_per_second": 206.96, | |
| "eval_steps_per_second": 12.939, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 27.663134606427572, | |
| "grad_norm": 0.45794692635536194, | |
| "learning_rate": 0.0002682038439138031, | |
| "loss": 3.168, | |
| "step": 95050 | |
| }, | |
| { | |
| "epoch": 27.67768979972054, | |
| "grad_norm": 0.431690514087677, | |
| "learning_rate": 0.0002680291205591147, | |
| "loss": 3.1673, | |
| "step": 95100 | |
| }, | |
| { | |
| "epoch": 27.692244993013507, | |
| "grad_norm": 0.47746652364730835, | |
| "learning_rate": 0.0002678543972044263, | |
| "loss": 3.1731, | |
| "step": 95150 | |
| }, | |
| { | |
| "epoch": 27.706800186306474, | |
| "grad_norm": 0.42426300048828125, | |
| "learning_rate": 0.0002676796738497379, | |
| "loss": 3.1812, | |
| "step": 95200 | |
| }, | |
| { | |
| "epoch": 27.72135537959944, | |
| "grad_norm": 0.4418182671070099, | |
| "learning_rate": 0.0002675049504950495, | |
| "loss": 3.1785, | |
| "step": 95250 | |
| }, | |
| { | |
| "epoch": 27.735910572892408, | |
| "grad_norm": 0.43584927916526794, | |
| "learning_rate": 0.0002673302271403611, | |
| "loss": 3.1769, | |
| "step": 95300 | |
| }, | |
| { | |
| "epoch": 27.750465766185375, | |
| "grad_norm": 0.46387040615081787, | |
| "learning_rate": 0.00026715550378567267, | |
| "loss": 3.173, | |
| "step": 95350 | |
| }, | |
| { | |
| "epoch": 27.765020959478342, | |
| "grad_norm": 0.4367853105068207, | |
| "learning_rate": 0.00026698078043098426, | |
| "loss": 3.1708, | |
| "step": 95400 | |
| }, | |
| { | |
| "epoch": 27.77957615277131, | |
| "grad_norm": 0.4488929212093353, | |
| "learning_rate": 0.00026680605707629586, | |
| "loss": 3.182, | |
| "step": 95450 | |
| }, | |
| { | |
| "epoch": 27.794131346064276, | |
| "grad_norm": 0.43413597345352173, | |
| "learning_rate": 0.00026663133372160745, | |
| "loss": 3.19, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 27.808686539357243, | |
| "grad_norm": 0.45167261362075806, | |
| "learning_rate": 0.00026645661036691905, | |
| "loss": 3.1795, | |
| "step": 95550 | |
| }, | |
| { | |
| "epoch": 27.82324173265021, | |
| "grad_norm": 0.5294893980026245, | |
| "learning_rate": 0.0002662818870122306, | |
| "loss": 3.1757, | |
| "step": 95600 | |
| }, | |
| { | |
| "epoch": 27.837796925943177, | |
| "grad_norm": 0.4377014935016632, | |
| "learning_rate": 0.00026610716365754224, | |
| "loss": 3.177, | |
| "step": 95650 | |
| }, | |
| { | |
| "epoch": 27.852352119236144, | |
| "grad_norm": 0.4920569658279419, | |
| "learning_rate": 0.0002659324403028538, | |
| "loss": 3.1722, | |
| "step": 95700 | |
| }, | |
| { | |
| "epoch": 27.86690731252911, | |
| "grad_norm": 0.46538761258125305, | |
| "learning_rate": 0.00026575771694816537, | |
| "loss": 3.1875, | |
| "step": 95750 | |
| }, | |
| { | |
| "epoch": 27.88146250582208, | |
| "grad_norm": 0.4536549746990204, | |
| "learning_rate": 0.00026558299359347696, | |
| "loss": 3.1862, | |
| "step": 95800 | |
| }, | |
| { | |
| "epoch": 27.896017699115045, | |
| "grad_norm": 0.4198532998561859, | |
| "learning_rate": 0.00026540827023878856, | |
| "loss": 3.1889, | |
| "step": 95850 | |
| }, | |
| { | |
| "epoch": 27.910572892408013, | |
| "grad_norm": 0.4229876399040222, | |
| "learning_rate": 0.00026523354688410015, | |
| "loss": 3.1849, | |
| "step": 95900 | |
| }, | |
| { | |
| "epoch": 27.92512808570098, | |
| "grad_norm": 0.46028637886047363, | |
| "learning_rate": 0.00026505882352941175, | |
| "loss": 3.1886, | |
| "step": 95950 | |
| }, | |
| { | |
| "epoch": 27.939683278993947, | |
| "grad_norm": 0.46924006938934326, | |
| "learning_rate": 0.00026488410017472334, | |
| "loss": 3.1771, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 27.939683278993947, | |
| "eval_accuracy": 0.3749334942360496, | |
| "eval_loss": 3.5343446731567383, | |
| "eval_runtime": 80.3858, | |
| "eval_samples_per_second": 207.139, | |
| "eval_steps_per_second": 12.95, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 27.95423847228691, | |
| "grad_norm": 0.48700371384620667, | |
| "learning_rate": 0.00026470937682003494, | |
| "loss": 3.1772, | |
| "step": 96050 | |
| }, | |
| { | |
| "epoch": 27.968793665579877, | |
| "grad_norm": 0.42400476336479187, | |
| "learning_rate": 0.00026453465346534653, | |
| "loss": 3.2, | |
| "step": 96100 | |
| }, | |
| { | |
| "epoch": 27.983348858872844, | |
| "grad_norm": 0.4213753342628479, | |
| "learning_rate": 0.00026435993011065807, | |
| "loss": 3.1773, | |
| "step": 96150 | |
| }, | |
| { | |
| "epoch": 27.99790405216581, | |
| "grad_norm": 0.4452514946460724, | |
| "learning_rate": 0.0002641852067559697, | |
| "loss": 3.1917, | |
| "step": 96200 | |
| }, | |
| { | |
| "epoch": 28.012226362366093, | |
| "grad_norm": 0.4369601905345917, | |
| "learning_rate": 0.00026401048340128126, | |
| "loss": 3.092, | |
| "step": 96250 | |
| }, | |
| { | |
| "epoch": 28.02678155565906, | |
| "grad_norm": 0.44504931569099426, | |
| "learning_rate": 0.00026383576004659285, | |
| "loss": 3.099, | |
| "step": 96300 | |
| }, | |
| { | |
| "epoch": 28.041336748952027, | |
| "grad_norm": 0.4424644410610199, | |
| "learning_rate": 0.00026366103669190445, | |
| "loss": 3.0901, | |
| "step": 96350 | |
| }, | |
| { | |
| "epoch": 28.055891942244994, | |
| "grad_norm": 0.43492698669433594, | |
| "learning_rate": 0.00026348631333721604, | |
| "loss": 3.0946, | |
| "step": 96400 | |
| }, | |
| { | |
| "epoch": 28.07044713553796, | |
| "grad_norm": 0.4465198814868927, | |
| "learning_rate": 0.00026331158998252764, | |
| "loss": 3.1013, | |
| "step": 96450 | |
| }, | |
| { | |
| "epoch": 28.085002328830928, | |
| "grad_norm": 0.4630950689315796, | |
| "learning_rate": 0.00026313686662783923, | |
| "loss": 3.099, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 28.099557522123895, | |
| "grad_norm": 0.4329875707626343, | |
| "learning_rate": 0.0002629621432731508, | |
| "loss": 3.1049, | |
| "step": 96550 | |
| }, | |
| { | |
| "epoch": 28.114112715416862, | |
| "grad_norm": 0.446515828371048, | |
| "learning_rate": 0.0002627874199184624, | |
| "loss": 3.091, | |
| "step": 96600 | |
| }, | |
| { | |
| "epoch": 28.12866790870983, | |
| "grad_norm": 0.470591276884079, | |
| "learning_rate": 0.000262612696563774, | |
| "loss": 3.1097, | |
| "step": 96650 | |
| }, | |
| { | |
| "epoch": 28.143223102002796, | |
| "grad_norm": 0.46814674139022827, | |
| "learning_rate": 0.0002624379732090856, | |
| "loss": 3.1056, | |
| "step": 96700 | |
| }, | |
| { | |
| "epoch": 28.15777829529576, | |
| "grad_norm": 0.4513624608516693, | |
| "learning_rate": 0.0002622632498543972, | |
| "loss": 3.1202, | |
| "step": 96750 | |
| }, | |
| { | |
| "epoch": 28.172333488588727, | |
| "grad_norm": 0.45544955134391785, | |
| "learning_rate": 0.0002620885264997088, | |
| "loss": 3.1104, | |
| "step": 96800 | |
| }, | |
| { | |
| "epoch": 28.186888681881694, | |
| "grad_norm": 0.4420503079891205, | |
| "learning_rate": 0.00026191380314502034, | |
| "loss": 3.1108, | |
| "step": 96850 | |
| }, | |
| { | |
| "epoch": 28.20144387517466, | |
| "grad_norm": 0.44920092821121216, | |
| "learning_rate": 0.000261739079790332, | |
| "loss": 3.1187, | |
| "step": 96900 | |
| }, | |
| { | |
| "epoch": 28.215999068467628, | |
| "grad_norm": 0.4485904574394226, | |
| "learning_rate": 0.0002615643564356435, | |
| "loss": 3.1261, | |
| "step": 96950 | |
| }, | |
| { | |
| "epoch": 28.230554261760595, | |
| "grad_norm": 0.4277605712413788, | |
| "learning_rate": 0.0002613896330809551, | |
| "loss": 3.1297, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 28.230554261760595, | |
| "eval_accuracy": 0.37414453343481424, | |
| "eval_loss": 3.5508644580841064, | |
| "eval_runtime": 80.2808, | |
| "eval_samples_per_second": 207.409, | |
| "eval_steps_per_second": 12.967, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 28.245109455053562, | |
| "grad_norm": 0.47483253479003906, | |
| "learning_rate": 0.0002612149097262667, | |
| "loss": 3.1326, | |
| "step": 97050 | |
| }, | |
| { | |
| "epoch": 28.25966464834653, | |
| "grad_norm": 0.47239866852760315, | |
| "learning_rate": 0.0002610401863715783, | |
| "loss": 3.1326, | |
| "step": 97100 | |
| }, | |
| { | |
| "epoch": 28.274219841639496, | |
| "grad_norm": 0.47539469599723816, | |
| "learning_rate": 0.0002608654630168899, | |
| "loss": 3.1338, | |
| "step": 97150 | |
| }, | |
| { | |
| "epoch": 28.288775034932463, | |
| "grad_norm": 0.4352509081363678, | |
| "learning_rate": 0.0002606907396622015, | |
| "loss": 3.1348, | |
| "step": 97200 | |
| }, | |
| { | |
| "epoch": 28.30333022822543, | |
| "grad_norm": 0.4521329700946808, | |
| "learning_rate": 0.0002605160163075131, | |
| "loss": 3.1398, | |
| "step": 97250 | |
| }, | |
| { | |
| "epoch": 28.317885421518397, | |
| "grad_norm": 0.47049468755722046, | |
| "learning_rate": 0.0002603412929528247, | |
| "loss": 3.1329, | |
| "step": 97300 | |
| }, | |
| { | |
| "epoch": 28.332440614811365, | |
| "grad_norm": 0.424907386302948, | |
| "learning_rate": 0.0002601665695981363, | |
| "loss": 3.1297, | |
| "step": 97350 | |
| }, | |
| { | |
| "epoch": 28.34699580810433, | |
| "grad_norm": 0.4480956196784973, | |
| "learning_rate": 0.0002599918462434478, | |
| "loss": 3.1348, | |
| "step": 97400 | |
| }, | |
| { | |
| "epoch": 28.3615510013973, | |
| "grad_norm": 0.4295244514942169, | |
| "learning_rate": 0.00025981712288875947, | |
| "loss": 3.1483, | |
| "step": 97450 | |
| }, | |
| { | |
| "epoch": 28.376106194690266, | |
| "grad_norm": 0.43365004658699036, | |
| "learning_rate": 0.000259642399534071, | |
| "loss": 3.133, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 28.390661387983233, | |
| "grad_norm": 0.4339531660079956, | |
| "learning_rate": 0.0002594676761793826, | |
| "loss": 3.147, | |
| "step": 97550 | |
| }, | |
| { | |
| "epoch": 28.4052165812762, | |
| "grad_norm": 0.45043033361434937, | |
| "learning_rate": 0.0002592929528246942, | |
| "loss": 3.1389, | |
| "step": 97600 | |
| }, | |
| { | |
| "epoch": 28.419771774569167, | |
| "grad_norm": 0.4544854164123535, | |
| "learning_rate": 0.0002591182294700058, | |
| "loss": 3.1416, | |
| "step": 97650 | |
| }, | |
| { | |
| "epoch": 28.434326967862134, | |
| "grad_norm": 0.4301966428756714, | |
| "learning_rate": 0.0002589435061153174, | |
| "loss": 3.1475, | |
| "step": 97700 | |
| }, | |
| { | |
| "epoch": 28.4488821611551, | |
| "grad_norm": 0.46877866983413696, | |
| "learning_rate": 0.000258768782760629, | |
| "loss": 3.1577, | |
| "step": 97750 | |
| }, | |
| { | |
| "epoch": 28.463437354448068, | |
| "grad_norm": 0.4570932388305664, | |
| "learning_rate": 0.0002585940594059406, | |
| "loss": 3.1484, | |
| "step": 97800 | |
| }, | |
| { | |
| "epoch": 28.477992547741035, | |
| "grad_norm": 0.4491603672504425, | |
| "learning_rate": 0.00025841933605125217, | |
| "loss": 3.1506, | |
| "step": 97850 | |
| }, | |
| { | |
| "epoch": 28.492547741034002, | |
| "grad_norm": 0.4457694888114929, | |
| "learning_rate": 0.00025824461269656376, | |
| "loss": 3.1444, | |
| "step": 97900 | |
| }, | |
| { | |
| "epoch": 28.50710293432697, | |
| "grad_norm": 0.4521375596523285, | |
| "learning_rate": 0.00025806988934187536, | |
| "loss": 3.1559, | |
| "step": 97950 | |
| }, | |
| { | |
| "epoch": 28.521658127619936, | |
| "grad_norm": 0.47815680503845215, | |
| "learning_rate": 0.00025789516598718695, | |
| "loss": 3.1628, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 28.521658127619936, | |
| "eval_accuracy": 0.37483218569878385, | |
| "eval_loss": 3.544330358505249, | |
| "eval_runtime": 80.3463, | |
| "eval_samples_per_second": 207.24, | |
| "eval_steps_per_second": 12.956, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 28.536213320912903, | |
| "grad_norm": 0.462468683719635, | |
| "learning_rate": 0.00025772044263249854, | |
| "loss": 3.1523, | |
| "step": 98050 | |
| }, | |
| { | |
| "epoch": 28.55076851420587, | |
| "grad_norm": 0.4511658847332001, | |
| "learning_rate": 0.0002575457192778101, | |
| "loss": 3.1666, | |
| "step": 98100 | |
| }, | |
| { | |
| "epoch": 28.565323707498834, | |
| "grad_norm": 0.46320784091949463, | |
| "learning_rate": 0.00025737099592312173, | |
| "loss": 3.1564, | |
| "step": 98150 | |
| }, | |
| { | |
| "epoch": 28.5798789007918, | |
| "grad_norm": 0.4806447923183441, | |
| "learning_rate": 0.0002571962725684333, | |
| "loss": 3.156, | |
| "step": 98200 | |
| }, | |
| { | |
| "epoch": 28.594434094084768, | |
| "grad_norm": 0.4460219740867615, | |
| "learning_rate": 0.00025702154921374487, | |
| "loss": 3.1584, | |
| "step": 98250 | |
| }, | |
| { | |
| "epoch": 28.608989287377735, | |
| "grad_norm": 0.44343358278274536, | |
| "learning_rate": 0.00025684682585905646, | |
| "loss": 3.1706, | |
| "step": 98300 | |
| }, | |
| { | |
| "epoch": 28.623544480670702, | |
| "grad_norm": 0.46316957473754883, | |
| "learning_rate": 0.00025667210250436806, | |
| "loss": 3.1585, | |
| "step": 98350 | |
| }, | |
| { | |
| "epoch": 28.63809967396367, | |
| "grad_norm": 0.44434890151023865, | |
| "learning_rate": 0.00025649737914967965, | |
| "loss": 3.1626, | |
| "step": 98400 | |
| }, | |
| { | |
| "epoch": 28.652654867256636, | |
| "grad_norm": 0.444489985704422, | |
| "learning_rate": 0.00025632265579499124, | |
| "loss": 3.1608, | |
| "step": 98450 | |
| }, | |
| { | |
| "epoch": 28.667210060549603, | |
| "grad_norm": 0.45586320757865906, | |
| "learning_rate": 0.00025614793244030284, | |
| "loss": 3.1596, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 28.68176525384257, | |
| "grad_norm": 0.46997010707855225, | |
| "learning_rate": 0.0002559732090856144, | |
| "loss": 3.1762, | |
| "step": 98550 | |
| }, | |
| { | |
| "epoch": 28.696320447135538, | |
| "grad_norm": 0.45185792446136475, | |
| "learning_rate": 0.00025579848573092603, | |
| "loss": 3.1652, | |
| "step": 98600 | |
| }, | |
| { | |
| "epoch": 28.710875640428505, | |
| "grad_norm": 0.47553834319114685, | |
| "learning_rate": 0.0002556237623762376, | |
| "loss": 3.1608, | |
| "step": 98650 | |
| }, | |
| { | |
| "epoch": 28.72543083372147, | |
| "grad_norm": 0.4305116534233093, | |
| "learning_rate": 0.0002554490390215492, | |
| "loss": 3.1611, | |
| "step": 98700 | |
| }, | |
| { | |
| "epoch": 28.73998602701444, | |
| "grad_norm": 0.4502272605895996, | |
| "learning_rate": 0.0002552743156668608, | |
| "loss": 3.1671, | |
| "step": 98750 | |
| }, | |
| { | |
| "epoch": 28.754541220307406, | |
| "grad_norm": 0.46674883365631104, | |
| "learning_rate": 0.00025509959231217235, | |
| "loss": 3.181, | |
| "step": 98800 | |
| }, | |
| { | |
| "epoch": 28.769096413600373, | |
| "grad_norm": 0.43325966596603394, | |
| "learning_rate": 0.000254924868957484, | |
| "loss": 3.1769, | |
| "step": 98850 | |
| }, | |
| { | |
| "epoch": 28.78365160689334, | |
| "grad_norm": 0.45992568135261536, | |
| "learning_rate": 0.00025475014560279554, | |
| "loss": 3.1732, | |
| "step": 98900 | |
| }, | |
| { | |
| "epoch": 28.798206800186307, | |
| "grad_norm": 0.4627813994884491, | |
| "learning_rate": 0.00025457542224810713, | |
| "loss": 3.1692, | |
| "step": 98950 | |
| }, | |
| { | |
| "epoch": 28.812761993479274, | |
| "grad_norm": 0.4586414396762848, | |
| "learning_rate": 0.00025440069889341873, | |
| "loss": 3.1719, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 28.812761993479274, | |
| "eval_accuracy": 0.37507299914757447, | |
| "eval_loss": 3.53705096244812, | |
| "eval_runtime": 80.3009, | |
| "eval_samples_per_second": 207.358, | |
| "eval_steps_per_second": 12.964, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 28.82731718677224, | |
| "grad_norm": 0.43554526567459106, | |
| "learning_rate": 0.0002542259755387303, | |
| "loss": 3.1731, | |
| "step": 99050 | |
| }, | |
| { | |
| "epoch": 28.841872380065208, | |
| "grad_norm": 0.435739129781723, | |
| "learning_rate": 0.0002540512521840419, | |
| "loss": 3.1709, | |
| "step": 99100 | |
| }, | |
| { | |
| "epoch": 28.856427573358175, | |
| "grad_norm": 0.4667801856994629, | |
| "learning_rate": 0.0002538765288293535, | |
| "loss": 3.1689, | |
| "step": 99150 | |
| }, | |
| { | |
| "epoch": 28.870982766651142, | |
| "grad_norm": 0.4495719075202942, | |
| "learning_rate": 0.0002537018054746651, | |
| "loss": 3.1832, | |
| "step": 99200 | |
| }, | |
| { | |
| "epoch": 28.88553795994411, | |
| "grad_norm": 0.43852946162223816, | |
| "learning_rate": 0.00025352708211997664, | |
| "loss": 3.1725, | |
| "step": 99250 | |
| }, | |
| { | |
| "epoch": 28.900093153237076, | |
| "grad_norm": 0.4768344759941101, | |
| "learning_rate": 0.0002533523587652883, | |
| "loss": 3.1751, | |
| "step": 99300 | |
| }, | |
| { | |
| "epoch": 28.914648346530043, | |
| "grad_norm": 0.4631161391735077, | |
| "learning_rate": 0.00025317763541059983, | |
| "loss": 3.1825, | |
| "step": 99350 | |
| }, | |
| { | |
| "epoch": 28.92920353982301, | |
| "grad_norm": 0.4493507742881775, | |
| "learning_rate": 0.0002530029120559115, | |
| "loss": 3.178, | |
| "step": 99400 | |
| }, | |
| { | |
| "epoch": 28.943758733115978, | |
| "grad_norm": 0.4749862849712372, | |
| "learning_rate": 0.000252828188701223, | |
| "loss": 3.1833, | |
| "step": 99450 | |
| }, | |
| { | |
| "epoch": 28.95831392640894, | |
| "grad_norm": 0.44799110293388367, | |
| "learning_rate": 0.0002526534653465346, | |
| "loss": 3.1771, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 28.972869119701908, | |
| "grad_norm": 0.49781110882759094, | |
| "learning_rate": 0.0002524787419918462, | |
| "loss": 3.1796, | |
| "step": 99550 | |
| }, | |
| { | |
| "epoch": 28.987424312994875, | |
| "grad_norm": 0.44419634342193604, | |
| "learning_rate": 0.0002523040186371578, | |
| "loss": 3.1856, | |
| "step": 99600 | |
| }, | |
| { | |
| "epoch": 29.001746623195157, | |
| "grad_norm": 0.48502570390701294, | |
| "learning_rate": 0.0002521292952824694, | |
| "loss": 3.156, | |
| "step": 99650 | |
| }, | |
| { | |
| "epoch": 29.016301816488124, | |
| "grad_norm": 0.44910928606987, | |
| "learning_rate": 0.000251954571927781, | |
| "loss": 3.0855, | |
| "step": 99700 | |
| }, | |
| { | |
| "epoch": 29.03085700978109, | |
| "grad_norm": 0.43617284297943115, | |
| "learning_rate": 0.0002517798485730926, | |
| "loss": 3.0835, | |
| "step": 99750 | |
| }, | |
| { | |
| "epoch": 29.045412203074058, | |
| "grad_norm": 0.4414200484752655, | |
| "learning_rate": 0.0002516051252184042, | |
| "loss": 3.1009, | |
| "step": 99800 | |
| }, | |
| { | |
| "epoch": 29.059967396367025, | |
| "grad_norm": 0.46186599135398865, | |
| "learning_rate": 0.0002514304018637158, | |
| "loss": 3.0889, | |
| "step": 99850 | |
| }, | |
| { | |
| "epoch": 29.074522589659992, | |
| "grad_norm": 0.46511024236679077, | |
| "learning_rate": 0.00025125567850902737, | |
| "loss": 3.0986, | |
| "step": 99900 | |
| }, | |
| { | |
| "epoch": 29.08907778295296, | |
| "grad_norm": 0.4477766454219818, | |
| "learning_rate": 0.0002510809551543389, | |
| "loss": 3.0979, | |
| "step": 99950 | |
| }, | |
| { | |
| "epoch": 29.103632976245926, | |
| "grad_norm": 0.49588531255722046, | |
| "learning_rate": 0.00025090623179965056, | |
| "loss": 3.0902, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 29.103632976245926, | |
| "eval_accuracy": 0.3744825419651811, | |
| "eval_loss": 3.548532009124756, | |
| "eval_runtime": 80.3279, | |
| "eval_samples_per_second": 207.288, | |
| "eval_steps_per_second": 12.959, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 29.118188169538893, | |
| "grad_norm": 0.46471527218818665, | |
| "learning_rate": 0.0002507315084449621, | |
| "loss": 3.0988, | |
| "step": 100050 | |
| }, | |
| { | |
| "epoch": 29.13274336283186, | |
| "grad_norm": 0.43220189213752747, | |
| "learning_rate": 0.00025055678509027375, | |
| "loss": 3.1027, | |
| "step": 100100 | |
| }, | |
| { | |
| "epoch": 29.147298556124824, | |
| "grad_norm": 0.4506238102912903, | |
| "learning_rate": 0.0002503820617355853, | |
| "loss": 3.107, | |
| "step": 100150 | |
| }, | |
| { | |
| "epoch": 29.16185374941779, | |
| "grad_norm": 0.4406176507472992, | |
| "learning_rate": 0.0002502073383808969, | |
| "loss": 3.1207, | |
| "step": 100200 | |
| }, | |
| { | |
| "epoch": 29.176408942710758, | |
| "grad_norm": 0.4619928300380707, | |
| "learning_rate": 0.0002500326150262085, | |
| "loss": 3.1107, | |
| "step": 100250 | |
| }, | |
| { | |
| "epoch": 29.190964136003725, | |
| "grad_norm": 0.4569091796875, | |
| "learning_rate": 0.00024985789167152007, | |
| "loss": 3.1172, | |
| "step": 100300 | |
| }, | |
| { | |
| "epoch": 29.205519329296692, | |
| "grad_norm": 0.46741896867752075, | |
| "learning_rate": 0.00024968316831683167, | |
| "loss": 3.1108, | |
| "step": 100350 | |
| }, | |
| { | |
| "epoch": 29.22007452258966, | |
| "grad_norm": 0.4463886320590973, | |
| "learning_rate": 0.00024950844496214326, | |
| "loss": 3.124, | |
| "step": 100400 | |
| }, | |
| { | |
| "epoch": 29.234629715882626, | |
| "grad_norm": 0.4393543303012848, | |
| "learning_rate": 0.00024933372160745485, | |
| "loss": 3.138, | |
| "step": 100450 | |
| }, | |
| { | |
| "epoch": 29.249184909175593, | |
| "grad_norm": 0.48772090673446655, | |
| "learning_rate": 0.0002491589982527664, | |
| "loss": 3.1266, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 29.26374010246856, | |
| "grad_norm": 0.44827088713645935, | |
| "learning_rate": 0.00024898427489807804, | |
| "loss": 3.1277, | |
| "step": 100550 | |
| }, | |
| { | |
| "epoch": 29.278295295761527, | |
| "grad_norm": 0.46728524565696716, | |
| "learning_rate": 0.0002488095515433896, | |
| "loss": 3.1136, | |
| "step": 100600 | |
| }, | |
| { | |
| "epoch": 29.292850489054494, | |
| "grad_norm": 0.4493448734283447, | |
| "learning_rate": 0.0002486348281887012, | |
| "loss": 3.1216, | |
| "step": 100650 | |
| }, | |
| { | |
| "epoch": 29.30740568234746, | |
| "grad_norm": 0.4877857565879822, | |
| "learning_rate": 0.00024846010483401277, | |
| "loss": 3.129, | |
| "step": 100700 | |
| }, | |
| { | |
| "epoch": 29.32196087564043, | |
| "grad_norm": 0.4457970857620239, | |
| "learning_rate": 0.00024828538147932437, | |
| "loss": 3.1237, | |
| "step": 100750 | |
| }, | |
| { | |
| "epoch": 29.336516068933395, | |
| "grad_norm": 0.4798702895641327, | |
| "learning_rate": 0.00024811065812463596, | |
| "loss": 3.1244, | |
| "step": 100800 | |
| }, | |
| { | |
| "epoch": 29.351071262226363, | |
| "grad_norm": 0.5078426599502563, | |
| "learning_rate": 0.00024793593476994755, | |
| "loss": 3.1281, | |
| "step": 100850 | |
| }, | |
| { | |
| "epoch": 29.36562645551933, | |
| "grad_norm": 0.45045414566993713, | |
| "learning_rate": 0.00024776121141525915, | |
| "loss": 3.1232, | |
| "step": 100900 | |
| }, | |
| { | |
| "epoch": 29.380181648812297, | |
| "grad_norm": 0.47196251153945923, | |
| "learning_rate": 0.00024758648806057074, | |
| "loss": 3.132, | |
| "step": 100950 | |
| }, | |
| { | |
| "epoch": 29.394736842105264, | |
| "grad_norm": 0.43645259737968445, | |
| "learning_rate": 0.00024741176470588234, | |
| "loss": 3.1338, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 29.394736842105264, | |
| "eval_accuracy": 0.37430589842514583, | |
| "eval_loss": 3.549154758453369, | |
| "eval_runtime": 80.3182, | |
| "eval_samples_per_second": 207.313, | |
| "eval_steps_per_second": 12.961, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 29.40929203539823, | |
| "grad_norm": 0.48996931314468384, | |
| "learning_rate": 0.00024723704135119393, | |
| "loss": 3.1365, | |
| "step": 101050 | |
| }, | |
| { | |
| "epoch": 29.423847228691198, | |
| "grad_norm": 0.46889182925224304, | |
| "learning_rate": 0.0002470623179965055, | |
| "loss": 3.1343, | |
| "step": 101100 | |
| }, | |
| { | |
| "epoch": 29.438402421984165, | |
| "grad_norm": 0.44414785504341125, | |
| "learning_rate": 0.0002468875946418171, | |
| "loss": 3.1348, | |
| "step": 101150 | |
| }, | |
| { | |
| "epoch": 29.452957615277132, | |
| "grad_norm": 0.46217817068099976, | |
| "learning_rate": 0.00024671287128712866, | |
| "loss": 3.1539, | |
| "step": 101200 | |
| }, | |
| { | |
| "epoch": 29.4675128085701, | |
| "grad_norm": 0.49297675490379333, | |
| "learning_rate": 0.0002465381479324403, | |
| "loss": 3.1401, | |
| "step": 101250 | |
| }, | |
| { | |
| "epoch": 29.482068001863066, | |
| "grad_norm": 0.4565764367580414, | |
| "learning_rate": 0.00024636342457775185, | |
| "loss": 3.1343, | |
| "step": 101300 | |
| }, | |
| { | |
| "epoch": 29.496623195156033, | |
| "grad_norm": 0.4355997145175934, | |
| "learning_rate": 0.00024618870122306344, | |
| "loss": 3.1409, | |
| "step": 101350 | |
| }, | |
| { | |
| "epoch": 29.511178388449, | |
| "grad_norm": 0.4661673307418823, | |
| "learning_rate": 0.00024601397786837504, | |
| "loss": 3.1388, | |
| "step": 101400 | |
| }, | |
| { | |
| "epoch": 29.525733581741967, | |
| "grad_norm": 0.463667631149292, | |
| "learning_rate": 0.00024583925451368663, | |
| "loss": 3.1538, | |
| "step": 101450 | |
| }, | |
| { | |
| "epoch": 29.54028877503493, | |
| "grad_norm": 0.47080424427986145, | |
| "learning_rate": 0.0002456645311589982, | |
| "loss": 3.1582, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 29.554843968327898, | |
| "grad_norm": 0.46760010719299316, | |
| "learning_rate": 0.0002454898078043098, | |
| "loss": 3.1369, | |
| "step": 101550 | |
| }, | |
| { | |
| "epoch": 29.569399161620865, | |
| "grad_norm": 0.46279507875442505, | |
| "learning_rate": 0.0002453150844496214, | |
| "loss": 3.1559, | |
| "step": 101600 | |
| }, | |
| { | |
| "epoch": 29.583954354913832, | |
| "grad_norm": 0.44417810440063477, | |
| "learning_rate": 0.000245140361094933, | |
| "loss": 3.1451, | |
| "step": 101650 | |
| }, | |
| { | |
| "epoch": 29.5985095482068, | |
| "grad_norm": 0.4647783637046814, | |
| "learning_rate": 0.0002449656377402446, | |
| "loss": 3.1481, | |
| "step": 101700 | |
| }, | |
| { | |
| "epoch": 29.613064741499766, | |
| "grad_norm": 0.4733027219772339, | |
| "learning_rate": 0.00024479091438555614, | |
| "loss": 3.1549, | |
| "step": 101750 | |
| }, | |
| { | |
| "epoch": 29.627619934792733, | |
| "grad_norm": 0.4805038571357727, | |
| "learning_rate": 0.0002446161910308678, | |
| "loss": 3.1594, | |
| "step": 101800 | |
| }, | |
| { | |
| "epoch": 29.6421751280857, | |
| "grad_norm": 0.44757434725761414, | |
| "learning_rate": 0.0002444414676761794, | |
| "loss": 3.1532, | |
| "step": 101850 | |
| }, | |
| { | |
| "epoch": 29.656730321378667, | |
| "grad_norm": 0.448430597782135, | |
| "learning_rate": 0.0002442667443214909, | |
| "loss": 3.1406, | |
| "step": 101900 | |
| }, | |
| { | |
| "epoch": 29.671285514671634, | |
| "grad_norm": 0.4703189432621002, | |
| "learning_rate": 0.00024409202096680255, | |
| "loss": 3.1423, | |
| "step": 101950 | |
| }, | |
| { | |
| "epoch": 29.6858407079646, | |
| "grad_norm": 0.4756447672843933, | |
| "learning_rate": 0.00024391729761211411, | |
| "loss": 3.1602, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 29.6858407079646, | |
| "eval_accuracy": 0.3748789615663381, | |
| "eval_loss": 3.545452356338501, | |
| "eval_runtime": 80.353, | |
| "eval_samples_per_second": 207.223, | |
| "eval_steps_per_second": 12.955, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 29.70039590125757, | |
| "grad_norm": 0.43251651525497437, | |
| "learning_rate": 0.00024374257425742574, | |
| "loss": 3.1574, | |
| "step": 102050 | |
| }, | |
| { | |
| "epoch": 29.714951094550536, | |
| "grad_norm": 0.447337806224823, | |
| "learning_rate": 0.0002435678509027373, | |
| "loss": 3.1717, | |
| "step": 102100 | |
| }, | |
| { | |
| "epoch": 29.729506287843503, | |
| "grad_norm": 0.46876469254493713, | |
| "learning_rate": 0.0002433931275480489, | |
| "loss": 3.1685, | |
| "step": 102150 | |
| }, | |
| { | |
| "epoch": 29.74406148113647, | |
| "grad_norm": 0.46718913316726685, | |
| "learning_rate": 0.00024321840419336052, | |
| "loss": 3.1622, | |
| "step": 102200 | |
| }, | |
| { | |
| "epoch": 29.758616674429437, | |
| "grad_norm": 0.47372883558273315, | |
| "learning_rate": 0.00024304368083867209, | |
| "loss": 3.1583, | |
| "step": 102250 | |
| }, | |
| { | |
| "epoch": 29.773171867722404, | |
| "grad_norm": 0.44791609048843384, | |
| "learning_rate": 0.00024286895748398365, | |
| "loss": 3.1614, | |
| "step": 102300 | |
| }, | |
| { | |
| "epoch": 29.78772706101537, | |
| "grad_norm": 0.46482592821121216, | |
| "learning_rate": 0.00024269423412929527, | |
| "loss": 3.1564, | |
| "step": 102350 | |
| }, | |
| { | |
| "epoch": 29.802282254308338, | |
| "grad_norm": 0.4608810245990753, | |
| "learning_rate": 0.00024251951077460684, | |
| "loss": 3.1684, | |
| "step": 102400 | |
| }, | |
| { | |
| "epoch": 29.816837447601305, | |
| "grad_norm": 0.4510408043861389, | |
| "learning_rate": 0.00024234478741991844, | |
| "loss": 3.1559, | |
| "step": 102450 | |
| }, | |
| { | |
| "epoch": 29.831392640894272, | |
| "grad_norm": 0.46691054105758667, | |
| "learning_rate": 0.00024217006406523003, | |
| "loss": 3.1683, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 29.84594783418724, | |
| "grad_norm": 0.45704054832458496, | |
| "learning_rate": 0.00024199534071054162, | |
| "loss": 3.1638, | |
| "step": 102550 | |
| }, | |
| { | |
| "epoch": 29.860503027480206, | |
| "grad_norm": 0.4454781711101532, | |
| "learning_rate": 0.0002418206173558532, | |
| "loss": 3.1578, | |
| "step": 102600 | |
| }, | |
| { | |
| "epoch": 29.875058220773173, | |
| "grad_norm": 0.47819724678993225, | |
| "learning_rate": 0.0002416458940011648, | |
| "loss": 3.1529, | |
| "step": 102650 | |
| }, | |
| { | |
| "epoch": 29.88961341406614, | |
| "grad_norm": 0.4584260880947113, | |
| "learning_rate": 0.00024147117064647638, | |
| "loss": 3.1502, | |
| "step": 102700 | |
| }, | |
| { | |
| "epoch": 29.904168607359107, | |
| "grad_norm": 0.44759097695350647, | |
| "learning_rate": 0.000241296447291788, | |
| "loss": 3.1743, | |
| "step": 102750 | |
| }, | |
| { | |
| "epoch": 29.918723800652074, | |
| "grad_norm": 0.4492528736591339, | |
| "learning_rate": 0.00024112172393709957, | |
| "loss": 3.1636, | |
| "step": 102800 | |
| }, | |
| { | |
| "epoch": 29.933278993945038, | |
| "grad_norm": 0.46391233801841736, | |
| "learning_rate": 0.00024094700058241116, | |
| "loss": 3.1667, | |
| "step": 102850 | |
| }, | |
| { | |
| "epoch": 29.947834187238005, | |
| "grad_norm": 0.4565926194190979, | |
| "learning_rate": 0.00024077227722772276, | |
| "loss": 3.1645, | |
| "step": 102900 | |
| }, | |
| { | |
| "epoch": 29.962389380530972, | |
| "grad_norm": 0.43850329518318176, | |
| "learning_rate": 0.00024059755387303435, | |
| "loss": 3.1554, | |
| "step": 102950 | |
| }, | |
| { | |
| "epoch": 29.97694457382394, | |
| "grad_norm": 0.47163861989974976, | |
| "learning_rate": 0.00024042283051834592, | |
| "loss": 3.1786, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 29.97694457382394, | |
| "eval_accuracy": 0.37558541819917374, | |
| "eval_loss": 3.5311474800109863, | |
| "eval_runtime": 80.297, | |
| "eval_samples_per_second": 207.368, | |
| "eval_steps_per_second": 12.964, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 29.991499767116906, | |
| "grad_norm": 0.46710267663002014, | |
| "learning_rate": 0.00024024810716365754, | |
| "loss": 3.1669, | |
| "step": 103050 | |
| }, | |
| { | |
| "epoch": 30.005822077317188, | |
| "grad_norm": 0.44763559103012085, | |
| "learning_rate": 0.0002400733838089691, | |
| "loss": 3.1516, | |
| "step": 103100 | |
| }, | |
| { | |
| "epoch": 30.020377270610155, | |
| "grad_norm": 0.46965065598487854, | |
| "learning_rate": 0.0002398986604542807, | |
| "loss": 3.0713, | |
| "step": 103150 | |
| }, | |
| { | |
| "epoch": 30.03493246390312, | |
| "grad_norm": 0.47300857305526733, | |
| "learning_rate": 0.0002397239370995923, | |
| "loss": 3.091, | |
| "step": 103200 | |
| }, | |
| { | |
| "epoch": 30.04948765719609, | |
| "grad_norm": 0.474572092294693, | |
| "learning_rate": 0.0002395492137449039, | |
| "loss": 3.0854, | |
| "step": 103250 | |
| }, | |
| { | |
| "epoch": 30.064042850489056, | |
| "grad_norm": 0.4572301506996155, | |
| "learning_rate": 0.00023937449039021546, | |
| "loss": 3.0722, | |
| "step": 103300 | |
| }, | |
| { | |
| "epoch": 30.078598043782023, | |
| "grad_norm": 0.5108086466789246, | |
| "learning_rate": 0.00023919976703552708, | |
| "loss": 3.0864, | |
| "step": 103350 | |
| }, | |
| { | |
| "epoch": 30.09315323707499, | |
| "grad_norm": 0.4534626603126526, | |
| "learning_rate": 0.00023902504368083865, | |
| "loss": 3.0909, | |
| "step": 103400 | |
| }, | |
| { | |
| "epoch": 30.107708430367957, | |
| "grad_norm": 0.4674241840839386, | |
| "learning_rate": 0.00023885032032615027, | |
| "loss": 3.1027, | |
| "step": 103450 | |
| }, | |
| { | |
| "epoch": 30.12226362366092, | |
| "grad_norm": 0.45966845750808716, | |
| "learning_rate": 0.00023867559697146183, | |
| "loss": 3.0979, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 30.136818816953888, | |
| "grad_norm": 0.45984208583831787, | |
| "learning_rate": 0.0002385008736167734, | |
| "loss": 3.1086, | |
| "step": 103550 | |
| }, | |
| { | |
| "epoch": 30.151374010246855, | |
| "grad_norm": 0.49848610162734985, | |
| "learning_rate": 0.00023832615026208502, | |
| "loss": 3.1022, | |
| "step": 103600 | |
| }, | |
| { | |
| "epoch": 30.16592920353982, | |
| "grad_norm": 0.4627673923969269, | |
| "learning_rate": 0.0002381514269073966, | |
| "loss": 3.09, | |
| "step": 103650 | |
| }, | |
| { | |
| "epoch": 30.18048439683279, | |
| "grad_norm": 0.4531181752681732, | |
| "learning_rate": 0.00023797670355270818, | |
| "loss": 3.0946, | |
| "step": 103700 | |
| }, | |
| { | |
| "epoch": 30.195039590125756, | |
| "grad_norm": 0.5123842358589172, | |
| "learning_rate": 0.0002378019801980198, | |
| "loss": 3.1133, | |
| "step": 103750 | |
| }, | |
| { | |
| "epoch": 30.209594783418723, | |
| "grad_norm": 0.45647183060646057, | |
| "learning_rate": 0.00023762725684333137, | |
| "loss": 3.1016, | |
| "step": 103800 | |
| }, | |
| { | |
| "epoch": 30.22414997671169, | |
| "grad_norm": 0.4563472270965576, | |
| "learning_rate": 0.00023745253348864294, | |
| "loss": 3.1145, | |
| "step": 103850 | |
| }, | |
| { | |
| "epoch": 30.238705170004657, | |
| "grad_norm": 0.47021499276161194, | |
| "learning_rate": 0.00023727781013395456, | |
| "loss": 3.1111, | |
| "step": 103900 | |
| }, | |
| { | |
| "epoch": 30.253260363297624, | |
| "grad_norm": 0.4799538552761078, | |
| "learning_rate": 0.00023710308677926613, | |
| "loss": 3.1034, | |
| "step": 103950 | |
| }, | |
| { | |
| "epoch": 30.26781555659059, | |
| "grad_norm": 0.4379008412361145, | |
| "learning_rate": 0.00023692836342457772, | |
| "loss": 3.1097, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 30.26781555659059, | |
| "eval_accuracy": 0.3746229870951493, | |
| "eval_loss": 3.5509490966796875, | |
| "eval_runtime": 80.2542, | |
| "eval_samples_per_second": 207.478, | |
| "eval_steps_per_second": 12.971, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 30.282370749883558, | |
| "grad_norm": 0.4403291344642639, | |
| "learning_rate": 0.00023675364006988932, | |
| "loss": 3.1262, | |
| "step": 104050 | |
| }, | |
| { | |
| "epoch": 30.296925943176525, | |
| "grad_norm": 0.498970627784729, | |
| "learning_rate": 0.0002365789167152009, | |
| "loss": 3.1209, | |
| "step": 104100 | |
| }, | |
| { | |
| "epoch": 30.311481136469492, | |
| "grad_norm": 0.4662904143333435, | |
| "learning_rate": 0.0002364041933605125, | |
| "loss": 3.1023, | |
| "step": 104150 | |
| }, | |
| { | |
| "epoch": 30.32603632976246, | |
| "grad_norm": 0.4825027585029602, | |
| "learning_rate": 0.0002362294700058241, | |
| "loss": 3.1207, | |
| "step": 104200 | |
| }, | |
| { | |
| "epoch": 30.340591523055426, | |
| "grad_norm": 0.45922690629959106, | |
| "learning_rate": 0.00023605474665113567, | |
| "loss": 3.1177, | |
| "step": 104250 | |
| }, | |
| { | |
| "epoch": 30.355146716348393, | |
| "grad_norm": 0.4470551609992981, | |
| "learning_rate": 0.0002358800232964473, | |
| "loss": 3.1206, | |
| "step": 104300 | |
| }, | |
| { | |
| "epoch": 30.36970190964136, | |
| "grad_norm": 0.48644015192985535, | |
| "learning_rate": 0.00023570529994175886, | |
| "loss": 3.1, | |
| "step": 104350 | |
| }, | |
| { | |
| "epoch": 30.384257102934328, | |
| "grad_norm": 0.46883222460746765, | |
| "learning_rate": 0.00023553057658707045, | |
| "loss": 3.1205, | |
| "step": 104400 | |
| }, | |
| { | |
| "epoch": 30.398812296227295, | |
| "grad_norm": 0.43868565559387207, | |
| "learning_rate": 0.00023535585323238204, | |
| "loss": 3.1294, | |
| "step": 104450 | |
| }, | |
| { | |
| "epoch": 30.41336748952026, | |
| "grad_norm": 0.45560604333877563, | |
| "learning_rate": 0.00023518112987769364, | |
| "loss": 3.1203, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 30.42792268281323, | |
| "grad_norm": 0.4556736350059509, | |
| "learning_rate": 0.0002350064065230052, | |
| "loss": 3.1194, | |
| "step": 104550 | |
| }, | |
| { | |
| "epoch": 30.442477876106196, | |
| "grad_norm": 0.4928584396839142, | |
| "learning_rate": 0.00023483168316831683, | |
| "loss": 3.1254, | |
| "step": 104600 | |
| }, | |
| { | |
| "epoch": 30.457033069399163, | |
| "grad_norm": 0.47284579277038574, | |
| "learning_rate": 0.0002346569598136284, | |
| "loss": 3.1416, | |
| "step": 104650 | |
| }, | |
| { | |
| "epoch": 30.47158826269213, | |
| "grad_norm": 0.45248204469680786, | |
| "learning_rate": 0.00023448223645894, | |
| "loss": 3.1351, | |
| "step": 104700 | |
| }, | |
| { | |
| "epoch": 30.486143455985097, | |
| "grad_norm": 0.4520478844642639, | |
| "learning_rate": 0.00023430751310425158, | |
| "loss": 3.1353, | |
| "step": 104750 | |
| }, | |
| { | |
| "epoch": 30.500698649278064, | |
| "grad_norm": 0.4718460142612457, | |
| "learning_rate": 0.00023413278974956318, | |
| "loss": 3.139, | |
| "step": 104800 | |
| }, | |
| { | |
| "epoch": 30.515253842571028, | |
| "grad_norm": 0.4471622705459595, | |
| "learning_rate": 0.00023395806639487477, | |
| "loss": 3.1299, | |
| "step": 104850 | |
| }, | |
| { | |
| "epoch": 30.529809035863995, | |
| "grad_norm": 0.45347538590431213, | |
| "learning_rate": 0.00023378334304018637, | |
| "loss": 3.139, | |
| "step": 104900 | |
| }, | |
| { | |
| "epoch": 30.54436422915696, | |
| "grad_norm": 0.47260230779647827, | |
| "learning_rate": 0.00023360861968549793, | |
| "loss": 3.1388, | |
| "step": 104950 | |
| }, | |
| { | |
| "epoch": 30.55891942244993, | |
| "grad_norm": 0.4715934097766876, | |
| "learning_rate": 0.00023343389633080955, | |
| "loss": 3.1368, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 30.55891942244993, | |
| "eval_accuracy": 0.3750912158799134, | |
| "eval_loss": 3.542126178741455, | |
| "eval_runtime": 80.2607, | |
| "eval_samples_per_second": 207.461, | |
| "eval_steps_per_second": 12.97, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 30.573474615742896, | |
| "grad_norm": 0.47947555780410767, | |
| "learning_rate": 0.00023325917297612112, | |
| "loss": 3.1314, | |
| "step": 105050 | |
| }, | |
| { | |
| "epoch": 30.588029809035863, | |
| "grad_norm": 0.46303045749664307, | |
| "learning_rate": 0.0002330844496214327, | |
| "loss": 3.1404, | |
| "step": 105100 | |
| }, | |
| { | |
| "epoch": 30.60258500232883, | |
| "grad_norm": 0.4715004563331604, | |
| "learning_rate": 0.0002329097262667443, | |
| "loss": 3.1454, | |
| "step": 105150 | |
| }, | |
| { | |
| "epoch": 30.617140195621797, | |
| "grad_norm": 0.47624292969703674, | |
| "learning_rate": 0.00023273500291205588, | |
| "loss": 3.1342, | |
| "step": 105200 | |
| }, | |
| { | |
| "epoch": 30.631695388914764, | |
| "grad_norm": 0.463158518075943, | |
| "learning_rate": 0.00023256027955736747, | |
| "loss": 3.1333, | |
| "step": 105250 | |
| }, | |
| { | |
| "epoch": 30.64625058220773, | |
| "grad_norm": 0.4352867007255554, | |
| "learning_rate": 0.00023238555620267907, | |
| "loss": 3.1517, | |
| "step": 105300 | |
| }, | |
| { | |
| "epoch": 30.660805775500698, | |
| "grad_norm": 0.4497668743133545, | |
| "learning_rate": 0.00023221083284799066, | |
| "loss": 3.1466, | |
| "step": 105350 | |
| }, | |
| { | |
| "epoch": 30.675360968793665, | |
| "grad_norm": 0.49091213941574097, | |
| "learning_rate": 0.00023203610949330223, | |
| "loss": 3.148, | |
| "step": 105400 | |
| }, | |
| { | |
| "epoch": 30.689916162086632, | |
| "grad_norm": 0.46702882647514343, | |
| "learning_rate": 0.00023186138613861385, | |
| "loss": 3.1498, | |
| "step": 105450 | |
| }, | |
| { | |
| "epoch": 30.7044713553796, | |
| "grad_norm": 0.47715649008750916, | |
| "learning_rate": 0.00023168666278392542, | |
| "loss": 3.1448, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 30.719026548672566, | |
| "grad_norm": 0.4723684787750244, | |
| "learning_rate": 0.00023151193942923704, | |
| "loss": 3.1628, | |
| "step": 105550 | |
| }, | |
| { | |
| "epoch": 30.733581741965533, | |
| "grad_norm": 0.44101059436798096, | |
| "learning_rate": 0.0002313372160745486, | |
| "loss": 3.1588, | |
| "step": 105600 | |
| }, | |
| { | |
| "epoch": 30.7481369352585, | |
| "grad_norm": 0.48599541187286377, | |
| "learning_rate": 0.0002311624927198602, | |
| "loss": 3.1535, | |
| "step": 105650 | |
| }, | |
| { | |
| "epoch": 30.762692128551468, | |
| "grad_norm": 0.47461456060409546, | |
| "learning_rate": 0.0002309877693651718, | |
| "loss": 3.149, | |
| "step": 105700 | |
| }, | |
| { | |
| "epoch": 30.777247321844435, | |
| "grad_norm": 0.4594041705131531, | |
| "learning_rate": 0.0002308130460104834, | |
| "loss": 3.1451, | |
| "step": 105750 | |
| }, | |
| { | |
| "epoch": 30.7918025151374, | |
| "grad_norm": 0.4538593590259552, | |
| "learning_rate": 0.00023063832265579496, | |
| "loss": 3.1707, | |
| "step": 105800 | |
| }, | |
| { | |
| "epoch": 30.80635770843037, | |
| "grad_norm": 0.45425233244895935, | |
| "learning_rate": 0.00023046359930110658, | |
| "loss": 3.1544, | |
| "step": 105850 | |
| }, | |
| { | |
| "epoch": 30.820912901723336, | |
| "grad_norm": 0.46237096190452576, | |
| "learning_rate": 0.00023028887594641814, | |
| "loss": 3.1544, | |
| "step": 105900 | |
| }, | |
| { | |
| "epoch": 30.835468095016303, | |
| "grad_norm": 0.47854360938072205, | |
| "learning_rate": 0.00023011415259172974, | |
| "loss": 3.1603, | |
| "step": 105950 | |
| }, | |
| { | |
| "epoch": 30.85002328830927, | |
| "grad_norm": 0.47883960604667664, | |
| "learning_rate": 0.00022993942923704133, | |
| "loss": 3.1502, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 30.85002328830927, | |
| "eval_accuracy": 0.3753248601630738, | |
| "eval_loss": 3.5363810062408447, | |
| "eval_runtime": 80.2896, | |
| "eval_samples_per_second": 207.387, | |
| "eval_steps_per_second": 12.966, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 30.864578481602237, | |
| "grad_norm": 0.4617387056350708, | |
| "learning_rate": 0.00022976470588235293, | |
| "loss": 3.1629, | |
| "step": 106050 | |
| }, | |
| { | |
| "epoch": 30.879133674895204, | |
| "grad_norm": 0.44905003905296326, | |
| "learning_rate": 0.0002295899825276645, | |
| "loss": 3.156, | |
| "step": 106100 | |
| }, | |
| { | |
| "epoch": 30.89368886818817, | |
| "grad_norm": 0.4597944915294647, | |
| "learning_rate": 0.00022941525917297612, | |
| "loss": 3.1543, | |
| "step": 106150 | |
| }, | |
| { | |
| "epoch": 30.908244061481135, | |
| "grad_norm": 0.4842374324798584, | |
| "learning_rate": 0.00022924053581828768, | |
| "loss": 3.1633, | |
| "step": 106200 | |
| }, | |
| { | |
| "epoch": 30.9227992547741, | |
| "grad_norm": 0.4724351465702057, | |
| "learning_rate": 0.0002290658124635993, | |
| "loss": 3.1588, | |
| "step": 106250 | |
| }, | |
| { | |
| "epoch": 30.93735444806707, | |
| "grad_norm": 0.47688618302345276, | |
| "learning_rate": 0.00022889108910891087, | |
| "loss": 3.162, | |
| "step": 106300 | |
| }, | |
| { | |
| "epoch": 30.951909641360036, | |
| "grad_norm": 0.45577678084373474, | |
| "learning_rate": 0.00022871636575422247, | |
| "loss": 3.1551, | |
| "step": 106350 | |
| }, | |
| { | |
| "epoch": 30.966464834653003, | |
| "grad_norm": 0.42968910932540894, | |
| "learning_rate": 0.00022854164239953406, | |
| "loss": 3.1604, | |
| "step": 106400 | |
| }, | |
| { | |
| "epoch": 30.98102002794597, | |
| "grad_norm": 0.45901602506637573, | |
| "learning_rate": 0.00022836691904484565, | |
| "loss": 3.166, | |
| "step": 106450 | |
| }, | |
| { | |
| "epoch": 30.995575221238937, | |
| "grad_norm": 0.46381086111068726, | |
| "learning_rate": 0.00022819219569015722, | |
| "loss": 3.1736, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 31.00989753143922, | |
| "grad_norm": 0.4824746251106262, | |
| "learning_rate": 0.00022801747233546884, | |
| "loss": 3.0928, | |
| "step": 106550 | |
| }, | |
| { | |
| "epoch": 31.024452724732186, | |
| "grad_norm": 0.462884783744812, | |
| "learning_rate": 0.0002278427489807804, | |
| "loss": 3.0647, | |
| "step": 106600 | |
| }, | |
| { | |
| "epoch": 31.039007918025153, | |
| "grad_norm": 0.4834712743759155, | |
| "learning_rate": 0.00022766802562609198, | |
| "loss": 3.0746, | |
| "step": 106650 | |
| }, | |
| { | |
| "epoch": 31.05356311131812, | |
| "grad_norm": 0.4722846448421478, | |
| "learning_rate": 0.0002274933022714036, | |
| "loss": 3.0618, | |
| "step": 106700 | |
| }, | |
| { | |
| "epoch": 31.068118304611087, | |
| "grad_norm": 0.46291640400886536, | |
| "learning_rate": 0.00022731857891671517, | |
| "loss": 3.077, | |
| "step": 106750 | |
| }, | |
| { | |
| "epoch": 31.082673497904054, | |
| "grad_norm": 0.46960365772247314, | |
| "learning_rate": 0.00022714385556202676, | |
| "loss": 3.1025, | |
| "step": 106800 | |
| }, | |
| { | |
| "epoch": 31.09722869119702, | |
| "grad_norm": 0.46258553862571716, | |
| "learning_rate": 0.00022696913220733835, | |
| "loss": 3.0774, | |
| "step": 106850 | |
| }, | |
| { | |
| "epoch": 31.111783884489984, | |
| "grad_norm": 0.4631222188472748, | |
| "learning_rate": 0.00022679440885264995, | |
| "loss": 3.0794, | |
| "step": 106900 | |
| }, | |
| { | |
| "epoch": 31.12633907778295, | |
| "grad_norm": 0.4621471166610718, | |
| "learning_rate": 0.00022661968549796157, | |
| "loss": 3.0947, | |
| "step": 106950 | |
| }, | |
| { | |
| "epoch": 31.14089427107592, | |
| "grad_norm": 0.4729008376598358, | |
| "learning_rate": 0.00022644496214327314, | |
| "loss": 3.0948, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 31.14089427107592, | |
| "eval_accuracy": 0.37452673223201627, | |
| "eval_loss": 3.553650140762329, | |
| "eval_runtime": 80.3763, | |
| "eval_samples_per_second": 207.163, | |
| "eval_steps_per_second": 12.952, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 31.155449464368886, | |
| "grad_norm": 0.5043581128120422, | |
| "learning_rate": 0.0002262702387885847, | |
| "loss": 3.1044, | |
| "step": 107050 | |
| }, | |
| { | |
| "epoch": 31.170004657661853, | |
| "grad_norm": 0.4716866910457611, | |
| "learning_rate": 0.00022609551543389633, | |
| "loss": 3.0918, | |
| "step": 107100 | |
| }, | |
| { | |
| "epoch": 31.18455985095482, | |
| "grad_norm": 0.5288137197494507, | |
| "learning_rate": 0.0002259207920792079, | |
| "loss": 3.0925, | |
| "step": 107150 | |
| }, | |
| { | |
| "epoch": 31.199115044247787, | |
| "grad_norm": 0.4575735032558441, | |
| "learning_rate": 0.0002257460687245195, | |
| "loss": 3.1032, | |
| "step": 107200 | |
| }, | |
| { | |
| "epoch": 31.213670237540754, | |
| "grad_norm": 0.469389945268631, | |
| "learning_rate": 0.00022557134536983108, | |
| "loss": 3.113, | |
| "step": 107250 | |
| }, | |
| { | |
| "epoch": 31.22822543083372, | |
| "grad_norm": 0.4693560004234314, | |
| "learning_rate": 0.00022539662201514268, | |
| "loss": 3.0918, | |
| "step": 107300 | |
| }, | |
| { | |
| "epoch": 31.242780624126688, | |
| "grad_norm": 0.4661855101585388, | |
| "learning_rate": 0.00022522189866045424, | |
| "loss": 3.1046, | |
| "step": 107350 | |
| }, | |
| { | |
| "epoch": 31.257335817419655, | |
| "grad_norm": 0.4602469205856323, | |
| "learning_rate": 0.00022504717530576586, | |
| "loss": 3.1107, | |
| "step": 107400 | |
| }, | |
| { | |
| "epoch": 31.271891010712622, | |
| "grad_norm": 0.4572893977165222, | |
| "learning_rate": 0.00022487245195107743, | |
| "loss": 3.107, | |
| "step": 107450 | |
| }, | |
| { | |
| "epoch": 31.28644620400559, | |
| "grad_norm": 0.4895777404308319, | |
| "learning_rate": 0.00022469772859638903, | |
| "loss": 3.0992, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 31.301001397298556, | |
| "grad_norm": 0.48202449083328247, | |
| "learning_rate": 0.00022452300524170062, | |
| "loss": 3.1149, | |
| "step": 107550 | |
| }, | |
| { | |
| "epoch": 31.315556590591523, | |
| "grad_norm": 0.476492702960968, | |
| "learning_rate": 0.00022434828188701221, | |
| "loss": 3.1106, | |
| "step": 107600 | |
| }, | |
| { | |
| "epoch": 31.33011178388449, | |
| "grad_norm": 0.4884395897388458, | |
| "learning_rate": 0.0002241735585323238, | |
| "loss": 3.1125, | |
| "step": 107650 | |
| }, | |
| { | |
| "epoch": 31.344666977177457, | |
| "grad_norm": 0.4597252905368805, | |
| "learning_rate": 0.0002239988351776354, | |
| "loss": 3.1009, | |
| "step": 107700 | |
| }, | |
| { | |
| "epoch": 31.359222170470424, | |
| "grad_norm": 0.4771318733692169, | |
| "learning_rate": 0.00022382411182294697, | |
| "loss": 3.1216, | |
| "step": 107750 | |
| }, | |
| { | |
| "epoch": 31.37377736376339, | |
| "grad_norm": 0.4635736346244812, | |
| "learning_rate": 0.0002236493884682586, | |
| "loss": 3.1188, | |
| "step": 107800 | |
| }, | |
| { | |
| "epoch": 31.38833255705636, | |
| "grad_norm": 0.4771345257759094, | |
| "learning_rate": 0.00022347466511357016, | |
| "loss": 3.1104, | |
| "step": 107850 | |
| }, | |
| { | |
| "epoch": 31.402887750349326, | |
| "grad_norm": 0.4752862751483917, | |
| "learning_rate": 0.00022329994175888175, | |
| "loss": 3.117, | |
| "step": 107900 | |
| }, | |
| { | |
| "epoch": 31.417442943642293, | |
| "grad_norm": 0.4715939462184906, | |
| "learning_rate": 0.00022312521840419335, | |
| "loss": 3.1216, | |
| "step": 107950 | |
| }, | |
| { | |
| "epoch": 31.43199813693526, | |
| "grad_norm": 0.46059268712997437, | |
| "learning_rate": 0.00022295049504950494, | |
| "loss": 3.1172, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 31.43199813693526, | |
| "eval_accuracy": 0.3750154107679222, | |
| "eval_loss": 3.542569875717163, | |
| "eval_runtime": 80.265, | |
| "eval_samples_per_second": 207.45, | |
| "eval_steps_per_second": 12.97, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 31.446553330228227, | |
| "grad_norm": 0.44773054122924805, | |
| "learning_rate": 0.0002227757716948165, | |
| "loss": 3.1192, | |
| "step": 108050 | |
| }, | |
| { | |
| "epoch": 31.461108523521194, | |
| "grad_norm": 0.46515557169914246, | |
| "learning_rate": 0.00022260104834012813, | |
| "loss": 3.1129, | |
| "step": 108100 | |
| }, | |
| { | |
| "epoch": 31.47566371681416, | |
| "grad_norm": 0.4887338876724243, | |
| "learning_rate": 0.0002224263249854397, | |
| "loss": 3.1288, | |
| "step": 108150 | |
| }, | |
| { | |
| "epoch": 31.490218910107128, | |
| "grad_norm": 0.47401952743530273, | |
| "learning_rate": 0.00022225160163075126, | |
| "loss": 3.1282, | |
| "step": 108200 | |
| }, | |
| { | |
| "epoch": 31.50477410340009, | |
| "grad_norm": 0.4582172632217407, | |
| "learning_rate": 0.00022207687827606289, | |
| "loss": 3.1299, | |
| "step": 108250 | |
| }, | |
| { | |
| "epoch": 31.51932929669306, | |
| "grad_norm": 0.4628421366214752, | |
| "learning_rate": 0.00022190215492137445, | |
| "loss": 3.1279, | |
| "step": 108300 | |
| }, | |
| { | |
| "epoch": 31.533884489986026, | |
| "grad_norm": 0.5074931383132935, | |
| "learning_rate": 0.00022172743156668607, | |
| "loss": 3.1378, | |
| "step": 108350 | |
| }, | |
| { | |
| "epoch": 31.548439683278993, | |
| "grad_norm": 0.47616952657699585, | |
| "learning_rate": 0.00022155270821199764, | |
| "loss": 3.1277, | |
| "step": 108400 | |
| }, | |
| { | |
| "epoch": 31.56299487657196, | |
| "grad_norm": 0.4730508327484131, | |
| "learning_rate": 0.00022137798485730924, | |
| "loss": 3.1306, | |
| "step": 108450 | |
| }, | |
| { | |
| "epoch": 31.577550069864927, | |
| "grad_norm": 0.4669469892978668, | |
| "learning_rate": 0.00022120326150262083, | |
| "loss": 3.136, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 31.592105263157894, | |
| "grad_norm": 0.4832284152507782, | |
| "learning_rate": 0.00022102853814793242, | |
| "loss": 3.1306, | |
| "step": 108550 | |
| }, | |
| { | |
| "epoch": 31.60666045645086, | |
| "grad_norm": 0.4845399558544159, | |
| "learning_rate": 0.000220853814793244, | |
| "loss": 3.1184, | |
| "step": 108600 | |
| }, | |
| { | |
| "epoch": 31.621215649743828, | |
| "grad_norm": 0.513660192489624, | |
| "learning_rate": 0.0002206790914385556, | |
| "loss": 3.1356, | |
| "step": 108650 | |
| }, | |
| { | |
| "epoch": 31.635770843036795, | |
| "grad_norm": 0.47942274808883667, | |
| "learning_rate": 0.00022050436808386718, | |
| "loss": 3.1312, | |
| "step": 108700 | |
| }, | |
| { | |
| "epoch": 31.650326036329762, | |
| "grad_norm": 0.47694769501686096, | |
| "learning_rate": 0.00022032964472917877, | |
| "loss": 3.1322, | |
| "step": 108750 | |
| }, | |
| { | |
| "epoch": 31.66488122962273, | |
| "grad_norm": 0.4673755466938019, | |
| "learning_rate": 0.00022015492137449037, | |
| "loss": 3.144, | |
| "step": 108800 | |
| }, | |
| { | |
| "epoch": 31.679436422915696, | |
| "grad_norm": 0.4699643850326538, | |
| "learning_rate": 0.00021998019801980196, | |
| "loss": 3.1473, | |
| "step": 108850 | |
| }, | |
| { | |
| "epoch": 31.693991616208663, | |
| "grad_norm": 0.4737618863582611, | |
| "learning_rate": 0.00021980547466511353, | |
| "loss": 3.1314, | |
| "step": 108900 | |
| }, | |
| { | |
| "epoch": 31.70854680950163, | |
| "grad_norm": 0.5028552412986755, | |
| "learning_rate": 0.00021963075131042515, | |
| "loss": 3.1386, | |
| "step": 108950 | |
| }, | |
| { | |
| "epoch": 31.723102002794597, | |
| "grad_norm": 0.46567708253860474, | |
| "learning_rate": 0.00021945602795573672, | |
| "loss": 3.145, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 31.723102002794597, | |
| "eval_accuracy": 0.3752102710402965, | |
| "eval_loss": 3.5423614978790283, | |
| "eval_runtime": 80.3489, | |
| "eval_samples_per_second": 207.234, | |
| "eval_steps_per_second": 12.956, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 31.723102002794597, | |
| "step": 109000, | |
| "total_flos": 2.277875215171584e+18, | |
| "train_loss": 3.368869836929741, | |
| "train_runtime": 80556.64, | |
| "train_samples_per_second": 170.568, | |
| "train_steps_per_second": 2.133 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 171800, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 20, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 11 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.277875215171584e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |