| { |
| "best_global_step": 72000, |
| "best_metric": 3.5321924686431885, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_drop_frequency_40817/checkpoint-40000", |
| "epoch": 29.11239881195038, |
| "eval_steps": 1000, |
| "global_step": 100000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.014559431599790344, |
| "grad_norm": 1.5551334619522095, |
| "learning_rate": 0.000294, |
| "loss": 8.4667, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.029118863199580687, |
| "grad_norm": 0.7336986064910889, |
| "learning_rate": 0.0005939999999999999, |
| "loss": 6.7245, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.043678294799371034, |
| "grad_norm": 0.4792507588863373, |
| "learning_rate": 0.0005998287212350713, |
| "loss": 6.3255, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.058237726399161374, |
| "grad_norm": 0.47392821311950684, |
| "learning_rate": 0.0005996539469851441, |
| "loss": 6.1138, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07279715799895171, |
| "grad_norm": 0.442217618227005, |
| "learning_rate": 0.000599479172735217, |
| "loss": 5.9746, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.08735658959874207, |
| "grad_norm": 0.4978708028793335, |
| "learning_rate": 0.0005993043984852897, |
| "loss": 5.8573, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.10191602119853241, |
| "grad_norm": 0.5078408122062683, |
| "learning_rate": 0.0005991296242353626, |
| "loss": 5.7377, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.11647545279832275, |
| "grad_norm": 0.4501552879810333, |
| "learning_rate": 0.0005989548499854355, |
| "loss": 5.613, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1310348843981131, |
| "grad_norm": 0.41562119126319885, |
| "learning_rate": 0.0005987800757355083, |
| "loss": 5.5049, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.14559431599790343, |
| "grad_norm": 0.39685097336769104, |
| "learning_rate": 0.0005986053014855811, |
| "loss": 5.4153, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.1601537475976938, |
| "grad_norm": 0.4735598862171173, |
| "learning_rate": 0.000598430527235654, |
| "loss": 5.3339, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.17471317919748414, |
| "grad_norm": 0.4490765929222107, |
| "learning_rate": 0.0005982557529857267, |
| "loss": 5.2571, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.18927261079727448, |
| "grad_norm": 0.5662270188331604, |
| "learning_rate": 0.0005980809787357995, |
| "loss": 5.183, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.20383204239706482, |
| "grad_norm": 0.4178728759288788, |
| "learning_rate": 0.0005979062044858724, |
| "loss": 5.1337, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.21839147399685516, |
| "grad_norm": 0.4277268648147583, |
| "learning_rate": 0.0005977314302359452, |
| "loss": 5.082, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2329509055966455, |
| "grad_norm": 0.49093976616859436, |
| "learning_rate": 0.0005975566559860181, |
| "loss": 5.0414, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.24751033719643586, |
| "grad_norm": 0.40832236409187317, |
| "learning_rate": 0.0005973818817360908, |
| "loss": 4.9782, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.2620697687962262, |
| "grad_norm": 0.42992544174194336, |
| "learning_rate": 0.0005972071074861636, |
| "loss": 4.9343, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.2766292003960165, |
| "grad_norm": 0.5416184067726135, |
| "learning_rate": 0.0005970323332362365, |
| "loss": 4.8685, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.29118863199580686, |
| "grad_norm": 0.5198241472244263, |
| "learning_rate": 0.0005968575589863093, |
| "loss": 4.849, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.29118863199580686, |
| "eval_accuracy": 0.25379510529217636, |
| "eval_loss": 4.761143684387207, |
| "eval_runtime": 183.641, |
| "eval_samples_per_second": 90.628, |
| "eval_steps_per_second": 5.669, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.30574806359559725, |
| "grad_norm": 0.47911056876182556, |
| "learning_rate": 0.0005966827847363822, |
| "loss": 4.7726, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.3203074951953876, |
| "grad_norm": 0.4758392572402954, |
| "learning_rate": 0.000596508010486455, |
| "loss": 4.7537, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.33486692679517793, |
| "grad_norm": 0.47129762172698975, |
| "learning_rate": 0.0005963332362365277, |
| "loss": 4.7115, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.3494263583949683, |
| "grad_norm": 0.42803141474723816, |
| "learning_rate": 0.0005961584619866006, |
| "loss": 4.6751, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.3639857899947586, |
| "grad_norm": 0.4740878641605377, |
| "learning_rate": 0.0005959836877366734, |
| "loss": 4.6417, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.37854522159454895, |
| "grad_norm": 0.40221309661865234, |
| "learning_rate": 0.0005958089134867463, |
| "loss": 4.6053, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.3931046531943393, |
| "grad_norm": 0.44672706723213196, |
| "learning_rate": 0.0005956341392368191, |
| "loss": 4.5801, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.40766408479412963, |
| "grad_norm": 0.4823697507381439, |
| "learning_rate": 0.0005954593649868918, |
| "loss": 4.5599, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.42222351639392, |
| "grad_norm": 0.5122449398040771, |
| "learning_rate": 0.0005952845907369647, |
| "loss": 4.5344, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4367829479937103, |
| "grad_norm": 0.4088864028453827, |
| "learning_rate": 0.0005951098164870375, |
| "loss": 4.4951, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.45134237959350065, |
| "grad_norm": 0.40731462836265564, |
| "learning_rate": 0.0005949350422371104, |
| "loss": 4.5018, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.465901811193291, |
| "grad_norm": 0.4263319671154022, |
| "learning_rate": 0.0005947602679871832, |
| "loss": 4.4755, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.48046124279308133, |
| "grad_norm": 0.38340768218040466, |
| "learning_rate": 0.000594585493737256, |
| "loss": 4.4569, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.49502067439287173, |
| "grad_norm": 0.3979549705982208, |
| "learning_rate": 0.0005944107194873288, |
| "loss": 4.4444, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.509580105992662, |
| "grad_norm": 0.4176700711250305, |
| "learning_rate": 0.0005942359452374016, |
| "loss": 4.4173, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.5241395375924524, |
| "grad_norm": 0.3926246464252472, |
| "learning_rate": 0.0005940611709874745, |
| "loss": 4.4004, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.5386989691922427, |
| "grad_norm": 0.3877740502357483, |
| "learning_rate": 0.0005938863967375473, |
| "loss": 4.3869, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.553258400792033, |
| "grad_norm": 0.4315814971923828, |
| "learning_rate": 0.0005937116224876201, |
| "loss": 4.3636, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5678178323918234, |
| "grad_norm": 0.403978556394577, |
| "learning_rate": 0.000593536848237693, |
| "loss": 4.3628, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5823772639916137, |
| "grad_norm": 0.39984941482543945, |
| "learning_rate": 0.0005933620739877657, |
| "loss": 4.3402, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5823772639916137, |
| "eval_accuracy": 0.29928804185701036, |
| "eval_loss": 4.288719177246094, |
| "eval_runtime": 180.6232, |
| "eval_samples_per_second": 92.142, |
| "eval_steps_per_second": 5.763, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5969366955914042, |
| "grad_norm": 0.4173935055732727, |
| "learning_rate": 0.0005931872997378385, |
| "loss": 4.3411, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6114961271911945, |
| "grad_norm": 0.37241849303245544, |
| "learning_rate": 0.0005930125254879114, |
| "loss": 4.3243, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.6260555587909848, |
| "grad_norm": 0.4274754822254181, |
| "learning_rate": 0.0005928377512379842, |
| "loss": 4.2883, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.6406149903907752, |
| "grad_norm": 0.4375714063644409, |
| "learning_rate": 0.0005926629769880571, |
| "loss": 4.2941, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6551744219905655, |
| "grad_norm": 0.39245837926864624, |
| "learning_rate": 0.0005924882027381298, |
| "loss": 4.2863, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.6697338535903559, |
| "grad_norm": 0.3508373498916626, |
| "learning_rate": 0.0005923134284882026, |
| "loss": 4.2683, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6842932851901462, |
| "grad_norm": 0.37966057658195496, |
| "learning_rate": 0.0005921386542382755, |
| "loss": 4.268, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.6988527167899365, |
| "grad_norm": 0.4270515441894531, |
| "learning_rate": 0.0005919638799883483, |
| "loss": 4.2548, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.7134121483897269, |
| "grad_norm": 0.36582618951797485, |
| "learning_rate": 0.0005917891057384212, |
| "loss": 4.2418, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.7279715799895172, |
| "grad_norm": 0.3588745594024658, |
| "learning_rate": 0.000591614331488494, |
| "loss": 4.2315, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.7425310115893076, |
| "grad_norm": 0.3805822730064392, |
| "learning_rate": 0.0005914395572385667, |
| "loss": 4.2263, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.7570904431890979, |
| "grad_norm": 0.37862271070480347, |
| "learning_rate": 0.0005912647829886396, |
| "loss": 4.2177, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.7716498747888882, |
| "grad_norm": 0.40694668889045715, |
| "learning_rate": 0.0005910900087387124, |
| "loss": 4.1886, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.7862093063886786, |
| "grad_norm": 0.3988340497016907, |
| "learning_rate": 0.0005909152344887853, |
| "loss": 4.1907, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.8007687379884689, |
| "grad_norm": 0.4412493109703064, |
| "learning_rate": 0.0005907404602388581, |
| "loss": 4.1929, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.8153281695882593, |
| "grad_norm": 0.37306517362594604, |
| "learning_rate": 0.0005905656859889308, |
| "loss": 4.1721, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.8298876011880496, |
| "grad_norm": 0.36752834916114807, |
| "learning_rate": 0.0005903909117390037, |
| "loss": 4.1729, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.84444703278784, |
| "grad_norm": 0.38249292969703674, |
| "learning_rate": 0.0005902161374890766, |
| "loss": 4.17, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.8590064643876303, |
| "grad_norm": 0.3479909598827362, |
| "learning_rate": 0.0005900413632391494, |
| "loss": 4.1629, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.8735658959874206, |
| "grad_norm": 0.34885624051094055, |
| "learning_rate": 0.0005898665889892223, |
| "loss": 4.1563, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8735658959874206, |
| "eval_accuracy": 0.31519818808069494, |
| "eval_loss": 4.099164009094238, |
| "eval_runtime": 183.4247, |
| "eval_samples_per_second": 90.735, |
| "eval_steps_per_second": 5.675, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.888125327587211, |
| "grad_norm": 0.38681846857070923, |
| "learning_rate": 0.0005896918147392951, |
| "loss": 4.1567, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.9026847591870013, |
| "grad_norm": 0.3432327210903168, |
| "learning_rate": 0.0005895170404893678, |
| "loss": 4.1293, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.9172441907867916, |
| "grad_norm": 0.3937830626964569, |
| "learning_rate": 0.0005893422662394407, |
| "loss": 4.1285, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.931803622386582, |
| "grad_norm": 0.39171546697616577, |
| "learning_rate": 0.0005891674919895135, |
| "loss": 4.1279, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.9463630539863723, |
| "grad_norm": 0.37026646733283997, |
| "learning_rate": 0.0005889927177395864, |
| "loss": 4.1106, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.9609224855861627, |
| "grad_norm": 0.3460790812969208, |
| "learning_rate": 0.0005888179434896592, |
| "loss": 4.1132, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.975481917185953, |
| "grad_norm": 0.36886388063430786, |
| "learning_rate": 0.000588643169239732, |
| "loss": 4.0977, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.9900413487857435, |
| "grad_norm": 0.36020082235336304, |
| "learning_rate": 0.0005884683949898048, |
| "loss": 4.0966, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.0043678294799372, |
| "grad_norm": 0.33763444423675537, |
| "learning_rate": 0.0005882936207398776, |
| "loss": 4.0577, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.0189272610797275, |
| "grad_norm": 0.34525808691978455, |
| "learning_rate": 0.0005881188464899504, |
| "loss": 4.0248, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.0334866926795179, |
| "grad_norm": 0.37068355083465576, |
| "learning_rate": 0.0005879440722400233, |
| "loss": 4.0183, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.0480461242793082, |
| "grad_norm": 0.34973421692848206, |
| "learning_rate": 0.0005877692979900961, |
| "loss": 4.0291, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.0626055558790986, |
| "grad_norm": 0.3637358248233795, |
| "learning_rate": 0.000587594523740169, |
| "loss": 4.0199, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.077164987478889, |
| "grad_norm": 0.34920114278793335, |
| "learning_rate": 0.0005874197494902417, |
| "loss": 4.0247, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.0917244190786792, |
| "grad_norm": 0.3420464098453522, |
| "learning_rate": 0.0005872449752403145, |
| "loss": 4.0189, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.1062838506784696, |
| "grad_norm": 0.34696176648139954, |
| "learning_rate": 0.0005870702009903874, |
| "loss": 4.0089, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.12084328227826, |
| "grad_norm": 0.3416752815246582, |
| "learning_rate": 0.0005868954267404602, |
| "loss": 3.9978, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.1354027138780503, |
| "grad_norm": 0.3729047179222107, |
| "learning_rate": 0.0005867206524905331, |
| "loss": 3.9976, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.1499621454778406, |
| "grad_norm": 0.34707263112068176, |
| "learning_rate": 0.0005865458782406058, |
| "loss": 3.9927, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.164521577077631, |
| "grad_norm": 0.3424519896507263, |
| "learning_rate": 0.0005863711039906786, |
| "loss": 3.9798, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.164521577077631, |
| "eval_accuracy": 0.32528629009357674, |
| "eval_loss": 3.9908077716827393, |
| "eval_runtime": 180.5563, |
| "eval_samples_per_second": 92.176, |
| "eval_steps_per_second": 5.766, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.1790810086774213, |
| "grad_norm": 0.3473677635192871, |
| "learning_rate": 0.0005861963297407515, |
| "loss": 3.9837, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.1936404402772116, |
| "grad_norm": 0.3695130944252014, |
| "learning_rate": 0.0005860215554908243, |
| "loss": 3.9857, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.208199871877002, |
| "grad_norm": 0.3494517207145691, |
| "learning_rate": 0.0005858467812408972, |
| "loss": 3.9749, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.2227593034767923, |
| "grad_norm": 0.3514440655708313, |
| "learning_rate": 0.00058567200699097, |
| "loss": 3.9773, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.2373187350765826, |
| "grad_norm": 0.33939051628112793, |
| "learning_rate": 0.0005854972327410427, |
| "loss": 3.9868, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.251878166676373, |
| "grad_norm": 0.39269140362739563, |
| "learning_rate": 0.0005853224584911156, |
| "loss": 3.9676, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.2664375982761633, |
| "grad_norm": 0.3487934470176697, |
| "learning_rate": 0.0005851476842411884, |
| "loss": 3.973, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.2809970298759537, |
| "grad_norm": 0.33803650736808777, |
| "learning_rate": 0.0005849729099912613, |
| "loss": 3.9805, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.295556461475744, |
| "grad_norm": 0.34375283122062683, |
| "learning_rate": 0.0005847981357413341, |
| "loss": 3.9729, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.3101158930755343, |
| "grad_norm": 0.3429529070854187, |
| "learning_rate": 0.0005846233614914068, |
| "loss": 3.9492, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.3246753246753247, |
| "grad_norm": 0.3482668399810791, |
| "learning_rate": 0.0005844485872414797, |
| "loss": 3.9654, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.339234756275115, |
| "grad_norm": 0.3361050486564636, |
| "learning_rate": 0.0005842738129915525, |
| "loss": 3.9693, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.3537941878749054, |
| "grad_norm": 0.34350207448005676, |
| "learning_rate": 0.0005840990387416253, |
| "loss": 3.9628, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.3683536194746957, |
| "grad_norm": 0.35732749104499817, |
| "learning_rate": 0.0005839242644916982, |
| "loss": 3.9383, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.382913051074486, |
| "grad_norm": 0.32812654972076416, |
| "learning_rate": 0.000583749490241771, |
| "loss": 3.9402, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.3974724826742764, |
| "grad_norm": 0.3359614312648773, |
| "learning_rate": 0.0005835747159918438, |
| "loss": 3.9409, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.4120319142740667, |
| "grad_norm": 0.36291930079460144, |
| "learning_rate": 0.0005833999417419166, |
| "loss": 3.9373, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.426591345873857, |
| "grad_norm": 0.3357282876968384, |
| "learning_rate": 0.0005832251674919894, |
| "loss": 3.9373, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.4411507774736474, |
| "grad_norm": 0.3662075996398926, |
| "learning_rate": 0.0005830503932420623, |
| "loss": 3.9326, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.4557102090734377, |
| "grad_norm": 0.3387506604194641, |
| "learning_rate": 0.0005828756189921351, |
| "loss": 3.9189, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.4557102090734377, |
| "eval_accuracy": 0.3320894535210645, |
| "eval_loss": 3.91398549079895, |
| "eval_runtime": 185.101, |
| "eval_samples_per_second": 89.913, |
| "eval_steps_per_second": 5.624, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.470269640673228, |
| "grad_norm": 0.32989710569381714, |
| "learning_rate": 0.000582700844742208, |
| "loss": 3.9282, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.4848290722730184, |
| "grad_norm": 0.3328815996646881, |
| "learning_rate": 0.0005825260704922807, |
| "loss": 3.9183, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.4993885038728088, |
| "grad_norm": 0.33961018919944763, |
| "learning_rate": 0.0005823512962423535, |
| "loss": 3.9253, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.5139479354725993, |
| "grad_norm": 0.33562958240509033, |
| "learning_rate": 0.0005821765219924264, |
| "loss": 3.9222, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.5285073670723897, |
| "grad_norm": 0.3406899571418762, |
| "learning_rate": 0.0005820017477424992, |
| "loss": 3.9185, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.54306679867218, |
| "grad_norm": 0.3406858742237091, |
| "learning_rate": 0.0005818269734925721, |
| "loss": 3.9156, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.5576262302719703, |
| "grad_norm": 0.34090015292167664, |
| "learning_rate": 0.0005816521992426448, |
| "loss": 3.8969, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.5721856618717607, |
| "grad_norm": 0.31158268451690674, |
| "learning_rate": 0.0005814774249927176, |
| "loss": 3.9143, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.586745093471551, |
| "grad_norm": 0.34926122426986694, |
| "learning_rate": 0.0005813026507427905, |
| "loss": 3.9132, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.6013045250713414, |
| "grad_norm": 0.34333717823028564, |
| "learning_rate": 0.0005811278764928634, |
| "loss": 3.9041, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.6158639566711317, |
| "grad_norm": 0.3164921998977661, |
| "learning_rate": 0.0005809531022429362, |
| "loss": 3.908, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.630423388270922, |
| "grad_norm": 0.3325600028038025, |
| "learning_rate": 0.0005807783279930091, |
| "loss": 3.8937, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.6449828198707124, |
| "grad_norm": 0.3716844916343689, |
| "learning_rate": 0.0005806035537430818, |
| "loss": 3.913, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.6595422514705027, |
| "grad_norm": 0.3302454352378845, |
| "learning_rate": 0.0005804287794931546, |
| "loss": 3.8894, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.674101683070293, |
| "grad_norm": 0.3286576271057129, |
| "learning_rate": 0.0005802540052432275, |
| "loss": 3.9061, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.6886611146700834, |
| "grad_norm": 0.31899774074554443, |
| "learning_rate": 0.0005800792309933003, |
| "loss": 3.885, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.7032205462698737, |
| "grad_norm": 0.38346347212791443, |
| "learning_rate": 0.0005799044567433732, |
| "loss": 3.8978, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.717779977869664, |
| "grad_norm": 0.32501021027565, |
| "learning_rate": 0.000579729682493446, |
| "loss": 3.8928, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.7323394094694544, |
| "grad_norm": 0.33264926075935364, |
| "learning_rate": 0.0005795549082435187, |
| "loss": 3.8917, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.7468988410692448, |
| "grad_norm": 0.35515546798706055, |
| "learning_rate": 0.0005793801339935916, |
| "loss": 3.8836, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.7468988410692448, |
| "eval_accuracy": 0.33675024013551297, |
| "eval_loss": 3.8585171699523926, |
| "eval_runtime": 185.0399, |
| "eval_samples_per_second": 89.943, |
| "eval_steps_per_second": 5.626, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.761458272669035, |
| "grad_norm": 0.3250105679035187, |
| "learning_rate": 0.0005792053597436644, |
| "loss": 3.8774, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.7760177042688254, |
| "grad_norm": 0.333280473947525, |
| "learning_rate": 0.0005790305854937372, |
| "loss": 3.8726, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.7905771358686158, |
| "grad_norm": 0.32873275876045227, |
| "learning_rate": 0.0005788558112438101, |
| "loss": 3.8701, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.8051365674684061, |
| "grad_norm": 0.3332742154598236, |
| "learning_rate": 0.0005786810369938828, |
| "loss": 3.8699, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.8196959990681965, |
| "grad_norm": 0.3222472369670868, |
| "learning_rate": 0.0005785062627439557, |
| "loss": 3.874, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.8342554306679868, |
| "grad_norm": 0.3324868381023407, |
| "learning_rate": 0.0005783314884940285, |
| "loss": 3.869, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.8488148622677771, |
| "grad_norm": 0.32730036973953247, |
| "learning_rate": 0.0005781567142441013, |
| "loss": 3.8536, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.8633742938675675, |
| "grad_norm": 0.3353622257709503, |
| "learning_rate": 0.0005779819399941742, |
| "loss": 3.869, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.8779337254673578, |
| "grad_norm": 0.33830076456069946, |
| "learning_rate": 0.000577807165744247, |
| "loss": 3.8726, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.8924931570671482, |
| "grad_norm": 0.31618306040763855, |
| "learning_rate": 0.0005776323914943198, |
| "loss": 3.8508, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.9070525886669385, |
| "grad_norm": 0.33165860176086426, |
| "learning_rate": 0.0005774576172443926, |
| "loss": 3.8566, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.9216120202667288, |
| "grad_norm": 0.3387751579284668, |
| "learning_rate": 0.0005772828429944654, |
| "loss": 3.8548, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.9361714518665192, |
| "grad_norm": 0.3364385664463043, |
| "learning_rate": 0.0005771080687445383, |
| "loss": 3.8539, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.9507308834663095, |
| "grad_norm": 0.34390878677368164, |
| "learning_rate": 0.0005769332944946111, |
| "loss": 3.8631, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.9652903150660999, |
| "grad_norm": 0.3324083685874939, |
| "learning_rate": 0.0005767585202446839, |
| "loss": 3.8482, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.9798497466658902, |
| "grad_norm": 0.32365697622299194, |
| "learning_rate": 0.0005765837459947567, |
| "loss": 3.8303, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.9944091782656805, |
| "grad_norm": 0.3342290222644806, |
| "learning_rate": 0.0005764089717448295, |
| "loss": 3.8508, |
| "step": 6850 |
| }, |
| { |
| "epoch": 2.0087356589598744, |
| "grad_norm": 0.3290010392665863, |
| "learning_rate": 0.0005762341974949024, |
| "loss": 3.7915, |
| "step": 6900 |
| }, |
| { |
| "epoch": 2.0232950905596647, |
| "grad_norm": 0.3240971565246582, |
| "learning_rate": 0.0005760594232449752, |
| "loss": 3.7587, |
| "step": 6950 |
| }, |
| { |
| "epoch": 2.037854522159455, |
| "grad_norm": 0.3391764163970947, |
| "learning_rate": 0.0005758846489950481, |
| "loss": 3.7526, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.037854522159455, |
| "eval_accuracy": 0.34109519666654636, |
| "eval_loss": 3.816195249557495, |
| "eval_runtime": 184.953, |
| "eval_samples_per_second": 89.985, |
| "eval_steps_per_second": 5.628, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.0524139537592454, |
| "grad_norm": 0.33266958594322205, |
| "learning_rate": 0.0005757098747451208, |
| "loss": 3.7541, |
| "step": 7050 |
| }, |
| { |
| "epoch": 2.0669733853590357, |
| "grad_norm": 0.34850549697875977, |
| "learning_rate": 0.0005755351004951936, |
| "loss": 3.7518, |
| "step": 7100 |
| }, |
| { |
| "epoch": 2.081532816958826, |
| "grad_norm": 0.3229345679283142, |
| "learning_rate": 0.0005753603262452665, |
| "loss": 3.7485, |
| "step": 7150 |
| }, |
| { |
| "epoch": 2.0960922485586164, |
| "grad_norm": 0.31956946849823, |
| "learning_rate": 0.0005751855519953393, |
| "loss": 3.7446, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.1106516801584068, |
| "grad_norm": 0.3483135402202606, |
| "learning_rate": 0.0005750107777454121, |
| "loss": 3.76, |
| "step": 7250 |
| }, |
| { |
| "epoch": 2.125211111758197, |
| "grad_norm": 0.3251873850822449, |
| "learning_rate": 0.0005748360034954849, |
| "loss": 3.7494, |
| "step": 7300 |
| }, |
| { |
| "epoch": 2.1397705433579874, |
| "grad_norm": 0.3456031382083893, |
| "learning_rate": 0.0005746612292455577, |
| "loss": 3.7564, |
| "step": 7350 |
| }, |
| { |
| "epoch": 2.154329974957778, |
| "grad_norm": 0.3253571093082428, |
| "learning_rate": 0.0005744864549956306, |
| "loss": 3.7517, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.168889406557568, |
| "grad_norm": 0.322238564491272, |
| "learning_rate": 0.0005743116807457034, |
| "loss": 3.7582, |
| "step": 7450 |
| }, |
| { |
| "epoch": 2.1834488381573585, |
| "grad_norm": 0.33640897274017334, |
| "learning_rate": 0.0005741369064957762, |
| "loss": 3.7567, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.198008269757149, |
| "grad_norm": 0.3346073627471924, |
| "learning_rate": 0.0005739621322458491, |
| "loss": 3.7461, |
| "step": 7550 |
| }, |
| { |
| "epoch": 2.212567701356939, |
| "grad_norm": 0.3327328860759735, |
| "learning_rate": 0.0005737873579959218, |
| "loss": 3.7594, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.2271271329567295, |
| "grad_norm": 0.3236997723579407, |
| "learning_rate": 0.0005736125837459947, |
| "loss": 3.7726, |
| "step": 7650 |
| }, |
| { |
| "epoch": 2.24168656455652, |
| "grad_norm": 0.33130574226379395, |
| "learning_rate": 0.0005734378094960675, |
| "loss": 3.7486, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.25624599615631, |
| "grad_norm": 0.34919485449790955, |
| "learning_rate": 0.0005732630352461403, |
| "loss": 3.7578, |
| "step": 7750 |
| }, |
| { |
| "epoch": 2.2708054277561005, |
| "grad_norm": 0.3182968497276306, |
| "learning_rate": 0.0005730882609962132, |
| "loss": 3.7604, |
| "step": 7800 |
| }, |
| { |
| "epoch": 2.285364859355891, |
| "grad_norm": 0.30436646938323975, |
| "learning_rate": 0.0005729134867462859, |
| "loss": 3.7412, |
| "step": 7850 |
| }, |
| { |
| "epoch": 2.299924290955681, |
| "grad_norm": 0.3302886188030243, |
| "learning_rate": 0.0005727387124963588, |
| "loss": 3.7515, |
| "step": 7900 |
| }, |
| { |
| "epoch": 2.3144837225554715, |
| "grad_norm": 0.30620837211608887, |
| "learning_rate": 0.0005725639382464317, |
| "loss": 3.7695, |
| "step": 7950 |
| }, |
| { |
| "epoch": 2.329043154155262, |
| "grad_norm": 0.3169257640838623, |
| "learning_rate": 0.0005723891639965045, |
| "loss": 3.7682, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.329043154155262, |
| "eval_accuracy": 0.34396188967982283, |
| "eval_loss": 3.788954496383667, |
| "eval_runtime": 182.8165, |
| "eval_samples_per_second": 91.037, |
| "eval_steps_per_second": 5.694, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.343602585755052, |
| "grad_norm": 0.3280718922615051, |
| "learning_rate": 0.0005722143897465773, |
| "loss": 3.7452, |
| "step": 8050 |
| }, |
| { |
| "epoch": 2.3581620173548425, |
| "grad_norm": 0.3237084746360779, |
| "learning_rate": 0.0005720396154966502, |
| "loss": 3.762, |
| "step": 8100 |
| }, |
| { |
| "epoch": 2.372721448954633, |
| "grad_norm": 0.31791386008262634, |
| "learning_rate": 0.0005718648412467229, |
| "loss": 3.7504, |
| "step": 8150 |
| }, |
| { |
| "epoch": 2.3872808805544232, |
| "grad_norm": 0.32723358273506165, |
| "learning_rate": 0.0005716900669967958, |
| "loss": 3.7561, |
| "step": 8200 |
| }, |
| { |
| "epoch": 2.4018403121542136, |
| "grad_norm": 0.3216814398765564, |
| "learning_rate": 0.0005715152927468686, |
| "loss": 3.7496, |
| "step": 8250 |
| }, |
| { |
| "epoch": 2.416399743754004, |
| "grad_norm": 0.32928794622421265, |
| "learning_rate": 0.0005713405184969414, |
| "loss": 3.7533, |
| "step": 8300 |
| }, |
| { |
| "epoch": 2.4309591753537942, |
| "grad_norm": 0.3223062753677368, |
| "learning_rate": 0.0005711657442470143, |
| "loss": 3.766, |
| "step": 8350 |
| }, |
| { |
| "epoch": 2.4455186069535846, |
| "grad_norm": 0.3292803168296814, |
| "learning_rate": 0.000570990969997087, |
| "loss": 3.7502, |
| "step": 8400 |
| }, |
| { |
| "epoch": 2.460078038553375, |
| "grad_norm": 0.3402736783027649, |
| "learning_rate": 0.0005708161957471599, |
| "loss": 3.744, |
| "step": 8450 |
| }, |
| { |
| "epoch": 2.4746374701531653, |
| "grad_norm": 0.3164720833301544, |
| "learning_rate": 0.0005706414214972327, |
| "loss": 3.7426, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.4891969017529556, |
| "grad_norm": 0.33465683460235596, |
| "learning_rate": 0.0005704666472473055, |
| "loss": 3.756, |
| "step": 8550 |
| }, |
| { |
| "epoch": 2.503756333352746, |
| "grad_norm": 0.3301171362400055, |
| "learning_rate": 0.0005702918729973784, |
| "loss": 3.7448, |
| "step": 8600 |
| }, |
| { |
| "epoch": 2.5183157649525363, |
| "grad_norm": 0.3436541259288788, |
| "learning_rate": 0.0005701170987474512, |
| "loss": 3.7449, |
| "step": 8650 |
| }, |
| { |
| "epoch": 2.5328751965523266, |
| "grad_norm": 0.3333314061164856, |
| "learning_rate": 0.0005699423244975239, |
| "loss": 3.7381, |
| "step": 8700 |
| }, |
| { |
| "epoch": 2.547434628152117, |
| "grad_norm": 0.3258245885372162, |
| "learning_rate": 0.0005697675502475968, |
| "loss": 3.7338, |
| "step": 8750 |
| }, |
| { |
| "epoch": 2.5619940597519073, |
| "grad_norm": 0.34784647822380066, |
| "learning_rate": 0.0005695927759976696, |
| "loss": 3.734, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.5765534913516976, |
| "grad_norm": 0.31109482049942017, |
| "learning_rate": 0.0005694180017477425, |
| "loss": 3.7372, |
| "step": 8850 |
| }, |
| { |
| "epoch": 2.591112922951488, |
| "grad_norm": 0.31201112270355225, |
| "learning_rate": 0.0005692432274978153, |
| "loss": 3.7499, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.6056723545512783, |
| "grad_norm": 0.31193050742149353, |
| "learning_rate": 0.000569068453247888, |
| "loss": 3.7385, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.6202317861510687, |
| "grad_norm": 0.3446432948112488, |
| "learning_rate": 0.0005688936789979609, |
| "loss": 3.7477, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6202317861510687, |
| "eval_accuracy": 0.3468738524556142, |
| "eval_loss": 3.757246255874634, |
| "eval_runtime": 182.4423, |
| "eval_samples_per_second": 91.223, |
| "eval_steps_per_second": 5.706, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.634791217750859, |
| "grad_norm": 0.31883829832077026, |
| "learning_rate": 0.0005687189047480337, |
| "loss": 3.7364, |
| "step": 9050 |
| }, |
| { |
| "epoch": 2.6493506493506493, |
| "grad_norm": 0.3273116946220398, |
| "learning_rate": 0.0005685441304981066, |
| "loss": 3.7312, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.6639100809504397, |
| "grad_norm": 0.3443247973918915, |
| "learning_rate": 0.0005683693562481794, |
| "loss": 3.7366, |
| "step": 9150 |
| }, |
| { |
| "epoch": 2.67846951255023, |
| "grad_norm": 0.30951568484306335, |
| "learning_rate": 0.0005681945819982522, |
| "loss": 3.7425, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.6930289441500204, |
| "grad_norm": 0.3140866756439209, |
| "learning_rate": 0.000568019807748325, |
| "loss": 3.7396, |
| "step": 9250 |
| }, |
| { |
| "epoch": 2.7075883757498107, |
| "grad_norm": 0.32707467675209045, |
| "learning_rate": 0.0005678450334983978, |
| "loss": 3.7348, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.722147807349601, |
| "grad_norm": 0.32110151648521423, |
| "learning_rate": 0.0005676702592484707, |
| "loss": 3.7223, |
| "step": 9350 |
| }, |
| { |
| "epoch": 2.7367072389493914, |
| "grad_norm": 0.3235968053340912, |
| "learning_rate": 0.0005674954849985435, |
| "loss": 3.7379, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.7512666705491817, |
| "grad_norm": 0.34924793243408203, |
| "learning_rate": 0.0005673207107486163, |
| "loss": 3.7503, |
| "step": 9450 |
| }, |
| { |
| "epoch": 2.765826102148972, |
| "grad_norm": 0.32524895668029785, |
| "learning_rate": 0.0005671459364986892, |
| "loss": 3.7302, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.7803855337487624, |
| "grad_norm": 0.3183753490447998, |
| "learning_rate": 0.0005669711622487619, |
| "loss": 3.7229, |
| "step": 9550 |
| }, |
| { |
| "epoch": 2.7949449653485527, |
| "grad_norm": 0.31938815116882324, |
| "learning_rate": 0.0005667963879988348, |
| "loss": 3.7208, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.809504396948343, |
| "grad_norm": 0.3149973154067993, |
| "learning_rate": 0.0005666216137489076, |
| "loss": 3.7312, |
| "step": 9650 |
| }, |
| { |
| "epoch": 2.8240638285481334, |
| "grad_norm": 0.32664161920547485, |
| "learning_rate": 0.0005664468394989804, |
| "loss": 3.7436, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.8386232601479238, |
| "grad_norm": 0.31149327754974365, |
| "learning_rate": 0.0005662720652490533, |
| "loss": 3.728, |
| "step": 9750 |
| }, |
| { |
| "epoch": 2.853182691747714, |
| "grad_norm": 0.3289666175842285, |
| "learning_rate": 0.000566097290999126, |
| "loss": 3.7286, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.8677421233475044, |
| "grad_norm": 0.3204244077205658, |
| "learning_rate": 0.0005659225167491988, |
| "loss": 3.7122, |
| "step": 9850 |
| }, |
| { |
| "epoch": 2.882301554947295, |
| "grad_norm": 0.33363139629364014, |
| "learning_rate": 0.0005657477424992717, |
| "loss": 3.7409, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.896860986547085, |
| "grad_norm": 0.3554539084434509, |
| "learning_rate": 0.0005655729682493445, |
| "loss": 3.7301, |
| "step": 9950 |
| }, |
| { |
| "epoch": 2.9114204181468755, |
| "grad_norm": 0.306832879781723, |
| "learning_rate": 0.0005653981939994174, |
| "loss": 3.73, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.9114204181468755, |
| "eval_accuracy": 0.3494360034301546, |
| "eval_loss": 3.729952573776245, |
| "eval_runtime": 181.5285, |
| "eval_samples_per_second": 91.683, |
| "eval_steps_per_second": 5.735, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.925979849746666, |
| "grad_norm": 0.31433573365211487, |
| "learning_rate": 0.0005652234197494902, |
| "loss": 3.7247, |
| "step": 10050 |
| }, |
| { |
| "epoch": 2.940539281346456, |
| "grad_norm": 0.3179089426994324, |
| "learning_rate": 0.0005650486454995629, |
| "loss": 3.7153, |
| "step": 10100 |
| }, |
| { |
| "epoch": 2.9550987129462465, |
| "grad_norm": 0.3196451961994171, |
| "learning_rate": 0.0005648738712496358, |
| "loss": 3.7189, |
| "step": 10150 |
| }, |
| { |
| "epoch": 2.969658144546037, |
| "grad_norm": 0.30295759439468384, |
| "learning_rate": 0.0005646990969997086, |
| "loss": 3.7165, |
| "step": 10200 |
| }, |
| { |
| "epoch": 2.984217576145827, |
| "grad_norm": 0.32530921697616577, |
| "learning_rate": 0.0005645243227497815, |
| "loss": 3.715, |
| "step": 10250 |
| }, |
| { |
| "epoch": 2.9987770077456175, |
| "grad_norm": 0.30198994278907776, |
| "learning_rate": 0.0005643495484998543, |
| "loss": 3.7192, |
| "step": 10300 |
| }, |
| { |
| "epoch": 3.0131034884398114, |
| "grad_norm": 0.31793293356895447, |
| "learning_rate": 0.000564174774249927, |
| "loss": 3.6316, |
| "step": 10350 |
| }, |
| { |
| "epoch": 3.0276629200396017, |
| "grad_norm": 0.3131251633167267, |
| "learning_rate": 0.0005639999999999999, |
| "loss": 3.6161, |
| "step": 10400 |
| }, |
| { |
| "epoch": 3.042222351639392, |
| "grad_norm": 0.3221314251422882, |
| "learning_rate": 0.0005638252257500727, |
| "loss": 3.6239, |
| "step": 10450 |
| }, |
| { |
| "epoch": 3.0567817832391824, |
| "grad_norm": 0.3299553096294403, |
| "learning_rate": 0.0005636504515001456, |
| "loss": 3.6255, |
| "step": 10500 |
| }, |
| { |
| "epoch": 3.0713412148389727, |
| "grad_norm": 0.3239217698574066, |
| "learning_rate": 0.0005634756772502185, |
| "loss": 3.6207, |
| "step": 10550 |
| }, |
| { |
| "epoch": 3.085900646438763, |
| "grad_norm": 0.3120846152305603, |
| "learning_rate": 0.0005633009030002913, |
| "loss": 3.6305, |
| "step": 10600 |
| }, |
| { |
| "epoch": 3.1004600780385534, |
| "grad_norm": 0.324990838766098, |
| "learning_rate": 0.000563126128750364, |
| "loss": 3.6298, |
| "step": 10650 |
| }, |
| { |
| "epoch": 3.1150195096383437, |
| "grad_norm": 0.3125215172767639, |
| "learning_rate": 0.0005629513545004369, |
| "loss": 3.617, |
| "step": 10700 |
| }, |
| { |
| "epoch": 3.129578941238134, |
| "grad_norm": 0.3323279917240143, |
| "learning_rate": 0.0005627765802505097, |
| "loss": 3.6235, |
| "step": 10750 |
| }, |
| { |
| "epoch": 3.1441383728379244, |
| "grad_norm": 0.3290170133113861, |
| "learning_rate": 0.0005626018060005826, |
| "loss": 3.6227, |
| "step": 10800 |
| }, |
| { |
| "epoch": 3.1586978044377148, |
| "grad_norm": 0.3450184762477875, |
| "learning_rate": 0.0005624270317506554, |
| "loss": 3.64, |
| "step": 10850 |
| }, |
| { |
| "epoch": 3.173257236037505, |
| "grad_norm": 0.32774847745895386, |
| "learning_rate": 0.0005622522575007282, |
| "loss": 3.646, |
| "step": 10900 |
| }, |
| { |
| "epoch": 3.1878166676372954, |
| "grad_norm": 0.32285189628601074, |
| "learning_rate": 0.000562077483250801, |
| "loss": 3.643, |
| "step": 10950 |
| }, |
| { |
| "epoch": 3.2023760992370858, |
| "grad_norm": 0.3201664686203003, |
| "learning_rate": 0.0005619027090008738, |
| "loss": 3.6397, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.2023760992370858, |
| "eval_accuracy": 0.3513435653971105, |
| "eval_loss": 3.7152557373046875, |
| "eval_runtime": 180.6216, |
| "eval_samples_per_second": 92.143, |
| "eval_steps_per_second": 5.763, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.216935530836876, |
| "grad_norm": 0.32860246300697327, |
| "learning_rate": 0.0005617279347509467, |
| "loss": 3.6478, |
| "step": 11050 |
| }, |
| { |
| "epoch": 3.2314949624366665, |
| "grad_norm": 0.32338783144950867, |
| "learning_rate": 0.0005615531605010195, |
| "loss": 3.6419, |
| "step": 11100 |
| }, |
| { |
| "epoch": 3.246054394036457, |
| "grad_norm": 0.3216056823730469, |
| "learning_rate": 0.0005613783862510923, |
| "loss": 3.6497, |
| "step": 11150 |
| }, |
| { |
| "epoch": 3.260613825636247, |
| "grad_norm": 0.36512988805770874, |
| "learning_rate": 0.0005612036120011652, |
| "loss": 3.6238, |
| "step": 11200 |
| }, |
| { |
| "epoch": 3.2751732572360375, |
| "grad_norm": 0.33006951212882996, |
| "learning_rate": 0.0005610288377512379, |
| "loss": 3.65, |
| "step": 11250 |
| }, |
| { |
| "epoch": 3.289732688835828, |
| "grad_norm": 0.32506290078163147, |
| "learning_rate": 0.0005608540635013107, |
| "loss": 3.6369, |
| "step": 11300 |
| }, |
| { |
| "epoch": 3.304292120435618, |
| "grad_norm": 0.3291010856628418, |
| "learning_rate": 0.0005606792892513836, |
| "loss": 3.644, |
| "step": 11350 |
| }, |
| { |
| "epoch": 3.3188515520354085, |
| "grad_norm": 0.3134164810180664, |
| "learning_rate": 0.0005605045150014564, |
| "loss": 3.6428, |
| "step": 11400 |
| }, |
| { |
| "epoch": 3.333410983635199, |
| "grad_norm": 0.3079008162021637, |
| "learning_rate": 0.0005603297407515293, |
| "loss": 3.638, |
| "step": 11450 |
| }, |
| { |
| "epoch": 3.347970415234989, |
| "grad_norm": 0.2959432899951935, |
| "learning_rate": 0.000560154966501602, |
| "loss": 3.6469, |
| "step": 11500 |
| }, |
| { |
| "epoch": 3.3625298468347795, |
| "grad_norm": 0.3210470676422119, |
| "learning_rate": 0.0005599801922516748, |
| "loss": 3.6441, |
| "step": 11550 |
| }, |
| { |
| "epoch": 3.37708927843457, |
| "grad_norm": 0.3303925395011902, |
| "learning_rate": 0.0005598054180017477, |
| "loss": 3.6448, |
| "step": 11600 |
| }, |
| { |
| "epoch": 3.39164871003436, |
| "grad_norm": 0.3426654040813446, |
| "learning_rate": 0.0005596306437518205, |
| "loss": 3.638, |
| "step": 11650 |
| }, |
| { |
| "epoch": 3.4062081416341505, |
| "grad_norm": 0.35107845067977905, |
| "learning_rate": 0.0005594558695018934, |
| "loss": 3.6483, |
| "step": 11700 |
| }, |
| { |
| "epoch": 3.420767573233941, |
| "grad_norm": 0.3188258111476898, |
| "learning_rate": 0.0005592810952519662, |
| "loss": 3.6422, |
| "step": 11750 |
| }, |
| { |
| "epoch": 3.435327004833731, |
| "grad_norm": 0.33043134212493896, |
| "learning_rate": 0.0005591063210020389, |
| "loss": 3.6448, |
| "step": 11800 |
| }, |
| { |
| "epoch": 3.4498864364335216, |
| "grad_norm": 0.31511127948760986, |
| "learning_rate": 0.0005589315467521118, |
| "loss": 3.648, |
| "step": 11850 |
| }, |
| { |
| "epoch": 3.464445868033312, |
| "grad_norm": 0.3306327164173126, |
| "learning_rate": 0.0005587567725021846, |
| "loss": 3.6258, |
| "step": 11900 |
| }, |
| { |
| "epoch": 3.4790052996331022, |
| "grad_norm": 0.3343588411808014, |
| "learning_rate": 0.0005585819982522575, |
| "loss": 3.646, |
| "step": 11950 |
| }, |
| { |
| "epoch": 3.4935647312328926, |
| "grad_norm": 0.3293665945529938, |
| "learning_rate": 0.0005584072240023303, |
| "loss": 3.6405, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.4935647312328926, |
| "eval_accuracy": 0.35307957260170497, |
| "eval_loss": 3.6972014904022217, |
| "eval_runtime": 181.5639, |
| "eval_samples_per_second": 91.665, |
| "eval_steps_per_second": 5.734, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.508124162832683, |
| "grad_norm": 0.3309422433376312, |
| "learning_rate": 0.000558232449752403, |
| "loss": 3.6445, |
| "step": 12050 |
| }, |
| { |
| "epoch": 3.5226835944324733, |
| "grad_norm": 0.3296276032924652, |
| "learning_rate": 0.0005580576755024759, |
| "loss": 3.6433, |
| "step": 12100 |
| }, |
| { |
| "epoch": 3.5372430260322636, |
| "grad_norm": 0.3203052580356598, |
| "learning_rate": 0.0005578829012525487, |
| "loss": 3.6408, |
| "step": 12150 |
| }, |
| { |
| "epoch": 3.551802457632054, |
| "grad_norm": 0.31741246581077576, |
| "learning_rate": 0.0005577081270026216, |
| "loss": 3.6379, |
| "step": 12200 |
| }, |
| { |
| "epoch": 3.5663618892318443, |
| "grad_norm": 0.32449865341186523, |
| "learning_rate": 0.0005575333527526944, |
| "loss": 3.6515, |
| "step": 12250 |
| }, |
| { |
| "epoch": 3.5809213208316346, |
| "grad_norm": 0.3303356170654297, |
| "learning_rate": 0.0005573585785027672, |
| "loss": 3.6346, |
| "step": 12300 |
| }, |
| { |
| "epoch": 3.595480752431425, |
| "grad_norm": 0.3001437783241272, |
| "learning_rate": 0.00055718380425284, |
| "loss": 3.6476, |
| "step": 12350 |
| }, |
| { |
| "epoch": 3.6100401840312153, |
| "grad_norm": 0.3065738379955292, |
| "learning_rate": 0.0005570090300029128, |
| "loss": 3.6495, |
| "step": 12400 |
| }, |
| { |
| "epoch": 3.6245996156310056, |
| "grad_norm": 0.3155801594257355, |
| "learning_rate": 0.0005568342557529856, |
| "loss": 3.6398, |
| "step": 12450 |
| }, |
| { |
| "epoch": 3.639159047230796, |
| "grad_norm": 0.3072325587272644, |
| "learning_rate": 0.0005566594815030585, |
| "loss": 3.6446, |
| "step": 12500 |
| }, |
| { |
| "epoch": 3.6537184788305863, |
| "grad_norm": 0.331887811422348, |
| "learning_rate": 0.0005564847072531313, |
| "loss": 3.6402, |
| "step": 12550 |
| }, |
| { |
| "epoch": 3.6682779104303767, |
| "grad_norm": 0.30090418457984924, |
| "learning_rate": 0.0005563099330032042, |
| "loss": 3.6303, |
| "step": 12600 |
| }, |
| { |
| "epoch": 3.682837342030167, |
| "grad_norm": 0.3239140808582306, |
| "learning_rate": 0.0005561351587532769, |
| "loss": 3.6552, |
| "step": 12650 |
| }, |
| { |
| "epoch": 3.6973967736299573, |
| "grad_norm": 0.320881724357605, |
| "learning_rate": 0.0005559603845033497, |
| "loss": 3.6356, |
| "step": 12700 |
| }, |
| { |
| "epoch": 3.7119562052297477, |
| "grad_norm": 0.3165138363838196, |
| "learning_rate": 0.0005557856102534226, |
| "loss": 3.6434, |
| "step": 12750 |
| }, |
| { |
| "epoch": 3.726515636829538, |
| "grad_norm": 0.3095230162143707, |
| "learning_rate": 0.0005556108360034954, |
| "loss": 3.6385, |
| "step": 12800 |
| }, |
| { |
| "epoch": 3.7410750684293284, |
| "grad_norm": 0.34694117307662964, |
| "learning_rate": 0.0005554360617535683, |
| "loss": 3.6463, |
| "step": 12850 |
| }, |
| { |
| "epoch": 3.755634500029119, |
| "grad_norm": 0.32559525966644287, |
| "learning_rate": 0.000555261287503641, |
| "loss": 3.6323, |
| "step": 12900 |
| }, |
| { |
| "epoch": 3.770193931628909, |
| "grad_norm": 0.3220575451850891, |
| "learning_rate": 0.0005550865132537138, |
| "loss": 3.6369, |
| "step": 12950 |
| }, |
| { |
| "epoch": 3.7847533632287, |
| "grad_norm": 0.31526488065719604, |
| "learning_rate": 0.0005549117390037867, |
| "loss": 3.6412, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.7847533632287, |
| "eval_accuracy": 0.35453631828429244, |
| "eval_loss": 3.682695150375366, |
| "eval_runtime": 183.1807, |
| "eval_samples_per_second": 90.856, |
| "eval_steps_per_second": 5.683, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.7993127948284897, |
| "grad_norm": 0.3306889832019806, |
| "learning_rate": 0.0005547369647538596, |
| "loss": 3.6618, |
| "step": 13050 |
| }, |
| { |
| "epoch": 3.8138722264282805, |
| "grad_norm": 0.33385586738586426, |
| "learning_rate": 0.0005545621905039324, |
| "loss": 3.6427, |
| "step": 13100 |
| }, |
| { |
| "epoch": 3.8284316580280704, |
| "grad_norm": 0.30829793214797974, |
| "learning_rate": 0.0005543874162540053, |
| "loss": 3.6345, |
| "step": 13150 |
| }, |
| { |
| "epoch": 3.842991089627861, |
| "grad_norm": 0.3245658576488495, |
| "learning_rate": 0.000554212642004078, |
| "loss": 3.6519, |
| "step": 13200 |
| }, |
| { |
| "epoch": 3.857550521227651, |
| "grad_norm": 0.29873931407928467, |
| "learning_rate": 0.0005540378677541508, |
| "loss": 3.639, |
| "step": 13250 |
| }, |
| { |
| "epoch": 3.872109952827442, |
| "grad_norm": 0.3140360414981842, |
| "learning_rate": 0.0005538630935042237, |
| "loss": 3.644, |
| "step": 13300 |
| }, |
| { |
| "epoch": 3.8866693844272318, |
| "grad_norm": 0.31487107276916504, |
| "learning_rate": 0.0005536883192542965, |
| "loss": 3.6451, |
| "step": 13350 |
| }, |
| { |
| "epoch": 3.9012288160270225, |
| "grad_norm": 0.31665652990341187, |
| "learning_rate": 0.0005535135450043694, |
| "loss": 3.63, |
| "step": 13400 |
| }, |
| { |
| "epoch": 3.9157882476268124, |
| "grad_norm": 0.3285450339317322, |
| "learning_rate": 0.0005533387707544422, |
| "loss": 3.6402, |
| "step": 13450 |
| }, |
| { |
| "epoch": 3.930347679226603, |
| "grad_norm": 0.3168368935585022, |
| "learning_rate": 0.0005531639965045149, |
| "loss": 3.6433, |
| "step": 13500 |
| }, |
| { |
| "epoch": 3.944907110826393, |
| "grad_norm": 0.3096484839916229, |
| "learning_rate": 0.0005529892222545878, |
| "loss": 3.6292, |
| "step": 13550 |
| }, |
| { |
| "epoch": 3.959466542426184, |
| "grad_norm": 0.31400060653686523, |
| "learning_rate": 0.0005528144480046606, |
| "loss": 3.6337, |
| "step": 13600 |
| }, |
| { |
| "epoch": 3.974025974025974, |
| "grad_norm": 0.32995402812957764, |
| "learning_rate": 0.0005526396737547335, |
| "loss": 3.644, |
| "step": 13650 |
| }, |
| { |
| "epoch": 3.9885854056257646, |
| "grad_norm": 0.30545228719711304, |
| "learning_rate": 0.0005524648995048063, |
| "loss": 3.6337, |
| "step": 13700 |
| }, |
| { |
| "epoch": 4.002911886319958, |
| "grad_norm": 0.3340036869049072, |
| "learning_rate": 0.000552290125254879, |
| "loss": 3.6049, |
| "step": 13750 |
| }, |
| { |
| "epoch": 4.017471317919749, |
| "grad_norm": 0.3237653076648712, |
| "learning_rate": 0.0005521153510049519, |
| "loss": 3.5263, |
| "step": 13800 |
| }, |
| { |
| "epoch": 4.032030749519539, |
| "grad_norm": 0.33258405327796936, |
| "learning_rate": 0.0005519405767550247, |
| "loss": 3.5231, |
| "step": 13850 |
| }, |
| { |
| "epoch": 4.046590181119329, |
| "grad_norm": 0.33560073375701904, |
| "learning_rate": 0.0005517658025050975, |
| "loss": 3.5422, |
| "step": 13900 |
| }, |
| { |
| "epoch": 4.061149612719119, |
| "grad_norm": 0.32539400458335876, |
| "learning_rate": 0.0005515910282551704, |
| "loss": 3.5393, |
| "step": 13950 |
| }, |
| { |
| "epoch": 4.07570904431891, |
| "grad_norm": 0.3466116786003113, |
| "learning_rate": 0.0005514162540052432, |
| "loss": 3.5371, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.07570904431891, |
| "eval_accuracy": 0.35599906074061566, |
| "eval_loss": 3.6699209213256836, |
| "eval_runtime": 180.5976, |
| "eval_samples_per_second": 92.155, |
| "eval_steps_per_second": 5.764, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.0902684759187, |
| "grad_norm": 0.35234954953193665, |
| "learning_rate": 0.000551241479755316, |
| "loss": 3.5405, |
| "step": 14050 |
| }, |
| { |
| "epoch": 4.104827907518491, |
| "grad_norm": 0.3241097629070282, |
| "learning_rate": 0.0005510667055053888, |
| "loss": 3.5312, |
| "step": 14100 |
| }, |
| { |
| "epoch": 4.119387339118281, |
| "grad_norm": 0.35480767488479614, |
| "learning_rate": 0.0005508919312554616, |
| "loss": 3.541, |
| "step": 14150 |
| }, |
| { |
| "epoch": 4.1339467707180715, |
| "grad_norm": 0.31226274371147156, |
| "learning_rate": 0.0005507171570055345, |
| "loss": 3.5525, |
| "step": 14200 |
| }, |
| { |
| "epoch": 4.148506202317861, |
| "grad_norm": 0.3221980631351471, |
| "learning_rate": 0.0005505423827556073, |
| "loss": 3.5545, |
| "step": 14250 |
| }, |
| { |
| "epoch": 4.163065633917652, |
| "grad_norm": 0.33322617411613464, |
| "learning_rate": 0.0005503676085056802, |
| "loss": 3.5607, |
| "step": 14300 |
| }, |
| { |
| "epoch": 4.177625065517442, |
| "grad_norm": 0.31406116485595703, |
| "learning_rate": 0.0005501928342557529, |
| "loss": 3.5597, |
| "step": 14350 |
| }, |
| { |
| "epoch": 4.192184497117233, |
| "grad_norm": 0.30982154607772827, |
| "learning_rate": 0.0005500180600058257, |
| "loss": 3.5544, |
| "step": 14400 |
| }, |
| { |
| "epoch": 4.206743928717023, |
| "grad_norm": 0.31833505630493164, |
| "learning_rate": 0.0005498432857558986, |
| "loss": 3.5589, |
| "step": 14450 |
| }, |
| { |
| "epoch": 4.2213033603168135, |
| "grad_norm": 0.31112346053123474, |
| "learning_rate": 0.0005496685115059714, |
| "loss": 3.5535, |
| "step": 14500 |
| }, |
| { |
| "epoch": 4.235862791916603, |
| "grad_norm": 0.3102998733520508, |
| "learning_rate": 0.0005494937372560443, |
| "loss": 3.5584, |
| "step": 14550 |
| }, |
| { |
| "epoch": 4.250422223516394, |
| "grad_norm": 0.3442176878452301, |
| "learning_rate": 0.000549318963006117, |
| "loss": 3.5691, |
| "step": 14600 |
| }, |
| { |
| "epoch": 4.264981655116184, |
| "grad_norm": 0.3217466175556183, |
| "learning_rate": 0.0005491441887561898, |
| "loss": 3.5748, |
| "step": 14650 |
| }, |
| { |
| "epoch": 4.279541086715975, |
| "grad_norm": 0.32345715165138245, |
| "learning_rate": 0.0005489694145062627, |
| "loss": 3.5711, |
| "step": 14700 |
| }, |
| { |
| "epoch": 4.294100518315765, |
| "grad_norm": 0.31309959292411804, |
| "learning_rate": 0.0005487946402563355, |
| "loss": 3.5544, |
| "step": 14750 |
| }, |
| { |
| "epoch": 4.308659949915556, |
| "grad_norm": 0.31507858633995056, |
| "learning_rate": 0.0005486198660064084, |
| "loss": 3.5806, |
| "step": 14800 |
| }, |
| { |
| "epoch": 4.3232193815153455, |
| "grad_norm": 0.3113386631011963, |
| "learning_rate": 0.0005484450917564812, |
| "loss": 3.5698, |
| "step": 14850 |
| }, |
| { |
| "epoch": 4.337778813115136, |
| "grad_norm": 0.30662500858306885, |
| "learning_rate": 0.0005482703175065539, |
| "loss": 3.5684, |
| "step": 14900 |
| }, |
| { |
| "epoch": 4.352338244714926, |
| "grad_norm": 0.33159640431404114, |
| "learning_rate": 0.0005480955432566268, |
| "loss": 3.5681, |
| "step": 14950 |
| }, |
| { |
| "epoch": 4.366897676314717, |
| "grad_norm": 0.3497229218482971, |
| "learning_rate": 0.0005479207690066996, |
| "loss": 3.5768, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.366897676314717, |
| "eval_accuracy": 0.35680662627036064, |
| "eval_loss": 3.663057565689087, |
| "eval_runtime": 180.5674, |
| "eval_samples_per_second": 92.171, |
| "eval_steps_per_second": 5.765, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.381457107914507, |
| "grad_norm": 0.3152848184108734, |
| "learning_rate": 0.0005477459947567725, |
| "loss": 3.5651, |
| "step": 15050 |
| }, |
| { |
| "epoch": 4.396016539514298, |
| "grad_norm": 0.31485655903816223, |
| "learning_rate": 0.0005475712205068453, |
| "loss": 3.5724, |
| "step": 15100 |
| }, |
| { |
| "epoch": 4.4105759711140875, |
| "grad_norm": 0.3210237920284271, |
| "learning_rate": 0.000547396446256918, |
| "loss": 3.5743, |
| "step": 15150 |
| }, |
| { |
| "epoch": 4.425135402713878, |
| "grad_norm": 0.31647804379463196, |
| "learning_rate": 0.0005472216720069909, |
| "loss": 3.5643, |
| "step": 15200 |
| }, |
| { |
| "epoch": 4.439694834313668, |
| "grad_norm": 0.3220058083534241, |
| "learning_rate": 0.0005470468977570637, |
| "loss": 3.5777, |
| "step": 15250 |
| }, |
| { |
| "epoch": 4.454254265913459, |
| "grad_norm": 0.31475868821144104, |
| "learning_rate": 0.0005468721235071365, |
| "loss": 3.5759, |
| "step": 15300 |
| }, |
| { |
| "epoch": 4.468813697513249, |
| "grad_norm": 0.31258007884025574, |
| "learning_rate": 0.0005466973492572094, |
| "loss": 3.58, |
| "step": 15350 |
| }, |
| { |
| "epoch": 4.48337312911304, |
| "grad_norm": 0.3323783874511719, |
| "learning_rate": 0.0005465225750072822, |
| "loss": 3.5717, |
| "step": 15400 |
| }, |
| { |
| "epoch": 4.4979325607128295, |
| "grad_norm": 0.31647196412086487, |
| "learning_rate": 0.000546347800757355, |
| "loss": 3.5666, |
| "step": 15450 |
| }, |
| { |
| "epoch": 4.51249199231262, |
| "grad_norm": 0.3166157007217407, |
| "learning_rate": 0.0005461730265074279, |
| "loss": 3.5661, |
| "step": 15500 |
| }, |
| { |
| "epoch": 4.52705142391241, |
| "grad_norm": 0.33359718322753906, |
| "learning_rate": 0.0005459982522575007, |
| "loss": 3.581, |
| "step": 15550 |
| }, |
| { |
| "epoch": 4.541610855512201, |
| "grad_norm": 0.30880287289619446, |
| "learning_rate": 0.0005458234780075735, |
| "loss": 3.5767, |
| "step": 15600 |
| }, |
| { |
| "epoch": 4.556170287111991, |
| "grad_norm": 0.3321440517902374, |
| "learning_rate": 0.0005456487037576464, |
| "loss": 3.5927, |
| "step": 15650 |
| }, |
| { |
| "epoch": 4.570729718711782, |
| "grad_norm": 0.35169097781181335, |
| "learning_rate": 0.0005454739295077192, |
| "loss": 3.5777, |
| "step": 15700 |
| }, |
| { |
| "epoch": 4.585289150311572, |
| "grad_norm": 0.3210912048816681, |
| "learning_rate": 0.000545299155257792, |
| "loss": 3.5641, |
| "step": 15750 |
| }, |
| { |
| "epoch": 4.599848581911362, |
| "grad_norm": 0.3266526460647583, |
| "learning_rate": 0.0005451243810078648, |
| "loss": 3.5624, |
| "step": 15800 |
| }, |
| { |
| "epoch": 4.614408013511152, |
| "grad_norm": 0.3169322609901428, |
| "learning_rate": 0.0005449496067579376, |
| "loss": 3.582, |
| "step": 15850 |
| }, |
| { |
| "epoch": 4.628967445110943, |
| "grad_norm": 0.30979159474372864, |
| "learning_rate": 0.0005447748325080105, |
| "loss": 3.5808, |
| "step": 15900 |
| }, |
| { |
| "epoch": 4.643526876710733, |
| "grad_norm": 0.3104844391345978, |
| "learning_rate": 0.0005446000582580833, |
| "loss": 3.5779, |
| "step": 15950 |
| }, |
| { |
| "epoch": 4.658086308310524, |
| "grad_norm": 0.3167930543422699, |
| "learning_rate": 0.0005444252840081562, |
| "loss": 3.577, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.658086308310524, |
| "eval_accuracy": 0.3581025173162721, |
| "eval_loss": 3.6506025791168213, |
| "eval_runtime": 184.1909, |
| "eval_samples_per_second": 90.357, |
| "eval_steps_per_second": 5.652, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.672645739910314, |
| "grad_norm": 0.3104100823402405, |
| "learning_rate": 0.0005442505097582289, |
| "loss": 3.5755, |
| "step": 16050 |
| }, |
| { |
| "epoch": 4.687205171510104, |
| "grad_norm": 0.32251089811325073, |
| "learning_rate": 0.0005440757355083017, |
| "loss": 3.5785, |
| "step": 16100 |
| }, |
| { |
| "epoch": 4.701764603109894, |
| "grad_norm": 0.30579274892807007, |
| "learning_rate": 0.0005439009612583746, |
| "loss": 3.5736, |
| "step": 16150 |
| }, |
| { |
| "epoch": 4.716324034709685, |
| "grad_norm": 0.32924431562423706, |
| "learning_rate": 0.0005437261870084474, |
| "loss": 3.5859, |
| "step": 16200 |
| }, |
| { |
| "epoch": 4.730883466309475, |
| "grad_norm": 0.32339397072792053, |
| "learning_rate": 0.0005435514127585203, |
| "loss": 3.5714, |
| "step": 16250 |
| }, |
| { |
| "epoch": 4.745442897909266, |
| "grad_norm": 0.3301834762096405, |
| "learning_rate": 0.000543376638508593, |
| "loss": 3.581, |
| "step": 16300 |
| }, |
| { |
| "epoch": 4.760002329509056, |
| "grad_norm": 0.3323529064655304, |
| "learning_rate": 0.0005432018642586658, |
| "loss": 3.5745, |
| "step": 16350 |
| }, |
| { |
| "epoch": 4.7745617611088464, |
| "grad_norm": 0.31460458040237427, |
| "learning_rate": 0.0005430270900087387, |
| "loss": 3.5752, |
| "step": 16400 |
| }, |
| { |
| "epoch": 4.789121192708636, |
| "grad_norm": 0.30962061882019043, |
| "learning_rate": 0.0005428523157588115, |
| "loss": 3.5847, |
| "step": 16450 |
| }, |
| { |
| "epoch": 4.803680624308427, |
| "grad_norm": 0.31121689081192017, |
| "learning_rate": 0.0005426775415088843, |
| "loss": 3.581, |
| "step": 16500 |
| }, |
| { |
| "epoch": 4.818240055908217, |
| "grad_norm": 0.3271123170852661, |
| "learning_rate": 0.0005425027672589572, |
| "loss": 3.5747, |
| "step": 16550 |
| }, |
| { |
| "epoch": 4.832799487508008, |
| "grad_norm": 0.34155216813087463, |
| "learning_rate": 0.0005423279930090299, |
| "loss": 3.5757, |
| "step": 16600 |
| }, |
| { |
| "epoch": 4.847358919107798, |
| "grad_norm": 0.31826114654541016, |
| "learning_rate": 0.0005421532187591028, |
| "loss": 3.5863, |
| "step": 16650 |
| }, |
| { |
| "epoch": 4.8619183507075885, |
| "grad_norm": 0.3213462829589844, |
| "learning_rate": 0.0005419784445091756, |
| "loss": 3.5846, |
| "step": 16700 |
| }, |
| { |
| "epoch": 4.876477782307378, |
| "grad_norm": 0.3335978388786316, |
| "learning_rate": 0.0005418036702592484, |
| "loss": 3.5778, |
| "step": 16750 |
| }, |
| { |
| "epoch": 4.891037213907169, |
| "grad_norm": 0.32565537095069885, |
| "learning_rate": 0.0005416288960093213, |
| "loss": 3.5903, |
| "step": 16800 |
| }, |
| { |
| "epoch": 4.905596645506959, |
| "grad_norm": 0.31601616740226746, |
| "learning_rate": 0.000541454121759394, |
| "loss": 3.581, |
| "step": 16850 |
| }, |
| { |
| "epoch": 4.92015607710675, |
| "grad_norm": 0.3034924268722534, |
| "learning_rate": 0.0005412793475094669, |
| "loss": 3.5731, |
| "step": 16900 |
| }, |
| { |
| "epoch": 4.93471550870654, |
| "grad_norm": 0.30528074502944946, |
| "learning_rate": 0.0005411045732595397, |
| "loss": 3.5775, |
| "step": 16950 |
| }, |
| { |
| "epoch": 4.9492749403063305, |
| "grad_norm": 0.32346123456954956, |
| "learning_rate": 0.0005409297990096125, |
| "loss": 3.5711, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.9492749403063305, |
| "eval_accuracy": 0.3593712465046746, |
| "eval_loss": 3.6345937252044678, |
| "eval_runtime": 183.7217, |
| "eval_samples_per_second": 90.588, |
| "eval_steps_per_second": 5.666, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.96383437190612, |
| "grad_norm": 0.3116399049758911, |
| "learning_rate": 0.0005407550247596854, |
| "loss": 3.5657, |
| "step": 17050 |
| }, |
| { |
| "epoch": 4.978393803505911, |
| "grad_norm": 0.3291073143482208, |
| "learning_rate": 0.0005405802505097582, |
| "loss": 3.5751, |
| "step": 17100 |
| }, |
| { |
| "epoch": 4.992953235105701, |
| "grad_norm": 0.3149360716342926, |
| "learning_rate": 0.000540405476259831, |
| "loss": 3.5743, |
| "step": 17150 |
| }, |
| { |
| "epoch": 5.007279715799895, |
| "grad_norm": 0.3213154971599579, |
| "learning_rate": 0.0005402307020099038, |
| "loss": 3.5347, |
| "step": 17200 |
| }, |
| { |
| "epoch": 5.021839147399685, |
| "grad_norm": 0.3356756567955017, |
| "learning_rate": 0.0005400559277599766, |
| "loss": 3.4594, |
| "step": 17250 |
| }, |
| { |
| "epoch": 5.036398578999476, |
| "grad_norm": 0.3190675973892212, |
| "learning_rate": 0.0005398811535100495, |
| "loss": 3.4737, |
| "step": 17300 |
| }, |
| { |
| "epoch": 5.050958010599266, |
| "grad_norm": 0.30441927909851074, |
| "learning_rate": 0.0005397063792601223, |
| "loss": 3.4683, |
| "step": 17350 |
| }, |
| { |
| "epoch": 5.065517442199057, |
| "grad_norm": 0.3276670277118683, |
| "learning_rate": 0.0005395316050101951, |
| "loss": 3.4779, |
| "step": 17400 |
| }, |
| { |
| "epoch": 5.080076873798847, |
| "grad_norm": 0.3393913805484772, |
| "learning_rate": 0.0005393568307602679, |
| "loss": 3.4886, |
| "step": 17450 |
| }, |
| { |
| "epoch": 5.094636305398637, |
| "grad_norm": 0.33122798800468445, |
| "learning_rate": 0.0005391820565103407, |
| "loss": 3.4829, |
| "step": 17500 |
| }, |
| { |
| "epoch": 5.109195736998427, |
| "grad_norm": 0.32901448011398315, |
| "learning_rate": 0.0005390072822604136, |
| "loss": 3.4872, |
| "step": 17550 |
| }, |
| { |
| "epoch": 5.123755168598218, |
| "grad_norm": 0.3309627175331116, |
| "learning_rate": 0.0005388325080104864, |
| "loss": 3.4831, |
| "step": 17600 |
| }, |
| { |
| "epoch": 5.138314600198008, |
| "grad_norm": 0.32044172286987305, |
| "learning_rate": 0.0005386577337605593, |
| "loss": 3.4888, |
| "step": 17650 |
| }, |
| { |
| "epoch": 5.152874031797799, |
| "grad_norm": 0.3464089334011078, |
| "learning_rate": 0.000538482959510632, |
| "loss": 3.4972, |
| "step": 17700 |
| }, |
| { |
| "epoch": 5.167433463397589, |
| "grad_norm": 0.3171513080596924, |
| "learning_rate": 0.0005383081852607048, |
| "loss": 3.5026, |
| "step": 17750 |
| }, |
| { |
| "epoch": 5.1819928949973795, |
| "grad_norm": 0.3164452612400055, |
| "learning_rate": 0.0005381334110107777, |
| "loss": 3.4926, |
| "step": 17800 |
| }, |
| { |
| "epoch": 5.196552326597169, |
| "grad_norm": 0.32658103108406067, |
| "learning_rate": 0.0005379586367608505, |
| "loss": 3.5046, |
| "step": 17850 |
| }, |
| { |
| "epoch": 5.21111175819696, |
| "grad_norm": 0.32511815428733826, |
| "learning_rate": 0.0005377838625109233, |
| "loss": 3.4953, |
| "step": 17900 |
| }, |
| { |
| "epoch": 5.22567118979675, |
| "grad_norm": 0.343904972076416, |
| "learning_rate": 0.0005376090882609961, |
| "loss": 3.5065, |
| "step": 17950 |
| }, |
| { |
| "epoch": 5.240230621396541, |
| "grad_norm": 0.33408525586128235, |
| "learning_rate": 0.0005374343140110689, |
| "loss": 3.5066, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.240230621396541, |
| "eval_accuracy": 0.35997068871065013, |
| "eval_loss": 3.6377220153808594, |
| "eval_runtime": 181.2212, |
| "eval_samples_per_second": 91.838, |
| "eval_steps_per_second": 5.744, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.254790052996331, |
| "grad_norm": 0.3558831512928009, |
| "learning_rate": 0.0005372595397611418, |
| "loss": 3.5154, |
| "step": 18050 |
| }, |
| { |
| "epoch": 5.2693494845961215, |
| "grad_norm": 0.3240915536880493, |
| "learning_rate": 0.0005370847655112147, |
| "loss": 3.5104, |
| "step": 18100 |
| }, |
| { |
| "epoch": 5.283908916195911, |
| "grad_norm": 0.3641294538974762, |
| "learning_rate": 0.0005369099912612875, |
| "loss": 3.5125, |
| "step": 18150 |
| }, |
| { |
| "epoch": 5.298468347795702, |
| "grad_norm": 0.323595255613327, |
| "learning_rate": 0.0005367352170113603, |
| "loss": 3.5091, |
| "step": 18200 |
| }, |
| { |
| "epoch": 5.313027779395492, |
| "grad_norm": 0.31085318326950073, |
| "learning_rate": 0.0005365604427614331, |
| "loss": 3.5061, |
| "step": 18250 |
| }, |
| { |
| "epoch": 5.327587210995283, |
| "grad_norm": 0.3321459889411926, |
| "learning_rate": 0.0005363856685115059, |
| "loss": 3.5128, |
| "step": 18300 |
| }, |
| { |
| "epoch": 5.342146642595073, |
| "grad_norm": 0.3359740674495697, |
| "learning_rate": 0.0005362108942615788, |
| "loss": 3.5207, |
| "step": 18350 |
| }, |
| { |
| "epoch": 5.3567060741948636, |
| "grad_norm": 0.35164040327072144, |
| "learning_rate": 0.0005360361200116516, |
| "loss": 3.5206, |
| "step": 18400 |
| }, |
| { |
| "epoch": 5.3712655057946534, |
| "grad_norm": 0.33065569400787354, |
| "learning_rate": 0.0005358613457617244, |
| "loss": 3.5137, |
| "step": 18450 |
| }, |
| { |
| "epoch": 5.385824937394444, |
| "grad_norm": 0.31795698404312134, |
| "learning_rate": 0.0005356865715117973, |
| "loss": 3.5181, |
| "step": 18500 |
| }, |
| { |
| "epoch": 5.400384368994234, |
| "grad_norm": 0.3166426718235016, |
| "learning_rate": 0.00053551179726187, |
| "loss": 3.5129, |
| "step": 18550 |
| }, |
| { |
| "epoch": 5.414943800594025, |
| "grad_norm": 0.3113225996494293, |
| "learning_rate": 0.0005353370230119429, |
| "loss": 3.5224, |
| "step": 18600 |
| }, |
| { |
| "epoch": 5.429503232193815, |
| "grad_norm": 0.3037504553794861, |
| "learning_rate": 0.0005351622487620157, |
| "loss": 3.5212, |
| "step": 18650 |
| }, |
| { |
| "epoch": 5.444062663793606, |
| "grad_norm": 0.3170977830886841, |
| "learning_rate": 0.0005349874745120885, |
| "loss": 3.5185, |
| "step": 18700 |
| }, |
| { |
| "epoch": 5.4586220953933955, |
| "grad_norm": 0.3276199698448181, |
| "learning_rate": 0.0005348127002621614, |
| "loss": 3.5143, |
| "step": 18750 |
| }, |
| { |
| "epoch": 5.473181526993186, |
| "grad_norm": 0.35049423575401306, |
| "learning_rate": 0.0005346379260122341, |
| "loss": 3.5178, |
| "step": 18800 |
| }, |
| { |
| "epoch": 5.487740958592976, |
| "grad_norm": 0.3257882595062256, |
| "learning_rate": 0.000534463151762307, |
| "loss": 3.513, |
| "step": 18850 |
| }, |
| { |
| "epoch": 5.502300390192767, |
| "grad_norm": 0.3254280686378479, |
| "learning_rate": 0.0005342883775123798, |
| "loss": 3.5157, |
| "step": 18900 |
| }, |
| { |
| "epoch": 5.516859821792557, |
| "grad_norm": 0.35354653000831604, |
| "learning_rate": 0.0005341136032624526, |
| "loss": 3.5323, |
| "step": 18950 |
| }, |
| { |
| "epoch": 5.531419253392348, |
| "grad_norm": 0.3293665945529938, |
| "learning_rate": 0.0005339388290125255, |
| "loss": 3.5294, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.531419253392348, |
| "eval_accuracy": 0.36105410583223874, |
| "eval_loss": 3.6271042823791504, |
| "eval_runtime": 181.268, |
| "eval_samples_per_second": 91.814, |
| "eval_steps_per_second": 5.743, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.5459786849921375, |
| "grad_norm": 0.32479095458984375, |
| "learning_rate": 0.0005337640547625983, |
| "loss": 3.5257, |
| "step": 19050 |
| }, |
| { |
| "epoch": 5.560538116591928, |
| "grad_norm": 0.30282458662986755, |
| "learning_rate": 0.000533589280512671, |
| "loss": 3.5376, |
| "step": 19100 |
| }, |
| { |
| "epoch": 5.575097548191718, |
| "grad_norm": 0.3051811754703522, |
| "learning_rate": 0.0005334145062627439, |
| "loss": 3.5188, |
| "step": 19150 |
| }, |
| { |
| "epoch": 5.589656979791509, |
| "grad_norm": 0.34127405285835266, |
| "learning_rate": 0.0005332397320128167, |
| "loss": 3.5171, |
| "step": 19200 |
| }, |
| { |
| "epoch": 5.604216411391299, |
| "grad_norm": 0.3210941553115845, |
| "learning_rate": 0.0005330649577628896, |
| "loss": 3.5248, |
| "step": 19250 |
| }, |
| { |
| "epoch": 5.61877584299109, |
| "grad_norm": 0.3192020654678345, |
| "learning_rate": 0.0005328901835129624, |
| "loss": 3.533, |
| "step": 19300 |
| }, |
| { |
| "epoch": 5.6333352745908805, |
| "grad_norm": 0.34110450744628906, |
| "learning_rate": 0.0005327154092630351, |
| "loss": 3.5295, |
| "step": 19350 |
| }, |
| { |
| "epoch": 5.64789470619067, |
| "grad_norm": 0.3144545555114746, |
| "learning_rate": 0.000532540635013108, |
| "loss": 3.5335, |
| "step": 19400 |
| }, |
| { |
| "epoch": 5.66245413779046, |
| "grad_norm": 0.3245835304260254, |
| "learning_rate": 0.0005323658607631808, |
| "loss": 3.5229, |
| "step": 19450 |
| }, |
| { |
| "epoch": 5.677013569390251, |
| "grad_norm": 0.3528177738189697, |
| "learning_rate": 0.0005321910865132537, |
| "loss": 3.5209, |
| "step": 19500 |
| }, |
| { |
| "epoch": 5.691573000990042, |
| "grad_norm": 0.3312878906726837, |
| "learning_rate": 0.0005320163122633265, |
| "loss": 3.5321, |
| "step": 19550 |
| }, |
| { |
| "epoch": 5.706132432589832, |
| "grad_norm": 0.3077809512615204, |
| "learning_rate": 0.0005318415380133993, |
| "loss": 3.5183, |
| "step": 19600 |
| }, |
| { |
| "epoch": 5.720691864189622, |
| "grad_norm": 0.32409968972206116, |
| "learning_rate": 0.0005316667637634721, |
| "loss": 3.5276, |
| "step": 19650 |
| }, |
| { |
| "epoch": 5.735251295789412, |
| "grad_norm": 0.3110126852989197, |
| "learning_rate": 0.0005314919895135449, |
| "loss": 3.5253, |
| "step": 19700 |
| }, |
| { |
| "epoch": 5.749810727389203, |
| "grad_norm": 0.33343297243118286, |
| "learning_rate": 0.0005313172152636178, |
| "loss": 3.5272, |
| "step": 19750 |
| }, |
| { |
| "epoch": 5.764370158988993, |
| "grad_norm": 0.3315747082233429, |
| "learning_rate": 0.0005311424410136906, |
| "loss": 3.534, |
| "step": 19800 |
| }, |
| { |
| "epoch": 5.778929590588783, |
| "grad_norm": 0.3091914653778076, |
| "learning_rate": 0.0005309676667637634, |
| "loss": 3.5276, |
| "step": 19850 |
| }, |
| { |
| "epoch": 5.793489022188574, |
| "grad_norm": 0.30921050906181335, |
| "learning_rate": 0.0005307928925138363, |
| "loss": 3.5219, |
| "step": 19900 |
| }, |
| { |
| "epoch": 5.8080484537883645, |
| "grad_norm": 0.30907315015792847, |
| "learning_rate": 0.000530618118263909, |
| "loss": 3.534, |
| "step": 19950 |
| }, |
| { |
| "epoch": 5.822607885388154, |
| "grad_norm": 0.36628568172454834, |
| "learning_rate": 0.0005304433440139819, |
| "loss": 3.538, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.822607885388154, |
| "eval_accuracy": 0.36194950645964236, |
| "eval_loss": 3.61657452583313, |
| "eval_runtime": 183.6498, |
| "eval_samples_per_second": 90.624, |
| "eval_steps_per_second": 5.668, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.837167316987944, |
| "grad_norm": 0.3185259997844696, |
| "learning_rate": 0.0005302685697640547, |
| "loss": 3.5243, |
| "step": 20050 |
| }, |
| { |
| "epoch": 5.851726748587735, |
| "grad_norm": 0.3328113257884979, |
| "learning_rate": 0.0005300937955141275, |
| "loss": 3.5306, |
| "step": 20100 |
| }, |
| { |
| "epoch": 5.866286180187526, |
| "grad_norm": 0.31715288758277893, |
| "learning_rate": 0.0005299190212642004, |
| "loss": 3.5368, |
| "step": 20150 |
| }, |
| { |
| "epoch": 5.880845611787316, |
| "grad_norm": 0.3114943206310272, |
| "learning_rate": 0.0005297442470142731, |
| "loss": 3.5279, |
| "step": 20200 |
| }, |
| { |
| "epoch": 5.895405043387106, |
| "grad_norm": 0.3375224471092224, |
| "learning_rate": 0.000529569472764346, |
| "loss": 3.5214, |
| "step": 20250 |
| }, |
| { |
| "epoch": 5.9099644749868965, |
| "grad_norm": 0.29627102613449097, |
| "learning_rate": 0.0005293946985144188, |
| "loss": 3.5182, |
| "step": 20300 |
| }, |
| { |
| "epoch": 5.924523906586687, |
| "grad_norm": 0.33964815735816956, |
| "learning_rate": 0.0005292199242644916, |
| "loss": 3.541, |
| "step": 20350 |
| }, |
| { |
| "epoch": 5.939083338186477, |
| "grad_norm": 0.3077552914619446, |
| "learning_rate": 0.0005290451500145645, |
| "loss": 3.5246, |
| "step": 20400 |
| }, |
| { |
| "epoch": 5.953642769786267, |
| "grad_norm": 0.3167116641998291, |
| "learning_rate": 0.0005288703757646373, |
| "loss": 3.5294, |
| "step": 20450 |
| }, |
| { |
| "epoch": 5.968202201386058, |
| "grad_norm": 0.3327026665210724, |
| "learning_rate": 0.00052869560151471, |
| "loss": 3.5322, |
| "step": 20500 |
| }, |
| { |
| "epoch": 5.982761632985849, |
| "grad_norm": 0.3215795159339905, |
| "learning_rate": 0.0005285208272647829, |
| "loss": 3.5378, |
| "step": 20550 |
| }, |
| { |
| "epoch": 5.9973210645856385, |
| "grad_norm": 0.3464929759502411, |
| "learning_rate": 0.0005283460530148558, |
| "loss": 3.544, |
| "step": 20600 |
| }, |
| { |
| "epoch": 6.011647545279832, |
| "grad_norm": 0.37006425857543945, |
| "learning_rate": 0.0005281712787649286, |
| "loss": 3.4569, |
| "step": 20650 |
| }, |
| { |
| "epoch": 6.026206976879623, |
| "grad_norm": 0.32685425877571106, |
| "learning_rate": 0.0005279965045150015, |
| "loss": 3.4143, |
| "step": 20700 |
| }, |
| { |
| "epoch": 6.040766408479413, |
| "grad_norm": 0.31896543502807617, |
| "learning_rate": 0.0005278217302650743, |
| "loss": 3.4284, |
| "step": 20750 |
| }, |
| { |
| "epoch": 6.055325840079203, |
| "grad_norm": 0.3501061499118805, |
| "learning_rate": 0.000527646956015147, |
| "loss": 3.4257, |
| "step": 20800 |
| }, |
| { |
| "epoch": 6.069885271678993, |
| "grad_norm": 0.3293428421020508, |
| "learning_rate": 0.0005274721817652199, |
| "loss": 3.4253, |
| "step": 20850 |
| }, |
| { |
| "epoch": 6.084444703278784, |
| "grad_norm": 0.33916565775871277, |
| "learning_rate": 0.0005272974075152927, |
| "loss": 3.4532, |
| "step": 20900 |
| }, |
| { |
| "epoch": 6.099004134878574, |
| "grad_norm": 0.3229523301124573, |
| "learning_rate": 0.0005271226332653656, |
| "loss": 3.4476, |
| "step": 20950 |
| }, |
| { |
| "epoch": 6.113563566478365, |
| "grad_norm": 0.3364764153957367, |
| "learning_rate": 0.0005269478590154384, |
| "loss": 3.4415, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.113563566478365, |
| "eval_accuracy": 0.3621457538197391, |
| "eval_loss": 3.6198906898498535, |
| "eval_runtime": 183.4856, |
| "eval_samples_per_second": 90.705, |
| "eval_steps_per_second": 5.673, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.128122998078155, |
| "grad_norm": 0.3735044300556183, |
| "learning_rate": 0.0005267730847655111, |
| "loss": 3.4457, |
| "step": 21050 |
| }, |
| { |
| "epoch": 6.142682429677945, |
| "grad_norm": 0.34455105662345886, |
| "learning_rate": 0.000526598310515584, |
| "loss": 3.4528, |
| "step": 21100 |
| }, |
| { |
| "epoch": 6.157241861277735, |
| "grad_norm": 0.33916333317756653, |
| "learning_rate": 0.0005264235362656568, |
| "loss": 3.4609, |
| "step": 21150 |
| }, |
| { |
| "epoch": 6.171801292877526, |
| "grad_norm": 0.3121279180049896, |
| "learning_rate": 0.0005262487620157297, |
| "loss": 3.4478, |
| "step": 21200 |
| }, |
| { |
| "epoch": 6.186360724477316, |
| "grad_norm": 0.30740803480148315, |
| "learning_rate": 0.0005260739877658025, |
| "loss": 3.4448, |
| "step": 21250 |
| }, |
| { |
| "epoch": 6.200920156077107, |
| "grad_norm": 0.3505891263484955, |
| "learning_rate": 0.0005258992135158753, |
| "loss": 3.4585, |
| "step": 21300 |
| }, |
| { |
| "epoch": 6.215479587676897, |
| "grad_norm": 0.33900803327560425, |
| "learning_rate": 0.0005257244392659481, |
| "loss": 3.46, |
| "step": 21350 |
| }, |
| { |
| "epoch": 6.2300390192766875, |
| "grad_norm": 0.3224051892757416, |
| "learning_rate": 0.0005255496650160209, |
| "loss": 3.4557, |
| "step": 21400 |
| }, |
| { |
| "epoch": 6.244598450876477, |
| "grad_norm": 0.35417911410331726, |
| "learning_rate": 0.0005253748907660938, |
| "loss": 3.4583, |
| "step": 21450 |
| }, |
| { |
| "epoch": 6.259157882476268, |
| "grad_norm": 0.34107911586761475, |
| "learning_rate": 0.0005252001165161666, |
| "loss": 3.4659, |
| "step": 21500 |
| }, |
| { |
| "epoch": 6.273717314076059, |
| "grad_norm": 0.32315975427627563, |
| "learning_rate": 0.0005250253422662394, |
| "loss": 3.4613, |
| "step": 21550 |
| }, |
| { |
| "epoch": 6.288276745675849, |
| "grad_norm": 0.3344326615333557, |
| "learning_rate": 0.0005248505680163123, |
| "loss": 3.4729, |
| "step": 21600 |
| }, |
| { |
| "epoch": 6.302836177275639, |
| "grad_norm": 0.34388530254364014, |
| "learning_rate": 0.000524675793766385, |
| "loss": 3.4723, |
| "step": 21650 |
| }, |
| { |
| "epoch": 6.3173956088754295, |
| "grad_norm": 0.34264546632766724, |
| "learning_rate": 0.0005245010195164579, |
| "loss": 3.4751, |
| "step": 21700 |
| }, |
| { |
| "epoch": 6.33195504047522, |
| "grad_norm": 0.32228031754493713, |
| "learning_rate": 0.0005243262452665307, |
| "loss": 3.4586, |
| "step": 21750 |
| }, |
| { |
| "epoch": 6.34651447207501, |
| "grad_norm": 0.34229689836502075, |
| "learning_rate": 0.0005241514710166035, |
| "loss": 3.4657, |
| "step": 21800 |
| }, |
| { |
| "epoch": 6.3610739036748, |
| "grad_norm": 0.3267248570919037, |
| "learning_rate": 0.0005239766967666764, |
| "loss": 3.4747, |
| "step": 21850 |
| }, |
| { |
| "epoch": 6.375633335274591, |
| "grad_norm": 0.3363324999809265, |
| "learning_rate": 0.0005238019225167491, |
| "loss": 3.4736, |
| "step": 21900 |
| }, |
| { |
| "epoch": 6.390192766874382, |
| "grad_norm": 0.32636144757270813, |
| "learning_rate": 0.0005236271482668219, |
| "loss": 3.4731, |
| "step": 21950 |
| }, |
| { |
| "epoch": 6.4047521984741715, |
| "grad_norm": 0.3209141194820404, |
| "learning_rate": 0.0005234523740168948, |
| "loss": 3.475, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.4047521984741715, |
| "eval_accuracy": 0.36269357673806785, |
| "eval_loss": 3.6098952293395996, |
| "eval_runtime": 183.5371, |
| "eval_samples_per_second": 90.679, |
| "eval_steps_per_second": 5.672, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.419311630073962, |
| "grad_norm": 0.3223513066768646, |
| "learning_rate": 0.0005232775997669676, |
| "loss": 3.4759, |
| "step": 22050 |
| }, |
| { |
| "epoch": 6.433871061673752, |
| "grad_norm": 0.3284885585308075, |
| "learning_rate": 0.0005231028255170405, |
| "loss": 3.4796, |
| "step": 22100 |
| }, |
| { |
| "epoch": 6.448430493273543, |
| "grad_norm": 0.32980912923812866, |
| "learning_rate": 0.0005229280512671133, |
| "loss": 3.4839, |
| "step": 22150 |
| }, |
| { |
| "epoch": 6.462989924873333, |
| "grad_norm": 0.33856451511383057, |
| "learning_rate": 0.000522753277017186, |
| "loss": 3.4825, |
| "step": 22200 |
| }, |
| { |
| "epoch": 6.477549356473124, |
| "grad_norm": 0.3303597867488861, |
| "learning_rate": 0.0005225785027672589, |
| "loss": 3.4827, |
| "step": 22250 |
| }, |
| { |
| "epoch": 6.492108788072914, |
| "grad_norm": 0.32675686478614807, |
| "learning_rate": 0.0005224037285173317, |
| "loss": 3.4781, |
| "step": 22300 |
| }, |
| { |
| "epoch": 6.506668219672704, |
| "grad_norm": 0.3315143883228302, |
| "learning_rate": 0.0005222289542674046, |
| "loss": 3.4786, |
| "step": 22350 |
| }, |
| { |
| "epoch": 6.521227651272494, |
| "grad_norm": 0.35115185379981995, |
| "learning_rate": 0.0005220541800174774, |
| "loss": 3.4777, |
| "step": 22400 |
| }, |
| { |
| "epoch": 6.535787082872285, |
| "grad_norm": 0.32922348380088806, |
| "learning_rate": 0.0005218794057675501, |
| "loss": 3.4764, |
| "step": 22450 |
| }, |
| { |
| "epoch": 6.550346514472075, |
| "grad_norm": 0.32848137617111206, |
| "learning_rate": 0.000521704631517623, |
| "loss": 3.4864, |
| "step": 22500 |
| }, |
| { |
| "epoch": 6.564905946071866, |
| "grad_norm": 0.3455169200897217, |
| "learning_rate": 0.0005215298572676958, |
| "loss": 3.4872, |
| "step": 22550 |
| }, |
| { |
| "epoch": 6.579465377671656, |
| "grad_norm": 0.3491528034210205, |
| "learning_rate": 0.0005213550830177687, |
| "loss": 3.4941, |
| "step": 22600 |
| }, |
| { |
| "epoch": 6.594024809271446, |
| "grad_norm": 0.3292933404445648, |
| "learning_rate": 0.0005211803087678415, |
| "loss": 3.4849, |
| "step": 22650 |
| }, |
| { |
| "epoch": 6.608584240871236, |
| "grad_norm": 0.33583250641822815, |
| "learning_rate": 0.0005210055345179143, |
| "loss": 3.4787, |
| "step": 22700 |
| }, |
| { |
| "epoch": 6.623143672471027, |
| "grad_norm": 0.32590252161026, |
| "learning_rate": 0.0005208307602679871, |
| "loss": 3.4819, |
| "step": 22750 |
| }, |
| { |
| "epoch": 6.637703104070817, |
| "grad_norm": 0.34313255548477173, |
| "learning_rate": 0.0005206559860180599, |
| "loss": 3.4679, |
| "step": 22800 |
| }, |
| { |
| "epoch": 6.652262535670608, |
| "grad_norm": 0.3168715238571167, |
| "learning_rate": 0.0005204812117681328, |
| "loss": 3.4812, |
| "step": 22850 |
| }, |
| { |
| "epoch": 6.666821967270398, |
| "grad_norm": 0.33726438879966736, |
| "learning_rate": 0.0005203064375182056, |
| "loss": 3.4791, |
| "step": 22900 |
| }, |
| { |
| "epoch": 6.6813813988701884, |
| "grad_norm": 0.33907851576805115, |
| "learning_rate": 0.0005201316632682784, |
| "loss": 3.4817, |
| "step": 22950 |
| }, |
| { |
| "epoch": 6.695940830469978, |
| "grad_norm": 0.3657963275909424, |
| "learning_rate": 0.0005199568890183513, |
| "loss": 3.4859, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.695940830469978, |
| "eval_accuracy": 0.3636106127844396, |
| "eval_loss": 3.6045539379119873, |
| "eval_runtime": 184.0903, |
| "eval_samples_per_second": 90.407, |
| "eval_steps_per_second": 5.655, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.710500262069769, |
| "grad_norm": 0.3322959840297699, |
| "learning_rate": 0.000519782114768424, |
| "loss": 3.4824, |
| "step": 23050 |
| }, |
| { |
| "epoch": 6.725059693669559, |
| "grad_norm": 0.343662828207016, |
| "learning_rate": 0.0005196073405184969, |
| "loss": 3.4897, |
| "step": 23100 |
| }, |
| { |
| "epoch": 6.73961912526935, |
| "grad_norm": 0.32909801602363586, |
| "learning_rate": 0.0005194325662685697, |
| "loss": 3.4858, |
| "step": 23150 |
| }, |
| { |
| "epoch": 6.75417855686914, |
| "grad_norm": 0.3375694751739502, |
| "learning_rate": 0.0005192577920186426, |
| "loss": 3.4808, |
| "step": 23200 |
| }, |
| { |
| "epoch": 6.7687379884689305, |
| "grad_norm": 0.3138526678085327, |
| "learning_rate": 0.0005190830177687154, |
| "loss": 3.483, |
| "step": 23250 |
| }, |
| { |
| "epoch": 6.78329742006872, |
| "grad_norm": 0.3340669572353363, |
| "learning_rate": 0.0005189082435187883, |
| "loss": 3.4903, |
| "step": 23300 |
| }, |
| { |
| "epoch": 6.797856851668511, |
| "grad_norm": 0.3336253762245178, |
| "learning_rate": 0.000518733469268861, |
| "loss": 3.4864, |
| "step": 23350 |
| }, |
| { |
| "epoch": 6.812416283268301, |
| "grad_norm": 0.3235922157764435, |
| "learning_rate": 0.0005185586950189338, |
| "loss": 3.5037, |
| "step": 23400 |
| }, |
| { |
| "epoch": 6.826975714868092, |
| "grad_norm": 0.3445108234882355, |
| "learning_rate": 0.0005183839207690067, |
| "loss": 3.4908, |
| "step": 23450 |
| }, |
| { |
| "epoch": 6.841535146467882, |
| "grad_norm": 0.3229808211326599, |
| "learning_rate": 0.0005182091465190795, |
| "loss": 3.4906, |
| "step": 23500 |
| }, |
| { |
| "epoch": 6.8560945780676725, |
| "grad_norm": 0.29649391770362854, |
| "learning_rate": 0.0005180343722691524, |
| "loss": 3.4964, |
| "step": 23550 |
| }, |
| { |
| "epoch": 6.870654009667462, |
| "grad_norm": 0.3273935616016388, |
| "learning_rate": 0.0005178595980192251, |
| "loss": 3.493, |
| "step": 23600 |
| }, |
| { |
| "epoch": 6.885213441267253, |
| "grad_norm": 0.33352574706077576, |
| "learning_rate": 0.0005176848237692979, |
| "loss": 3.4915, |
| "step": 23650 |
| }, |
| { |
| "epoch": 6.899772872867043, |
| "grad_norm": 0.3277892768383026, |
| "learning_rate": 0.0005175100495193708, |
| "loss": 3.4986, |
| "step": 23700 |
| }, |
| { |
| "epoch": 6.914332304466834, |
| "grad_norm": 0.3182038366794586, |
| "learning_rate": 0.0005173352752694436, |
| "loss": 3.5041, |
| "step": 23750 |
| }, |
| { |
| "epoch": 6.928891736066624, |
| "grad_norm": 0.3153535723686218, |
| "learning_rate": 0.0005171605010195165, |
| "loss": 3.5035, |
| "step": 23800 |
| }, |
| { |
| "epoch": 6.943451167666415, |
| "grad_norm": 0.34128624200820923, |
| "learning_rate": 0.0005169857267695893, |
| "loss": 3.4943, |
| "step": 23850 |
| }, |
| { |
| "epoch": 6.9580105992662045, |
| "grad_norm": 0.3200225234031677, |
| "learning_rate": 0.000516810952519662, |
| "loss": 3.5004, |
| "step": 23900 |
| }, |
| { |
| "epoch": 6.972570030865995, |
| "grad_norm": 0.35053977370262146, |
| "learning_rate": 0.0005166361782697349, |
| "loss": 3.4938, |
| "step": 23950 |
| }, |
| { |
| "epoch": 6.987129462465785, |
| "grad_norm": 0.35640257596969604, |
| "learning_rate": 0.0005164614040198077, |
| "loss": 3.4951, |
| "step": 24000 |
| }, |
| { |
| "epoch": 6.987129462465785, |
| "eval_accuracy": 0.36452964775538993, |
| "eval_loss": 3.5928568840026855, |
| "eval_runtime": 184.1497, |
| "eval_samples_per_second": 90.378, |
| "eval_steps_per_second": 5.653, |
| "step": 24000 |
| }, |
| { |
| "epoch": 7.001455943159979, |
| "grad_norm": 0.3846636712551117, |
| "learning_rate": 0.0005162866297698806, |
| "loss": 3.4935, |
| "step": 24050 |
| }, |
| { |
| "epoch": 7.016015374759769, |
| "grad_norm": 0.3523205518722534, |
| "learning_rate": 0.0005161118555199534, |
| "loss": 3.3822, |
| "step": 24100 |
| }, |
| { |
| "epoch": 7.03057480635956, |
| "grad_norm": 0.36663973331451416, |
| "learning_rate": 0.0005159370812700261, |
| "loss": 3.3874, |
| "step": 24150 |
| }, |
| { |
| "epoch": 7.04513423795935, |
| "grad_norm": 0.38096940517425537, |
| "learning_rate": 0.000515762307020099, |
| "loss": 3.3899, |
| "step": 24200 |
| }, |
| { |
| "epoch": 7.059693669559141, |
| "grad_norm": 0.35516002774238586, |
| "learning_rate": 0.0005155875327701718, |
| "loss": 3.3847, |
| "step": 24250 |
| }, |
| { |
| "epoch": 7.074253101158931, |
| "grad_norm": 0.3651926815509796, |
| "learning_rate": 0.0005154127585202447, |
| "loss": 3.4049, |
| "step": 24300 |
| }, |
| { |
| "epoch": 7.0888125327587215, |
| "grad_norm": 0.36075493693351746, |
| "learning_rate": 0.0005152379842703175, |
| "loss": 3.3965, |
| "step": 24350 |
| }, |
| { |
| "epoch": 7.103371964358511, |
| "grad_norm": 0.38245540857315063, |
| "learning_rate": 0.0005150632100203903, |
| "loss": 3.4028, |
| "step": 24400 |
| }, |
| { |
| "epoch": 7.117931395958302, |
| "grad_norm": 0.32894188165664673, |
| "learning_rate": 0.0005148884357704631, |
| "loss": 3.3985, |
| "step": 24450 |
| }, |
| { |
| "epoch": 7.132490827558092, |
| "grad_norm": 0.3118518590927124, |
| "learning_rate": 0.0005147136615205359, |
| "loss": 3.4151, |
| "step": 24500 |
| }, |
| { |
| "epoch": 7.147050259157883, |
| "grad_norm": 0.3686443269252777, |
| "learning_rate": 0.0005145388872706087, |
| "loss": 3.4092, |
| "step": 24550 |
| }, |
| { |
| "epoch": 7.161609690757673, |
| "grad_norm": 0.35504400730133057, |
| "learning_rate": 0.0005143641130206816, |
| "loss": 3.4128, |
| "step": 24600 |
| }, |
| { |
| "epoch": 7.1761691223574635, |
| "grad_norm": 0.371929794549942, |
| "learning_rate": 0.0005141893387707544, |
| "loss": 3.4088, |
| "step": 24650 |
| }, |
| { |
| "epoch": 7.190728553957253, |
| "grad_norm": 0.35544171929359436, |
| "learning_rate": 0.0005140145645208272, |
| "loss": 3.4102, |
| "step": 24700 |
| }, |
| { |
| "epoch": 7.205287985557044, |
| "grad_norm": 0.32105565071105957, |
| "learning_rate": 0.0005138397902709, |
| "loss": 3.4146, |
| "step": 24750 |
| }, |
| { |
| "epoch": 7.219847417156834, |
| "grad_norm": 0.3172771632671356, |
| "learning_rate": 0.0005136650160209728, |
| "loss": 3.4218, |
| "step": 24800 |
| }, |
| { |
| "epoch": 7.234406848756625, |
| "grad_norm": 0.3447094261646271, |
| "learning_rate": 0.0005134902417710457, |
| "loss": 3.4251, |
| "step": 24850 |
| }, |
| { |
| "epoch": 7.248966280356415, |
| "grad_norm": 0.3414628505706787, |
| "learning_rate": 0.0005133154675211185, |
| "loss": 3.4205, |
| "step": 24900 |
| }, |
| { |
| "epoch": 7.2635257119562056, |
| "grad_norm": 0.36512497067451477, |
| "learning_rate": 0.0005131406932711914, |
| "loss": 3.4273, |
| "step": 24950 |
| }, |
| { |
| "epoch": 7.2780851435559955, |
| "grad_norm": 0.3672768771648407, |
| "learning_rate": 0.0005129659190212641, |
| "loss": 3.4339, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.2780851435559955, |
| "eval_accuracy": 0.36433104871931843, |
| "eval_loss": 3.5997819900512695, |
| "eval_runtime": 180.8992, |
| "eval_samples_per_second": 92.002, |
| "eval_steps_per_second": 5.755, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.292644575155786, |
| "grad_norm": 0.3394540846347809, |
| "learning_rate": 0.0005127911447713369, |
| "loss": 3.4232, |
| "step": 25050 |
| }, |
| { |
| "epoch": 7.307204006755576, |
| "grad_norm": 0.3045949637889862, |
| "learning_rate": 0.0005126163705214098, |
| "loss": 3.4267, |
| "step": 25100 |
| }, |
| { |
| "epoch": 7.321763438355367, |
| "grad_norm": 0.32903987169265747, |
| "learning_rate": 0.0005124415962714826, |
| "loss": 3.4341, |
| "step": 25150 |
| }, |
| { |
| "epoch": 7.336322869955157, |
| "grad_norm": 0.3628155589103699, |
| "learning_rate": 0.0005122668220215555, |
| "loss": 3.4336, |
| "step": 25200 |
| }, |
| { |
| "epoch": 7.350882301554948, |
| "grad_norm": 0.3750855624675751, |
| "learning_rate": 0.0005120920477716282, |
| "loss": 3.436, |
| "step": 25250 |
| }, |
| { |
| "epoch": 7.3654417331547375, |
| "grad_norm": 0.31662774085998535, |
| "learning_rate": 0.000511917273521701, |
| "loss": 3.4372, |
| "step": 25300 |
| }, |
| { |
| "epoch": 7.380001164754528, |
| "grad_norm": 0.3318006694316864, |
| "learning_rate": 0.0005117424992717739, |
| "loss": 3.4377, |
| "step": 25350 |
| }, |
| { |
| "epoch": 7.394560596354318, |
| "grad_norm": 0.3489433526992798, |
| "learning_rate": 0.0005115677250218467, |
| "loss": 3.4364, |
| "step": 25400 |
| }, |
| { |
| "epoch": 7.409120027954109, |
| "grad_norm": 0.3378850817680359, |
| "learning_rate": 0.0005113929507719196, |
| "loss": 3.4392, |
| "step": 25450 |
| }, |
| { |
| "epoch": 7.423679459553899, |
| "grad_norm": 0.3490906357765198, |
| "learning_rate": 0.0005112181765219924, |
| "loss": 3.4404, |
| "step": 25500 |
| }, |
| { |
| "epoch": 7.43823889115369, |
| "grad_norm": 0.33684709668159485, |
| "learning_rate": 0.0005110434022720651, |
| "loss": 3.4316, |
| "step": 25550 |
| }, |
| { |
| "epoch": 7.4527983227534795, |
| "grad_norm": 0.3533405363559723, |
| "learning_rate": 0.000510868628022138, |
| "loss": 3.4519, |
| "step": 25600 |
| }, |
| { |
| "epoch": 7.46735775435327, |
| "grad_norm": 0.364666610956192, |
| "learning_rate": 0.0005106938537722109, |
| "loss": 3.4428, |
| "step": 25650 |
| }, |
| { |
| "epoch": 7.48191718595306, |
| "grad_norm": 0.3563931882381439, |
| "learning_rate": 0.0005105190795222837, |
| "loss": 3.442, |
| "step": 25700 |
| }, |
| { |
| "epoch": 7.496476617552851, |
| "grad_norm": 0.35002008080482483, |
| "learning_rate": 0.0005103443052723565, |
| "loss": 3.4379, |
| "step": 25750 |
| }, |
| { |
| "epoch": 7.511036049152641, |
| "grad_norm": 0.3543298542499542, |
| "learning_rate": 0.0005101695310224294, |
| "loss": 3.457, |
| "step": 25800 |
| }, |
| { |
| "epoch": 7.525595480752432, |
| "grad_norm": 0.33176884055137634, |
| "learning_rate": 0.0005099947567725021, |
| "loss": 3.4399, |
| "step": 25850 |
| }, |
| { |
| "epoch": 7.540154912352222, |
| "grad_norm": 0.34475451707839966, |
| "learning_rate": 0.000509819982522575, |
| "loss": 3.4443, |
| "step": 25900 |
| }, |
| { |
| "epoch": 7.554714343952012, |
| "grad_norm": 0.33004602789878845, |
| "learning_rate": 0.0005096452082726478, |
| "loss": 3.4584, |
| "step": 25950 |
| }, |
| { |
| "epoch": 7.569273775551802, |
| "grad_norm": 0.3163653016090393, |
| "learning_rate": 0.0005094704340227206, |
| "loss": 3.45, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.569273775551802, |
| "eval_accuracy": 0.36467333515745, |
| "eval_loss": 3.5947012901306152, |
| "eval_runtime": 180.7981, |
| "eval_samples_per_second": 92.053, |
| "eval_steps_per_second": 5.758, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.583833207151593, |
| "grad_norm": 0.3281993567943573, |
| "learning_rate": 0.0005092956597727935, |
| "loss": 3.4512, |
| "step": 26050 |
| }, |
| { |
| "epoch": 7.598392638751383, |
| "grad_norm": 0.31753283739089966, |
| "learning_rate": 0.0005091208855228662, |
| "loss": 3.4646, |
| "step": 26100 |
| }, |
| { |
| "epoch": 7.612952070351174, |
| "grad_norm": 0.3362863063812256, |
| "learning_rate": 0.0005089461112729391, |
| "loss": 3.4544, |
| "step": 26150 |
| }, |
| { |
| "epoch": 7.627511501950964, |
| "grad_norm": 0.34793269634246826, |
| "learning_rate": 0.0005087713370230119, |
| "loss": 3.4481, |
| "step": 26200 |
| }, |
| { |
| "epoch": 7.642070933550754, |
| "grad_norm": 0.3432117700576782, |
| "learning_rate": 0.0005085965627730847, |
| "loss": 3.4591, |
| "step": 26250 |
| }, |
| { |
| "epoch": 7.656630365150544, |
| "grad_norm": 0.3630698323249817, |
| "learning_rate": 0.0005084217885231576, |
| "loss": 3.4618, |
| "step": 26300 |
| }, |
| { |
| "epoch": 7.671189796750335, |
| "grad_norm": 0.3361819088459015, |
| "learning_rate": 0.0005082470142732304, |
| "loss": 3.4646, |
| "step": 26350 |
| }, |
| { |
| "epoch": 7.685749228350125, |
| "grad_norm": 0.3673403263092041, |
| "learning_rate": 0.0005080722400233032, |
| "loss": 3.4539, |
| "step": 26400 |
| }, |
| { |
| "epoch": 7.700308659949916, |
| "grad_norm": 0.33987388014793396, |
| "learning_rate": 0.000507897465773376, |
| "loss": 3.4693, |
| "step": 26450 |
| }, |
| { |
| "epoch": 7.714868091549706, |
| "grad_norm": 0.32190704345703125, |
| "learning_rate": 0.0005077226915234488, |
| "loss": 3.4468, |
| "step": 26500 |
| }, |
| { |
| "epoch": 7.729427523149496, |
| "grad_norm": 0.3864888846874237, |
| "learning_rate": 0.0005075479172735217, |
| "loss": 3.4556, |
| "step": 26550 |
| }, |
| { |
| "epoch": 7.743986954749286, |
| "grad_norm": 0.3400271534919739, |
| "learning_rate": 0.0005073731430235945, |
| "loss": 3.4587, |
| "step": 26600 |
| }, |
| { |
| "epoch": 7.758546386349077, |
| "grad_norm": 0.3375173509120941, |
| "learning_rate": 0.0005071983687736674, |
| "loss": 3.4628, |
| "step": 26650 |
| }, |
| { |
| "epoch": 7.773105817948867, |
| "grad_norm": 0.3561650216579437, |
| "learning_rate": 0.0005070235945237401, |
| "loss": 3.4572, |
| "step": 26700 |
| }, |
| { |
| "epoch": 7.787665249548658, |
| "grad_norm": 0.3330904543399811, |
| "learning_rate": 0.0005068488202738129, |
| "loss": 3.4615, |
| "step": 26750 |
| }, |
| { |
| "epoch": 7.802224681148448, |
| "grad_norm": 0.3155699074268341, |
| "learning_rate": 0.0005066740460238858, |
| "loss": 3.4407, |
| "step": 26800 |
| }, |
| { |
| "epoch": 7.8167841127482385, |
| "grad_norm": 0.3466147780418396, |
| "learning_rate": 0.0005064992717739586, |
| "loss": 3.457, |
| "step": 26850 |
| }, |
| { |
| "epoch": 7.831343544348028, |
| "grad_norm": 0.3634095788002014, |
| "learning_rate": 0.0005063244975240315, |
| "loss": 3.4655, |
| "step": 26900 |
| }, |
| { |
| "epoch": 7.845902975947819, |
| "grad_norm": 0.3383113741874695, |
| "learning_rate": 0.0005061497232741042, |
| "loss": 3.4613, |
| "step": 26950 |
| }, |
| { |
| "epoch": 7.860462407547609, |
| "grad_norm": 0.33195146918296814, |
| "learning_rate": 0.000505974949024177, |
| "loss": 3.4628, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.860462407547609, |
| "eval_accuracy": 0.3657741546812521, |
| "eval_loss": 3.5823051929473877, |
| "eval_runtime": 183.1613, |
| "eval_samples_per_second": 90.865, |
| "eval_steps_per_second": 5.684, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.8750218391474, |
| "grad_norm": 0.35151028633117676, |
| "learning_rate": 0.0005058001747742499, |
| "loss": 3.4647, |
| "step": 27050 |
| }, |
| { |
| "epoch": 7.88958127074719, |
| "grad_norm": 0.35772132873535156, |
| "learning_rate": 0.0005056254005243227, |
| "loss": 3.4699, |
| "step": 27100 |
| }, |
| { |
| "epoch": 7.9041407023469805, |
| "grad_norm": 0.3402451276779175, |
| "learning_rate": 0.0005054506262743955, |
| "loss": 3.4738, |
| "step": 27150 |
| }, |
| { |
| "epoch": 7.91870013394677, |
| "grad_norm": 0.33174848556518555, |
| "learning_rate": 0.0005052758520244684, |
| "loss": 3.4582, |
| "step": 27200 |
| }, |
| { |
| "epoch": 7.933259565546561, |
| "grad_norm": 0.33006104826927185, |
| "learning_rate": 0.0005051010777745411, |
| "loss": 3.4596, |
| "step": 27250 |
| }, |
| { |
| "epoch": 7.947818997146351, |
| "grad_norm": 0.347843199968338, |
| "learning_rate": 0.000504926303524614, |
| "loss": 3.4695, |
| "step": 27300 |
| }, |
| { |
| "epoch": 7.962378428746142, |
| "grad_norm": 0.32010769844055176, |
| "learning_rate": 0.0005047515292746868, |
| "loss": 3.4707, |
| "step": 27350 |
| }, |
| { |
| "epoch": 7.976937860345932, |
| "grad_norm": 0.3584131896495819, |
| "learning_rate": 0.0005045767550247596, |
| "loss": 3.4584, |
| "step": 27400 |
| }, |
| { |
| "epoch": 7.991497291945723, |
| "grad_norm": 0.3257739543914795, |
| "learning_rate": 0.0005044019807748325, |
| "loss": 3.4689, |
| "step": 27450 |
| }, |
| { |
| "epoch": 8.005823772639916, |
| "grad_norm": 0.33740708231925964, |
| "learning_rate": 0.0005042272065249052, |
| "loss": 3.4213, |
| "step": 27500 |
| }, |
| { |
| "epoch": 8.020383204239707, |
| "grad_norm": 0.33311763405799866, |
| "learning_rate": 0.0005040524322749781, |
| "loss": 3.3665, |
| "step": 27550 |
| }, |
| { |
| "epoch": 8.034942635839498, |
| "grad_norm": 0.32844987511634827, |
| "learning_rate": 0.0005038776580250509, |
| "loss": 3.3403, |
| "step": 27600 |
| }, |
| { |
| "epoch": 8.049502067439287, |
| "grad_norm": 0.33761027455329895, |
| "learning_rate": 0.0005037028837751237, |
| "loss": 3.3569, |
| "step": 27650 |
| }, |
| { |
| "epoch": 8.064061499039077, |
| "grad_norm": 0.35406097769737244, |
| "learning_rate": 0.0005035281095251966, |
| "loss": 3.3567, |
| "step": 27700 |
| }, |
| { |
| "epoch": 8.078620930638868, |
| "grad_norm": 0.38495901226997375, |
| "learning_rate": 0.0005033533352752694, |
| "loss": 3.3788, |
| "step": 27750 |
| }, |
| { |
| "epoch": 8.093180362238659, |
| "grad_norm": 0.331709086894989, |
| "learning_rate": 0.0005031785610253422, |
| "loss": 3.3591, |
| "step": 27800 |
| }, |
| { |
| "epoch": 8.107739793838448, |
| "grad_norm": 0.3502473533153534, |
| "learning_rate": 0.000503003786775415, |
| "loss": 3.3805, |
| "step": 27850 |
| }, |
| { |
| "epoch": 8.122299225438239, |
| "grad_norm": 0.3384426534175873, |
| "learning_rate": 0.0005028290125254878, |
| "loss": 3.3839, |
| "step": 27900 |
| }, |
| { |
| "epoch": 8.13685865703803, |
| "grad_norm": 0.36867547035217285, |
| "learning_rate": 0.0005026542382755607, |
| "loss": 3.3739, |
| "step": 27950 |
| }, |
| { |
| "epoch": 8.15141808863782, |
| "grad_norm": 0.342602014541626, |
| "learning_rate": 0.0005024794640256335, |
| "loss": 3.3902, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.15141808863782, |
| "eval_accuracy": 0.3654784314274215, |
| "eval_loss": 3.5930871963500977, |
| "eval_runtime": 181.7974, |
| "eval_samples_per_second": 91.547, |
| "eval_steps_per_second": 5.726, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.16597752023761, |
| "grad_norm": 0.34287703037261963, |
| "learning_rate": 0.0005023046897757064, |
| "loss": 3.3962, |
| "step": 28050 |
| }, |
| { |
| "epoch": 8.1805369518374, |
| "grad_norm": 0.3469769060611725, |
| "learning_rate": 0.0005021299155257791, |
| "loss": 3.3802, |
| "step": 28100 |
| }, |
| { |
| "epoch": 8.19509638343719, |
| "grad_norm": 0.33318281173706055, |
| "learning_rate": 0.000501955141275852, |
| "loss": 3.3841, |
| "step": 28150 |
| }, |
| { |
| "epoch": 8.209655815036982, |
| "grad_norm": 0.3634045720100403, |
| "learning_rate": 0.0005017803670259248, |
| "loss": 3.3948, |
| "step": 28200 |
| }, |
| { |
| "epoch": 8.22421524663677, |
| "grad_norm": 0.33881765604019165, |
| "learning_rate": 0.0005016055927759977, |
| "loss": 3.3884, |
| "step": 28250 |
| }, |
| { |
| "epoch": 8.238774678236561, |
| "grad_norm": 0.3295370638370514, |
| "learning_rate": 0.0005014308185260705, |
| "loss": 3.3949, |
| "step": 28300 |
| }, |
| { |
| "epoch": 8.253334109836352, |
| "grad_norm": 0.32435914874076843, |
| "learning_rate": 0.0005012560442761432, |
| "loss": 3.3864, |
| "step": 28350 |
| }, |
| { |
| "epoch": 8.267893541436143, |
| "grad_norm": 0.3608424663543701, |
| "learning_rate": 0.0005010812700262161, |
| "loss": 3.3904, |
| "step": 28400 |
| }, |
| { |
| "epoch": 8.282452973035932, |
| "grad_norm": 0.34927839040756226, |
| "learning_rate": 0.0005009064957762889, |
| "loss": 3.4077, |
| "step": 28450 |
| }, |
| { |
| "epoch": 8.297012404635723, |
| "grad_norm": 0.37262028455734253, |
| "learning_rate": 0.0005007317215263618, |
| "loss": 3.3987, |
| "step": 28500 |
| }, |
| { |
| "epoch": 8.311571836235514, |
| "grad_norm": 0.335907906293869, |
| "learning_rate": 0.0005005569472764346, |
| "loss": 3.4031, |
| "step": 28550 |
| }, |
| { |
| "epoch": 8.326131267835304, |
| "grad_norm": 0.32725778222084045, |
| "learning_rate": 0.0005003821730265074, |
| "loss": 3.4097, |
| "step": 28600 |
| }, |
| { |
| "epoch": 8.340690699435093, |
| "grad_norm": 0.34939050674438477, |
| "learning_rate": 0.0005002073987765802, |
| "loss": 3.4022, |
| "step": 28650 |
| }, |
| { |
| "epoch": 8.355250131034884, |
| "grad_norm": 0.329519659280777, |
| "learning_rate": 0.000500032624526653, |
| "loss": 3.395, |
| "step": 28700 |
| }, |
| { |
| "epoch": 8.369809562634675, |
| "grad_norm": 0.342352032661438, |
| "learning_rate": 0.0004998578502767259, |
| "loss": 3.4104, |
| "step": 28750 |
| }, |
| { |
| "epoch": 8.384368994234466, |
| "grad_norm": 0.33699142932891846, |
| "learning_rate": 0.0004996830760267987, |
| "loss": 3.4088, |
| "step": 28800 |
| }, |
| { |
| "epoch": 8.398928425834255, |
| "grad_norm": 0.3412262797355652, |
| "learning_rate": 0.0004995083017768715, |
| "loss": 3.4005, |
| "step": 28850 |
| }, |
| { |
| "epoch": 8.413487857434045, |
| "grad_norm": 0.3386961817741394, |
| "learning_rate": 0.0004993335275269444, |
| "loss": 3.4233, |
| "step": 28900 |
| }, |
| { |
| "epoch": 8.428047289033836, |
| "grad_norm": 0.3285767138004303, |
| "learning_rate": 0.0004991587532770171, |
| "loss": 3.4173, |
| "step": 28950 |
| }, |
| { |
| "epoch": 8.442606720633627, |
| "grad_norm": 0.36489197611808777, |
| "learning_rate": 0.00049898397902709, |
| "loss": 3.4052, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.442606720633627, |
| "eval_accuracy": 0.3661564196109552, |
| "eval_loss": 3.5855534076690674, |
| "eval_runtime": 181.6388, |
| "eval_samples_per_second": 91.627, |
| "eval_steps_per_second": 5.731, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.457166152233416, |
| "grad_norm": 0.33611413836479187, |
| "learning_rate": 0.0004988092047771628, |
| "loss": 3.4126, |
| "step": 29050 |
| }, |
| { |
| "epoch": 8.471725583833207, |
| "grad_norm": 0.353683203458786, |
| "learning_rate": 0.0004986344305272356, |
| "loss": 3.411, |
| "step": 29100 |
| }, |
| { |
| "epoch": 8.486285015432998, |
| "grad_norm": 0.3237438201904297, |
| "learning_rate": 0.0004984596562773085, |
| "loss": 3.4116, |
| "step": 29150 |
| }, |
| { |
| "epoch": 8.500844447032788, |
| "grad_norm": 0.3344637155532837, |
| "learning_rate": 0.0004982848820273812, |
| "loss": 3.4252, |
| "step": 29200 |
| }, |
| { |
| "epoch": 8.515403878632577, |
| "grad_norm": 0.36675405502319336, |
| "learning_rate": 0.0004981101077774541, |
| "loss": 3.4157, |
| "step": 29250 |
| }, |
| { |
| "epoch": 8.529963310232368, |
| "grad_norm": 0.3269510269165039, |
| "learning_rate": 0.0004979353335275269, |
| "loss": 3.4108, |
| "step": 29300 |
| }, |
| { |
| "epoch": 8.544522741832159, |
| "grad_norm": 0.32414621114730835, |
| "learning_rate": 0.0004977605592775997, |
| "loss": 3.4189, |
| "step": 29350 |
| }, |
| { |
| "epoch": 8.55908217343195, |
| "grad_norm": 0.34106162190437317, |
| "learning_rate": 0.0004975857850276726, |
| "loss": 3.413, |
| "step": 29400 |
| }, |
| { |
| "epoch": 8.573641605031739, |
| "grad_norm": 0.3581826388835907, |
| "learning_rate": 0.0004974110107777454, |
| "loss": 3.4321, |
| "step": 29450 |
| }, |
| { |
| "epoch": 8.58820103663153, |
| "grad_norm": 0.34445253014564514, |
| "learning_rate": 0.0004972362365278182, |
| "loss": 3.4358, |
| "step": 29500 |
| }, |
| { |
| "epoch": 8.60276046823132, |
| "grad_norm": 0.3163345158100128, |
| "learning_rate": 0.000497061462277891, |
| "loss": 3.4149, |
| "step": 29550 |
| }, |
| { |
| "epoch": 8.617319899831111, |
| "grad_norm": 0.3480691909790039, |
| "learning_rate": 0.0004968866880279638, |
| "loss": 3.4231, |
| "step": 29600 |
| }, |
| { |
| "epoch": 8.6318793314309, |
| "grad_norm": 0.35285916924476624, |
| "learning_rate": 0.0004967119137780367, |
| "loss": 3.438, |
| "step": 29650 |
| }, |
| { |
| "epoch": 8.646438763030691, |
| "grad_norm": 0.3620506823062897, |
| "learning_rate": 0.0004965371395281095, |
| "loss": 3.428, |
| "step": 29700 |
| }, |
| { |
| "epoch": 8.660998194630482, |
| "grad_norm": 0.3322892189025879, |
| "learning_rate": 0.0004963623652781822, |
| "loss": 3.4191, |
| "step": 29750 |
| }, |
| { |
| "epoch": 8.675557626230272, |
| "grad_norm": 0.3197033405303955, |
| "learning_rate": 0.0004961875910282551, |
| "loss": 3.4174, |
| "step": 29800 |
| }, |
| { |
| "epoch": 8.690117057830061, |
| "grad_norm": 0.34963804483413696, |
| "learning_rate": 0.0004960128167783279, |
| "loss": 3.4357, |
| "step": 29850 |
| }, |
| { |
| "epoch": 8.704676489429852, |
| "grad_norm": 0.3247370421886444, |
| "learning_rate": 0.0004958380425284008, |
| "loss": 3.421, |
| "step": 29900 |
| }, |
| { |
| "epoch": 8.719235921029643, |
| "grad_norm": 0.32233262062072754, |
| "learning_rate": 0.0004956632682784736, |
| "loss": 3.4173, |
| "step": 29950 |
| }, |
| { |
| "epoch": 8.733795352629434, |
| "grad_norm": 0.36941850185394287, |
| "learning_rate": 0.0004954884940285464, |
| "loss": 3.4335, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.733795352629434, |
| "eval_accuracy": 0.36665227049024096, |
| "eval_loss": 3.578705310821533, |
| "eval_runtime": 182.0872, |
| "eval_samples_per_second": 91.401, |
| "eval_steps_per_second": 5.717, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.748354784229225, |
| "grad_norm": 0.3275495767593384, |
| "learning_rate": 0.0004953137197786192, |
| "loss": 3.4322, |
| "step": 30050 |
| }, |
| { |
| "epoch": 8.762914215829014, |
| "grad_norm": 0.3445492684841156, |
| "learning_rate": 0.000495138945528692, |
| "loss": 3.4295, |
| "step": 30100 |
| }, |
| { |
| "epoch": 8.777473647428804, |
| "grad_norm": 0.364786297082901, |
| "learning_rate": 0.0004949641712787649, |
| "loss": 3.4453, |
| "step": 30150 |
| }, |
| { |
| "epoch": 8.792033079028595, |
| "grad_norm": 0.3113223612308502, |
| "learning_rate": 0.0004947893970288377, |
| "loss": 3.4291, |
| "step": 30200 |
| }, |
| { |
| "epoch": 8.806592510628384, |
| "grad_norm": 0.3300077021121979, |
| "learning_rate": 0.0004946146227789105, |
| "loss": 3.4139, |
| "step": 30250 |
| }, |
| { |
| "epoch": 8.821151942228175, |
| "grad_norm": 0.3236207067966461, |
| "learning_rate": 0.0004944398485289834, |
| "loss": 3.4379, |
| "step": 30300 |
| }, |
| { |
| "epoch": 8.835711373827966, |
| "grad_norm": 0.34814879298210144, |
| "learning_rate": 0.0004942650742790561, |
| "loss": 3.4444, |
| "step": 30350 |
| }, |
| { |
| "epoch": 8.850270805427757, |
| "grad_norm": 0.35743340849876404, |
| "learning_rate": 0.000494090300029129, |
| "loss": 3.4288, |
| "step": 30400 |
| }, |
| { |
| "epoch": 8.864830237027547, |
| "grad_norm": 0.3275567889213562, |
| "learning_rate": 0.0004939155257792018, |
| "loss": 3.4436, |
| "step": 30450 |
| }, |
| { |
| "epoch": 8.879389668627336, |
| "grad_norm": 0.3158799707889557, |
| "learning_rate": 0.0004937407515292746, |
| "loss": 3.4395, |
| "step": 30500 |
| }, |
| { |
| "epoch": 8.893949100227127, |
| "grad_norm": 0.33471766114234924, |
| "learning_rate": 0.0004935659772793475, |
| "loss": 3.4325, |
| "step": 30550 |
| }, |
| { |
| "epoch": 8.908508531826918, |
| "grad_norm": 0.34228864312171936, |
| "learning_rate": 0.0004933912030294202, |
| "loss": 3.427, |
| "step": 30600 |
| }, |
| { |
| "epoch": 8.923067963426707, |
| "grad_norm": 0.3427802324295044, |
| "learning_rate": 0.0004932164287794931, |
| "loss": 3.4251, |
| "step": 30650 |
| }, |
| { |
| "epoch": 8.937627395026498, |
| "grad_norm": 0.35321247577667236, |
| "learning_rate": 0.000493041654529566, |
| "loss": 3.4431, |
| "step": 30700 |
| }, |
| { |
| "epoch": 8.952186826626289, |
| "grad_norm": 0.3708088994026184, |
| "learning_rate": 0.0004928668802796388, |
| "loss": 3.4538, |
| "step": 30750 |
| }, |
| { |
| "epoch": 8.96674625822608, |
| "grad_norm": 0.34218692779541016, |
| "learning_rate": 0.0004926921060297116, |
| "loss": 3.4313, |
| "step": 30800 |
| }, |
| { |
| "epoch": 8.98130568982587, |
| "grad_norm": 0.35662880539894104, |
| "learning_rate": 0.0004925173317797845, |
| "loss": 3.4317, |
| "step": 30850 |
| }, |
| { |
| "epoch": 8.995865121425659, |
| "grad_norm": 0.3506118655204773, |
| "learning_rate": 0.0004923425575298572, |
| "loss": 3.4502, |
| "step": 30900 |
| }, |
| { |
| "epoch": 9.010191602119853, |
| "grad_norm": 0.34358587861061096, |
| "learning_rate": 0.0004921677832799301, |
| "loss": 3.3637, |
| "step": 30950 |
| }, |
| { |
| "epoch": 9.024751033719644, |
| "grad_norm": 0.3556085526943207, |
| "learning_rate": 0.0004919930090300029, |
| "loss": 3.3189, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.024751033719644, |
| "eval_accuracy": 0.36717398980524946, |
| "eval_loss": 3.5799267292022705, |
| "eval_runtime": 181.8055, |
| "eval_samples_per_second": 91.543, |
| "eval_steps_per_second": 5.726, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.039310465319433, |
| "grad_norm": 0.34578433632850647, |
| "learning_rate": 0.0004918182347800757, |
| "loss": 3.3231, |
| "step": 31050 |
| }, |
| { |
| "epoch": 9.053869896919224, |
| "grad_norm": 0.3525102138519287, |
| "learning_rate": 0.0004916434605301486, |
| "loss": 3.3193, |
| "step": 31100 |
| }, |
| { |
| "epoch": 9.068429328519015, |
| "grad_norm": 0.3447619080543518, |
| "learning_rate": 0.0004914686862802213, |
| "loss": 3.3293, |
| "step": 31150 |
| }, |
| { |
| "epoch": 9.082988760118806, |
| "grad_norm": 0.316193550825119, |
| "learning_rate": 0.0004912939120302941, |
| "loss": 3.35, |
| "step": 31200 |
| }, |
| { |
| "epoch": 9.097548191718595, |
| "grad_norm": 0.3357117772102356, |
| "learning_rate": 0.000491119137780367, |
| "loss": 3.3503, |
| "step": 31250 |
| }, |
| { |
| "epoch": 9.112107623318385, |
| "grad_norm": 0.3565595746040344, |
| "learning_rate": 0.0004909443635304398, |
| "loss": 3.3394, |
| "step": 31300 |
| }, |
| { |
| "epoch": 9.126667054918176, |
| "grad_norm": 0.35598695278167725, |
| "learning_rate": 0.0004907695892805127, |
| "loss": 3.3571, |
| "step": 31350 |
| }, |
| { |
| "epoch": 9.141226486517967, |
| "grad_norm": 0.3496910035610199, |
| "learning_rate": 0.0004905948150305855, |
| "loss": 3.354, |
| "step": 31400 |
| }, |
| { |
| "epoch": 9.155785918117756, |
| "grad_norm": 0.34782034158706665, |
| "learning_rate": 0.0004904200407806582, |
| "loss": 3.3431, |
| "step": 31450 |
| }, |
| { |
| "epoch": 9.170345349717547, |
| "grad_norm": 0.34046244621276855, |
| "learning_rate": 0.0004902452665307311, |
| "loss": 3.3657, |
| "step": 31500 |
| }, |
| { |
| "epoch": 9.184904781317337, |
| "grad_norm": 0.37150949239730835, |
| "learning_rate": 0.0004900704922808039, |
| "loss": 3.3665, |
| "step": 31550 |
| }, |
| { |
| "epoch": 9.199464212917128, |
| "grad_norm": 0.36348044872283936, |
| "learning_rate": 0.0004898957180308768, |
| "loss": 3.3567, |
| "step": 31600 |
| }, |
| { |
| "epoch": 9.214023644516917, |
| "grad_norm": 0.3551836311817169, |
| "learning_rate": 0.0004897209437809496, |
| "loss": 3.3674, |
| "step": 31650 |
| }, |
| { |
| "epoch": 9.228583076116708, |
| "grad_norm": 0.3500552475452423, |
| "learning_rate": 0.0004895461695310223, |
| "loss": 3.3814, |
| "step": 31700 |
| }, |
| { |
| "epoch": 9.243142507716499, |
| "grad_norm": 0.3479650318622589, |
| "learning_rate": 0.0004893713952810952, |
| "loss": 3.3613, |
| "step": 31750 |
| }, |
| { |
| "epoch": 9.25770193931629, |
| "grad_norm": 0.3503901958465576, |
| "learning_rate": 0.000489196621031168, |
| "loss": 3.3602, |
| "step": 31800 |
| }, |
| { |
| "epoch": 9.272261370916079, |
| "grad_norm": 0.33610227704048157, |
| "learning_rate": 0.0004890218467812409, |
| "loss": 3.3631, |
| "step": 31850 |
| }, |
| { |
| "epoch": 9.28682080251587, |
| "grad_norm": 0.3341948091983795, |
| "learning_rate": 0.0004888470725313137, |
| "loss": 3.3609, |
| "step": 31900 |
| }, |
| { |
| "epoch": 9.30138023411566, |
| "grad_norm": 0.3447319567203522, |
| "learning_rate": 0.0004886722982813865, |
| "loss": 3.3727, |
| "step": 31950 |
| }, |
| { |
| "epoch": 9.315939665715451, |
| "grad_norm": 0.32863977551460266, |
| "learning_rate": 0.0004884975240314593, |
| "loss": 3.3782, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.315939665715451, |
| "eval_accuracy": 0.366884968827947, |
| "eval_loss": 3.581573724746704, |
| "eval_runtime": 182.0337, |
| "eval_samples_per_second": 91.428, |
| "eval_steps_per_second": 5.719, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.33049909731524, |
| "grad_norm": 0.3508942127227783, |
| "learning_rate": 0.0004883227497815321, |
| "loss": 3.3778, |
| "step": 32050 |
| }, |
| { |
| "epoch": 9.34505852891503, |
| "grad_norm": 0.3674251437187195, |
| "learning_rate": 0.00048814797553160496, |
| "loss": 3.3807, |
| "step": 32100 |
| }, |
| { |
| "epoch": 9.359617960514822, |
| "grad_norm": 0.3387126922607422, |
| "learning_rate": 0.0004879732012816778, |
| "loss": 3.3823, |
| "step": 32150 |
| }, |
| { |
| "epoch": 9.374177392114612, |
| "grad_norm": 0.3542914390563965, |
| "learning_rate": 0.0004877984270317506, |
| "loss": 3.398, |
| "step": 32200 |
| }, |
| { |
| "epoch": 9.388736823714403, |
| "grad_norm": 0.354044109582901, |
| "learning_rate": 0.0004876236527818234, |
| "loss": 3.3764, |
| "step": 32250 |
| }, |
| { |
| "epoch": 9.403296255314192, |
| "grad_norm": 0.3662169575691223, |
| "learning_rate": 0.00048744887853189624, |
| "loss": 3.3919, |
| "step": 32300 |
| }, |
| { |
| "epoch": 9.417855686913983, |
| "grad_norm": 0.33728882670402527, |
| "learning_rate": 0.00048727410428196907, |
| "loss": 3.383, |
| "step": 32350 |
| }, |
| { |
| "epoch": 9.432415118513774, |
| "grad_norm": 0.32222864031791687, |
| "learning_rate": 0.0004870993300320419, |
| "loss": 3.3877, |
| "step": 32400 |
| }, |
| { |
| "epoch": 9.446974550113563, |
| "grad_norm": 0.3222348988056183, |
| "learning_rate": 0.00048692455578211474, |
| "loss": 3.3822, |
| "step": 32450 |
| }, |
| { |
| "epoch": 9.461533981713353, |
| "grad_norm": 0.3391883671283722, |
| "learning_rate": 0.0004867497815321875, |
| "loss": 3.3887, |
| "step": 32500 |
| }, |
| { |
| "epoch": 9.476093413313144, |
| "grad_norm": 0.3517501652240753, |
| "learning_rate": 0.00048657500728226035, |
| "loss": 3.3825, |
| "step": 32550 |
| }, |
| { |
| "epoch": 9.490652844912935, |
| "grad_norm": 0.3315829932689667, |
| "learning_rate": 0.0004864002330323332, |
| "loss": 3.3849, |
| "step": 32600 |
| }, |
| { |
| "epoch": 9.505212276512726, |
| "grad_norm": 0.33583584427833557, |
| "learning_rate": 0.000486225458782406, |
| "loss": 3.3938, |
| "step": 32650 |
| }, |
| { |
| "epoch": 9.519771708112515, |
| "grad_norm": 0.3496243357658386, |
| "learning_rate": 0.0004860506845324788, |
| "loss": 3.3901, |
| "step": 32700 |
| }, |
| { |
| "epoch": 9.534331139712306, |
| "grad_norm": 0.34915950894355774, |
| "learning_rate": 0.0004858759102825516, |
| "loss": 3.402, |
| "step": 32750 |
| }, |
| { |
| "epoch": 9.548890571312096, |
| "grad_norm": 0.3658216893672943, |
| "learning_rate": 0.00048570113603262446, |
| "loss": 3.391, |
| "step": 32800 |
| }, |
| { |
| "epoch": 9.563450002911885, |
| "grad_norm": 0.3504136800765991, |
| "learning_rate": 0.0004855263617826973, |
| "loss": 3.3906, |
| "step": 32850 |
| }, |
| { |
| "epoch": 9.578009434511676, |
| "grad_norm": 0.33254560828208923, |
| "learning_rate": 0.0004853515875327701, |
| "loss": 3.4056, |
| "step": 32900 |
| }, |
| { |
| "epoch": 9.592568866111467, |
| "grad_norm": 0.34906646609306335, |
| "learning_rate": 0.0004851768132828429, |
| "loss": 3.4075, |
| "step": 32950 |
| }, |
| { |
| "epoch": 9.607128297711258, |
| "grad_norm": 0.34559518098831177, |
| "learning_rate": 0.00048500203903291574, |
| "loss": 3.4026, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.607128297711258, |
| "eval_accuracy": 0.36745666125742, |
| "eval_loss": 3.5726640224456787, |
| "eval_runtime": 181.7757, |
| "eval_samples_per_second": 91.558, |
| "eval_steps_per_second": 5.727, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.621687729311049, |
| "grad_norm": 0.3735829293727875, |
| "learning_rate": 0.00048482726478298857, |
| "loss": 3.4065, |
| "step": 33050 |
| }, |
| { |
| "epoch": 9.636247160910838, |
| "grad_norm": 0.3518868684768677, |
| "learning_rate": 0.0004846524905330614, |
| "loss": 3.4036, |
| "step": 33100 |
| }, |
| { |
| "epoch": 9.650806592510628, |
| "grad_norm": 0.3787810802459717, |
| "learning_rate": 0.00048447771628313424, |
| "loss": 3.4012, |
| "step": 33150 |
| }, |
| { |
| "epoch": 9.66536602411042, |
| "grad_norm": 0.36960500478744507, |
| "learning_rate": 0.0004843029420332071, |
| "loss": 3.408, |
| "step": 33200 |
| }, |
| { |
| "epoch": 9.67992545571021, |
| "grad_norm": 0.34325626492500305, |
| "learning_rate": 0.0004841281677832799, |
| "loss": 3.4017, |
| "step": 33250 |
| }, |
| { |
| "epoch": 9.694484887309999, |
| "grad_norm": 0.3455840051174164, |
| "learning_rate": 0.00048395339353335273, |
| "loss": 3.4139, |
| "step": 33300 |
| }, |
| { |
| "epoch": 9.70904431890979, |
| "grad_norm": 0.35434481501579285, |
| "learning_rate": 0.00048377861928342557, |
| "loss": 3.3996, |
| "step": 33350 |
| }, |
| { |
| "epoch": 9.72360375050958, |
| "grad_norm": 0.33681508898735046, |
| "learning_rate": 0.0004836038450334984, |
| "loss": 3.4125, |
| "step": 33400 |
| }, |
| { |
| "epoch": 9.738163182109371, |
| "grad_norm": 0.35238656401634216, |
| "learning_rate": 0.0004834290707835712, |
| "loss": 3.4157, |
| "step": 33450 |
| }, |
| { |
| "epoch": 9.75272261370916, |
| "grad_norm": 0.37718260288238525, |
| "learning_rate": 0.000483254296533644, |
| "loss": 3.4033, |
| "step": 33500 |
| }, |
| { |
| "epoch": 9.767282045308951, |
| "grad_norm": 0.3434363901615143, |
| "learning_rate": 0.00048307952228371685, |
| "loss": 3.4143, |
| "step": 33550 |
| }, |
| { |
| "epoch": 9.781841476908742, |
| "grad_norm": 0.34627440571784973, |
| "learning_rate": 0.0004829047480337897, |
| "loss": 3.4043, |
| "step": 33600 |
| }, |
| { |
| "epoch": 9.796400908508533, |
| "grad_norm": 0.33534497022628784, |
| "learning_rate": 0.0004827299737838625, |
| "loss": 3.4029, |
| "step": 33650 |
| }, |
| { |
| "epoch": 9.810960340108322, |
| "grad_norm": 0.3508129417896271, |
| "learning_rate": 0.0004825551995339353, |
| "loss": 3.406, |
| "step": 33700 |
| }, |
| { |
| "epoch": 9.825519771708112, |
| "grad_norm": 0.34650343656539917, |
| "learning_rate": 0.0004823804252840081, |
| "loss": 3.404, |
| "step": 33750 |
| }, |
| { |
| "epoch": 9.840079203307903, |
| "grad_norm": 0.33442333340644836, |
| "learning_rate": 0.00048220565103408096, |
| "loss": 3.4015, |
| "step": 33800 |
| }, |
| { |
| "epoch": 9.854638634907694, |
| "grad_norm": 0.3506050407886505, |
| "learning_rate": 0.0004820308767841538, |
| "loss": 3.4156, |
| "step": 33850 |
| }, |
| { |
| "epoch": 9.869198066507483, |
| "grad_norm": 0.341828316450119, |
| "learning_rate": 0.0004818561025342266, |
| "loss": 3.4171, |
| "step": 33900 |
| }, |
| { |
| "epoch": 9.883757498107274, |
| "grad_norm": 0.3377910554409027, |
| "learning_rate": 0.0004816813282842994, |
| "loss": 3.4102, |
| "step": 33950 |
| }, |
| { |
| "epoch": 9.898316929707065, |
| "grad_norm": 0.35400837659835815, |
| "learning_rate": 0.00048150655403437223, |
| "loss": 3.4082, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.898316929707065, |
| "eval_accuracy": 0.36838721944064684, |
| "eval_loss": 3.5640623569488525, |
| "eval_runtime": 182.8947, |
| "eval_samples_per_second": 90.998, |
| "eval_steps_per_second": 5.692, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.912876361306855, |
| "grad_norm": 0.3472040593624115, |
| "learning_rate": 0.00048133177978444507, |
| "loss": 3.4179, |
| "step": 34050 |
| }, |
| { |
| "epoch": 9.927435792906644, |
| "grad_norm": 0.3496232032775879, |
| "learning_rate": 0.0004811570055345179, |
| "loss": 3.4113, |
| "step": 34100 |
| }, |
| { |
| "epoch": 9.941995224506435, |
| "grad_norm": 0.33684638142585754, |
| "learning_rate": 0.0004809822312845907, |
| "loss": 3.4137, |
| "step": 34150 |
| }, |
| { |
| "epoch": 9.956554656106226, |
| "grad_norm": 0.34335857629776, |
| "learning_rate": 0.0004808074570346635, |
| "loss": 3.4172, |
| "step": 34200 |
| }, |
| { |
| "epoch": 9.971114087706017, |
| "grad_norm": 0.34269091486930847, |
| "learning_rate": 0.00048063268278473634, |
| "loss": 3.4183, |
| "step": 34250 |
| }, |
| { |
| "epoch": 9.985673519305806, |
| "grad_norm": 0.3301508128643036, |
| "learning_rate": 0.0004804579085348092, |
| "loss": 3.4135, |
| "step": 34300 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.8190501928329468, |
| "learning_rate": 0.000480283134284882, |
| "loss": 3.4084, |
| "step": 34350 |
| }, |
| { |
| "epoch": 10.01455943159979, |
| "grad_norm": 0.34881967306137085, |
| "learning_rate": 0.0004801083600349548, |
| "loss": 3.3121, |
| "step": 34400 |
| }, |
| { |
| "epoch": 10.029118863199582, |
| "grad_norm": 0.3504365086555481, |
| "learning_rate": 0.0004799335857850276, |
| "loss": 3.3012, |
| "step": 34450 |
| }, |
| { |
| "epoch": 10.04367829479937, |
| "grad_norm": 0.3723757565021515, |
| "learning_rate": 0.00047975881153510046, |
| "loss": 3.3123, |
| "step": 34500 |
| }, |
| { |
| "epoch": 10.058237726399161, |
| "grad_norm": 0.3652939200401306, |
| "learning_rate": 0.0004795840372851733, |
| "loss": 3.3082, |
| "step": 34550 |
| }, |
| { |
| "epoch": 10.072797157998952, |
| "grad_norm": 0.36539286375045776, |
| "learning_rate": 0.00047940926303524607, |
| "loss": 3.3053, |
| "step": 34600 |
| }, |
| { |
| "epoch": 10.087356589598743, |
| "grad_norm": 0.34552112221717834, |
| "learning_rate": 0.0004792344887853189, |
| "loss": 3.3203, |
| "step": 34650 |
| }, |
| { |
| "epoch": 10.101916021198532, |
| "grad_norm": 0.34289079904556274, |
| "learning_rate": 0.00047905971453539173, |
| "loss": 3.3335, |
| "step": 34700 |
| }, |
| { |
| "epoch": 10.116475452798323, |
| "grad_norm": 0.34614643454551697, |
| "learning_rate": 0.00047888494028546457, |
| "loss": 3.3293, |
| "step": 34750 |
| }, |
| { |
| "epoch": 10.131034884398114, |
| "grad_norm": 0.365692675113678, |
| "learning_rate": 0.0004787101660355374, |
| "loss": 3.3347, |
| "step": 34800 |
| }, |
| { |
| "epoch": 10.145594315997904, |
| "grad_norm": 0.3478696644306183, |
| "learning_rate": 0.0004785353917856102, |
| "loss": 3.3419, |
| "step": 34850 |
| }, |
| { |
| "epoch": 10.160153747597693, |
| "grad_norm": 0.345829576253891, |
| "learning_rate": 0.000478360617535683, |
| "loss": 3.3263, |
| "step": 34900 |
| }, |
| { |
| "epoch": 10.174713179197484, |
| "grad_norm": 0.4017032980918884, |
| "learning_rate": 0.00047818584328575584, |
| "loss": 3.3348, |
| "step": 34950 |
| }, |
| { |
| "epoch": 10.189272610797275, |
| "grad_norm": 0.34451884031295776, |
| "learning_rate": 0.0004780110690358287, |
| "loss": 3.3356, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.189272610797275, |
| "eval_accuracy": 0.36771605111744, |
| "eval_loss": 3.5778610706329346, |
| "eval_runtime": 183.2483, |
| "eval_samples_per_second": 90.822, |
| "eval_steps_per_second": 5.681, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.203832042397066, |
| "grad_norm": 0.35025554895401, |
| "learning_rate": 0.0004778362947859015, |
| "loss": 3.3442, |
| "step": 35050 |
| }, |
| { |
| "epoch": 10.218391473996855, |
| "grad_norm": 0.34518471360206604, |
| "learning_rate": 0.0004776615205359743, |
| "loss": 3.3374, |
| "step": 35100 |
| }, |
| { |
| "epoch": 10.232950905596645, |
| "grad_norm": 0.35896578431129456, |
| "learning_rate": 0.0004774867462860471, |
| "loss": 3.3453, |
| "step": 35150 |
| }, |
| { |
| "epoch": 10.247510337196436, |
| "grad_norm": 0.3396795094013214, |
| "learning_rate": 0.00047731197203611995, |
| "loss": 3.3457, |
| "step": 35200 |
| }, |
| { |
| "epoch": 10.262069768796227, |
| "grad_norm": 0.3721248209476471, |
| "learning_rate": 0.0004771371977861928, |
| "loss": 3.3458, |
| "step": 35250 |
| }, |
| { |
| "epoch": 10.276629200396016, |
| "grad_norm": 0.3700907230377197, |
| "learning_rate": 0.00047696242353626557, |
| "loss": 3.3381, |
| "step": 35300 |
| }, |
| { |
| "epoch": 10.291188631995807, |
| "grad_norm": 0.3764047622680664, |
| "learning_rate": 0.0004767876492863384, |
| "loss": 3.3418, |
| "step": 35350 |
| }, |
| { |
| "epoch": 10.305748063595598, |
| "grad_norm": 0.3617747724056244, |
| "learning_rate": 0.00047661287503641123, |
| "loss": 3.347, |
| "step": 35400 |
| }, |
| { |
| "epoch": 10.320307495195388, |
| "grad_norm": 0.34759700298309326, |
| "learning_rate": 0.00047643810078648407, |
| "loss": 3.3512, |
| "step": 35450 |
| }, |
| { |
| "epoch": 10.334866926795177, |
| "grad_norm": 0.35689282417297363, |
| "learning_rate": 0.0004762633265365569, |
| "loss": 3.3663, |
| "step": 35500 |
| }, |
| { |
| "epoch": 10.349426358394968, |
| "grad_norm": 0.32792720198631287, |
| "learning_rate": 0.0004760885522866297, |
| "loss": 3.3568, |
| "step": 35550 |
| }, |
| { |
| "epoch": 10.363985789994759, |
| "grad_norm": 0.3390996754169464, |
| "learning_rate": 0.0004759137780367025, |
| "loss": 3.3689, |
| "step": 35600 |
| }, |
| { |
| "epoch": 10.37854522159455, |
| "grad_norm": 0.35693955421447754, |
| "learning_rate": 0.00047573900378677534, |
| "loss": 3.3575, |
| "step": 35650 |
| }, |
| { |
| "epoch": 10.393104653194339, |
| "grad_norm": 0.3452168405056, |
| "learning_rate": 0.00047556422953684823, |
| "loss": 3.3642, |
| "step": 35700 |
| }, |
| { |
| "epoch": 10.40766408479413, |
| "grad_norm": 0.370328426361084, |
| "learning_rate": 0.00047538945528692106, |
| "loss": 3.3595, |
| "step": 35750 |
| }, |
| { |
| "epoch": 10.42222351639392, |
| "grad_norm": 0.37136757373809814, |
| "learning_rate": 0.0004752146810369939, |
| "loss": 3.346, |
| "step": 35800 |
| }, |
| { |
| "epoch": 10.436782947993711, |
| "grad_norm": 0.3773367702960968, |
| "learning_rate": 0.0004750399067870667, |
| "loss": 3.3645, |
| "step": 35850 |
| }, |
| { |
| "epoch": 10.4513423795935, |
| "grad_norm": 0.3447873592376709, |
| "learning_rate": 0.0004748651325371395, |
| "loss": 3.3598, |
| "step": 35900 |
| }, |
| { |
| "epoch": 10.46590181119329, |
| "grad_norm": 0.355688214302063, |
| "learning_rate": 0.00047469035828721234, |
| "loss": 3.3672, |
| "step": 35950 |
| }, |
| { |
| "epoch": 10.480461242793082, |
| "grad_norm": 0.3678136169910431, |
| "learning_rate": 0.0004745155840372852, |
| "loss": 3.3828, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.480461242793082, |
| "eval_accuracy": 0.36819038416155636, |
| "eval_loss": 3.568837881088257, |
| "eval_runtime": 183.926, |
| "eval_samples_per_second": 90.487, |
| "eval_steps_per_second": 5.66, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.495020674392872, |
| "grad_norm": 0.3659283220767975, |
| "learning_rate": 0.00047434080978735795, |
| "loss": 3.3647, |
| "step": 36050 |
| }, |
| { |
| "epoch": 10.509580105992661, |
| "grad_norm": 0.3798047602176666, |
| "learning_rate": 0.0004741660355374308, |
| "loss": 3.3631, |
| "step": 36100 |
| }, |
| { |
| "epoch": 10.524139537592452, |
| "grad_norm": 0.3466806411743164, |
| "learning_rate": 0.0004739912612875036, |
| "loss": 3.359, |
| "step": 36150 |
| }, |
| { |
| "epoch": 10.538698969192243, |
| "grad_norm": 0.35511037707328796, |
| "learning_rate": 0.00047381648703757645, |
| "loss": 3.3755, |
| "step": 36200 |
| }, |
| { |
| "epoch": 10.553258400792034, |
| "grad_norm": 0.3418614864349365, |
| "learning_rate": 0.0004736417127876493, |
| "loss": 3.3799, |
| "step": 36250 |
| }, |
| { |
| "epoch": 10.567817832391823, |
| "grad_norm": 0.38244953751564026, |
| "learning_rate": 0.00047346693853772206, |
| "loss": 3.382, |
| "step": 36300 |
| }, |
| { |
| "epoch": 10.582377263991614, |
| "grad_norm": 0.3323763310909271, |
| "learning_rate": 0.0004732921642877949, |
| "loss": 3.3828, |
| "step": 36350 |
| }, |
| { |
| "epoch": 10.596936695591404, |
| "grad_norm": 0.3437618315219879, |
| "learning_rate": 0.00047311739003786773, |
| "loss": 3.391, |
| "step": 36400 |
| }, |
| { |
| "epoch": 10.611496127191195, |
| "grad_norm": 0.36182549595832825, |
| "learning_rate": 0.00047294261578794056, |
| "loss": 3.3829, |
| "step": 36450 |
| }, |
| { |
| "epoch": 10.626055558790984, |
| "grad_norm": 0.38253724575042725, |
| "learning_rate": 0.0004727678415380134, |
| "loss": 3.3803, |
| "step": 36500 |
| }, |
| { |
| "epoch": 10.640614990390775, |
| "grad_norm": 0.36465519666671753, |
| "learning_rate": 0.0004725930672880862, |
| "loss": 3.3703, |
| "step": 36550 |
| }, |
| { |
| "epoch": 10.655174421990566, |
| "grad_norm": 0.3479657769203186, |
| "learning_rate": 0.000472418293038159, |
| "loss": 3.3709, |
| "step": 36600 |
| }, |
| { |
| "epoch": 10.669733853590357, |
| "grad_norm": 0.3454592227935791, |
| "learning_rate": 0.00047224351878823184, |
| "loss": 3.3876, |
| "step": 36650 |
| }, |
| { |
| "epoch": 10.684293285190146, |
| "grad_norm": 0.34455588459968567, |
| "learning_rate": 0.0004720687445383047, |
| "loss": 3.3788, |
| "step": 36700 |
| }, |
| { |
| "epoch": 10.698852716789936, |
| "grad_norm": 0.357598215341568, |
| "learning_rate": 0.00047189397028837745, |
| "loss": 3.3759, |
| "step": 36750 |
| }, |
| { |
| "epoch": 10.713412148389727, |
| "grad_norm": 0.36810582876205444, |
| "learning_rate": 0.0004717191960384503, |
| "loss": 3.3817, |
| "step": 36800 |
| }, |
| { |
| "epoch": 10.727971579989518, |
| "grad_norm": 0.37969326972961426, |
| "learning_rate": 0.0004715444217885231, |
| "loss": 3.3845, |
| "step": 36850 |
| }, |
| { |
| "epoch": 10.742531011589307, |
| "grad_norm": 0.362560898065567, |
| "learning_rate": 0.00047136964753859595, |
| "loss": 3.397, |
| "step": 36900 |
| }, |
| { |
| "epoch": 10.757090443189098, |
| "grad_norm": 0.36402398347854614, |
| "learning_rate": 0.0004711948732886688, |
| "loss": 3.3797, |
| "step": 36950 |
| }, |
| { |
| "epoch": 10.771649874788888, |
| "grad_norm": 0.3478822410106659, |
| "learning_rate": 0.00047102009903874156, |
| "loss": 3.3911, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.771649874788888, |
| "eval_accuracy": 0.368904117819907, |
| "eval_loss": 3.5626118183135986, |
| "eval_runtime": 183.8574, |
| "eval_samples_per_second": 90.521, |
| "eval_steps_per_second": 5.662, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.78620930638868, |
| "grad_norm": 0.34672781825065613, |
| "learning_rate": 0.0004708453247888144, |
| "loss": 3.3796, |
| "step": 37050 |
| }, |
| { |
| "epoch": 10.800768737988468, |
| "grad_norm": 0.35510483384132385, |
| "learning_rate": 0.00047067055053888723, |
| "loss": 3.3921, |
| "step": 37100 |
| }, |
| { |
| "epoch": 10.815328169588259, |
| "grad_norm": 0.3330132067203522, |
| "learning_rate": 0.00047049577628896006, |
| "loss": 3.3707, |
| "step": 37150 |
| }, |
| { |
| "epoch": 10.82988760118805, |
| "grad_norm": 0.35041606426239014, |
| "learning_rate": 0.0004703210020390329, |
| "loss": 3.3993, |
| "step": 37200 |
| }, |
| { |
| "epoch": 10.84444703278784, |
| "grad_norm": 0.34748944640159607, |
| "learning_rate": 0.0004701462277891057, |
| "loss": 3.3854, |
| "step": 37250 |
| }, |
| { |
| "epoch": 10.85900646438763, |
| "grad_norm": 0.3505236506462097, |
| "learning_rate": 0.0004699714535391785, |
| "loss": 3.3933, |
| "step": 37300 |
| }, |
| { |
| "epoch": 10.87356589598742, |
| "grad_norm": 0.3472146689891815, |
| "learning_rate": 0.00046979667928925134, |
| "loss": 3.3877, |
| "step": 37350 |
| }, |
| { |
| "epoch": 10.888125327587211, |
| "grad_norm": 0.33038902282714844, |
| "learning_rate": 0.0004696219050393242, |
| "loss": 3.3872, |
| "step": 37400 |
| }, |
| { |
| "epoch": 10.902684759187002, |
| "grad_norm": 0.33716917037963867, |
| "learning_rate": 0.00046944713078939695, |
| "loss": 3.3962, |
| "step": 37450 |
| }, |
| { |
| "epoch": 10.917244190786791, |
| "grad_norm": 0.3526748716831207, |
| "learning_rate": 0.0004692723565394698, |
| "loss": 3.3928, |
| "step": 37500 |
| }, |
| { |
| "epoch": 10.931803622386582, |
| "grad_norm": 0.36475178599357605, |
| "learning_rate": 0.0004690975822895426, |
| "loss": 3.3842, |
| "step": 37550 |
| }, |
| { |
| "epoch": 10.946363053986373, |
| "grad_norm": 0.36359477043151855, |
| "learning_rate": 0.00046892280803961545, |
| "loss": 3.401, |
| "step": 37600 |
| }, |
| { |
| "epoch": 10.960922485586163, |
| "grad_norm": 0.35189494490623474, |
| "learning_rate": 0.0004687480337896883, |
| "loss": 3.3937, |
| "step": 37650 |
| }, |
| { |
| "epoch": 10.975481917185952, |
| "grad_norm": 0.3400118350982666, |
| "learning_rate": 0.00046857325953976106, |
| "loss": 3.3934, |
| "step": 37700 |
| }, |
| { |
| "epoch": 10.990041348785743, |
| "grad_norm": 0.3473895490169525, |
| "learning_rate": 0.0004683984852898339, |
| "loss": 3.3902, |
| "step": 37750 |
| }, |
| { |
| "epoch": 11.004367829479937, |
| "grad_norm": 0.3693157732486725, |
| "learning_rate": 0.00046822371103990673, |
| "loss": 3.3584, |
| "step": 37800 |
| }, |
| { |
| "epoch": 11.018927261079728, |
| "grad_norm": 0.34884193539619446, |
| "learning_rate": 0.00046804893678997956, |
| "loss": 3.2712, |
| "step": 37850 |
| }, |
| { |
| "epoch": 11.033486692679517, |
| "grad_norm": 0.331039696931839, |
| "learning_rate": 0.00046787416254005234, |
| "loss": 3.2856, |
| "step": 37900 |
| }, |
| { |
| "epoch": 11.048046124279308, |
| "grad_norm": 0.34825077652931213, |
| "learning_rate": 0.0004676993882901252, |
| "loss": 3.2941, |
| "step": 37950 |
| }, |
| { |
| "epoch": 11.062605555879099, |
| "grad_norm": 0.3396894633769989, |
| "learning_rate": 0.000467524614040198, |
| "loss": 3.2861, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.062605555879099, |
| "eval_accuracy": 0.36855336534826616, |
| "eval_loss": 3.5719475746154785, |
| "eval_runtime": 180.4716, |
| "eval_samples_per_second": 92.219, |
| "eval_steps_per_second": 5.768, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.07716498747889, |
| "grad_norm": 0.41541653871536255, |
| "learning_rate": 0.00046734983979027084, |
| "loss": 3.2967, |
| "step": 38050 |
| }, |
| { |
| "epoch": 11.091724419078679, |
| "grad_norm": 0.34760013222694397, |
| "learning_rate": 0.00046717506554034367, |
| "loss": 3.2988, |
| "step": 38100 |
| }, |
| { |
| "epoch": 11.10628385067847, |
| "grad_norm": 0.3493053913116455, |
| "learning_rate": 0.00046700029129041645, |
| "loss": 3.2994, |
| "step": 38150 |
| }, |
| { |
| "epoch": 11.12084328227826, |
| "grad_norm": 0.35706987977027893, |
| "learning_rate": 0.0004668255170404893, |
| "loss": 3.315, |
| "step": 38200 |
| }, |
| { |
| "epoch": 11.135402713878051, |
| "grad_norm": 0.3363507390022278, |
| "learning_rate": 0.00046665074279056217, |
| "loss": 3.3071, |
| "step": 38250 |
| }, |
| { |
| "epoch": 11.14996214547784, |
| "grad_norm": 0.3618837296962738, |
| "learning_rate": 0.000466475968540635, |
| "loss": 3.3106, |
| "step": 38300 |
| }, |
| { |
| "epoch": 11.16452157707763, |
| "grad_norm": 0.33892515301704407, |
| "learning_rate": 0.00046630119429070784, |
| "loss": 3.3133, |
| "step": 38350 |
| }, |
| { |
| "epoch": 11.179081008677421, |
| "grad_norm": 0.33202266693115234, |
| "learning_rate": 0.00046612642004078067, |
| "loss": 3.3235, |
| "step": 38400 |
| }, |
| { |
| "epoch": 11.193640440277212, |
| "grad_norm": 0.3930901288986206, |
| "learning_rate": 0.00046595164579085345, |
| "loss": 3.3299, |
| "step": 38450 |
| }, |
| { |
| "epoch": 11.208199871877001, |
| "grad_norm": 0.4052780568599701, |
| "learning_rate": 0.0004657768715409263, |
| "loss": 3.3197, |
| "step": 38500 |
| }, |
| { |
| "epoch": 11.222759303476792, |
| "grad_norm": 0.3582177460193634, |
| "learning_rate": 0.0004656020972909991, |
| "loss": 3.3247, |
| "step": 38550 |
| }, |
| { |
| "epoch": 11.237318735076583, |
| "grad_norm": 0.3405052423477173, |
| "learning_rate": 0.00046542732304107195, |
| "loss": 3.3235, |
| "step": 38600 |
| }, |
| { |
| "epoch": 11.251878166676374, |
| "grad_norm": 0.32738906145095825, |
| "learning_rate": 0.0004652525487911447, |
| "loss": 3.3191, |
| "step": 38650 |
| }, |
| { |
| "epoch": 11.266437598276163, |
| "grad_norm": 0.36800041794776917, |
| "learning_rate": 0.00046507777454121756, |
| "loss": 3.3328, |
| "step": 38700 |
| }, |
| { |
| "epoch": 11.280997029875953, |
| "grad_norm": 0.37207457423210144, |
| "learning_rate": 0.0004649030002912904, |
| "loss": 3.3304, |
| "step": 38750 |
| }, |
| { |
| "epoch": 11.295556461475744, |
| "grad_norm": 0.36415359377861023, |
| "learning_rate": 0.0004647282260413632, |
| "loss": 3.3409, |
| "step": 38800 |
| }, |
| { |
| "epoch": 11.310115893075535, |
| "grad_norm": 0.3438774049282074, |
| "learning_rate": 0.00046455345179143606, |
| "loss": 3.3288, |
| "step": 38850 |
| }, |
| { |
| "epoch": 11.324675324675324, |
| "grad_norm": 0.3514200448989868, |
| "learning_rate": 0.00046437867754150884, |
| "loss": 3.331, |
| "step": 38900 |
| }, |
| { |
| "epoch": 11.339234756275115, |
| "grad_norm": 0.34444525837898254, |
| "learning_rate": 0.00046420390329158167, |
| "loss": 3.3253, |
| "step": 38950 |
| }, |
| { |
| "epoch": 11.353794187874906, |
| "grad_norm": 0.34927886724472046, |
| "learning_rate": 0.0004640291290416545, |
| "loss": 3.3361, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.353794187874906, |
| "eval_accuracy": 0.36882192674458786, |
| "eval_loss": 3.568171739578247, |
| "eval_runtime": 180.5055, |
| "eval_samples_per_second": 92.202, |
| "eval_steps_per_second": 5.767, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.368353619474696, |
| "grad_norm": 0.34765294194221497, |
| "learning_rate": 0.00046385435479172734, |
| "loss": 3.3385, |
| "step": 39050 |
| }, |
| { |
| "epoch": 11.382913051074485, |
| "grad_norm": 0.37567201256752014, |
| "learning_rate": 0.00046367958054180017, |
| "loss": 3.3372, |
| "step": 39100 |
| }, |
| { |
| "epoch": 11.397472482674276, |
| "grad_norm": 0.3298972547054291, |
| "learning_rate": 0.00046350480629187295, |
| "loss": 3.3449, |
| "step": 39150 |
| }, |
| { |
| "epoch": 11.412031914274067, |
| "grad_norm": 0.3385719656944275, |
| "learning_rate": 0.0004633300320419458, |
| "loss": 3.3381, |
| "step": 39200 |
| }, |
| { |
| "epoch": 11.426591345873858, |
| "grad_norm": 0.3834417164325714, |
| "learning_rate": 0.0004631552577920186, |
| "loss": 3.3453, |
| "step": 39250 |
| }, |
| { |
| "epoch": 11.441150777473647, |
| "grad_norm": 0.36645200848579407, |
| "learning_rate": 0.00046298048354209145, |
| "loss": 3.3585, |
| "step": 39300 |
| }, |
| { |
| "epoch": 11.455710209073438, |
| "grad_norm": 0.3596128523349762, |
| "learning_rate": 0.0004628057092921642, |
| "loss": 3.3505, |
| "step": 39350 |
| }, |
| { |
| "epoch": 11.470269640673228, |
| "grad_norm": 0.37306201457977295, |
| "learning_rate": 0.00046263093504223706, |
| "loss": 3.3489, |
| "step": 39400 |
| }, |
| { |
| "epoch": 11.484829072273019, |
| "grad_norm": 0.40729859471321106, |
| "learning_rate": 0.0004624561607923099, |
| "loss": 3.3555, |
| "step": 39450 |
| }, |
| { |
| "epoch": 11.499388503872808, |
| "grad_norm": 0.35100769996643066, |
| "learning_rate": 0.0004622813865423827, |
| "loss": 3.3443, |
| "step": 39500 |
| }, |
| { |
| "epoch": 11.513947935472599, |
| "grad_norm": 0.347989559173584, |
| "learning_rate": 0.00046210661229245556, |
| "loss": 3.3647, |
| "step": 39550 |
| }, |
| { |
| "epoch": 11.52850736707239, |
| "grad_norm": 0.35340970754623413, |
| "learning_rate": 0.00046193183804252834, |
| "loss": 3.346, |
| "step": 39600 |
| }, |
| { |
| "epoch": 11.54306679867218, |
| "grad_norm": 0.3439280092716217, |
| "learning_rate": 0.00046175706379260117, |
| "loss": 3.3613, |
| "step": 39650 |
| }, |
| { |
| "epoch": 11.55762623027197, |
| "grad_norm": 0.34520137310028076, |
| "learning_rate": 0.000461582289542674, |
| "loss": 3.347, |
| "step": 39700 |
| }, |
| { |
| "epoch": 11.57218566187176, |
| "grad_norm": 0.3320297598838806, |
| "learning_rate": 0.00046140751529274684, |
| "loss": 3.3489, |
| "step": 39750 |
| }, |
| { |
| "epoch": 11.586745093471551, |
| "grad_norm": 0.35040003061294556, |
| "learning_rate": 0.00046123274104281967, |
| "loss": 3.3462, |
| "step": 39800 |
| }, |
| { |
| "epoch": 11.601304525071342, |
| "grad_norm": 0.3691483438014984, |
| "learning_rate": 0.00046105796679289245, |
| "loss": 3.3593, |
| "step": 39850 |
| }, |
| { |
| "epoch": 11.61586395667113, |
| "grad_norm": 0.3896438777446747, |
| "learning_rate": 0.0004608831925429653, |
| "loss": 3.3616, |
| "step": 39900 |
| }, |
| { |
| "epoch": 11.630423388270922, |
| "grad_norm": 0.36567434668540955, |
| "learning_rate": 0.0004607084182930381, |
| "loss": 3.3553, |
| "step": 39950 |
| }, |
| { |
| "epoch": 11.644982819870712, |
| "grad_norm": 0.343128502368927, |
| "learning_rate": 0.00046053364404311095, |
| "loss": 3.361, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.644982819870712, |
| "eval_accuracy": 0.36932248097582326, |
| "eval_loss": 3.5595271587371826, |
| "eval_runtime": 180.4161, |
| "eval_samples_per_second": 92.248, |
| "eval_steps_per_second": 5.77, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.659542251470503, |
| "grad_norm": 0.35191434621810913, |
| "learning_rate": 0.0004603588697931837, |
| "loss": 3.3577, |
| "step": 40050 |
| }, |
| { |
| "epoch": 11.674101683070292, |
| "grad_norm": 0.36230576038360596, |
| "learning_rate": 0.00046018409554325656, |
| "loss": 3.357, |
| "step": 40100 |
| }, |
| { |
| "epoch": 11.688661114670083, |
| "grad_norm": 0.3622187077999115, |
| "learning_rate": 0.0004600093212933294, |
| "loss": 3.3582, |
| "step": 40150 |
| }, |
| { |
| "epoch": 11.703220546269874, |
| "grad_norm": 0.3318762183189392, |
| "learning_rate": 0.0004598345470434022, |
| "loss": 3.3514, |
| "step": 40200 |
| }, |
| { |
| "epoch": 11.717779977869665, |
| "grad_norm": 0.39714378118515015, |
| "learning_rate": 0.00045965977279347506, |
| "loss": 3.3712, |
| "step": 40250 |
| }, |
| { |
| "epoch": 11.732339409469454, |
| "grad_norm": 0.3500266969203949, |
| "learning_rate": 0.00045948499854354784, |
| "loss": 3.3497, |
| "step": 40300 |
| }, |
| { |
| "epoch": 11.746898841069244, |
| "grad_norm": 0.3604873716831207, |
| "learning_rate": 0.00045931022429362067, |
| "loss": 3.3741, |
| "step": 40350 |
| }, |
| { |
| "epoch": 11.761458272669035, |
| "grad_norm": 0.3486907482147217, |
| "learning_rate": 0.0004591354500436935, |
| "loss": 3.3652, |
| "step": 40400 |
| }, |
| { |
| "epoch": 11.776017704268826, |
| "grad_norm": 0.3572530448436737, |
| "learning_rate": 0.00045896067579376634, |
| "loss": 3.3718, |
| "step": 40450 |
| }, |
| { |
| "epoch": 11.790577135868615, |
| "grad_norm": 0.3657876253128052, |
| "learning_rate": 0.0004587859015438391, |
| "loss": 3.3635, |
| "step": 40500 |
| }, |
| { |
| "epoch": 11.805136567468406, |
| "grad_norm": 0.3681361675262451, |
| "learning_rate": 0.00045861112729391195, |
| "loss": 3.3774, |
| "step": 40550 |
| }, |
| { |
| "epoch": 11.819695999068196, |
| "grad_norm": 0.3658032715320587, |
| "learning_rate": 0.0004584363530439848, |
| "loss": 3.3633, |
| "step": 40600 |
| }, |
| { |
| "epoch": 11.834255430667987, |
| "grad_norm": 0.35775232315063477, |
| "learning_rate": 0.0004582615787940576, |
| "loss": 3.3666, |
| "step": 40650 |
| }, |
| { |
| "epoch": 11.848814862267776, |
| "grad_norm": 0.3474526107311249, |
| "learning_rate": 0.00045808680454413045, |
| "loss": 3.3651, |
| "step": 40700 |
| }, |
| { |
| "epoch": 11.863374293867567, |
| "grad_norm": 0.332225501537323, |
| "learning_rate": 0.00045791203029420333, |
| "loss": 3.3805, |
| "step": 40750 |
| }, |
| { |
| "epoch": 11.877933725467358, |
| "grad_norm": 0.3563697338104248, |
| "learning_rate": 0.0004577372560442761, |
| "loss": 3.3677, |
| "step": 40800 |
| }, |
| { |
| "epoch": 11.892493157067149, |
| "grad_norm": 0.35877200961112976, |
| "learning_rate": 0.00045756248179434894, |
| "loss": 3.36, |
| "step": 40850 |
| }, |
| { |
| "epoch": 11.90705258866694, |
| "grad_norm": 0.3673311471939087, |
| "learning_rate": 0.0004573877075444218, |
| "loss": 3.3754, |
| "step": 40900 |
| }, |
| { |
| "epoch": 11.921612020266728, |
| "grad_norm": 0.3623284697532654, |
| "learning_rate": 0.0004572129332944946, |
| "loss": 3.3718, |
| "step": 40950 |
| }, |
| { |
| "epoch": 11.93617145186652, |
| "grad_norm": 0.3350276052951813, |
| "learning_rate": 0.00045703815904456744, |
| "loss": 3.382, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.93617145186652, |
| "eval_accuracy": 0.3699698973716846, |
| "eval_loss": 3.5505945682525635, |
| "eval_runtime": 180.4963, |
| "eval_samples_per_second": 92.207, |
| "eval_steps_per_second": 5.767, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.95073088346631, |
| "grad_norm": 0.38111695647239685, |
| "learning_rate": 0.0004568633847946402, |
| "loss": 3.3643, |
| "step": 41050 |
| }, |
| { |
| "epoch": 11.965290315066099, |
| "grad_norm": 0.3372560739517212, |
| "learning_rate": 0.00045668861054471306, |
| "loss": 3.3782, |
| "step": 41100 |
| }, |
| { |
| "epoch": 11.97984974666589, |
| "grad_norm": 0.3638279139995575, |
| "learning_rate": 0.0004565138362947859, |
| "loss": 3.3817, |
| "step": 41150 |
| }, |
| { |
| "epoch": 11.99440917826568, |
| "grad_norm": 0.3317911922931671, |
| "learning_rate": 0.0004563390620448587, |
| "loss": 3.3756, |
| "step": 41200 |
| }, |
| { |
| "epoch": 12.008735658959875, |
| "grad_norm": 0.36474958062171936, |
| "learning_rate": 0.0004561642877949315, |
| "loss": 3.3129, |
| "step": 41250 |
| }, |
| { |
| "epoch": 12.023295090559664, |
| "grad_norm": 0.3482862412929535, |
| "learning_rate": 0.00045598951354500433, |
| "loss": 3.2551, |
| "step": 41300 |
| }, |
| { |
| "epoch": 12.037854522159455, |
| "grad_norm": 0.3640674948692322, |
| "learning_rate": 0.00045581473929507717, |
| "loss": 3.2729, |
| "step": 41350 |
| }, |
| { |
| "epoch": 12.052413953759245, |
| "grad_norm": 0.3480179011821747, |
| "learning_rate": 0.00045563996504515, |
| "loss": 3.2839, |
| "step": 41400 |
| }, |
| { |
| "epoch": 12.066973385359036, |
| "grad_norm": 0.36980733275413513, |
| "learning_rate": 0.00045546519079522283, |
| "loss": 3.2803, |
| "step": 41450 |
| }, |
| { |
| "epoch": 12.081532816958825, |
| "grad_norm": 0.3632776141166687, |
| "learning_rate": 0.0004552904165452956, |
| "loss": 3.276, |
| "step": 41500 |
| }, |
| { |
| "epoch": 12.096092248558616, |
| "grad_norm": 0.3438667058944702, |
| "learning_rate": 0.00045511564229536844, |
| "loss": 3.2754, |
| "step": 41550 |
| }, |
| { |
| "epoch": 12.110651680158407, |
| "grad_norm": 0.37722644209861755, |
| "learning_rate": 0.0004549408680454413, |
| "loss": 3.2876, |
| "step": 41600 |
| }, |
| { |
| "epoch": 12.125211111758198, |
| "grad_norm": 0.3496084213256836, |
| "learning_rate": 0.0004547660937955141, |
| "loss": 3.2876, |
| "step": 41650 |
| }, |
| { |
| "epoch": 12.139770543357987, |
| "grad_norm": 0.35013580322265625, |
| "learning_rate": 0.00045459131954558694, |
| "loss": 3.2979, |
| "step": 41700 |
| }, |
| { |
| "epoch": 12.154329974957777, |
| "grad_norm": 0.36948785185813904, |
| "learning_rate": 0.0004544165452956597, |
| "loss": 3.2904, |
| "step": 41750 |
| }, |
| { |
| "epoch": 12.168889406557568, |
| "grad_norm": 0.3557933568954468, |
| "learning_rate": 0.00045424177104573255, |
| "loss": 3.3091, |
| "step": 41800 |
| }, |
| { |
| "epoch": 12.183448838157359, |
| "grad_norm": 0.3801754415035248, |
| "learning_rate": 0.0004540669967958054, |
| "loss": 3.3023, |
| "step": 41850 |
| }, |
| { |
| "epoch": 12.198008269757148, |
| "grad_norm": 0.3558266758918762, |
| "learning_rate": 0.0004538922225458782, |
| "loss": 3.3032, |
| "step": 41900 |
| }, |
| { |
| "epoch": 12.212567701356939, |
| "grad_norm": 0.347100168466568, |
| "learning_rate": 0.000453717448295951, |
| "loss": 3.3132, |
| "step": 41950 |
| }, |
| { |
| "epoch": 12.22712713295673, |
| "grad_norm": 0.38097837567329407, |
| "learning_rate": 0.00045354267404602383, |
| "loss": 3.3023, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.22712713295673, |
| "eval_accuracy": 0.3693688089925267, |
| "eval_loss": 3.5664913654327393, |
| "eval_runtime": 180.5022, |
| "eval_samples_per_second": 92.204, |
| "eval_steps_per_second": 5.767, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.24168656455652, |
| "grad_norm": 0.3856671154499054, |
| "learning_rate": 0.00045336789979609667, |
| "loss": 3.3158, |
| "step": 42050 |
| }, |
| { |
| "epoch": 12.25624599615631, |
| "grad_norm": 0.38062354922294617, |
| "learning_rate": 0.0004531931255461695, |
| "loss": 3.3048, |
| "step": 42100 |
| }, |
| { |
| "epoch": 12.2708054277561, |
| "grad_norm": 0.36241772770881653, |
| "learning_rate": 0.00045301835129624233, |
| "loss": 3.3038, |
| "step": 42150 |
| }, |
| { |
| "epoch": 12.28536485935589, |
| "grad_norm": 0.3613075315952301, |
| "learning_rate": 0.0004528435770463151, |
| "loss": 3.3044, |
| "step": 42200 |
| }, |
| { |
| "epoch": 12.299924290955682, |
| "grad_norm": 0.3558962941169739, |
| "learning_rate": 0.00045266880279638794, |
| "loss": 3.319, |
| "step": 42250 |
| }, |
| { |
| "epoch": 12.31448372255547, |
| "grad_norm": 0.3771170973777771, |
| "learning_rate": 0.0004524940285464608, |
| "loss": 3.3102, |
| "step": 42300 |
| }, |
| { |
| "epoch": 12.329043154155261, |
| "grad_norm": 0.3604891896247864, |
| "learning_rate": 0.0004523192542965336, |
| "loss": 3.3279, |
| "step": 42350 |
| }, |
| { |
| "epoch": 12.343602585755052, |
| "grad_norm": 0.3826010525226593, |
| "learning_rate": 0.00045214448004660644, |
| "loss": 3.3174, |
| "step": 42400 |
| }, |
| { |
| "epoch": 12.358162017354843, |
| "grad_norm": 0.36643317341804504, |
| "learning_rate": 0.0004519697057966792, |
| "loss": 3.3172, |
| "step": 42450 |
| }, |
| { |
| "epoch": 12.372721448954632, |
| "grad_norm": 0.3626962900161743, |
| "learning_rate": 0.00045179493154675205, |
| "loss": 3.3235, |
| "step": 42500 |
| }, |
| { |
| "epoch": 12.387280880554423, |
| "grad_norm": 0.3473532199859619, |
| "learning_rate": 0.0004516201572968249, |
| "loss": 3.328, |
| "step": 42550 |
| }, |
| { |
| "epoch": 12.401840312154214, |
| "grad_norm": 0.3678642213344574, |
| "learning_rate": 0.0004514453830468977, |
| "loss": 3.328, |
| "step": 42600 |
| }, |
| { |
| "epoch": 12.416399743754004, |
| "grad_norm": 0.3755843937397003, |
| "learning_rate": 0.0004512706087969705, |
| "loss": 3.3249, |
| "step": 42650 |
| }, |
| { |
| "epoch": 12.430959175353793, |
| "grad_norm": 0.36632055044174194, |
| "learning_rate": 0.00045109583454704333, |
| "loss": 3.3178, |
| "step": 42700 |
| }, |
| { |
| "epoch": 12.445518606953584, |
| "grad_norm": 0.348257839679718, |
| "learning_rate": 0.00045092106029711616, |
| "loss": 3.3205, |
| "step": 42750 |
| }, |
| { |
| "epoch": 12.460078038553375, |
| "grad_norm": 0.39049002528190613, |
| "learning_rate": 0.000450746286047189, |
| "loss": 3.3324, |
| "step": 42800 |
| }, |
| { |
| "epoch": 12.474637470153166, |
| "grad_norm": 0.35907331109046936, |
| "learning_rate": 0.00045057151179726183, |
| "loss": 3.3413, |
| "step": 42850 |
| }, |
| { |
| "epoch": 12.489196901752955, |
| "grad_norm": 0.3372901380062103, |
| "learning_rate": 0.0004503967375473346, |
| "loss": 3.3362, |
| "step": 42900 |
| }, |
| { |
| "epoch": 12.503756333352746, |
| "grad_norm": 0.3593348562717438, |
| "learning_rate": 0.00045022196329740744, |
| "loss": 3.3363, |
| "step": 42950 |
| }, |
| { |
| "epoch": 12.518315764952536, |
| "grad_norm": 0.3299802541732788, |
| "learning_rate": 0.0004500471890474803, |
| "loss": 3.3451, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.518315764952536, |
| "eval_accuracy": 0.37018825048594445, |
| "eval_loss": 3.559201955795288, |
| "eval_runtime": 180.5771, |
| "eval_samples_per_second": 92.166, |
| "eval_steps_per_second": 5.765, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.532875196552327, |
| "grad_norm": 0.390259712934494, |
| "learning_rate": 0.0004498724147975531, |
| "loss": 3.3272, |
| "step": 43050 |
| }, |
| { |
| "epoch": 12.547434628152118, |
| "grad_norm": 0.3740730583667755, |
| "learning_rate": 0.00044969764054762594, |
| "loss": 3.3344, |
| "step": 43100 |
| }, |
| { |
| "epoch": 12.561994059751907, |
| "grad_norm": 0.3831499516963959, |
| "learning_rate": 0.0004495228662976987, |
| "loss": 3.3283, |
| "step": 43150 |
| }, |
| { |
| "epoch": 12.576553491351698, |
| "grad_norm": 0.3770614266395569, |
| "learning_rate": 0.00044934809204777155, |
| "loss": 3.3304, |
| "step": 43200 |
| }, |
| { |
| "epoch": 12.591112922951488, |
| "grad_norm": 0.3610830008983612, |
| "learning_rate": 0.0004491733177978444, |
| "loss": 3.3299, |
| "step": 43250 |
| }, |
| { |
| "epoch": 12.605672354551277, |
| "grad_norm": 0.36223649978637695, |
| "learning_rate": 0.0004489985435479173, |
| "loss": 3.338, |
| "step": 43300 |
| }, |
| { |
| "epoch": 12.620231786151068, |
| "grad_norm": 0.3683512210845947, |
| "learning_rate": 0.0004488237692979901, |
| "loss": 3.3381, |
| "step": 43350 |
| }, |
| { |
| "epoch": 12.634791217750859, |
| "grad_norm": 0.3539344370365143, |
| "learning_rate": 0.0004486489950480629, |
| "loss": 3.3481, |
| "step": 43400 |
| }, |
| { |
| "epoch": 12.64935064935065, |
| "grad_norm": 0.35910946130752563, |
| "learning_rate": 0.0004484742207981357, |
| "loss": 3.3372, |
| "step": 43450 |
| }, |
| { |
| "epoch": 12.66391008095044, |
| "grad_norm": 0.354937881231308, |
| "learning_rate": 0.00044829944654820855, |
| "loss": 3.3418, |
| "step": 43500 |
| }, |
| { |
| "epoch": 12.67846951255023, |
| "grad_norm": 0.3593963384628296, |
| "learning_rate": 0.0004481246722982814, |
| "loss": 3.3433, |
| "step": 43550 |
| }, |
| { |
| "epoch": 12.69302894415002, |
| "grad_norm": 0.3790937066078186, |
| "learning_rate": 0.0004479498980483542, |
| "loss": 3.344, |
| "step": 43600 |
| }, |
| { |
| "epoch": 12.707588375749811, |
| "grad_norm": 0.37113437056541443, |
| "learning_rate": 0.000447775123798427, |
| "loss": 3.3403, |
| "step": 43650 |
| }, |
| { |
| "epoch": 12.7221478073496, |
| "grad_norm": 0.3723011612892151, |
| "learning_rate": 0.00044760034954849983, |
| "loss": 3.3454, |
| "step": 43700 |
| }, |
| { |
| "epoch": 12.736707238949391, |
| "grad_norm": 0.42552995681762695, |
| "learning_rate": 0.00044742557529857266, |
| "loss": 3.3587, |
| "step": 43750 |
| }, |
| { |
| "epoch": 12.751266670549182, |
| "grad_norm": 0.37783387303352356, |
| "learning_rate": 0.0004472508010486455, |
| "loss": 3.3452, |
| "step": 43800 |
| }, |
| { |
| "epoch": 12.765826102148973, |
| "grad_norm": 0.36067041754722595, |
| "learning_rate": 0.0004470760267987183, |
| "loss": 3.3381, |
| "step": 43850 |
| }, |
| { |
| "epoch": 12.780385533748763, |
| "grad_norm": 0.34738510847091675, |
| "learning_rate": 0.0004469012525487911, |
| "loss": 3.3478, |
| "step": 43900 |
| }, |
| { |
| "epoch": 12.794944965348552, |
| "grad_norm": 0.37115344405174255, |
| "learning_rate": 0.00044672647829886394, |
| "loss": 3.3477, |
| "step": 43950 |
| }, |
| { |
| "epoch": 12.809504396948343, |
| "grad_norm": 0.3720683753490448, |
| "learning_rate": 0.00044655170404893677, |
| "loss": 3.3421, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.809504396948343, |
| "eval_accuracy": 0.3703436962678785, |
| "eval_loss": 3.550240993499756, |
| "eval_runtime": 180.6249, |
| "eval_samples_per_second": 92.141, |
| "eval_steps_per_second": 5.763, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.824063828548134, |
| "grad_norm": 0.38867267966270447, |
| "learning_rate": 0.0004463769297990096, |
| "loss": 3.3632, |
| "step": 44050 |
| }, |
| { |
| "epoch": 12.838623260147925, |
| "grad_norm": 0.3558189868927002, |
| "learning_rate": 0.0004462021555490824, |
| "loss": 3.3466, |
| "step": 44100 |
| }, |
| { |
| "epoch": 12.853182691747714, |
| "grad_norm": 0.37346190214157104, |
| "learning_rate": 0.0004460273812991552, |
| "loss": 3.3497, |
| "step": 44150 |
| }, |
| { |
| "epoch": 12.867742123347504, |
| "grad_norm": 0.3574129641056061, |
| "learning_rate": 0.00044585260704922805, |
| "loss": 3.3586, |
| "step": 44200 |
| }, |
| { |
| "epoch": 12.882301554947295, |
| "grad_norm": 0.3636460602283478, |
| "learning_rate": 0.0004456778327993009, |
| "loss": 3.3511, |
| "step": 44250 |
| }, |
| { |
| "epoch": 12.896860986547086, |
| "grad_norm": 0.38483962416648865, |
| "learning_rate": 0.0004455030585493737, |
| "loss": 3.3554, |
| "step": 44300 |
| }, |
| { |
| "epoch": 12.911420418146875, |
| "grad_norm": 0.3610190749168396, |
| "learning_rate": 0.0004453282842994465, |
| "loss": 3.3622, |
| "step": 44350 |
| }, |
| { |
| "epoch": 12.925979849746666, |
| "grad_norm": 0.3633497357368469, |
| "learning_rate": 0.00044515351004951933, |
| "loss": 3.3642, |
| "step": 44400 |
| }, |
| { |
| "epoch": 12.940539281346457, |
| "grad_norm": 0.37331637740135193, |
| "learning_rate": 0.00044497873579959216, |
| "loss": 3.3451, |
| "step": 44450 |
| }, |
| { |
| "epoch": 12.955098712946247, |
| "grad_norm": 0.3372592031955719, |
| "learning_rate": 0.000444803961549665, |
| "loss": 3.3385, |
| "step": 44500 |
| }, |
| { |
| "epoch": 12.969658144546036, |
| "grad_norm": 0.38096725940704346, |
| "learning_rate": 0.0004446291872997378, |
| "loss": 3.3511, |
| "step": 44550 |
| }, |
| { |
| "epoch": 12.984217576145827, |
| "grad_norm": 0.3771226704120636, |
| "learning_rate": 0.0004444544130498106, |
| "loss": 3.3651, |
| "step": 44600 |
| }, |
| { |
| "epoch": 12.998777007745618, |
| "grad_norm": 0.37394359707832336, |
| "learning_rate": 0.00044427963879988344, |
| "loss": 3.3551, |
| "step": 44650 |
| }, |
| { |
| "epoch": 13.01310348843981, |
| "grad_norm": 0.3792758882045746, |
| "learning_rate": 0.00044410486454995627, |
| "loss": 3.2584, |
| "step": 44700 |
| }, |
| { |
| "epoch": 13.027662920039601, |
| "grad_norm": 0.3561805188655853, |
| "learning_rate": 0.0004439300903000291, |
| "loss": 3.2468, |
| "step": 44750 |
| }, |
| { |
| "epoch": 13.042222351639392, |
| "grad_norm": 0.3759787678718567, |
| "learning_rate": 0.0004437553160501019, |
| "loss": 3.2549, |
| "step": 44800 |
| }, |
| { |
| "epoch": 13.056781783239183, |
| "grad_norm": 0.3348606824874878, |
| "learning_rate": 0.0004435805418001747, |
| "loss": 3.2564, |
| "step": 44850 |
| }, |
| { |
| "epoch": 13.071341214838972, |
| "grad_norm": 0.36148425936698914, |
| "learning_rate": 0.00044340576755024755, |
| "loss": 3.2635, |
| "step": 44900 |
| }, |
| { |
| "epoch": 13.085900646438763, |
| "grad_norm": 0.37378937005996704, |
| "learning_rate": 0.0004432309933003204, |
| "loss": 3.2627, |
| "step": 44950 |
| }, |
| { |
| "epoch": 13.100460078038553, |
| "grad_norm": 0.39774009585380554, |
| "learning_rate": 0.0004430562190503932, |
| "loss": 3.278, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.100460078038553, |
| "eval_accuracy": 0.36968699075191663, |
| "eval_loss": 3.564668893814087, |
| "eval_runtime": 180.6228, |
| "eval_samples_per_second": 92.142, |
| "eval_steps_per_second": 5.763, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.115019509638344, |
| "grad_norm": 0.3558865785598755, |
| "learning_rate": 0.000442881444800466, |
| "loss": 3.2663, |
| "step": 45050 |
| }, |
| { |
| "epoch": 13.129578941238133, |
| "grad_norm": 0.3488845229148865, |
| "learning_rate": 0.00044270667055053883, |
| "loss": 3.2783, |
| "step": 45100 |
| }, |
| { |
| "epoch": 13.144138372837924, |
| "grad_norm": 0.3732559382915497, |
| "learning_rate": 0.00044253189630061166, |
| "loss": 3.2773, |
| "step": 45150 |
| }, |
| { |
| "epoch": 13.158697804437715, |
| "grad_norm": 0.3805554211139679, |
| "learning_rate": 0.0004423571220506845, |
| "loss": 3.285, |
| "step": 45200 |
| }, |
| { |
| "epoch": 13.173257236037506, |
| "grad_norm": 0.3527976870536804, |
| "learning_rate": 0.00044218234780075727, |
| "loss": 3.2736, |
| "step": 45250 |
| }, |
| { |
| "epoch": 13.187816667637296, |
| "grad_norm": 0.36648017168045044, |
| "learning_rate": 0.0004420075735508301, |
| "loss": 3.2835, |
| "step": 45300 |
| }, |
| { |
| "epoch": 13.202376099237085, |
| "grad_norm": 0.3752287030220032, |
| "learning_rate": 0.00044183279930090294, |
| "loss": 3.2882, |
| "step": 45350 |
| }, |
| { |
| "epoch": 13.216935530836876, |
| "grad_norm": 0.3686719536781311, |
| "learning_rate": 0.00044165802505097577, |
| "loss": 3.2853, |
| "step": 45400 |
| }, |
| { |
| "epoch": 13.231494962436667, |
| "grad_norm": 0.37559449672698975, |
| "learning_rate": 0.0004414832508010486, |
| "loss": 3.2975, |
| "step": 45450 |
| }, |
| { |
| "epoch": 13.246054394036458, |
| "grad_norm": 0.37365761399269104, |
| "learning_rate": 0.0004413084765511214, |
| "loss": 3.2917, |
| "step": 45500 |
| }, |
| { |
| "epoch": 13.260613825636247, |
| "grad_norm": 0.3821715712547302, |
| "learning_rate": 0.0004411337023011942, |
| "loss": 3.3, |
| "step": 45550 |
| }, |
| { |
| "epoch": 13.275173257236037, |
| "grad_norm": 0.38011810183525085, |
| "learning_rate": 0.00044095892805126705, |
| "loss": 3.2926, |
| "step": 45600 |
| }, |
| { |
| "epoch": 13.289732688835828, |
| "grad_norm": 0.3801783621311188, |
| "learning_rate": 0.0004407841538013399, |
| "loss": 3.2993, |
| "step": 45650 |
| }, |
| { |
| "epoch": 13.304292120435619, |
| "grad_norm": 0.3595413267612457, |
| "learning_rate": 0.0004406093795514127, |
| "loss": 3.3049, |
| "step": 45700 |
| }, |
| { |
| "epoch": 13.318851552035408, |
| "grad_norm": 0.3675829768180847, |
| "learning_rate": 0.0004404346053014855, |
| "loss": 3.296, |
| "step": 45750 |
| }, |
| { |
| "epoch": 13.333410983635199, |
| "grad_norm": 0.37648913264274597, |
| "learning_rate": 0.0004402598310515584, |
| "loss": 3.3018, |
| "step": 45800 |
| }, |
| { |
| "epoch": 13.34797041523499, |
| "grad_norm": 0.3896103799343109, |
| "learning_rate": 0.0004400850568016312, |
| "loss": 3.3021, |
| "step": 45850 |
| }, |
| { |
| "epoch": 13.36252984683478, |
| "grad_norm": 0.39437761902809143, |
| "learning_rate": 0.00043991028255170405, |
| "loss": 3.3033, |
| "step": 45900 |
| }, |
| { |
| "epoch": 13.37708927843457, |
| "grad_norm": 0.3523409962654114, |
| "learning_rate": 0.0004397355083017769, |
| "loss": 3.3159, |
| "step": 45950 |
| }, |
| { |
| "epoch": 13.39164871003436, |
| "grad_norm": 0.3909803628921509, |
| "learning_rate": 0.00043956073405184966, |
| "loss": 3.3183, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.39164871003436, |
| "eval_accuracy": 0.3703245301086839, |
| "eval_loss": 3.55873703956604, |
| "eval_runtime": 180.5898, |
| "eval_samples_per_second": 92.159, |
| "eval_steps_per_second": 5.764, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.406208141634151, |
| "grad_norm": 0.3750564455986023, |
| "learning_rate": 0.0004393859598019225, |
| "loss": 3.3159, |
| "step": 46050 |
| }, |
| { |
| "epoch": 13.420767573233942, |
| "grad_norm": 0.36859869956970215, |
| "learning_rate": 0.0004392111855519953, |
| "loss": 3.3034, |
| "step": 46100 |
| }, |
| { |
| "epoch": 13.43532700483373, |
| "grad_norm": 0.3968806564807892, |
| "learning_rate": 0.00043903641130206816, |
| "loss": 3.3098, |
| "step": 46150 |
| }, |
| { |
| "epoch": 13.449886436433522, |
| "grad_norm": 0.34583473205566406, |
| "learning_rate": 0.000438861637052141, |
| "loss": 3.3178, |
| "step": 46200 |
| }, |
| { |
| "epoch": 13.464445868033312, |
| "grad_norm": 0.37164896726608276, |
| "learning_rate": 0.00043868686280221377, |
| "loss": 3.3134, |
| "step": 46250 |
| }, |
| { |
| "epoch": 13.479005299633103, |
| "grad_norm": 0.367662638425827, |
| "learning_rate": 0.0004385120885522866, |
| "loss": 3.3195, |
| "step": 46300 |
| }, |
| { |
| "epoch": 13.493564731232892, |
| "grad_norm": 0.38434478640556335, |
| "learning_rate": 0.00043833731430235944, |
| "loss": 3.3219, |
| "step": 46350 |
| }, |
| { |
| "epoch": 13.508124162832683, |
| "grad_norm": 0.3725048303604126, |
| "learning_rate": 0.00043816254005243227, |
| "loss": 3.3191, |
| "step": 46400 |
| }, |
| { |
| "epoch": 13.522683594432474, |
| "grad_norm": 0.38499268889427185, |
| "learning_rate": 0.0004379877658025051, |
| "loss": 3.3106, |
| "step": 46450 |
| }, |
| { |
| "epoch": 13.537243026032264, |
| "grad_norm": 0.36700862646102905, |
| "learning_rate": 0.0004378129915525779, |
| "loss": 3.3155, |
| "step": 46500 |
| }, |
| { |
| "epoch": 13.551802457632053, |
| "grad_norm": 0.3642922639846802, |
| "learning_rate": 0.0004376382173026507, |
| "loss": 3.3128, |
| "step": 46550 |
| }, |
| { |
| "epoch": 13.566361889231844, |
| "grad_norm": 0.38024237751960754, |
| "learning_rate": 0.00043746344305272355, |
| "loss": 3.3268, |
| "step": 46600 |
| }, |
| { |
| "epoch": 13.580921320831635, |
| "grad_norm": 0.3899970054626465, |
| "learning_rate": 0.0004372886688027964, |
| "loss": 3.3127, |
| "step": 46650 |
| }, |
| { |
| "epoch": 13.595480752431426, |
| "grad_norm": 0.3449365198612213, |
| "learning_rate": 0.00043711389455286916, |
| "loss": 3.3182, |
| "step": 46700 |
| }, |
| { |
| "epoch": 13.610040184031215, |
| "grad_norm": 0.36648425459861755, |
| "learning_rate": 0.000436939120302942, |
| "loss": 3.3191, |
| "step": 46750 |
| }, |
| { |
| "epoch": 13.624599615631006, |
| "grad_norm": 0.3479945957660675, |
| "learning_rate": 0.0004367643460530148, |
| "loss": 3.3269, |
| "step": 46800 |
| }, |
| { |
| "epoch": 13.639159047230796, |
| "grad_norm": 0.3739745616912842, |
| "learning_rate": 0.00043658957180308766, |
| "loss": 3.3166, |
| "step": 46850 |
| }, |
| { |
| "epoch": 13.653718478830587, |
| "grad_norm": 0.37553584575653076, |
| "learning_rate": 0.0004364147975531605, |
| "loss": 3.3376, |
| "step": 46900 |
| }, |
| { |
| "epoch": 13.668277910430376, |
| "grad_norm": 0.3680751621723175, |
| "learning_rate": 0.00043624002330323327, |
| "loss": 3.3333, |
| "step": 46950 |
| }, |
| { |
| "epoch": 13.682837342030167, |
| "grad_norm": 0.41123461723327637, |
| "learning_rate": 0.0004360652490533061, |
| "loss": 3.3367, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.682837342030167, |
| "eval_accuracy": 0.37088411140688665, |
| "eval_loss": 3.5522854328155518, |
| "eval_runtime": 180.6331, |
| "eval_samples_per_second": 92.137, |
| "eval_steps_per_second": 5.763, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.697396773629958, |
| "grad_norm": 0.37972718477249146, |
| "learning_rate": 0.00043589047480337893, |
| "loss": 3.3264, |
| "step": 47050 |
| }, |
| { |
| "epoch": 13.711956205229749, |
| "grad_norm": 0.3651711642742157, |
| "learning_rate": 0.00043571570055345177, |
| "loss": 3.3072, |
| "step": 47100 |
| }, |
| { |
| "epoch": 13.726515636829538, |
| "grad_norm": 0.38708093762397766, |
| "learning_rate": 0.00043554092630352455, |
| "loss": 3.3205, |
| "step": 47150 |
| }, |
| { |
| "epoch": 13.741075068429328, |
| "grad_norm": 0.39110127091407776, |
| "learning_rate": 0.0004353661520535974, |
| "loss": 3.3218, |
| "step": 47200 |
| }, |
| { |
| "epoch": 13.75563450002912, |
| "grad_norm": 0.37536707520484924, |
| "learning_rate": 0.0004351913778036702, |
| "loss": 3.3211, |
| "step": 47250 |
| }, |
| { |
| "epoch": 13.77019393162891, |
| "grad_norm": 0.359542578458786, |
| "learning_rate": 0.00043501660355374305, |
| "loss": 3.3267, |
| "step": 47300 |
| }, |
| { |
| "epoch": 13.784753363228699, |
| "grad_norm": 0.3808390200138092, |
| "learning_rate": 0.0004348418293038159, |
| "loss": 3.3377, |
| "step": 47350 |
| }, |
| { |
| "epoch": 13.79931279482849, |
| "grad_norm": 0.3661261200904846, |
| "learning_rate": 0.00043466705505388866, |
| "loss": 3.3298, |
| "step": 47400 |
| }, |
| { |
| "epoch": 13.81387222642828, |
| "grad_norm": 0.3553769588470459, |
| "learning_rate": 0.0004344922808039615, |
| "loss": 3.3201, |
| "step": 47450 |
| }, |
| { |
| "epoch": 13.828431658028071, |
| "grad_norm": 0.3497569262981415, |
| "learning_rate": 0.0004343175065540343, |
| "loss": 3.3404, |
| "step": 47500 |
| }, |
| { |
| "epoch": 13.84299108962786, |
| "grad_norm": 0.37346166372299194, |
| "learning_rate": 0.00043414273230410716, |
| "loss": 3.3399, |
| "step": 47550 |
| }, |
| { |
| "epoch": 13.857550521227651, |
| "grad_norm": 0.3590134382247925, |
| "learning_rate": 0.00043396795805418, |
| "loss": 3.3308, |
| "step": 47600 |
| }, |
| { |
| "epoch": 13.872109952827442, |
| "grad_norm": 0.3460633158683777, |
| "learning_rate": 0.00043379318380425277, |
| "loss": 3.3433, |
| "step": 47650 |
| }, |
| { |
| "epoch": 13.886669384427233, |
| "grad_norm": 0.38021934032440186, |
| "learning_rate": 0.0004336184095543256, |
| "loss": 3.3404, |
| "step": 47700 |
| }, |
| { |
| "epoch": 13.901228816027022, |
| "grad_norm": 0.3488508462905884, |
| "learning_rate": 0.00043344363530439843, |
| "loss": 3.3313, |
| "step": 47750 |
| }, |
| { |
| "epoch": 13.915788247626812, |
| "grad_norm": 0.3584929406642914, |
| "learning_rate": 0.00043326886105447127, |
| "loss": 3.3392, |
| "step": 47800 |
| }, |
| { |
| "epoch": 13.930347679226603, |
| "grad_norm": 0.35224011540412903, |
| "learning_rate": 0.00043309408680454405, |
| "loss": 3.3349, |
| "step": 47850 |
| }, |
| { |
| "epoch": 13.944907110826394, |
| "grad_norm": 0.4024561643600464, |
| "learning_rate": 0.0004329193125546169, |
| "loss": 3.3364, |
| "step": 47900 |
| }, |
| { |
| "epoch": 13.959466542426183, |
| "grad_norm": 0.397568941116333, |
| "learning_rate": 0.0004327445383046897, |
| "loss": 3.3406, |
| "step": 47950 |
| }, |
| { |
| "epoch": 13.974025974025974, |
| "grad_norm": 0.3521833121776581, |
| "learning_rate": 0.00043256976405476255, |
| "loss": 3.3425, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.974025974025974, |
| "eval_accuracy": 0.3711820687528933, |
| "eval_loss": 3.5451555252075195, |
| "eval_runtime": 180.6693, |
| "eval_samples_per_second": 92.119, |
| "eval_steps_per_second": 5.762, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.988585405625765, |
| "grad_norm": 0.37055131793022156, |
| "learning_rate": 0.0004323949898048354, |
| "loss": 3.3417, |
| "step": 48050 |
| }, |
| { |
| "epoch": 14.002911886319959, |
| "grad_norm": 0.3706069588661194, |
| "learning_rate": 0.00043222021555490816, |
| "loss": 3.3111, |
| "step": 48100 |
| }, |
| { |
| "epoch": 14.017471317919748, |
| "grad_norm": 0.36993443965911865, |
| "learning_rate": 0.000432045441304981, |
| "loss": 3.242, |
| "step": 48150 |
| }, |
| { |
| "epoch": 14.032030749519539, |
| "grad_norm": 0.3860079348087311, |
| "learning_rate": 0.0004318706670550538, |
| "loss": 3.2285, |
| "step": 48200 |
| }, |
| { |
| "epoch": 14.04659018111933, |
| "grad_norm": 0.36225396394729614, |
| "learning_rate": 0.00043169589280512666, |
| "loss": 3.2359, |
| "step": 48250 |
| }, |
| { |
| "epoch": 14.06114961271912, |
| "grad_norm": 0.3634689450263977, |
| "learning_rate": 0.0004315211185551995, |
| "loss": 3.241, |
| "step": 48300 |
| }, |
| { |
| "epoch": 14.07570904431891, |
| "grad_norm": 0.40568917989730835, |
| "learning_rate": 0.0004313463443052724, |
| "loss": 3.2463, |
| "step": 48350 |
| }, |
| { |
| "epoch": 14.0902684759187, |
| "grad_norm": 0.3718388080596924, |
| "learning_rate": 0.00043117157005534515, |
| "loss": 3.2494, |
| "step": 48400 |
| }, |
| { |
| "epoch": 14.10482790751849, |
| "grad_norm": 0.3734684884548187, |
| "learning_rate": 0.000430996795805418, |
| "loss": 3.2557, |
| "step": 48450 |
| }, |
| { |
| "epoch": 14.119387339118282, |
| "grad_norm": 0.35955342650413513, |
| "learning_rate": 0.0004308220215554908, |
| "loss": 3.2482, |
| "step": 48500 |
| }, |
| { |
| "epoch": 14.13394677071807, |
| "grad_norm": 0.36027783155441284, |
| "learning_rate": 0.00043064724730556365, |
| "loss": 3.2626, |
| "step": 48550 |
| }, |
| { |
| "epoch": 14.148506202317861, |
| "grad_norm": 0.3621423542499542, |
| "learning_rate": 0.00043047247305563643, |
| "loss": 3.267, |
| "step": 48600 |
| }, |
| { |
| "epoch": 14.163065633917652, |
| "grad_norm": 0.37086912989616394, |
| "learning_rate": 0.00043029769880570927, |
| "loss": 3.2634, |
| "step": 48650 |
| }, |
| { |
| "epoch": 14.177625065517443, |
| "grad_norm": 0.3684757351875305, |
| "learning_rate": 0.0004301229245557821, |
| "loss": 3.2605, |
| "step": 48700 |
| }, |
| { |
| "epoch": 14.192184497117232, |
| "grad_norm": 0.37380942702293396, |
| "learning_rate": 0.00042994815030585493, |
| "loss": 3.2687, |
| "step": 48750 |
| }, |
| { |
| "epoch": 14.206743928717023, |
| "grad_norm": 0.39025264978408813, |
| "learning_rate": 0.00042977337605592776, |
| "loss": 3.2659, |
| "step": 48800 |
| }, |
| { |
| "epoch": 14.221303360316814, |
| "grad_norm": 0.38189178705215454, |
| "learning_rate": 0.00042959860180600054, |
| "loss": 3.2656, |
| "step": 48850 |
| }, |
| { |
| "epoch": 14.235862791916604, |
| "grad_norm": 0.3802640438079834, |
| "learning_rate": 0.0004294238275560734, |
| "loss": 3.2824, |
| "step": 48900 |
| }, |
| { |
| "epoch": 14.250422223516393, |
| "grad_norm": 0.3753884732723236, |
| "learning_rate": 0.0004292490533061462, |
| "loss": 3.2765, |
| "step": 48950 |
| }, |
| { |
| "epoch": 14.264981655116184, |
| "grad_norm": 0.35979166626930237, |
| "learning_rate": 0.00042907427905621904, |
| "loss": 3.2799, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.264981655116184, |
| "eval_accuracy": 0.37069092122555714, |
| "eval_loss": 3.5586068630218506, |
| "eval_runtime": 180.5727, |
| "eval_samples_per_second": 92.168, |
| "eval_steps_per_second": 5.765, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.279541086715975, |
| "grad_norm": 0.3511539101600647, |
| "learning_rate": 0.0004288995048062919, |
| "loss": 3.2865, |
| "step": 49050 |
| }, |
| { |
| "epoch": 14.294100518315766, |
| "grad_norm": 0.37331297993659973, |
| "learning_rate": 0.00042872473055636465, |
| "loss": 3.2907, |
| "step": 49100 |
| }, |
| { |
| "epoch": 14.308659949915555, |
| "grad_norm": 0.402055948972702, |
| "learning_rate": 0.0004285499563064375, |
| "loss": 3.287, |
| "step": 49150 |
| }, |
| { |
| "epoch": 14.323219381515345, |
| "grad_norm": 0.37558865547180176, |
| "learning_rate": 0.0004283751820565103, |
| "loss": 3.2928, |
| "step": 49200 |
| }, |
| { |
| "epoch": 14.337778813115136, |
| "grad_norm": 0.36927327513694763, |
| "learning_rate": 0.00042820040780658315, |
| "loss": 3.2923, |
| "step": 49250 |
| }, |
| { |
| "epoch": 14.352338244714927, |
| "grad_norm": 0.37961676716804504, |
| "learning_rate": 0.00042802563355665593, |
| "loss": 3.2827, |
| "step": 49300 |
| }, |
| { |
| "epoch": 14.366897676314716, |
| "grad_norm": 0.3793201744556427, |
| "learning_rate": 0.00042785085930672876, |
| "loss": 3.2895, |
| "step": 49350 |
| }, |
| { |
| "epoch": 14.381457107914507, |
| "grad_norm": 0.38360291719436646, |
| "learning_rate": 0.0004276760850568016, |
| "loss": 3.2897, |
| "step": 49400 |
| }, |
| { |
| "epoch": 14.396016539514298, |
| "grad_norm": 0.37893158197402954, |
| "learning_rate": 0.00042750131080687443, |
| "loss": 3.3067, |
| "step": 49450 |
| }, |
| { |
| "epoch": 14.410575971114088, |
| "grad_norm": 0.3840549886226654, |
| "learning_rate": 0.00042732653655694726, |
| "loss": 3.2867, |
| "step": 49500 |
| }, |
| { |
| "epoch": 14.425135402713877, |
| "grad_norm": 0.37985333800315857, |
| "learning_rate": 0.00042715176230702004, |
| "loss": 3.2966, |
| "step": 49550 |
| }, |
| { |
| "epoch": 14.439694834313668, |
| "grad_norm": 0.3771022856235504, |
| "learning_rate": 0.0004269769880570929, |
| "loss": 3.287, |
| "step": 49600 |
| }, |
| { |
| "epoch": 14.454254265913459, |
| "grad_norm": 0.38164374232292175, |
| "learning_rate": 0.0004268022138071657, |
| "loss": 3.3017, |
| "step": 49650 |
| }, |
| { |
| "epoch": 14.46881369751325, |
| "grad_norm": 0.38309139013290405, |
| "learning_rate": 0.00042662743955723854, |
| "loss": 3.298, |
| "step": 49700 |
| }, |
| { |
| "epoch": 14.483373129113039, |
| "grad_norm": 0.3698599338531494, |
| "learning_rate": 0.0004264526653073114, |
| "loss": 3.3014, |
| "step": 49750 |
| }, |
| { |
| "epoch": 14.49793256071283, |
| "grad_norm": 0.3559224605560303, |
| "learning_rate": 0.00042627789105738415, |
| "loss": 3.2961, |
| "step": 49800 |
| }, |
| { |
| "epoch": 14.51249199231262, |
| "grad_norm": 0.38171273469924927, |
| "learning_rate": 0.000426103116807457, |
| "loss": 3.297, |
| "step": 49850 |
| }, |
| { |
| "epoch": 14.527051423912411, |
| "grad_norm": 0.3736954629421234, |
| "learning_rate": 0.0004259283425575298, |
| "loss": 3.3043, |
| "step": 49900 |
| }, |
| { |
| "epoch": 14.5416108555122, |
| "grad_norm": 0.36771056056022644, |
| "learning_rate": 0.00042575356830760265, |
| "loss": 3.3111, |
| "step": 49950 |
| }, |
| { |
| "epoch": 14.556170287111991, |
| "grad_norm": 0.39156320691108704, |
| "learning_rate": 0.00042557879405767543, |
| "loss": 3.3193, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.556170287111991, |
| "eval_accuracy": 0.37085107035944076, |
| "eval_loss": 3.5553669929504395, |
| "eval_runtime": 180.9457, |
| "eval_samples_per_second": 91.978, |
| "eval_steps_per_second": 5.753, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.570729718711782, |
| "grad_norm": 0.39517736434936523, |
| "learning_rate": 0.00042540401980774826, |
| "loss": 3.3082, |
| "step": 50050 |
| }, |
| { |
| "epoch": 14.585289150311572, |
| "grad_norm": 0.35409751534461975, |
| "learning_rate": 0.0004252292455578211, |
| "loss": 3.2987, |
| "step": 50100 |
| }, |
| { |
| "epoch": 14.599848581911361, |
| "grad_norm": 0.40323737263679504, |
| "learning_rate": 0.00042505447130789393, |
| "loss": 3.3129, |
| "step": 50150 |
| }, |
| { |
| "epoch": 14.614408013511152, |
| "grad_norm": 0.3909080922603607, |
| "learning_rate": 0.00042487969705796676, |
| "loss": 3.3088, |
| "step": 50200 |
| }, |
| { |
| "epoch": 14.628967445110943, |
| "grad_norm": 0.36540573835372925, |
| "learning_rate": 0.00042470492280803954, |
| "loss": 3.3142, |
| "step": 50250 |
| }, |
| { |
| "epoch": 14.643526876710734, |
| "grad_norm": 0.3602832555770874, |
| "learning_rate": 0.0004245301485581124, |
| "loss": 3.3221, |
| "step": 50300 |
| }, |
| { |
| "epoch": 14.658086308310523, |
| "grad_norm": 0.3897080421447754, |
| "learning_rate": 0.0004243553743081852, |
| "loss": 3.2985, |
| "step": 50350 |
| }, |
| { |
| "epoch": 14.672645739910314, |
| "grad_norm": 0.36867547035217285, |
| "learning_rate": 0.00042418060005825804, |
| "loss": 3.3072, |
| "step": 50400 |
| }, |
| { |
| "epoch": 14.687205171510104, |
| "grad_norm": 0.3675730228424072, |
| "learning_rate": 0.0004240058258083308, |
| "loss": 3.3038, |
| "step": 50450 |
| }, |
| { |
| "epoch": 14.701764603109895, |
| "grad_norm": 0.37027379870414734, |
| "learning_rate": 0.00042383105155840365, |
| "loss": 3.3026, |
| "step": 50500 |
| }, |
| { |
| "epoch": 14.716324034709684, |
| "grad_norm": 0.359173446893692, |
| "learning_rate": 0.0004236562773084765, |
| "loss": 3.3026, |
| "step": 50550 |
| }, |
| { |
| "epoch": 14.730883466309475, |
| "grad_norm": 0.35587379336357117, |
| "learning_rate": 0.0004234815030585493, |
| "loss": 3.3067, |
| "step": 50600 |
| }, |
| { |
| "epoch": 14.745442897909266, |
| "grad_norm": 0.3435940146446228, |
| "learning_rate": 0.00042330672880862215, |
| "loss": 3.3207, |
| "step": 50650 |
| }, |
| { |
| "epoch": 14.760002329509057, |
| "grad_norm": 0.35753193497657776, |
| "learning_rate": 0.00042313195455869493, |
| "loss": 3.3199, |
| "step": 50700 |
| }, |
| { |
| "epoch": 14.774561761108846, |
| "grad_norm": 0.377048134803772, |
| "learning_rate": 0.00042295718030876776, |
| "loss": 3.3216, |
| "step": 50750 |
| }, |
| { |
| "epoch": 14.789121192708636, |
| "grad_norm": 0.35497966408729553, |
| "learning_rate": 0.0004227824060588406, |
| "loss": 3.3176, |
| "step": 50800 |
| }, |
| { |
| "epoch": 14.803680624308427, |
| "grad_norm": 0.3708688020706177, |
| "learning_rate": 0.0004226076318089135, |
| "loss": 3.3151, |
| "step": 50850 |
| }, |
| { |
| "epoch": 14.818240055908218, |
| "grad_norm": 0.40276867151260376, |
| "learning_rate": 0.0004224328575589863, |
| "loss": 3.3208, |
| "step": 50900 |
| }, |
| { |
| "epoch": 14.832799487508007, |
| "grad_norm": 0.37389078736305237, |
| "learning_rate": 0.00042225808330905915, |
| "loss": 3.3148, |
| "step": 50950 |
| }, |
| { |
| "epoch": 14.847358919107798, |
| "grad_norm": 0.3689541220664978, |
| "learning_rate": 0.00042208330905913193, |
| "loss": 3.3317, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.847358919107798, |
| "eval_accuracy": 0.3714391069369385, |
| "eval_loss": 3.5418295860290527, |
| "eval_runtime": 194.6343, |
| "eval_samples_per_second": 85.509, |
| "eval_steps_per_second": 5.348, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.861918350707588, |
| "grad_norm": 0.3774998188018799, |
| "learning_rate": 0.00042190853480920476, |
| "loss": 3.3296, |
| "step": 51050 |
| }, |
| { |
| "epoch": 14.87647778230738, |
| "grad_norm": 0.37848007678985596, |
| "learning_rate": 0.0004217337605592776, |
| "loss": 3.3077, |
| "step": 51100 |
| }, |
| { |
| "epoch": 14.891037213907168, |
| "grad_norm": 0.37494605779647827, |
| "learning_rate": 0.00042155898630935043, |
| "loss": 3.3271, |
| "step": 51150 |
| }, |
| { |
| "epoch": 14.905596645506959, |
| "grad_norm": 0.3562757074832916, |
| "learning_rate": 0.0004213842120594232, |
| "loss": 3.3193, |
| "step": 51200 |
| }, |
| { |
| "epoch": 14.92015607710675, |
| "grad_norm": 0.3716096878051758, |
| "learning_rate": 0.00042120943780949604, |
| "loss": 3.3134, |
| "step": 51250 |
| }, |
| { |
| "epoch": 14.93471550870654, |
| "grad_norm": 0.36975473165512085, |
| "learning_rate": 0.00042103466355956887, |
| "loss": 3.3207, |
| "step": 51300 |
| }, |
| { |
| "epoch": 14.94927494030633, |
| "grad_norm": 0.37558260560035706, |
| "learning_rate": 0.0004208598893096417, |
| "loss": 3.3307, |
| "step": 51350 |
| }, |
| { |
| "epoch": 14.96383437190612, |
| "grad_norm": 0.36577916145324707, |
| "learning_rate": 0.00042068511505971454, |
| "loss": 3.3233, |
| "step": 51400 |
| }, |
| { |
| "epoch": 14.978393803505911, |
| "grad_norm": 0.3728995621204376, |
| "learning_rate": 0.0004205103408097873, |
| "loss": 3.3257, |
| "step": 51450 |
| }, |
| { |
| "epoch": 14.992953235105702, |
| "grad_norm": 0.41513046622276306, |
| "learning_rate": 0.00042033556655986015, |
| "loss": 3.3362, |
| "step": 51500 |
| }, |
| { |
| "epoch": 15.007279715799895, |
| "grad_norm": 0.38033053278923035, |
| "learning_rate": 0.000420160792309933, |
| "loss": 3.2579, |
| "step": 51550 |
| }, |
| { |
| "epoch": 15.021839147399685, |
| "grad_norm": 0.4050993323326111, |
| "learning_rate": 0.0004199860180600058, |
| "loss": 3.2234, |
| "step": 51600 |
| }, |
| { |
| "epoch": 15.036398578999476, |
| "grad_norm": 0.3728967607021332, |
| "learning_rate": 0.00041981124381007865, |
| "loss": 3.2089, |
| "step": 51650 |
| }, |
| { |
| "epoch": 15.050958010599267, |
| "grad_norm": 0.3688093423843384, |
| "learning_rate": 0.00041963646956015143, |
| "loss": 3.2277, |
| "step": 51700 |
| }, |
| { |
| "epoch": 15.065517442199056, |
| "grad_norm": 0.38589268922805786, |
| "learning_rate": 0.00041946169531022426, |
| "loss": 3.2312, |
| "step": 51750 |
| }, |
| { |
| "epoch": 15.080076873798847, |
| "grad_norm": 0.37138667702674866, |
| "learning_rate": 0.0004192869210602971, |
| "loss": 3.2244, |
| "step": 51800 |
| }, |
| { |
| "epoch": 15.094636305398637, |
| "grad_norm": 0.3603217303752899, |
| "learning_rate": 0.0004191121468103699, |
| "loss": 3.2385, |
| "step": 51850 |
| }, |
| { |
| "epoch": 15.109195736998428, |
| "grad_norm": 0.3568740487098694, |
| "learning_rate": 0.0004189373725604427, |
| "loss": 3.2404, |
| "step": 51900 |
| }, |
| { |
| "epoch": 15.123755168598217, |
| "grad_norm": 0.3630964457988739, |
| "learning_rate": 0.00041876259831051554, |
| "loss": 3.2507, |
| "step": 51950 |
| }, |
| { |
| "epoch": 15.138314600198008, |
| "grad_norm": 0.37862637639045715, |
| "learning_rate": 0.00041858782406058837, |
| "loss": 3.2427, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.138314600198008, |
| "eval_accuracy": 0.37097041791516167, |
| "eval_loss": 3.5578880310058594, |
| "eval_runtime": 180.9393, |
| "eval_samples_per_second": 91.981, |
| "eval_steps_per_second": 5.753, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.152874031797799, |
| "grad_norm": 0.37787380814552307, |
| "learning_rate": 0.0004184130498106612, |
| "loss": 3.2461, |
| "step": 52050 |
| }, |
| { |
| "epoch": 15.16743346339759, |
| "grad_norm": 0.38548505306243896, |
| "learning_rate": 0.00041823827556073404, |
| "loss": 3.2523, |
| "step": 52100 |
| }, |
| { |
| "epoch": 15.181992894997379, |
| "grad_norm": 0.4000053107738495, |
| "learning_rate": 0.0004180635013108068, |
| "loss": 3.2496, |
| "step": 52150 |
| }, |
| { |
| "epoch": 15.19655232659717, |
| "grad_norm": 0.3886640667915344, |
| "learning_rate": 0.00041788872706087965, |
| "loss": 3.2533, |
| "step": 52200 |
| }, |
| { |
| "epoch": 15.21111175819696, |
| "grad_norm": 0.38245636224746704, |
| "learning_rate": 0.0004177139528109525, |
| "loss": 3.2632, |
| "step": 52250 |
| }, |
| { |
| "epoch": 15.225671189796751, |
| "grad_norm": 0.36337175965309143, |
| "learning_rate": 0.0004175391785610253, |
| "loss": 3.2547, |
| "step": 52300 |
| }, |
| { |
| "epoch": 15.24023062139654, |
| "grad_norm": 0.4020264446735382, |
| "learning_rate": 0.00041736440431109815, |
| "loss": 3.2694, |
| "step": 52350 |
| }, |
| { |
| "epoch": 15.25479005299633, |
| "grad_norm": 0.41589444875717163, |
| "learning_rate": 0.0004171896300611709, |
| "loss": 3.2538, |
| "step": 52400 |
| }, |
| { |
| "epoch": 15.269349484596122, |
| "grad_norm": 0.3860560655593872, |
| "learning_rate": 0.00041701485581124376, |
| "loss": 3.2743, |
| "step": 52450 |
| }, |
| { |
| "epoch": 15.283908916195912, |
| "grad_norm": 0.3931313157081604, |
| "learning_rate": 0.0004168400815613166, |
| "loss": 3.2647, |
| "step": 52500 |
| }, |
| { |
| "epoch": 15.298468347795701, |
| "grad_norm": 0.38691258430480957, |
| "learning_rate": 0.0004166653073113894, |
| "loss": 3.2563, |
| "step": 52550 |
| }, |
| { |
| "epoch": 15.313027779395492, |
| "grad_norm": 0.34192565083503723, |
| "learning_rate": 0.0004164905330614622, |
| "loss": 3.2569, |
| "step": 52600 |
| }, |
| { |
| "epoch": 15.327587210995283, |
| "grad_norm": 0.3795337975025177, |
| "learning_rate": 0.00041631575881153504, |
| "loss": 3.267, |
| "step": 52650 |
| }, |
| { |
| "epoch": 15.342146642595074, |
| "grad_norm": 0.3903842270374298, |
| "learning_rate": 0.00041614098456160787, |
| "loss": 3.2731, |
| "step": 52700 |
| }, |
| { |
| "epoch": 15.356706074194863, |
| "grad_norm": 0.36304110288619995, |
| "learning_rate": 0.0004159662103116807, |
| "loss": 3.2816, |
| "step": 52750 |
| }, |
| { |
| "epoch": 15.371265505794653, |
| "grad_norm": 0.3842661678791046, |
| "learning_rate": 0.00041579143606175354, |
| "loss": 3.2652, |
| "step": 52800 |
| }, |
| { |
| "epoch": 15.385824937394444, |
| "grad_norm": 0.37199848890304565, |
| "learning_rate": 0.0004156166618118263, |
| "loss": 3.2733, |
| "step": 52850 |
| }, |
| { |
| "epoch": 15.400384368994235, |
| "grad_norm": 0.38361623883247375, |
| "learning_rate": 0.00041544188756189915, |
| "loss": 3.2763, |
| "step": 52900 |
| }, |
| { |
| "epoch": 15.414943800594024, |
| "grad_norm": 0.41524621844291687, |
| "learning_rate": 0.000415267113311972, |
| "loss": 3.2739, |
| "step": 52950 |
| }, |
| { |
| "epoch": 15.429503232193815, |
| "grad_norm": 0.3668960630893707, |
| "learning_rate": 0.0004150923390620448, |
| "loss": 3.2801, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.429503232193815, |
| "eval_accuracy": 0.371453804911781, |
| "eval_loss": 3.5519752502441406, |
| "eval_runtime": 180.7835, |
| "eval_samples_per_second": 92.06, |
| "eval_steps_per_second": 5.758, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.444062663793606, |
| "grad_norm": 0.36112311482429504, |
| "learning_rate": 0.0004149175648121176, |
| "loss": 3.2864, |
| "step": 53050 |
| }, |
| { |
| "epoch": 15.458622095393396, |
| "grad_norm": 0.40427324175834656, |
| "learning_rate": 0.0004147427905621904, |
| "loss": 3.2863, |
| "step": 53100 |
| }, |
| { |
| "epoch": 15.473181526993185, |
| "grad_norm": 0.3597639203071594, |
| "learning_rate": 0.00041456801631226326, |
| "loss": 3.2863, |
| "step": 53150 |
| }, |
| { |
| "epoch": 15.487740958592976, |
| "grad_norm": 0.3552030920982361, |
| "learning_rate": 0.0004143932420623361, |
| "loss": 3.2777, |
| "step": 53200 |
| }, |
| { |
| "epoch": 15.502300390192767, |
| "grad_norm": 0.3815247714519501, |
| "learning_rate": 0.0004142184678124089, |
| "loss": 3.2863, |
| "step": 53250 |
| }, |
| { |
| "epoch": 15.516859821792558, |
| "grad_norm": 0.3695749342441559, |
| "learning_rate": 0.0004140436935624817, |
| "loss": 3.2861, |
| "step": 53300 |
| }, |
| { |
| "epoch": 15.531419253392347, |
| "grad_norm": 0.39430856704711914, |
| "learning_rate": 0.00041386891931255454, |
| "loss": 3.2831, |
| "step": 53350 |
| }, |
| { |
| "epoch": 15.545978684992138, |
| "grad_norm": 0.38121476769447327, |
| "learning_rate": 0.0004136941450626274, |
| "loss": 3.2911, |
| "step": 53400 |
| }, |
| { |
| "epoch": 15.560538116591928, |
| "grad_norm": 0.4004911482334137, |
| "learning_rate": 0.00041351937081270026, |
| "loss": 3.2937, |
| "step": 53450 |
| }, |
| { |
| "epoch": 15.575097548191719, |
| "grad_norm": 0.3881937563419342, |
| "learning_rate": 0.0004133445965627731, |
| "loss": 3.301, |
| "step": 53500 |
| }, |
| { |
| "epoch": 15.58965697979151, |
| "grad_norm": 0.35038501024246216, |
| "learning_rate": 0.0004131698223128459, |
| "loss": 3.2991, |
| "step": 53550 |
| }, |
| { |
| "epoch": 15.604216411391299, |
| "grad_norm": 0.3707396686077118, |
| "learning_rate": 0.0004129950480629187, |
| "loss": 3.2934, |
| "step": 53600 |
| }, |
| { |
| "epoch": 15.61877584299109, |
| "grad_norm": 0.40238645672798157, |
| "learning_rate": 0.00041282027381299153, |
| "loss": 3.3011, |
| "step": 53650 |
| }, |
| { |
| "epoch": 15.63333527459088, |
| "grad_norm": 0.4219394624233246, |
| "learning_rate": 0.00041264549956306437, |
| "loss": 3.296, |
| "step": 53700 |
| }, |
| { |
| "epoch": 15.64789470619067, |
| "grad_norm": 0.4156340956687927, |
| "learning_rate": 0.0004124707253131372, |
| "loss": 3.3031, |
| "step": 53750 |
| }, |
| { |
| "epoch": 15.66245413779046, |
| "grad_norm": 0.3960428237915039, |
| "learning_rate": 0.00041229595106321, |
| "loss": 3.298, |
| "step": 53800 |
| }, |
| { |
| "epoch": 15.677013569390251, |
| "grad_norm": 0.3877508044242859, |
| "learning_rate": 0.0004121211768132828, |
| "loss": 3.295, |
| "step": 53850 |
| }, |
| { |
| "epoch": 15.691573000990042, |
| "grad_norm": 0.3831869661808014, |
| "learning_rate": 0.00041194640256335565, |
| "loss": 3.3089, |
| "step": 53900 |
| }, |
| { |
| "epoch": 15.706132432589833, |
| "grad_norm": 0.37973588705062866, |
| "learning_rate": 0.0004117716283134285, |
| "loss": 3.2996, |
| "step": 53950 |
| }, |
| { |
| "epoch": 15.720691864189622, |
| "grad_norm": 0.36781612038612366, |
| "learning_rate": 0.0004115968540635013, |
| "loss": 3.3063, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.720691864189622, |
| "eval_accuracy": 0.371678389967374, |
| "eval_loss": 3.5436747074127197, |
| "eval_runtime": 194.0997, |
| "eval_samples_per_second": 85.745, |
| "eval_steps_per_second": 5.363, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.735251295789412, |
| "grad_norm": 0.38517138361930847, |
| "learning_rate": 0.0004114220798135741, |
| "loss": 3.2983, |
| "step": 54050 |
| }, |
| { |
| "epoch": 15.749810727389203, |
| "grad_norm": 0.38750118017196655, |
| "learning_rate": 0.0004112473055636469, |
| "loss": 3.287, |
| "step": 54100 |
| }, |
| { |
| "epoch": 15.764370158988992, |
| "grad_norm": 0.36446696519851685, |
| "learning_rate": 0.00041107253131371976, |
| "loss": 3.3055, |
| "step": 54150 |
| }, |
| { |
| "epoch": 15.778929590588783, |
| "grad_norm": 0.36559680104255676, |
| "learning_rate": 0.0004108977570637926, |
| "loss": 3.3041, |
| "step": 54200 |
| }, |
| { |
| "epoch": 15.793489022188574, |
| "grad_norm": 0.3546500504016876, |
| "learning_rate": 0.0004107229828138654, |
| "loss": 3.3086, |
| "step": 54250 |
| }, |
| { |
| "epoch": 15.808048453788365, |
| "grad_norm": 0.3691169023513794, |
| "learning_rate": 0.0004105482085639382, |
| "loss": 3.3098, |
| "step": 54300 |
| }, |
| { |
| "epoch": 15.822607885388155, |
| "grad_norm": 0.37200814485549927, |
| "learning_rate": 0.00041037343431401103, |
| "loss": 3.3147, |
| "step": 54350 |
| }, |
| { |
| "epoch": 15.837167316987944, |
| "grad_norm": 0.3752872943878174, |
| "learning_rate": 0.00041019866006408387, |
| "loss": 3.3032, |
| "step": 54400 |
| }, |
| { |
| "epoch": 15.851726748587735, |
| "grad_norm": 0.3832313120365143, |
| "learning_rate": 0.0004100238858141567, |
| "loss": 3.3188, |
| "step": 54450 |
| }, |
| { |
| "epoch": 15.866286180187526, |
| "grad_norm": 0.3642217218875885, |
| "learning_rate": 0.0004098491115642295, |
| "loss": 3.3026, |
| "step": 54500 |
| }, |
| { |
| "epoch": 15.880845611787315, |
| "grad_norm": 0.3717597723007202, |
| "learning_rate": 0.0004096743373143023, |
| "loss": 3.3106, |
| "step": 54550 |
| }, |
| { |
| "epoch": 15.895405043387106, |
| "grad_norm": 0.39715245366096497, |
| "learning_rate": 0.00040949956306437514, |
| "loss": 3.3218, |
| "step": 54600 |
| }, |
| { |
| "epoch": 15.909964474986896, |
| "grad_norm": 0.37992674112319946, |
| "learning_rate": 0.000409324788814448, |
| "loss": 3.3185, |
| "step": 54650 |
| }, |
| { |
| "epoch": 15.924523906586687, |
| "grad_norm": 0.38378632068634033, |
| "learning_rate": 0.0004091500145645208, |
| "loss": 3.3142, |
| "step": 54700 |
| }, |
| { |
| "epoch": 15.939083338186478, |
| "grad_norm": 0.3747106194496155, |
| "learning_rate": 0.0004089752403145936, |
| "loss": 3.3099, |
| "step": 54750 |
| }, |
| { |
| "epoch": 15.953642769786267, |
| "grad_norm": 0.3706667721271515, |
| "learning_rate": 0.0004088004660646664, |
| "loss": 3.3069, |
| "step": 54800 |
| }, |
| { |
| "epoch": 15.968202201386058, |
| "grad_norm": 0.4007405936717987, |
| "learning_rate": 0.00040862569181473926, |
| "loss": 3.3156, |
| "step": 54850 |
| }, |
| { |
| "epoch": 15.982761632985849, |
| "grad_norm": 0.3907228410243988, |
| "learning_rate": 0.0004084509175648121, |
| "loss": 3.3087, |
| "step": 54900 |
| }, |
| { |
| "epoch": 15.99732106458564, |
| "grad_norm": 0.37928932905197144, |
| "learning_rate": 0.0004082761433148849, |
| "loss": 3.3079, |
| "step": 54950 |
| }, |
| { |
| "epoch": 16.011647545279832, |
| "grad_norm": 0.36409544944763184, |
| "learning_rate": 0.0004081013690649577, |
| "loss": 3.2192, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.011647545279832, |
| "eval_accuracy": 0.37157150629431956, |
| "eval_loss": 3.5547502040863037, |
| "eval_runtime": 218.6088, |
| "eval_samples_per_second": 76.131, |
| "eval_steps_per_second": 4.762, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.02620697687962, |
| "grad_norm": 0.3841230273246765, |
| "learning_rate": 0.00040792659481503053, |
| "loss": 3.2109, |
| "step": 55050 |
| }, |
| { |
| "epoch": 16.040766408479413, |
| "grad_norm": 0.38151082396507263, |
| "learning_rate": 0.00040775182056510337, |
| "loss": 3.2051, |
| "step": 55100 |
| }, |
| { |
| "epoch": 16.055325840079202, |
| "grad_norm": 0.39493605494499207, |
| "learning_rate": 0.0004075770463151762, |
| "loss": 3.2216, |
| "step": 55150 |
| }, |
| { |
| "epoch": 16.069885271678995, |
| "grad_norm": 0.3837626278400421, |
| "learning_rate": 0.000407402272065249, |
| "loss": 3.2114, |
| "step": 55200 |
| }, |
| { |
| "epoch": 16.084444703278784, |
| "grad_norm": 0.42783597111701965, |
| "learning_rate": 0.0004072274978153218, |
| "loss": 3.2084, |
| "step": 55250 |
| }, |
| { |
| "epoch": 16.099004134878573, |
| "grad_norm": 0.41208311915397644, |
| "learning_rate": 0.00040705272356539464, |
| "loss": 3.2166, |
| "step": 55300 |
| }, |
| { |
| "epoch": 16.113563566478366, |
| "grad_norm": 0.4104818105697632, |
| "learning_rate": 0.0004068779493154675, |
| "loss": 3.2429, |
| "step": 55350 |
| }, |
| { |
| "epoch": 16.128122998078155, |
| "grad_norm": 0.3684764802455902, |
| "learning_rate": 0.0004067031750655403, |
| "loss": 3.2416, |
| "step": 55400 |
| }, |
| { |
| "epoch": 16.142682429677944, |
| "grad_norm": 0.38913047313690186, |
| "learning_rate": 0.0004065284008156131, |
| "loss": 3.2386, |
| "step": 55450 |
| }, |
| { |
| "epoch": 16.157241861277736, |
| "grad_norm": 0.3729836344718933, |
| "learning_rate": 0.0004063536265656859, |
| "loss": 3.2345, |
| "step": 55500 |
| }, |
| { |
| "epoch": 16.171801292877525, |
| "grad_norm": 0.3831164240837097, |
| "learning_rate": 0.00040617885231575876, |
| "loss": 3.2205, |
| "step": 55550 |
| }, |
| { |
| "epoch": 16.186360724477318, |
| "grad_norm": 0.370057612657547, |
| "learning_rate": 0.0004060040780658316, |
| "loss": 3.2449, |
| "step": 55600 |
| }, |
| { |
| "epoch": 16.200920156077107, |
| "grad_norm": 0.4026546776294708, |
| "learning_rate": 0.0004058293038159044, |
| "loss": 3.2473, |
| "step": 55650 |
| }, |
| { |
| "epoch": 16.215479587676896, |
| "grad_norm": 0.3730154037475586, |
| "learning_rate": 0.0004056545295659772, |
| "loss": 3.2499, |
| "step": 55700 |
| }, |
| { |
| "epoch": 16.23003901927669, |
| "grad_norm": 0.40726903080940247, |
| "learning_rate": 0.00040547975531605003, |
| "loss": 3.2402, |
| "step": 55750 |
| }, |
| { |
| "epoch": 16.244598450876477, |
| "grad_norm": 0.36538970470428467, |
| "learning_rate": 0.00040530498106612287, |
| "loss": 3.2381, |
| "step": 55800 |
| }, |
| { |
| "epoch": 16.259157882476266, |
| "grad_norm": 0.4038563668727875, |
| "learning_rate": 0.0004051302068161957, |
| "loss": 3.2464, |
| "step": 55850 |
| }, |
| { |
| "epoch": 16.27371731407606, |
| "grad_norm": 0.3808690905570984, |
| "learning_rate": 0.0004049554325662686, |
| "loss": 3.2503, |
| "step": 55900 |
| }, |
| { |
| "epoch": 16.288276745675848, |
| "grad_norm": 0.39795416593551636, |
| "learning_rate": 0.00040478065831634136, |
| "loss": 3.2531, |
| "step": 55950 |
| }, |
| { |
| "epoch": 16.30283617727564, |
| "grad_norm": 0.3833180367946625, |
| "learning_rate": 0.0004046058840664142, |
| "loss": 3.2589, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.30283617727564, |
| "eval_accuracy": 0.3717569359449322, |
| "eval_loss": 3.5537829399108887, |
| "eval_runtime": 180.812, |
| "eval_samples_per_second": 92.046, |
| "eval_steps_per_second": 5.757, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.31739560887543, |
| "grad_norm": 0.38893625140190125, |
| "learning_rate": 0.00040443110981648703, |
| "loss": 3.2673, |
| "step": 56050 |
| }, |
| { |
| "epoch": 16.33195504047522, |
| "grad_norm": 0.3620428144931793, |
| "learning_rate": 0.00040425633556655986, |
| "loss": 3.2631, |
| "step": 56100 |
| }, |
| { |
| "epoch": 16.34651447207501, |
| "grad_norm": 0.3953818082809448, |
| "learning_rate": 0.0004040815613166327, |
| "loss": 3.253, |
| "step": 56150 |
| }, |
| { |
| "epoch": 16.3610739036748, |
| "grad_norm": 0.42537441849708557, |
| "learning_rate": 0.0004039067870667055, |
| "loss": 3.2594, |
| "step": 56200 |
| }, |
| { |
| "epoch": 16.375633335274593, |
| "grad_norm": 0.39338961243629456, |
| "learning_rate": 0.0004037320128167783, |
| "loss": 3.266, |
| "step": 56250 |
| }, |
| { |
| "epoch": 16.39019276687438, |
| "grad_norm": 0.4023808538913727, |
| "learning_rate": 0.00040355723856685114, |
| "loss": 3.2705, |
| "step": 56300 |
| }, |
| { |
| "epoch": 16.40475219847417, |
| "grad_norm": 0.4043920934200287, |
| "learning_rate": 0.000403382464316924, |
| "loss": 3.2644, |
| "step": 56350 |
| }, |
| { |
| "epoch": 16.419311630073963, |
| "grad_norm": 0.3746441602706909, |
| "learning_rate": 0.00040320769006699675, |
| "loss": 3.2603, |
| "step": 56400 |
| }, |
| { |
| "epoch": 16.433871061673752, |
| "grad_norm": 0.4176045060157776, |
| "learning_rate": 0.0004030329158170696, |
| "loss": 3.2748, |
| "step": 56450 |
| }, |
| { |
| "epoch": 16.44843049327354, |
| "grad_norm": 0.3769119381904602, |
| "learning_rate": 0.0004028581415671424, |
| "loss": 3.2631, |
| "step": 56500 |
| }, |
| { |
| "epoch": 16.462989924873334, |
| "grad_norm": 0.4049900770187378, |
| "learning_rate": 0.00040268336731721525, |
| "loss": 3.2749, |
| "step": 56550 |
| }, |
| { |
| "epoch": 16.477549356473123, |
| "grad_norm": 0.3694433569908142, |
| "learning_rate": 0.0004025085930672881, |
| "loss": 3.2737, |
| "step": 56600 |
| }, |
| { |
| "epoch": 16.492108788072915, |
| "grad_norm": 0.38329896330833435, |
| "learning_rate": 0.00040233381881736086, |
| "loss": 3.2764, |
| "step": 56650 |
| }, |
| { |
| "epoch": 16.506668219672704, |
| "grad_norm": 0.3678717613220215, |
| "learning_rate": 0.0004021590445674337, |
| "loss": 3.2785, |
| "step": 56700 |
| }, |
| { |
| "epoch": 16.521227651272493, |
| "grad_norm": 0.42633742094039917, |
| "learning_rate": 0.00040198427031750653, |
| "loss": 3.2742, |
| "step": 56750 |
| }, |
| { |
| "epoch": 16.535787082872286, |
| "grad_norm": 0.38539767265319824, |
| "learning_rate": 0.00040180949606757936, |
| "loss": 3.2687, |
| "step": 56800 |
| }, |
| { |
| "epoch": 16.550346514472075, |
| "grad_norm": 0.39155086874961853, |
| "learning_rate": 0.0004016347218176522, |
| "loss": 3.2782, |
| "step": 56850 |
| }, |
| { |
| "epoch": 16.564905946071864, |
| "grad_norm": 0.4056413471698761, |
| "learning_rate": 0.000401459947567725, |
| "loss": 3.2825, |
| "step": 56900 |
| }, |
| { |
| "epoch": 16.579465377671657, |
| "grad_norm": 0.3961426019668579, |
| "learning_rate": 0.0004012851733177978, |
| "loss": 3.2809, |
| "step": 56950 |
| }, |
| { |
| "epoch": 16.594024809271446, |
| "grad_norm": 0.37948471307754517, |
| "learning_rate": 0.00040111039906787064, |
| "loss": 3.2819, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.594024809271446, |
| "eval_accuracy": 0.3720454865870397, |
| "eval_loss": 3.5462021827697754, |
| "eval_runtime": 180.5198, |
| "eval_samples_per_second": 92.195, |
| "eval_steps_per_second": 5.767, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.608584240871238, |
| "grad_norm": 0.3997423052787781, |
| "learning_rate": 0.0004009356248179435, |
| "loss": 3.2757, |
| "step": 57050 |
| }, |
| { |
| "epoch": 16.623143672471027, |
| "grad_norm": 0.38000285625457764, |
| "learning_rate": 0.00040076085056801625, |
| "loss": 3.2857, |
| "step": 57100 |
| }, |
| { |
| "epoch": 16.637703104070816, |
| "grad_norm": 0.36798590421676636, |
| "learning_rate": 0.0004005860763180891, |
| "loss": 3.2861, |
| "step": 57150 |
| }, |
| { |
| "epoch": 16.65226253567061, |
| "grad_norm": 0.38152462244033813, |
| "learning_rate": 0.0004004113020681619, |
| "loss": 3.2859, |
| "step": 57200 |
| }, |
| { |
| "epoch": 16.666821967270398, |
| "grad_norm": 0.4101053774356842, |
| "learning_rate": 0.00040023652781823475, |
| "loss": 3.2847, |
| "step": 57250 |
| }, |
| { |
| "epoch": 16.681381398870187, |
| "grad_norm": 0.37895187735557556, |
| "learning_rate": 0.0004000617535683076, |
| "loss": 3.286, |
| "step": 57300 |
| }, |
| { |
| "epoch": 16.69594083046998, |
| "grad_norm": 0.4000397324562073, |
| "learning_rate": 0.00039988697931838036, |
| "loss": 3.2801, |
| "step": 57350 |
| }, |
| { |
| "epoch": 16.71050026206977, |
| "grad_norm": 0.3596523404121399, |
| "learning_rate": 0.0003997122050684532, |
| "loss": 3.2806, |
| "step": 57400 |
| }, |
| { |
| "epoch": 16.72505969366956, |
| "grad_norm": 0.40595123171806335, |
| "learning_rate": 0.00039953743081852603, |
| "loss": 3.2842, |
| "step": 57450 |
| }, |
| { |
| "epoch": 16.73961912526935, |
| "grad_norm": 0.3860524594783783, |
| "learning_rate": 0.00039936265656859886, |
| "loss": 3.284, |
| "step": 57500 |
| }, |
| { |
| "epoch": 16.75417855686914, |
| "grad_norm": 0.3636539876461029, |
| "learning_rate": 0.0003991878823186717, |
| "loss": 3.2895, |
| "step": 57550 |
| }, |
| { |
| "epoch": 16.76873798846893, |
| "grad_norm": 0.3696141242980957, |
| "learning_rate": 0.0003990131080687445, |
| "loss": 3.2896, |
| "step": 57600 |
| }, |
| { |
| "epoch": 16.78329742006872, |
| "grad_norm": 0.3895752429962158, |
| "learning_rate": 0.0003988383338188173, |
| "loss": 3.287, |
| "step": 57650 |
| }, |
| { |
| "epoch": 16.79785685166851, |
| "grad_norm": 0.393362820148468, |
| "learning_rate": 0.00039866355956889014, |
| "loss": 3.3037, |
| "step": 57700 |
| }, |
| { |
| "epoch": 16.812416283268302, |
| "grad_norm": 0.379912406206131, |
| "learning_rate": 0.000398488785318963, |
| "loss": 3.2958, |
| "step": 57750 |
| }, |
| { |
| "epoch": 16.82697571486809, |
| "grad_norm": 0.37024620175361633, |
| "learning_rate": 0.00039831401106903575, |
| "loss": 3.2925, |
| "step": 57800 |
| }, |
| { |
| "epoch": 16.841535146467884, |
| "grad_norm": 0.38401541113853455, |
| "learning_rate": 0.0003981392368191086, |
| "loss": 3.296, |
| "step": 57850 |
| }, |
| { |
| "epoch": 16.856094578067673, |
| "grad_norm": 0.3762282729148865, |
| "learning_rate": 0.0003979644625691814, |
| "loss": 3.2997, |
| "step": 57900 |
| }, |
| { |
| "epoch": 16.87065400966746, |
| "grad_norm": 0.38887494802474976, |
| "learning_rate": 0.00039778968831925425, |
| "loss": 3.295, |
| "step": 57950 |
| }, |
| { |
| "epoch": 16.885213441267254, |
| "grad_norm": 0.3960239589214325, |
| "learning_rate": 0.0003976149140693271, |
| "loss": 3.2991, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.885213441267254, |
| "eval_accuracy": 0.37233003937999004, |
| "eval_loss": 3.538398265838623, |
| "eval_runtime": 180.9439, |
| "eval_samples_per_second": 91.979, |
| "eval_steps_per_second": 5.753, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.899772872867043, |
| "grad_norm": 0.35838446021080017, |
| "learning_rate": 0.00039744013981939986, |
| "loss": 3.2993, |
| "step": 58050 |
| }, |
| { |
| "epoch": 16.914332304466832, |
| "grad_norm": 0.3769555985927582, |
| "learning_rate": 0.0003972653655694727, |
| "loss": 3.2954, |
| "step": 58100 |
| }, |
| { |
| "epoch": 16.928891736066625, |
| "grad_norm": 0.3769146800041199, |
| "learning_rate": 0.00039709059131954553, |
| "loss": 3.3055, |
| "step": 58150 |
| }, |
| { |
| "epoch": 16.943451167666414, |
| "grad_norm": 0.37942448258399963, |
| "learning_rate": 0.00039691581706961836, |
| "loss": 3.3018, |
| "step": 58200 |
| }, |
| { |
| "epoch": 16.958010599266206, |
| "grad_norm": 0.3871458172798157, |
| "learning_rate": 0.0003967410428196912, |
| "loss": 3.3066, |
| "step": 58250 |
| }, |
| { |
| "epoch": 16.972570030865995, |
| "grad_norm": 0.37447428703308105, |
| "learning_rate": 0.000396566268569764, |
| "loss": 3.2896, |
| "step": 58300 |
| }, |
| { |
| "epoch": 16.987129462465784, |
| "grad_norm": 0.39451682567596436, |
| "learning_rate": 0.0003963914943198368, |
| "loss": 3.294, |
| "step": 58350 |
| }, |
| { |
| "epoch": 17.00145594315998, |
| "grad_norm": 0.37564557790756226, |
| "learning_rate": 0.00039621672006990964, |
| "loss": 3.2853, |
| "step": 58400 |
| }, |
| { |
| "epoch": 17.01601537475977, |
| "grad_norm": 0.3866485357284546, |
| "learning_rate": 0.0003960419458199825, |
| "loss": 3.1838, |
| "step": 58450 |
| }, |
| { |
| "epoch": 17.03057480635956, |
| "grad_norm": 0.39509961009025574, |
| "learning_rate": 0.00039586717157005536, |
| "loss": 3.1882, |
| "step": 58500 |
| }, |
| { |
| "epoch": 17.04513423795935, |
| "grad_norm": 0.40553566813468933, |
| "learning_rate": 0.00039569239732012814, |
| "loss": 3.2081, |
| "step": 58550 |
| }, |
| { |
| "epoch": 17.05969366955914, |
| "grad_norm": 0.3974907100200653, |
| "learning_rate": 0.00039551762307020097, |
| "loss": 3.201, |
| "step": 58600 |
| }, |
| { |
| "epoch": 17.07425310115893, |
| "grad_norm": 0.4323204457759857, |
| "learning_rate": 0.0003953428488202738, |
| "loss": 3.2201, |
| "step": 58650 |
| }, |
| { |
| "epoch": 17.08881253275872, |
| "grad_norm": 0.39381760358810425, |
| "learning_rate": 0.00039516807457034664, |
| "loss": 3.2085, |
| "step": 58700 |
| }, |
| { |
| "epoch": 17.103371964358512, |
| "grad_norm": 0.3874780237674713, |
| "learning_rate": 0.00039499330032041947, |
| "loss": 3.2036, |
| "step": 58750 |
| }, |
| { |
| "epoch": 17.1179313959583, |
| "grad_norm": 0.3998047411441803, |
| "learning_rate": 0.00039481852607049225, |
| "loss": 3.2124, |
| "step": 58800 |
| }, |
| { |
| "epoch": 17.132490827558094, |
| "grad_norm": 0.37455350160598755, |
| "learning_rate": 0.0003946437518205651, |
| "loss": 3.2184, |
| "step": 58850 |
| }, |
| { |
| "epoch": 17.147050259157883, |
| "grad_norm": 0.4187104403972626, |
| "learning_rate": 0.0003944689775706379, |
| "loss": 3.2301, |
| "step": 58900 |
| }, |
| { |
| "epoch": 17.161609690757672, |
| "grad_norm": 0.3866749107837677, |
| "learning_rate": 0.00039429420332071075, |
| "loss": 3.2204, |
| "step": 58950 |
| }, |
| { |
| "epoch": 17.176169122357464, |
| "grad_norm": 0.39651066064834595, |
| "learning_rate": 0.0003941194290707836, |
| "loss": 3.2153, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.176169122357464, |
| "eval_accuracy": 0.3716996726349459, |
| "eval_loss": 3.555577278137207, |
| "eval_runtime": 181.0557, |
| "eval_samples_per_second": 91.922, |
| "eval_steps_per_second": 5.75, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.190728553957253, |
| "grad_norm": 0.41101446747779846, |
| "learning_rate": 0.00039394465482085636, |
| "loss": 3.2375, |
| "step": 59050 |
| }, |
| { |
| "epoch": 17.205287985557042, |
| "grad_norm": 0.400997132062912, |
| "learning_rate": 0.0003937698805709292, |
| "loss": 3.2352, |
| "step": 59100 |
| }, |
| { |
| "epoch": 17.219847417156835, |
| "grad_norm": 0.4068007171154022, |
| "learning_rate": 0.000393595106321002, |
| "loss": 3.2375, |
| "step": 59150 |
| }, |
| { |
| "epoch": 17.234406848756624, |
| "grad_norm": 0.38392940163612366, |
| "learning_rate": 0.00039342033207107486, |
| "loss": 3.2337, |
| "step": 59200 |
| }, |
| { |
| "epoch": 17.248966280356417, |
| "grad_norm": 0.38920333981513977, |
| "learning_rate": 0.00039324555782114764, |
| "loss": 3.2361, |
| "step": 59250 |
| }, |
| { |
| "epoch": 17.263525711956206, |
| "grad_norm": 0.408083438873291, |
| "learning_rate": 0.00039307078357122047, |
| "loss": 3.2247, |
| "step": 59300 |
| }, |
| { |
| "epoch": 17.278085143555995, |
| "grad_norm": 0.43184590339660645, |
| "learning_rate": 0.0003928960093212933, |
| "loss": 3.245, |
| "step": 59350 |
| }, |
| { |
| "epoch": 17.292644575155787, |
| "grad_norm": 0.39720863103866577, |
| "learning_rate": 0.00039272123507136614, |
| "loss": 3.2417, |
| "step": 59400 |
| }, |
| { |
| "epoch": 17.307204006755576, |
| "grad_norm": 0.38785409927368164, |
| "learning_rate": 0.00039254646082143897, |
| "loss": 3.2473, |
| "step": 59450 |
| }, |
| { |
| "epoch": 17.321763438355365, |
| "grad_norm": 0.37579146027565, |
| "learning_rate": 0.00039237168657151175, |
| "loss": 3.2439, |
| "step": 59500 |
| }, |
| { |
| "epoch": 17.336322869955158, |
| "grad_norm": 0.4311056435108185, |
| "learning_rate": 0.0003921969123215846, |
| "loss": 3.2639, |
| "step": 59550 |
| }, |
| { |
| "epoch": 17.350882301554947, |
| "grad_norm": 0.40039804577827454, |
| "learning_rate": 0.0003920221380716574, |
| "loss": 3.2553, |
| "step": 59600 |
| }, |
| { |
| "epoch": 17.36544173315474, |
| "grad_norm": 0.37928736209869385, |
| "learning_rate": 0.00039184736382173025, |
| "loss": 3.2459, |
| "step": 59650 |
| }, |
| { |
| "epoch": 17.38000116475453, |
| "grad_norm": 0.3739273250102997, |
| "learning_rate": 0.000391672589571803, |
| "loss": 3.2557, |
| "step": 59700 |
| }, |
| { |
| "epoch": 17.394560596354317, |
| "grad_norm": 0.453727662563324, |
| "learning_rate": 0.00039149781532187586, |
| "loss": 3.2579, |
| "step": 59750 |
| }, |
| { |
| "epoch": 17.40912002795411, |
| "grad_norm": 0.37308287620544434, |
| "learning_rate": 0.0003913230410719487, |
| "loss": 3.249, |
| "step": 59800 |
| }, |
| { |
| "epoch": 17.4236794595539, |
| "grad_norm": 0.40976881980895996, |
| "learning_rate": 0.0003911482668220215, |
| "loss": 3.2503, |
| "step": 59850 |
| }, |
| { |
| "epoch": 17.438238891153688, |
| "grad_norm": 0.4195505380630493, |
| "learning_rate": 0.00039097349257209436, |
| "loss": 3.245, |
| "step": 59900 |
| }, |
| { |
| "epoch": 17.45279832275348, |
| "grad_norm": 0.4036107361316681, |
| "learning_rate": 0.00039079871832216714, |
| "loss": 3.2565, |
| "step": 59950 |
| }, |
| { |
| "epoch": 17.46735775435327, |
| "grad_norm": 0.374733030796051, |
| "learning_rate": 0.00039062394407223997, |
| "loss": 3.2642, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.46735775435327, |
| "eval_accuracy": 0.3723462659442161, |
| "eval_loss": 3.5491600036621094, |
| "eval_runtime": 180.875, |
| "eval_samples_per_second": 92.014, |
| "eval_steps_per_second": 5.755, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.481917185953062, |
| "grad_norm": 0.38882243633270264, |
| "learning_rate": 0.0003904491698223128, |
| "loss": 3.2628, |
| "step": 60050 |
| }, |
| { |
| "epoch": 17.49647661755285, |
| "grad_norm": 0.37586531043052673, |
| "learning_rate": 0.00039027439557238564, |
| "loss": 3.2443, |
| "step": 60100 |
| }, |
| { |
| "epoch": 17.51103604915264, |
| "grad_norm": 0.379884272813797, |
| "learning_rate": 0.00039009962132245847, |
| "loss": 3.2743, |
| "step": 60150 |
| }, |
| { |
| "epoch": 17.525595480752433, |
| "grad_norm": 0.411455363035202, |
| "learning_rate": 0.00038992484707253125, |
| "loss": 3.265, |
| "step": 60200 |
| }, |
| { |
| "epoch": 17.54015491235222, |
| "grad_norm": 0.4151667058467865, |
| "learning_rate": 0.0003897500728226041, |
| "loss": 3.2711, |
| "step": 60250 |
| }, |
| { |
| "epoch": 17.55471434395201, |
| "grad_norm": 0.37580248713493347, |
| "learning_rate": 0.0003895752985726769, |
| "loss": 3.2486, |
| "step": 60300 |
| }, |
| { |
| "epoch": 17.569273775551803, |
| "grad_norm": 0.41014158725738525, |
| "learning_rate": 0.00038940052432274975, |
| "loss": 3.2664, |
| "step": 60350 |
| }, |
| { |
| "epoch": 17.583833207151592, |
| "grad_norm": 0.3746136724948883, |
| "learning_rate": 0.0003892257500728225, |
| "loss": 3.2613, |
| "step": 60400 |
| }, |
| { |
| "epoch": 17.598392638751385, |
| "grad_norm": 0.41069844365119934, |
| "learning_rate": 0.00038905097582289536, |
| "loss": 3.2718, |
| "step": 60450 |
| }, |
| { |
| "epoch": 17.612952070351174, |
| "grad_norm": 0.39283448457717896, |
| "learning_rate": 0.0003888762015729682, |
| "loss": 3.2655, |
| "step": 60500 |
| }, |
| { |
| "epoch": 17.627511501950963, |
| "grad_norm": 0.40771397948265076, |
| "learning_rate": 0.000388701427323041, |
| "loss": 3.2694, |
| "step": 60550 |
| }, |
| { |
| "epoch": 17.642070933550755, |
| "grad_norm": 0.42583101987838745, |
| "learning_rate": 0.00038852665307311386, |
| "loss": 3.2694, |
| "step": 60600 |
| }, |
| { |
| "epoch": 17.656630365150544, |
| "grad_norm": 0.38941916823387146, |
| "learning_rate": 0.00038835187882318664, |
| "loss": 3.2727, |
| "step": 60650 |
| }, |
| { |
| "epoch": 17.671189796750333, |
| "grad_norm": 0.4130733907222748, |
| "learning_rate": 0.00038817710457325947, |
| "loss": 3.271, |
| "step": 60700 |
| }, |
| { |
| "epoch": 17.685749228350126, |
| "grad_norm": 0.3779431879520416, |
| "learning_rate": 0.0003880023303233323, |
| "loss": 3.2729, |
| "step": 60750 |
| }, |
| { |
| "epoch": 17.700308659949915, |
| "grad_norm": 0.3987460434436798, |
| "learning_rate": 0.00038782755607340514, |
| "loss": 3.281, |
| "step": 60800 |
| }, |
| { |
| "epoch": 17.714868091549707, |
| "grad_norm": 0.41353845596313477, |
| "learning_rate": 0.00038765278182347797, |
| "loss": 3.2826, |
| "step": 60850 |
| }, |
| { |
| "epoch": 17.729427523149496, |
| "grad_norm": 0.3690580129623413, |
| "learning_rate": 0.00038747800757355075, |
| "loss": 3.2909, |
| "step": 60900 |
| }, |
| { |
| "epoch": 17.743986954749285, |
| "grad_norm": 0.39012983441352844, |
| "learning_rate": 0.00038730323332362363, |
| "loss": 3.289, |
| "step": 60950 |
| }, |
| { |
| "epoch": 17.758546386349078, |
| "grad_norm": 0.3791520893573761, |
| "learning_rate": 0.00038712845907369647, |
| "loss": 3.2743, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.758546386349078, |
| "eval_accuracy": 0.37214919549752823, |
| "eval_loss": 3.54073166847229, |
| "eval_runtime": 181.4366, |
| "eval_samples_per_second": 91.729, |
| "eval_steps_per_second": 5.738, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.773105817948867, |
| "grad_norm": 0.42223140597343445, |
| "learning_rate": 0.0003869536848237693, |
| "loss": 3.2712, |
| "step": 61050 |
| }, |
| { |
| "epoch": 17.787665249548656, |
| "grad_norm": 0.39836451411247253, |
| "learning_rate": 0.00038677891057384213, |
| "loss": 3.2741, |
| "step": 61100 |
| }, |
| { |
| "epoch": 17.80222468114845, |
| "grad_norm": 0.3820720314979553, |
| "learning_rate": 0.0003866041363239149, |
| "loss": 3.2734, |
| "step": 61150 |
| }, |
| { |
| "epoch": 17.816784112748238, |
| "grad_norm": 0.3743707239627838, |
| "learning_rate": 0.00038642936207398774, |
| "loss": 3.2878, |
| "step": 61200 |
| }, |
| { |
| "epoch": 17.83134354434803, |
| "grad_norm": 0.40424010157585144, |
| "learning_rate": 0.0003862545878240606, |
| "loss": 3.2707, |
| "step": 61250 |
| }, |
| { |
| "epoch": 17.84590297594782, |
| "grad_norm": 0.3789885640144348, |
| "learning_rate": 0.0003860798135741334, |
| "loss": 3.2847, |
| "step": 61300 |
| }, |
| { |
| "epoch": 17.860462407547608, |
| "grad_norm": 0.3790980875492096, |
| "learning_rate": 0.00038590503932420624, |
| "loss": 3.2678, |
| "step": 61350 |
| }, |
| { |
| "epoch": 17.8750218391474, |
| "grad_norm": 0.3928091526031494, |
| "learning_rate": 0.000385730265074279, |
| "loss": 3.2864, |
| "step": 61400 |
| }, |
| { |
| "epoch": 17.88958127074719, |
| "grad_norm": 0.43650928139686584, |
| "learning_rate": 0.00038555549082435186, |
| "loss": 3.2733, |
| "step": 61450 |
| }, |
| { |
| "epoch": 17.90414070234698, |
| "grad_norm": 0.3985980153083801, |
| "learning_rate": 0.0003853807165744247, |
| "loss": 3.2887, |
| "step": 61500 |
| }, |
| { |
| "epoch": 17.91870013394677, |
| "grad_norm": 0.38238638639450073, |
| "learning_rate": 0.0003852059423244975, |
| "loss": 3.2964, |
| "step": 61550 |
| }, |
| { |
| "epoch": 17.93325956554656, |
| "grad_norm": 0.4149417281150818, |
| "learning_rate": 0.00038503116807457035, |
| "loss": 3.2798, |
| "step": 61600 |
| }, |
| { |
| "epoch": 17.947818997146353, |
| "grad_norm": 0.3858490288257599, |
| "learning_rate": 0.00038485639382464313, |
| "loss": 3.286, |
| "step": 61650 |
| }, |
| { |
| "epoch": 17.962378428746142, |
| "grad_norm": 0.41513580083847046, |
| "learning_rate": 0.00038468161957471597, |
| "loss": 3.2882, |
| "step": 61700 |
| }, |
| { |
| "epoch": 17.97693786034593, |
| "grad_norm": 0.3717535436153412, |
| "learning_rate": 0.0003845068453247888, |
| "loss": 3.286, |
| "step": 61750 |
| }, |
| { |
| "epoch": 17.991497291945723, |
| "grad_norm": 0.4167464077472687, |
| "learning_rate": 0.00038433207107486163, |
| "loss": 3.2957, |
| "step": 61800 |
| }, |
| { |
| "epoch": 18.005823772639918, |
| "grad_norm": 0.39267608523368835, |
| "learning_rate": 0.0003841572968249344, |
| "loss": 3.2381, |
| "step": 61850 |
| }, |
| { |
| "epoch": 18.020383204239707, |
| "grad_norm": 0.3998408019542694, |
| "learning_rate": 0.00038398252257500724, |
| "loss": 3.1779, |
| "step": 61900 |
| }, |
| { |
| "epoch": 18.034942635839496, |
| "grad_norm": 0.38134950399398804, |
| "learning_rate": 0.0003838077483250801, |
| "loss": 3.1746, |
| "step": 61950 |
| }, |
| { |
| "epoch": 18.04950206743929, |
| "grad_norm": 0.38700148463249207, |
| "learning_rate": 0.0003836329740751529, |
| "loss": 3.1953, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.04950206743929, |
| "eval_accuracy": 0.3720806441428629, |
| "eval_loss": 3.5562655925750732, |
| "eval_runtime": 181.185, |
| "eval_samples_per_second": 91.856, |
| "eval_steps_per_second": 5.746, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.064061499039077, |
| "grad_norm": 0.4167320132255554, |
| "learning_rate": 0.00038345819982522574, |
| "loss": 3.1947, |
| "step": 62050 |
| }, |
| { |
| "epoch": 18.078620930638866, |
| "grad_norm": 0.4142136573791504, |
| "learning_rate": 0.0003832834255752985, |
| "loss": 3.1952, |
| "step": 62100 |
| }, |
| { |
| "epoch": 18.09318036223866, |
| "grad_norm": 0.3881548345088959, |
| "learning_rate": 0.00038310865132537135, |
| "loss": 3.2028, |
| "step": 62150 |
| }, |
| { |
| "epoch": 18.107739793838448, |
| "grad_norm": 0.41077175736427307, |
| "learning_rate": 0.0003829338770754442, |
| "loss": 3.2023, |
| "step": 62200 |
| }, |
| { |
| "epoch": 18.12229922543824, |
| "grad_norm": 0.38895899057388306, |
| "learning_rate": 0.000382759102825517, |
| "loss": 3.1963, |
| "step": 62250 |
| }, |
| { |
| "epoch": 18.13685865703803, |
| "grad_norm": 0.40407344698905945, |
| "learning_rate": 0.00038258432857558985, |
| "loss": 3.2109, |
| "step": 62300 |
| }, |
| { |
| "epoch": 18.15141808863782, |
| "grad_norm": 0.38854244351387024, |
| "learning_rate": 0.00038240955432566263, |
| "loss": 3.2107, |
| "step": 62350 |
| }, |
| { |
| "epoch": 18.16597752023761, |
| "grad_norm": 0.3986676037311554, |
| "learning_rate": 0.00038223478007573547, |
| "loss": 3.2017, |
| "step": 62400 |
| }, |
| { |
| "epoch": 18.1805369518374, |
| "grad_norm": 0.38370537757873535, |
| "learning_rate": 0.0003820600058258083, |
| "loss": 3.212, |
| "step": 62450 |
| }, |
| { |
| "epoch": 18.19509638343719, |
| "grad_norm": 0.3775344491004944, |
| "learning_rate": 0.00038188523157588113, |
| "loss": 3.2255, |
| "step": 62500 |
| }, |
| { |
| "epoch": 18.20965581503698, |
| "grad_norm": 0.4089907705783844, |
| "learning_rate": 0.0003817104573259539, |
| "loss": 3.2281, |
| "step": 62550 |
| }, |
| { |
| "epoch": 18.22421524663677, |
| "grad_norm": 0.4235895574092865, |
| "learning_rate": 0.00038153568307602674, |
| "loss": 3.2173, |
| "step": 62600 |
| }, |
| { |
| "epoch": 18.238774678236563, |
| "grad_norm": 0.41684481501579285, |
| "learning_rate": 0.0003813609088260996, |
| "loss": 3.2157, |
| "step": 62650 |
| }, |
| { |
| "epoch": 18.253334109836352, |
| "grad_norm": 0.3882802724838257, |
| "learning_rate": 0.0003811861345761724, |
| "loss": 3.2313, |
| "step": 62700 |
| }, |
| { |
| "epoch": 18.26789354143614, |
| "grad_norm": 0.4060615599155426, |
| "learning_rate": 0.00038101136032624524, |
| "loss": 3.2251, |
| "step": 62750 |
| }, |
| { |
| "epoch": 18.282452973035934, |
| "grad_norm": 0.40829920768737793, |
| "learning_rate": 0.000380836586076318, |
| "loss": 3.2242, |
| "step": 62800 |
| }, |
| { |
| "epoch": 18.297012404635723, |
| "grad_norm": 0.39912256598472595, |
| "learning_rate": 0.00038066181182639085, |
| "loss": 3.2388, |
| "step": 62850 |
| }, |
| { |
| "epoch": 18.31157183623551, |
| "grad_norm": 0.41347721219062805, |
| "learning_rate": 0.0003804870375764637, |
| "loss": 3.2414, |
| "step": 62900 |
| }, |
| { |
| "epoch": 18.326131267835304, |
| "grad_norm": 0.37032246589660645, |
| "learning_rate": 0.0003803122633265365, |
| "loss": 3.2287, |
| "step": 62950 |
| }, |
| { |
| "epoch": 18.340690699435093, |
| "grad_norm": 0.38557493686676025, |
| "learning_rate": 0.0003801374890766093, |
| "loss": 3.2446, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.340690699435093, |
| "eval_accuracy": 0.3720570097993162, |
| "eval_loss": 3.5540997982025146, |
| "eval_runtime": 181.3354, |
| "eval_samples_per_second": 91.78, |
| "eval_steps_per_second": 5.741, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.355250131034886, |
| "grad_norm": 0.4208027422428131, |
| "learning_rate": 0.00037996271482668213, |
| "loss": 3.2347, |
| "step": 63050 |
| }, |
| { |
| "epoch": 18.369809562634675, |
| "grad_norm": 0.42822667956352234, |
| "learning_rate": 0.00037978794057675497, |
| "loss": 3.2171, |
| "step": 63100 |
| }, |
| { |
| "epoch": 18.384368994234464, |
| "grad_norm": 0.39856502413749695, |
| "learning_rate": 0.0003796131663268278, |
| "loss": 3.2528, |
| "step": 63150 |
| }, |
| { |
| "epoch": 18.398928425834256, |
| "grad_norm": 0.38449880480766296, |
| "learning_rate": 0.00037943839207690063, |
| "loss": 3.2357, |
| "step": 63200 |
| }, |
| { |
| "epoch": 18.413487857434045, |
| "grad_norm": 0.3990757167339325, |
| "learning_rate": 0.0003792636178269734, |
| "loss": 3.2576, |
| "step": 63250 |
| }, |
| { |
| "epoch": 18.428047289033834, |
| "grad_norm": 0.4236275255680084, |
| "learning_rate": 0.00037908884357704624, |
| "loss": 3.2358, |
| "step": 63300 |
| }, |
| { |
| "epoch": 18.442606720633627, |
| "grad_norm": 0.39217713475227356, |
| "learning_rate": 0.0003789140693271191, |
| "loss": 3.24, |
| "step": 63350 |
| }, |
| { |
| "epoch": 18.457166152233416, |
| "grad_norm": 0.4095120131969452, |
| "learning_rate": 0.0003787392950771919, |
| "loss": 3.2689, |
| "step": 63400 |
| }, |
| { |
| "epoch": 18.47172558383321, |
| "grad_norm": 0.4244193136692047, |
| "learning_rate": 0.00037856452082726474, |
| "loss": 3.2418, |
| "step": 63450 |
| }, |
| { |
| "epoch": 18.486285015432998, |
| "grad_norm": 0.3923884332180023, |
| "learning_rate": 0.00037838974657733763, |
| "loss": 3.2559, |
| "step": 63500 |
| }, |
| { |
| "epoch": 18.500844447032787, |
| "grad_norm": 0.3991324007511139, |
| "learning_rate": 0.0003782149723274104, |
| "loss": 3.2467, |
| "step": 63550 |
| }, |
| { |
| "epoch": 18.51540387863258, |
| "grad_norm": 0.37614545226097107, |
| "learning_rate": 0.00037804019807748324, |
| "loss": 3.2485, |
| "step": 63600 |
| }, |
| { |
| "epoch": 18.529963310232368, |
| "grad_norm": 0.40092289447784424, |
| "learning_rate": 0.0003778654238275561, |
| "loss": 3.2604, |
| "step": 63650 |
| }, |
| { |
| "epoch": 18.544522741832157, |
| "grad_norm": 0.3537710905075073, |
| "learning_rate": 0.0003776906495776289, |
| "loss": 3.2492, |
| "step": 63700 |
| }, |
| { |
| "epoch": 18.55908217343195, |
| "grad_norm": 0.3787946105003357, |
| "learning_rate": 0.0003775158753277017, |
| "loss": 3.2538, |
| "step": 63750 |
| }, |
| { |
| "epoch": 18.57364160503174, |
| "grad_norm": 0.38140779733657837, |
| "learning_rate": 0.0003773411010777745, |
| "loss": 3.2461, |
| "step": 63800 |
| }, |
| { |
| "epoch": 18.58820103663153, |
| "grad_norm": 0.3862013518810272, |
| "learning_rate": 0.00037716632682784735, |
| "loss": 3.2435, |
| "step": 63850 |
| }, |
| { |
| "epoch": 18.60276046823132, |
| "grad_norm": 0.41837623715400696, |
| "learning_rate": 0.0003769915525779202, |
| "loss": 3.2602, |
| "step": 63900 |
| }, |
| { |
| "epoch": 18.61731989983111, |
| "grad_norm": 0.42126792669296265, |
| "learning_rate": 0.000376816778327993, |
| "loss": 3.2575, |
| "step": 63950 |
| }, |
| { |
| "epoch": 18.631879331430902, |
| "grad_norm": 0.38360005617141724, |
| "learning_rate": 0.0003766420040780658, |
| "loss": 3.263, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.631879331430902, |
| "eval_accuracy": 0.3726539827455182, |
| "eval_loss": 3.5427675247192383, |
| "eval_runtime": 180.7526, |
| "eval_samples_per_second": 92.076, |
| "eval_steps_per_second": 5.759, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.64643876303069, |
| "grad_norm": 0.38449525833129883, |
| "learning_rate": 0.00037646722982813863, |
| "loss": 3.2636, |
| "step": 64050 |
| }, |
| { |
| "epoch": 18.66099819463048, |
| "grad_norm": 0.39993739128112793, |
| "learning_rate": 0.00037629245557821146, |
| "loss": 3.2687, |
| "step": 64100 |
| }, |
| { |
| "epoch": 18.675557626230272, |
| "grad_norm": 0.44774314761161804, |
| "learning_rate": 0.0003761176813282843, |
| "loss": 3.264, |
| "step": 64150 |
| }, |
| { |
| "epoch": 18.69011705783006, |
| "grad_norm": 0.4117605686187744, |
| "learning_rate": 0.00037594290707835713, |
| "loss": 3.2523, |
| "step": 64200 |
| }, |
| { |
| "epoch": 18.704676489429854, |
| "grad_norm": 0.41900861263275146, |
| "learning_rate": 0.0003757681328284299, |
| "loss": 3.2652, |
| "step": 64250 |
| }, |
| { |
| "epoch": 18.719235921029643, |
| "grad_norm": 0.3894200921058655, |
| "learning_rate": 0.00037559335857850274, |
| "loss": 3.2627, |
| "step": 64300 |
| }, |
| { |
| "epoch": 18.733795352629432, |
| "grad_norm": 0.3939887583255768, |
| "learning_rate": 0.0003754185843285756, |
| "loss": 3.2714, |
| "step": 64350 |
| }, |
| { |
| "epoch": 18.748354784229225, |
| "grad_norm": 0.3855105936527252, |
| "learning_rate": 0.0003752438100786484, |
| "loss": 3.2576, |
| "step": 64400 |
| }, |
| { |
| "epoch": 18.762914215829014, |
| "grad_norm": 0.40547657012939453, |
| "learning_rate": 0.0003750690358287212, |
| "loss": 3.2643, |
| "step": 64450 |
| }, |
| { |
| "epoch": 18.777473647428806, |
| "grad_norm": 0.3899041414260864, |
| "learning_rate": 0.000374894261578794, |
| "loss": 3.2565, |
| "step": 64500 |
| }, |
| { |
| "epoch": 18.792033079028595, |
| "grad_norm": 0.3877187669277191, |
| "learning_rate": 0.00037471948732886685, |
| "loss": 3.2734, |
| "step": 64550 |
| }, |
| { |
| "epoch": 18.806592510628384, |
| "grad_norm": 0.4046708047389984, |
| "learning_rate": 0.0003745447130789397, |
| "loss": 3.2639, |
| "step": 64600 |
| }, |
| { |
| "epoch": 18.821151942228177, |
| "grad_norm": 0.3916832208633423, |
| "learning_rate": 0.0003743699388290125, |
| "loss": 3.2698, |
| "step": 64650 |
| }, |
| { |
| "epoch": 18.835711373827966, |
| "grad_norm": 0.3999544382095337, |
| "learning_rate": 0.0003741951645790853, |
| "loss": 3.2824, |
| "step": 64700 |
| }, |
| { |
| "epoch": 18.850270805427755, |
| "grad_norm": 0.4189528524875641, |
| "learning_rate": 0.00037402039032915813, |
| "loss": 3.2592, |
| "step": 64750 |
| }, |
| { |
| "epoch": 18.864830237027547, |
| "grad_norm": 0.39608845114707947, |
| "learning_rate": 0.00037384561607923096, |
| "loss": 3.2647, |
| "step": 64800 |
| }, |
| { |
| "epoch": 18.879389668627336, |
| "grad_norm": 0.36444324254989624, |
| "learning_rate": 0.0003736708418293038, |
| "loss": 3.2628, |
| "step": 64850 |
| }, |
| { |
| "epoch": 18.893949100227125, |
| "grad_norm": 0.39453035593032837, |
| "learning_rate": 0.00037349606757937663, |
| "loss": 3.2773, |
| "step": 64900 |
| }, |
| { |
| "epoch": 18.908508531826918, |
| "grad_norm": 0.4110398590564728, |
| "learning_rate": 0.0003733212933294494, |
| "loss": 3.2737, |
| "step": 64950 |
| }, |
| { |
| "epoch": 18.923067963426707, |
| "grad_norm": 0.37432223558425903, |
| "learning_rate": 0.00037314651907952224, |
| "loss": 3.2807, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.923067963426707, |
| "eval_accuracy": 0.3731833450074448, |
| "eval_loss": 3.5371851921081543, |
| "eval_runtime": 181.3913, |
| "eval_samples_per_second": 91.752, |
| "eval_steps_per_second": 5.739, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.9376273950265, |
| "grad_norm": 0.38230255246162415, |
| "learning_rate": 0.00037297174482959507, |
| "loss": 3.2801, |
| "step": 65050 |
| }, |
| { |
| "epoch": 18.95218682662629, |
| "grad_norm": 0.4243530035018921, |
| "learning_rate": 0.0003727969705796679, |
| "loss": 3.2761, |
| "step": 65100 |
| }, |
| { |
| "epoch": 18.966746258226078, |
| "grad_norm": 0.4133412539958954, |
| "learning_rate": 0.0003726221963297407, |
| "loss": 3.2716, |
| "step": 65150 |
| }, |
| { |
| "epoch": 18.98130568982587, |
| "grad_norm": 0.3846762180328369, |
| "learning_rate": 0.0003724474220798135, |
| "loss": 3.2876, |
| "step": 65200 |
| }, |
| { |
| "epoch": 18.99586512142566, |
| "grad_norm": 0.422029972076416, |
| "learning_rate": 0.00037227264782988635, |
| "loss": 3.275, |
| "step": 65250 |
| }, |
| { |
| "epoch": 19.010191602119853, |
| "grad_norm": 0.40092238783836365, |
| "learning_rate": 0.0003720978735799592, |
| "loss": 3.2038, |
| "step": 65300 |
| }, |
| { |
| "epoch": 19.024751033719642, |
| "grad_norm": 0.356789767742157, |
| "learning_rate": 0.000371923099330032, |
| "loss": 3.1628, |
| "step": 65350 |
| }, |
| { |
| "epoch": 19.039310465319435, |
| "grad_norm": 0.3983159363269806, |
| "learning_rate": 0.0003717483250801048, |
| "loss": 3.1639, |
| "step": 65400 |
| }, |
| { |
| "epoch": 19.053869896919224, |
| "grad_norm": 0.4232736825942993, |
| "learning_rate": 0.00037157355083017763, |
| "loss": 3.1838, |
| "step": 65450 |
| }, |
| { |
| "epoch": 19.068429328519013, |
| "grad_norm": 0.41093751788139343, |
| "learning_rate": 0.00037139877658025046, |
| "loss": 3.1817, |
| "step": 65500 |
| }, |
| { |
| "epoch": 19.082988760118806, |
| "grad_norm": 0.40463700890541077, |
| "learning_rate": 0.0003712240023303233, |
| "loss": 3.1925, |
| "step": 65550 |
| }, |
| { |
| "epoch": 19.097548191718595, |
| "grad_norm": 0.403666228055954, |
| "learning_rate": 0.00037104922808039607, |
| "loss": 3.1855, |
| "step": 65600 |
| }, |
| { |
| "epoch": 19.112107623318387, |
| "grad_norm": 0.4537275731563568, |
| "learning_rate": 0.0003708744538304689, |
| "loss": 3.1927, |
| "step": 65650 |
| }, |
| { |
| "epoch": 19.126667054918176, |
| "grad_norm": 0.401798814535141, |
| "learning_rate": 0.00037069967958054174, |
| "loss": 3.1889, |
| "step": 65700 |
| }, |
| { |
| "epoch": 19.141226486517965, |
| "grad_norm": 0.40314981341362, |
| "learning_rate": 0.00037052490533061457, |
| "loss": 3.1879, |
| "step": 65750 |
| }, |
| { |
| "epoch": 19.155785918117758, |
| "grad_norm": 0.3906761705875397, |
| "learning_rate": 0.0003703501310806874, |
| "loss": 3.2167, |
| "step": 65800 |
| }, |
| { |
| "epoch": 19.170345349717547, |
| "grad_norm": 0.39637866616249084, |
| "learning_rate": 0.0003701753568307602, |
| "loss": 3.1943, |
| "step": 65850 |
| }, |
| { |
| "epoch": 19.184904781317336, |
| "grad_norm": 0.40415269136428833, |
| "learning_rate": 0.000370000582580833, |
| "loss": 3.1988, |
| "step": 65900 |
| }, |
| { |
| "epoch": 19.19946421291713, |
| "grad_norm": 0.41272252798080444, |
| "learning_rate": 0.00036982580833090585, |
| "loss": 3.1979, |
| "step": 65950 |
| }, |
| { |
| "epoch": 19.214023644516917, |
| "grad_norm": 0.4298579692840576, |
| "learning_rate": 0.00036965103408097874, |
| "loss": 3.2138, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.214023644516917, |
| "eval_accuracy": 0.3722166885980049, |
| "eval_loss": 3.5523126125335693, |
| "eval_runtime": 180.988, |
| "eval_samples_per_second": 91.956, |
| "eval_steps_per_second": 5.752, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.22858307611671, |
| "grad_norm": 0.3859882652759552, |
| "learning_rate": 0.00036947625983105157, |
| "loss": 3.2164, |
| "step": 66050 |
| }, |
| { |
| "epoch": 19.2431425077165, |
| "grad_norm": 0.38444507122039795, |
| "learning_rate": 0.0003693014855811244, |
| "loss": 3.222, |
| "step": 66100 |
| }, |
| { |
| "epoch": 19.257701939316288, |
| "grad_norm": 0.3962699770927429, |
| "learning_rate": 0.0003691267113311972, |
| "loss": 3.2082, |
| "step": 66150 |
| }, |
| { |
| "epoch": 19.27226137091608, |
| "grad_norm": 0.389569491147995, |
| "learning_rate": 0.00036895193708127, |
| "loss": 3.2159, |
| "step": 66200 |
| }, |
| { |
| "epoch": 19.28682080251587, |
| "grad_norm": 0.4205169081687927, |
| "learning_rate": 0.00036877716283134285, |
| "loss": 3.2215, |
| "step": 66250 |
| }, |
| { |
| "epoch": 19.30138023411566, |
| "grad_norm": 0.4000534415245056, |
| "learning_rate": 0.0003686023885814157, |
| "loss": 3.2277, |
| "step": 66300 |
| }, |
| { |
| "epoch": 19.31593966571545, |
| "grad_norm": 0.40057796239852905, |
| "learning_rate": 0.00036842761433148846, |
| "loss": 3.2363, |
| "step": 66350 |
| }, |
| { |
| "epoch": 19.33049909731524, |
| "grad_norm": 0.44507649540901184, |
| "learning_rate": 0.0003682528400815613, |
| "loss": 3.2307, |
| "step": 66400 |
| }, |
| { |
| "epoch": 19.345058528915033, |
| "grad_norm": 0.4065307378768921, |
| "learning_rate": 0.0003680780658316341, |
| "loss": 3.2169, |
| "step": 66450 |
| }, |
| { |
| "epoch": 19.35961796051482, |
| "grad_norm": 0.41337087750434875, |
| "learning_rate": 0.00036790329158170696, |
| "loss": 3.2245, |
| "step": 66500 |
| }, |
| { |
| "epoch": 19.37417739211461, |
| "grad_norm": 0.4373385012149811, |
| "learning_rate": 0.0003677285173317798, |
| "loss": 3.2265, |
| "step": 66550 |
| }, |
| { |
| "epoch": 19.388736823714403, |
| "grad_norm": 0.41589170694351196, |
| "learning_rate": 0.00036755374308185257, |
| "loss": 3.2428, |
| "step": 66600 |
| }, |
| { |
| "epoch": 19.403296255314192, |
| "grad_norm": 0.4082939922809601, |
| "learning_rate": 0.0003673789688319254, |
| "loss": 3.2366, |
| "step": 66650 |
| }, |
| { |
| "epoch": 19.41785568691398, |
| "grad_norm": 0.3798679709434509, |
| "learning_rate": 0.00036720419458199824, |
| "loss": 3.2216, |
| "step": 66700 |
| }, |
| { |
| "epoch": 19.432415118513774, |
| "grad_norm": 0.3959912359714508, |
| "learning_rate": 0.00036702942033207107, |
| "loss": 3.2231, |
| "step": 66750 |
| }, |
| { |
| "epoch": 19.446974550113563, |
| "grad_norm": 0.42029306292533875, |
| "learning_rate": 0.0003668546460821439, |
| "loss": 3.2401, |
| "step": 66800 |
| }, |
| { |
| "epoch": 19.461533981713355, |
| "grad_norm": 0.400526225566864, |
| "learning_rate": 0.0003666798718322167, |
| "loss": 3.2324, |
| "step": 66850 |
| }, |
| { |
| "epoch": 19.476093413313144, |
| "grad_norm": 0.4720441997051239, |
| "learning_rate": 0.0003665050975822895, |
| "loss": 3.2414, |
| "step": 66900 |
| }, |
| { |
| "epoch": 19.490652844912933, |
| "grad_norm": 0.40877315402030945, |
| "learning_rate": 0.00036633032333236235, |
| "loss": 3.2441, |
| "step": 66950 |
| }, |
| { |
| "epoch": 19.505212276512726, |
| "grad_norm": 0.3751513659954071, |
| "learning_rate": 0.0003661555490824352, |
| "loss": 3.242, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.505212276512726, |
| "eval_accuracy": 0.37279508330400596, |
| "eval_loss": 3.547092914581299, |
| "eval_runtime": 181.1392, |
| "eval_samples_per_second": 91.88, |
| "eval_steps_per_second": 5.747, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.519771708112515, |
| "grad_norm": 0.39515799283981323, |
| "learning_rate": 0.00036598077483250796, |
| "loss": 3.2452, |
| "step": 67050 |
| }, |
| { |
| "epoch": 19.534331139712307, |
| "grad_norm": 0.41457104682922363, |
| "learning_rate": 0.0003658060005825808, |
| "loss": 3.2361, |
| "step": 67100 |
| }, |
| { |
| "epoch": 19.548890571312096, |
| "grad_norm": 0.43827950954437256, |
| "learning_rate": 0.0003656312263326536, |
| "loss": 3.2423, |
| "step": 67150 |
| }, |
| { |
| "epoch": 19.563450002911885, |
| "grad_norm": 0.3718095123767853, |
| "learning_rate": 0.00036545645208272646, |
| "loss": 3.2491, |
| "step": 67200 |
| }, |
| { |
| "epoch": 19.578009434511678, |
| "grad_norm": 0.3950149714946747, |
| "learning_rate": 0.0003652816778327993, |
| "loss": 3.2372, |
| "step": 67250 |
| }, |
| { |
| "epoch": 19.592568866111467, |
| "grad_norm": 0.39129117131233215, |
| "learning_rate": 0.00036510690358287207, |
| "loss": 3.2336, |
| "step": 67300 |
| }, |
| { |
| "epoch": 19.607128297711256, |
| "grad_norm": 0.39655396342277527, |
| "learning_rate": 0.0003649321293329449, |
| "loss": 3.2283, |
| "step": 67350 |
| }, |
| { |
| "epoch": 19.62168772931105, |
| "grad_norm": 0.43846869468688965, |
| "learning_rate": 0.00036475735508301774, |
| "loss": 3.2469, |
| "step": 67400 |
| }, |
| { |
| "epoch": 19.636247160910838, |
| "grad_norm": 0.4126003682613373, |
| "learning_rate": 0.00036458258083309057, |
| "loss": 3.2552, |
| "step": 67450 |
| }, |
| { |
| "epoch": 19.650806592510627, |
| "grad_norm": 0.4312933385372162, |
| "learning_rate": 0.0003644078065831634, |
| "loss": 3.2403, |
| "step": 67500 |
| }, |
| { |
| "epoch": 19.66536602411042, |
| "grad_norm": 0.4025894105434418, |
| "learning_rate": 0.0003642330323332362, |
| "loss": 3.2417, |
| "step": 67550 |
| }, |
| { |
| "epoch": 19.679925455710208, |
| "grad_norm": 0.3867190480232239, |
| "learning_rate": 0.000364058258083309, |
| "loss": 3.2577, |
| "step": 67600 |
| }, |
| { |
| "epoch": 19.69448488731, |
| "grad_norm": 0.3977561593055725, |
| "learning_rate": 0.00036388348383338185, |
| "loss": 3.2423, |
| "step": 67650 |
| }, |
| { |
| "epoch": 19.70904431890979, |
| "grad_norm": 0.4209028482437134, |
| "learning_rate": 0.0003637087095834547, |
| "loss": 3.2664, |
| "step": 67700 |
| }, |
| { |
| "epoch": 19.72360375050958, |
| "grad_norm": 0.39911961555480957, |
| "learning_rate": 0.00036353393533352746, |
| "loss": 3.2596, |
| "step": 67750 |
| }, |
| { |
| "epoch": 19.73816318210937, |
| "grad_norm": 0.43913912773132324, |
| "learning_rate": 0.0003633591610836003, |
| "loss": 3.2618, |
| "step": 67800 |
| }, |
| { |
| "epoch": 19.75272261370916, |
| "grad_norm": 0.40178295969963074, |
| "learning_rate": 0.0003631843868336731, |
| "loss": 3.2504, |
| "step": 67850 |
| }, |
| { |
| "epoch": 19.767282045308953, |
| "grad_norm": 0.3968295454978943, |
| "learning_rate": 0.00036300961258374596, |
| "loss": 3.2515, |
| "step": 67900 |
| }, |
| { |
| "epoch": 19.781841476908742, |
| "grad_norm": 0.41022300720214844, |
| "learning_rate": 0.0003628348383338188, |
| "loss": 3.2553, |
| "step": 67950 |
| }, |
| { |
| "epoch": 19.79640090850853, |
| "grad_norm": 0.40481844544410706, |
| "learning_rate": 0.00036266006408389157, |
| "loss": 3.253, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.79640090850853, |
| "eval_accuracy": 0.37316241509126913, |
| "eval_loss": 3.537109613418579, |
| "eval_runtime": 181.0978, |
| "eval_samples_per_second": 91.901, |
| "eval_steps_per_second": 5.748, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.810960340108323, |
| "grad_norm": 0.384703665971756, |
| "learning_rate": 0.0003624852898339644, |
| "loss": 3.256, |
| "step": 68050 |
| }, |
| { |
| "epoch": 19.825519771708112, |
| "grad_norm": 0.3775125741958618, |
| "learning_rate": 0.00036231051558403723, |
| "loss": 3.2636, |
| "step": 68100 |
| }, |
| { |
| "epoch": 19.8400792033079, |
| "grad_norm": 0.41662731766700745, |
| "learning_rate": 0.00036213574133411007, |
| "loss": 3.2597, |
| "step": 68150 |
| }, |
| { |
| "epoch": 19.854638634907694, |
| "grad_norm": 0.40957972407341003, |
| "learning_rate": 0.0003619609670841829, |
| "loss": 3.2596, |
| "step": 68200 |
| }, |
| { |
| "epoch": 19.869198066507483, |
| "grad_norm": 0.38087132573127747, |
| "learning_rate": 0.0003617861928342557, |
| "loss": 3.2742, |
| "step": 68250 |
| }, |
| { |
| "epoch": 19.883757498107276, |
| "grad_norm": 0.4036879241466522, |
| "learning_rate": 0.0003616114185843285, |
| "loss": 3.2549, |
| "step": 68300 |
| }, |
| { |
| "epoch": 19.898316929707065, |
| "grad_norm": 0.4047390818595886, |
| "learning_rate": 0.00036143664433440135, |
| "loss": 3.2638, |
| "step": 68350 |
| }, |
| { |
| "epoch": 19.912876361306854, |
| "grad_norm": 0.41273418068885803, |
| "learning_rate": 0.0003612618700844742, |
| "loss": 3.2695, |
| "step": 68400 |
| }, |
| { |
| "epoch": 19.927435792906646, |
| "grad_norm": 0.40376052260398865, |
| "learning_rate": 0.00036108709583454696, |
| "loss": 3.277, |
| "step": 68450 |
| }, |
| { |
| "epoch": 19.941995224506435, |
| "grad_norm": 0.372547447681427, |
| "learning_rate": 0.0003609123215846198, |
| "loss": 3.2634, |
| "step": 68500 |
| }, |
| { |
| "epoch": 19.956554656106224, |
| "grad_norm": 0.41053515672683716, |
| "learning_rate": 0.0003607375473346927, |
| "loss": 3.2729, |
| "step": 68550 |
| }, |
| { |
| "epoch": 19.971114087706017, |
| "grad_norm": 0.40540847182273865, |
| "learning_rate": 0.0003605627730847655, |
| "loss": 3.2581, |
| "step": 68600 |
| }, |
| { |
| "epoch": 19.985673519305806, |
| "grad_norm": 0.412577360868454, |
| "learning_rate": 0.00036038799883483834, |
| "loss": 3.2543, |
| "step": 68650 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 1.0419994592666626, |
| "learning_rate": 0.0003602132245849112, |
| "loss": 3.2561, |
| "step": 68700 |
| }, |
| { |
| "epoch": 20.01455943159979, |
| "grad_norm": 0.44616925716400146, |
| "learning_rate": 0.00036003845033498395, |
| "loss": 3.1465, |
| "step": 68750 |
| }, |
| { |
| "epoch": 20.02911886319958, |
| "grad_norm": 0.4104091227054596, |
| "learning_rate": 0.0003598636760850568, |
| "loss": 3.173, |
| "step": 68800 |
| }, |
| { |
| "epoch": 20.04367829479937, |
| "grad_norm": 0.39892300963401794, |
| "learning_rate": 0.0003596889018351296, |
| "loss": 3.1733, |
| "step": 68850 |
| }, |
| { |
| "epoch": 20.058237726399163, |
| "grad_norm": 0.39944881200790405, |
| "learning_rate": 0.00035951412758520245, |
| "loss": 3.1739, |
| "step": 68900 |
| }, |
| { |
| "epoch": 20.072797157998952, |
| "grad_norm": 0.4186191260814667, |
| "learning_rate": 0.00035933935333527523, |
| "loss": 3.1701, |
| "step": 68950 |
| }, |
| { |
| "epoch": 20.08735658959874, |
| "grad_norm": 0.43529069423675537, |
| "learning_rate": 0.00035916457908534807, |
| "loss": 3.17, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.08735658959874, |
| "eval_accuracy": 0.37242316574859197, |
| "eval_loss": 3.556870698928833, |
| "eval_runtime": 180.4494, |
| "eval_samples_per_second": 92.231, |
| "eval_steps_per_second": 5.769, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.101916021198534, |
| "grad_norm": 0.43885257840156555, |
| "learning_rate": 0.0003589898048354209, |
| "loss": 3.1713, |
| "step": 69050 |
| }, |
| { |
| "epoch": 20.116475452798323, |
| "grad_norm": 0.3850330412387848, |
| "learning_rate": 0.00035881503058549373, |
| "loss": 3.1862, |
| "step": 69100 |
| }, |
| { |
| "epoch": 20.13103488439811, |
| "grad_norm": 0.38994720578193665, |
| "learning_rate": 0.00035864025633556656, |
| "loss": 3.185, |
| "step": 69150 |
| }, |
| { |
| "epoch": 20.145594315997904, |
| "grad_norm": 0.3972725570201874, |
| "learning_rate": 0.00035846548208563934, |
| "loss": 3.1977, |
| "step": 69200 |
| }, |
| { |
| "epoch": 20.160153747597693, |
| "grad_norm": 0.41154587268829346, |
| "learning_rate": 0.0003582907078357122, |
| "loss": 3.1837, |
| "step": 69250 |
| }, |
| { |
| "epoch": 20.174713179197486, |
| "grad_norm": 0.41088539361953735, |
| "learning_rate": 0.000358115933585785, |
| "loss": 3.1911, |
| "step": 69300 |
| }, |
| { |
| "epoch": 20.189272610797275, |
| "grad_norm": 0.4196443259716034, |
| "learning_rate": 0.00035794115933585784, |
| "loss": 3.1966, |
| "step": 69350 |
| }, |
| { |
| "epoch": 20.203832042397064, |
| "grad_norm": 0.40109729766845703, |
| "learning_rate": 0.0003577663850859307, |
| "loss": 3.2108, |
| "step": 69400 |
| }, |
| { |
| "epoch": 20.218391473996856, |
| "grad_norm": 0.3926057815551758, |
| "learning_rate": 0.00035759161083600345, |
| "loss": 3.1926, |
| "step": 69450 |
| }, |
| { |
| "epoch": 20.232950905596645, |
| "grad_norm": 0.4119376540184021, |
| "learning_rate": 0.0003574168365860763, |
| "loss": 3.2071, |
| "step": 69500 |
| }, |
| { |
| "epoch": 20.247510337196434, |
| "grad_norm": 0.39600470662117004, |
| "learning_rate": 0.0003572420623361491, |
| "loss": 3.2095, |
| "step": 69550 |
| }, |
| { |
| "epoch": 20.262069768796227, |
| "grad_norm": 0.40082696080207825, |
| "learning_rate": 0.00035706728808622195, |
| "loss": 3.1961, |
| "step": 69600 |
| }, |
| { |
| "epoch": 20.276629200396016, |
| "grad_norm": 0.40221554040908813, |
| "learning_rate": 0.00035689251383629473, |
| "loss": 3.2103, |
| "step": 69650 |
| }, |
| { |
| "epoch": 20.29118863199581, |
| "grad_norm": 0.38792362809181213, |
| "learning_rate": 0.00035671773958636757, |
| "loss": 3.2052, |
| "step": 69700 |
| }, |
| { |
| "epoch": 20.305748063595598, |
| "grad_norm": 0.3968353569507599, |
| "learning_rate": 0.0003565429653364404, |
| "loss": 3.2091, |
| "step": 69750 |
| }, |
| { |
| "epoch": 20.320307495195387, |
| "grad_norm": 0.3859781324863434, |
| "learning_rate": 0.00035636819108651323, |
| "loss": 3.2032, |
| "step": 69800 |
| }, |
| { |
| "epoch": 20.33486692679518, |
| "grad_norm": 0.4044977128505707, |
| "learning_rate": 0.00035619341683658606, |
| "loss": 3.2157, |
| "step": 69850 |
| }, |
| { |
| "epoch": 20.349426358394968, |
| "grad_norm": 0.41051003336906433, |
| "learning_rate": 0.00035601864258665884, |
| "loss": 3.2151, |
| "step": 69900 |
| }, |
| { |
| "epoch": 20.363985789994757, |
| "grad_norm": 0.4241284728050232, |
| "learning_rate": 0.0003558438683367317, |
| "loss": 3.2137, |
| "step": 69950 |
| }, |
| { |
| "epoch": 20.37854522159455, |
| "grad_norm": 0.40410953760147095, |
| "learning_rate": 0.0003556690940868045, |
| "loss": 3.2225, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.37854522159455, |
| "eval_accuracy": 0.37306611396010125, |
| "eval_loss": 3.548208713531494, |
| "eval_runtime": 180.5894, |
| "eval_samples_per_second": 92.159, |
| "eval_steps_per_second": 5.764, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.39310465319434, |
| "grad_norm": 0.3968863785266876, |
| "learning_rate": 0.00035549431983687734, |
| "loss": 3.2168, |
| "step": 70050 |
| }, |
| { |
| "epoch": 20.40766408479413, |
| "grad_norm": 0.4126570224761963, |
| "learning_rate": 0.0003553195455869502, |
| "loss": 3.2173, |
| "step": 70100 |
| }, |
| { |
| "epoch": 20.42222351639392, |
| "grad_norm": 0.388510137796402, |
| "learning_rate": 0.00035514477133702295, |
| "loss": 3.2307, |
| "step": 70150 |
| }, |
| { |
| "epoch": 20.43678294799371, |
| "grad_norm": 0.38976356387138367, |
| "learning_rate": 0.0003549699970870958, |
| "loss": 3.224, |
| "step": 70200 |
| }, |
| { |
| "epoch": 20.451342379593502, |
| "grad_norm": 0.4209003448486328, |
| "learning_rate": 0.0003547952228371686, |
| "loss": 3.2257, |
| "step": 70250 |
| }, |
| { |
| "epoch": 20.46590181119329, |
| "grad_norm": 0.40154707431793213, |
| "learning_rate": 0.00035462044858724145, |
| "loss": 3.2186, |
| "step": 70300 |
| }, |
| { |
| "epoch": 20.48046124279308, |
| "grad_norm": 0.3838447332382202, |
| "learning_rate": 0.00035444567433731423, |
| "loss": 3.2274, |
| "step": 70350 |
| }, |
| { |
| "epoch": 20.495020674392872, |
| "grad_norm": 0.39228400588035583, |
| "learning_rate": 0.00035427090008738706, |
| "loss": 3.2203, |
| "step": 70400 |
| }, |
| { |
| "epoch": 20.50958010599266, |
| "grad_norm": 0.41162481904029846, |
| "learning_rate": 0.0003540961258374599, |
| "loss": 3.2273, |
| "step": 70450 |
| }, |
| { |
| "epoch": 20.524139537592454, |
| "grad_norm": 0.4010622501373291, |
| "learning_rate": 0.00035392135158753273, |
| "loss": 3.2287, |
| "step": 70500 |
| }, |
| { |
| "epoch": 20.538698969192243, |
| "grad_norm": 0.3961891233921051, |
| "learning_rate": 0.00035374657733760556, |
| "loss": 3.2322, |
| "step": 70550 |
| }, |
| { |
| "epoch": 20.553258400792032, |
| "grad_norm": 0.42891672253608704, |
| "learning_rate": 0.00035357180308767834, |
| "loss": 3.232, |
| "step": 70600 |
| }, |
| { |
| "epoch": 20.567817832391825, |
| "grad_norm": 0.41287800669670105, |
| "learning_rate": 0.0003533970288377512, |
| "loss": 3.2326, |
| "step": 70650 |
| }, |
| { |
| "epoch": 20.582377263991614, |
| "grad_norm": 0.4244118928909302, |
| "learning_rate": 0.000353222254587824, |
| "loss": 3.2308, |
| "step": 70700 |
| }, |
| { |
| "epoch": 20.596936695591403, |
| "grad_norm": 0.4150402247905731, |
| "learning_rate": 0.00035304748033789684, |
| "loss": 3.2365, |
| "step": 70750 |
| }, |
| { |
| "epoch": 20.611496127191195, |
| "grad_norm": 0.41076770424842834, |
| "learning_rate": 0.0003528727060879697, |
| "loss": 3.2409, |
| "step": 70800 |
| }, |
| { |
| "epoch": 20.626055558790984, |
| "grad_norm": 0.4489847719669342, |
| "learning_rate": 0.00035269793183804245, |
| "loss": 3.2417, |
| "step": 70850 |
| }, |
| { |
| "epoch": 20.640614990390777, |
| "grad_norm": 0.41337257623672485, |
| "learning_rate": 0.0003525231575881153, |
| "loss": 3.2453, |
| "step": 70900 |
| }, |
| { |
| "epoch": 20.655174421990566, |
| "grad_norm": 0.3830353319644928, |
| "learning_rate": 0.0003523483833381881, |
| "loss": 3.2388, |
| "step": 70950 |
| }, |
| { |
| "epoch": 20.669733853590355, |
| "grad_norm": 0.4028426706790924, |
| "learning_rate": 0.00035217360908826095, |
| "loss": 3.237, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.669733853590355, |
| "eval_accuracy": 0.3736510933588318, |
| "eval_loss": 3.543719530105591, |
| "eval_runtime": 180.4495, |
| "eval_samples_per_second": 92.231, |
| "eval_steps_per_second": 5.769, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.684293285190147, |
| "grad_norm": 0.4202480614185333, |
| "learning_rate": 0.00035199883483833384, |
| "loss": 3.2331, |
| "step": 71050 |
| }, |
| { |
| "epoch": 20.698852716789936, |
| "grad_norm": 0.4162975549697876, |
| "learning_rate": 0.0003518240605884066, |
| "loss": 3.242, |
| "step": 71100 |
| }, |
| { |
| "epoch": 20.713412148389725, |
| "grad_norm": 0.3857875466346741, |
| "learning_rate": 0.00035164928633847945, |
| "loss": 3.2475, |
| "step": 71150 |
| }, |
| { |
| "epoch": 20.727971579989518, |
| "grad_norm": 0.3922126591205597, |
| "learning_rate": 0.0003514745120885523, |
| "loss": 3.2386, |
| "step": 71200 |
| }, |
| { |
| "epoch": 20.742531011589307, |
| "grad_norm": 0.4351789653301239, |
| "learning_rate": 0.0003512997378386251, |
| "loss": 3.2494, |
| "step": 71250 |
| }, |
| { |
| "epoch": 20.7570904431891, |
| "grad_norm": 0.40401214361190796, |
| "learning_rate": 0.00035112496358869795, |
| "loss": 3.2468, |
| "step": 71300 |
| }, |
| { |
| "epoch": 20.77164987478889, |
| "grad_norm": 0.4370771050453186, |
| "learning_rate": 0.00035095018933877073, |
| "loss": 3.2306, |
| "step": 71350 |
| }, |
| { |
| "epoch": 20.786209306388677, |
| "grad_norm": 0.42607468366622925, |
| "learning_rate": 0.00035077541508884356, |
| "loss": 3.2454, |
| "step": 71400 |
| }, |
| { |
| "epoch": 20.80076873798847, |
| "grad_norm": 0.42872190475463867, |
| "learning_rate": 0.0003506006408389164, |
| "loss": 3.2379, |
| "step": 71450 |
| }, |
| { |
| "epoch": 20.81532816958826, |
| "grad_norm": 0.4074784815311432, |
| "learning_rate": 0.00035042586658898923, |
| "loss": 3.2438, |
| "step": 71500 |
| }, |
| { |
| "epoch": 20.829887601188048, |
| "grad_norm": 0.4053134322166443, |
| "learning_rate": 0.00035025109233906206, |
| "loss": 3.2373, |
| "step": 71550 |
| }, |
| { |
| "epoch": 20.84444703278784, |
| "grad_norm": 0.4389936923980713, |
| "learning_rate": 0.00035007631808913484, |
| "loss": 3.2509, |
| "step": 71600 |
| }, |
| { |
| "epoch": 20.85900646438763, |
| "grad_norm": 0.3987066149711609, |
| "learning_rate": 0.00034990154383920767, |
| "loss": 3.2476, |
| "step": 71650 |
| }, |
| { |
| "epoch": 20.873565895987422, |
| "grad_norm": 0.403385728597641, |
| "learning_rate": 0.0003497267695892805, |
| "loss": 3.2615, |
| "step": 71700 |
| }, |
| { |
| "epoch": 20.88812532758721, |
| "grad_norm": 0.3917509913444519, |
| "learning_rate": 0.00034955199533935334, |
| "loss": 3.2535, |
| "step": 71750 |
| }, |
| { |
| "epoch": 20.902684759187, |
| "grad_norm": 0.3859056234359741, |
| "learning_rate": 0.0003493772210894261, |
| "loss": 3.2504, |
| "step": 71800 |
| }, |
| { |
| "epoch": 20.917244190786793, |
| "grad_norm": 0.381082683801651, |
| "learning_rate": 0.00034920244683949895, |
| "loss": 3.2605, |
| "step": 71850 |
| }, |
| { |
| "epoch": 20.93180362238658, |
| "grad_norm": 0.4057201147079468, |
| "learning_rate": 0.0003490276725895718, |
| "loss": 3.2611, |
| "step": 71900 |
| }, |
| { |
| "epoch": 20.94636305398637, |
| "grad_norm": 0.4282720983028412, |
| "learning_rate": 0.0003488528983396446, |
| "loss": 3.2509, |
| "step": 71950 |
| }, |
| { |
| "epoch": 20.960922485586163, |
| "grad_norm": 0.43336808681488037, |
| "learning_rate": 0.00034867812408971745, |
| "loss": 3.263, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.960922485586163, |
| "eval_accuracy": 0.37352304460200414, |
| "eval_loss": 3.5321924686431885, |
| "eval_runtime": 181.1295, |
| "eval_samples_per_second": 91.885, |
| "eval_steps_per_second": 5.747, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.975481917185952, |
| "grad_norm": 0.38790565729141235, |
| "learning_rate": 0.00034850334983979023, |
| "loss": 3.2522, |
| "step": 72050 |
| }, |
| { |
| "epoch": 20.990041348785745, |
| "grad_norm": 0.3824382424354553, |
| "learning_rate": 0.00034832857558986306, |
| "loss": 3.2437, |
| "step": 72100 |
| }, |
| { |
| "epoch": 21.004367829479936, |
| "grad_norm": 0.4122096598148346, |
| "learning_rate": 0.0003481538013399359, |
| "loss": 3.2138, |
| "step": 72150 |
| }, |
| { |
| "epoch": 21.018927261079728, |
| "grad_norm": 0.39322277903556824, |
| "learning_rate": 0.0003479790270900087, |
| "loss": 3.1639, |
| "step": 72200 |
| }, |
| { |
| "epoch": 21.033486692679517, |
| "grad_norm": 0.417479544878006, |
| "learning_rate": 0.0003478042528400815, |
| "loss": 3.1547, |
| "step": 72250 |
| }, |
| { |
| "epoch": 21.04804612427931, |
| "grad_norm": 0.40138930082321167, |
| "learning_rate": 0.00034762947859015434, |
| "loss": 3.1497, |
| "step": 72300 |
| }, |
| { |
| "epoch": 21.0626055558791, |
| "grad_norm": 0.4107111394405365, |
| "learning_rate": 0.00034745470434022717, |
| "loss": 3.1665, |
| "step": 72350 |
| }, |
| { |
| "epoch": 21.077164987478888, |
| "grad_norm": 0.4117753803730011, |
| "learning_rate": 0.0003472799300903, |
| "loss": 3.1677, |
| "step": 72400 |
| }, |
| { |
| "epoch": 21.09172441907868, |
| "grad_norm": 0.4315739870071411, |
| "learning_rate": 0.00034710515584037284, |
| "loss": 3.1559, |
| "step": 72450 |
| }, |
| { |
| "epoch": 21.10628385067847, |
| "grad_norm": 0.4121326506137848, |
| "learning_rate": 0.0003469303815904456, |
| "loss": 3.1717, |
| "step": 72500 |
| }, |
| { |
| "epoch": 21.12084328227826, |
| "grad_norm": 0.3968290388584137, |
| "learning_rate": 0.00034675560734051845, |
| "loss": 3.1652, |
| "step": 72550 |
| }, |
| { |
| "epoch": 21.13540271387805, |
| "grad_norm": 0.4244532287120819, |
| "learning_rate": 0.0003465808330905913, |
| "loss": 3.1783, |
| "step": 72600 |
| }, |
| { |
| "epoch": 21.14996214547784, |
| "grad_norm": 0.4135434329509735, |
| "learning_rate": 0.0003464060588406641, |
| "loss": 3.1797, |
| "step": 72650 |
| }, |
| { |
| "epoch": 21.164521577077632, |
| "grad_norm": 0.38270533084869385, |
| "learning_rate": 0.00034623128459073695, |
| "loss": 3.1792, |
| "step": 72700 |
| }, |
| { |
| "epoch": 21.17908100867742, |
| "grad_norm": 0.433463990688324, |
| "learning_rate": 0.00034605651034080973, |
| "loss": 3.17, |
| "step": 72750 |
| }, |
| { |
| "epoch": 21.19364044027721, |
| "grad_norm": 0.3898848295211792, |
| "learning_rate": 0.00034588173609088256, |
| "loss": 3.1825, |
| "step": 72800 |
| }, |
| { |
| "epoch": 21.208199871877003, |
| "grad_norm": 0.4255882203578949, |
| "learning_rate": 0.0003457069618409554, |
| "loss": 3.1903, |
| "step": 72850 |
| }, |
| { |
| "epoch": 21.222759303476792, |
| "grad_norm": 0.47647958993911743, |
| "learning_rate": 0.0003455321875910282, |
| "loss": 3.188, |
| "step": 72900 |
| }, |
| { |
| "epoch": 21.23731873507658, |
| "grad_norm": 0.4151340126991272, |
| "learning_rate": 0.000345357413341101, |
| "loss": 3.1946, |
| "step": 72950 |
| }, |
| { |
| "epoch": 21.251878166676374, |
| "grad_norm": 0.4168964922428131, |
| "learning_rate": 0.00034518263909117384, |
| "loss": 3.2052, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.251878166676374, |
| "eval_accuracy": 0.37267949842984477, |
| "eval_loss": 3.5532169342041016, |
| "eval_runtime": 181.4222, |
| "eval_samples_per_second": 91.736, |
| "eval_steps_per_second": 5.738, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.266437598276163, |
| "grad_norm": 0.39896926283836365, |
| "learning_rate": 0.00034500786484124667, |
| "loss": 3.194, |
| "step": 73050 |
| }, |
| { |
| "epoch": 21.280997029875955, |
| "grad_norm": 0.4467354416847229, |
| "learning_rate": 0.0003448330905913195, |
| "loss": 3.1996, |
| "step": 73100 |
| }, |
| { |
| "epoch": 21.295556461475744, |
| "grad_norm": 0.40804705023765564, |
| "learning_rate": 0.00034465831634139234, |
| "loss": 3.212, |
| "step": 73150 |
| }, |
| { |
| "epoch": 21.310115893075533, |
| "grad_norm": 0.41930943727493286, |
| "learning_rate": 0.0003444835420914651, |
| "loss": 3.2012, |
| "step": 73200 |
| }, |
| { |
| "epoch": 21.324675324675326, |
| "grad_norm": 0.4978606700897217, |
| "learning_rate": 0.00034430876784153795, |
| "loss": 3.2046, |
| "step": 73250 |
| }, |
| { |
| "epoch": 21.339234756275115, |
| "grad_norm": 0.4318115711212158, |
| "learning_rate": 0.0003441339935916108, |
| "loss": 3.1995, |
| "step": 73300 |
| }, |
| { |
| "epoch": 21.353794187874904, |
| "grad_norm": 0.4151814579963684, |
| "learning_rate": 0.0003439592193416836, |
| "loss": 3.2036, |
| "step": 73350 |
| }, |
| { |
| "epoch": 21.368353619474696, |
| "grad_norm": 0.4480758309364319, |
| "learning_rate": 0.00034378444509175645, |
| "loss": 3.2069, |
| "step": 73400 |
| }, |
| { |
| "epoch": 21.382913051074485, |
| "grad_norm": 0.40307602286338806, |
| "learning_rate": 0.0003436096708418292, |
| "loss": 3.1965, |
| "step": 73450 |
| }, |
| { |
| "epoch": 21.397472482674278, |
| "grad_norm": 0.4136466979980469, |
| "learning_rate": 0.00034343489659190206, |
| "loss": 3.2067, |
| "step": 73500 |
| }, |
| { |
| "epoch": 21.412031914274067, |
| "grad_norm": 0.41154733300209045, |
| "learning_rate": 0.0003432601223419749, |
| "loss": 3.1987, |
| "step": 73550 |
| }, |
| { |
| "epoch": 21.426591345873856, |
| "grad_norm": 0.3847964107990265, |
| "learning_rate": 0.0003430853480920478, |
| "loss": 3.2141, |
| "step": 73600 |
| }, |
| { |
| "epoch": 21.44115077747365, |
| "grad_norm": 0.3878454566001892, |
| "learning_rate": 0.0003429105738421206, |
| "loss": 3.2038, |
| "step": 73650 |
| }, |
| { |
| "epoch": 21.455710209073438, |
| "grad_norm": 0.4452696442604065, |
| "learning_rate": 0.0003427357995921934, |
| "loss": 3.2162, |
| "step": 73700 |
| }, |
| { |
| "epoch": 21.470269640673227, |
| "grad_norm": 0.39309459924697876, |
| "learning_rate": 0.0003425610253422662, |
| "loss": 3.2198, |
| "step": 73750 |
| }, |
| { |
| "epoch": 21.48482907227302, |
| "grad_norm": 0.4251159429550171, |
| "learning_rate": 0.00034238625109233906, |
| "loss": 3.2216, |
| "step": 73800 |
| }, |
| { |
| "epoch": 21.499388503872808, |
| "grad_norm": 0.41249531507492065, |
| "learning_rate": 0.0003422114768424119, |
| "loss": 3.2121, |
| "step": 73850 |
| }, |
| { |
| "epoch": 21.5139479354726, |
| "grad_norm": 0.43736204504966736, |
| "learning_rate": 0.0003420367025924847, |
| "loss": 3.2238, |
| "step": 73900 |
| }, |
| { |
| "epoch": 21.52850736707239, |
| "grad_norm": 0.4327532649040222, |
| "learning_rate": 0.0003418619283425575, |
| "loss": 3.2167, |
| "step": 73950 |
| }, |
| { |
| "epoch": 21.54306679867218, |
| "grad_norm": 0.4144987165927887, |
| "learning_rate": 0.00034168715409263034, |
| "loss": 3.2214, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.54306679867218, |
| "eval_accuracy": 0.37350446636180323, |
| "eval_loss": 3.544901132583618, |
| "eval_runtime": 181.1526, |
| "eval_samples_per_second": 91.873, |
| "eval_steps_per_second": 5.747, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.55762623027197, |
| "grad_norm": 0.4094686210155487, |
| "learning_rate": 0.00034151237984270317, |
| "loss": 3.2336, |
| "step": 74050 |
| }, |
| { |
| "epoch": 21.57218566187176, |
| "grad_norm": 0.3899657726287842, |
| "learning_rate": 0.000341337605592776, |
| "loss": 3.23, |
| "step": 74100 |
| }, |
| { |
| "epoch": 21.58674509347155, |
| "grad_norm": 0.38108712434768677, |
| "learning_rate": 0.00034116283134284883, |
| "loss": 3.2234, |
| "step": 74150 |
| }, |
| { |
| "epoch": 21.601304525071342, |
| "grad_norm": 0.4188117980957031, |
| "learning_rate": 0.0003409880570929216, |
| "loss": 3.2367, |
| "step": 74200 |
| }, |
| { |
| "epoch": 21.61586395667113, |
| "grad_norm": 0.4578975439071655, |
| "learning_rate": 0.00034081328284299445, |
| "loss": 3.2332, |
| "step": 74250 |
| }, |
| { |
| "epoch": 21.630423388270923, |
| "grad_norm": 0.411504864692688, |
| "learning_rate": 0.0003406385085930673, |
| "loss": 3.2302, |
| "step": 74300 |
| }, |
| { |
| "epoch": 21.644982819870712, |
| "grad_norm": 0.3843030035495758, |
| "learning_rate": 0.0003404637343431401, |
| "loss": 3.2405, |
| "step": 74350 |
| }, |
| { |
| "epoch": 21.6595422514705, |
| "grad_norm": 0.40367045998573303, |
| "learning_rate": 0.0003402889600932129, |
| "loss": 3.2326, |
| "step": 74400 |
| }, |
| { |
| "epoch": 21.674101683070294, |
| "grad_norm": 0.41419780254364014, |
| "learning_rate": 0.0003401141858432857, |
| "loss": 3.226, |
| "step": 74450 |
| }, |
| { |
| "epoch": 21.688661114670083, |
| "grad_norm": 0.4172533452510834, |
| "learning_rate": 0.00033993941159335856, |
| "loss": 3.2313, |
| "step": 74500 |
| }, |
| { |
| "epoch": 21.703220546269872, |
| "grad_norm": 0.42541390657424927, |
| "learning_rate": 0.0003397646373434314, |
| "loss": 3.2203, |
| "step": 74550 |
| }, |
| { |
| "epoch": 21.717779977869665, |
| "grad_norm": 0.430610716342926, |
| "learning_rate": 0.0003395898630935042, |
| "loss": 3.2308, |
| "step": 74600 |
| }, |
| { |
| "epoch": 21.732339409469454, |
| "grad_norm": 0.39513736963272095, |
| "learning_rate": 0.000339415088843577, |
| "loss": 3.231, |
| "step": 74650 |
| }, |
| { |
| "epoch": 21.746898841069246, |
| "grad_norm": 0.43501347303390503, |
| "learning_rate": 0.00033924031459364983, |
| "loss": 3.2268, |
| "step": 74700 |
| }, |
| { |
| "epoch": 21.761458272669035, |
| "grad_norm": 0.4237409830093384, |
| "learning_rate": 0.00033906554034372267, |
| "loss": 3.2324, |
| "step": 74750 |
| }, |
| { |
| "epoch": 21.776017704268824, |
| "grad_norm": 0.41927024722099304, |
| "learning_rate": 0.0003388907660937955, |
| "loss": 3.2298, |
| "step": 74800 |
| }, |
| { |
| "epoch": 21.790577135868617, |
| "grad_norm": 0.4185371696949005, |
| "learning_rate": 0.0003387159918438683, |
| "loss": 3.2384, |
| "step": 74850 |
| }, |
| { |
| "epoch": 21.805136567468406, |
| "grad_norm": 0.4015279710292816, |
| "learning_rate": 0.0003385412175939411, |
| "loss": 3.2271, |
| "step": 74900 |
| }, |
| { |
| "epoch": 21.819695999068195, |
| "grad_norm": 0.4096405804157257, |
| "learning_rate": 0.00033836644334401395, |
| "loss": 3.2573, |
| "step": 74950 |
| }, |
| { |
| "epoch": 21.834255430667987, |
| "grad_norm": 0.40210607647895813, |
| "learning_rate": 0.0003381916690940868, |
| "loss": 3.2313, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.834255430667987, |
| "eval_accuracy": 0.37395798707354266, |
| "eval_loss": 3.53704833984375, |
| "eval_runtime": 181.2122, |
| "eval_samples_per_second": 91.843, |
| "eval_steps_per_second": 5.745, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.848814862267776, |
| "grad_norm": 0.39823204278945923, |
| "learning_rate": 0.0003380168948441596, |
| "loss": 3.2451, |
| "step": 75050 |
| }, |
| { |
| "epoch": 21.86337429386757, |
| "grad_norm": 0.3969886600971222, |
| "learning_rate": 0.0003378421205942324, |
| "loss": 3.2378, |
| "step": 75100 |
| }, |
| { |
| "epoch": 21.877933725467358, |
| "grad_norm": 0.4100249409675598, |
| "learning_rate": 0.0003376673463443052, |
| "loss": 3.2505, |
| "step": 75150 |
| }, |
| { |
| "epoch": 21.892493157067147, |
| "grad_norm": 0.405699223279953, |
| "learning_rate": 0.00033749257209437806, |
| "loss": 3.2278, |
| "step": 75200 |
| }, |
| { |
| "epoch": 21.90705258866694, |
| "grad_norm": 0.39252954721450806, |
| "learning_rate": 0.0003373177978444509, |
| "loss": 3.2441, |
| "step": 75250 |
| }, |
| { |
| "epoch": 21.92161202026673, |
| "grad_norm": 0.38878968358039856, |
| "learning_rate": 0.0003371430235945237, |
| "loss": 3.2341, |
| "step": 75300 |
| }, |
| { |
| "epoch": 21.93617145186652, |
| "grad_norm": 0.3854546844959259, |
| "learning_rate": 0.0003369682493445965, |
| "loss": 3.2396, |
| "step": 75350 |
| }, |
| { |
| "epoch": 21.95073088346631, |
| "grad_norm": 0.421974241733551, |
| "learning_rate": 0.00033679347509466933, |
| "loss": 3.2496, |
| "step": 75400 |
| }, |
| { |
| "epoch": 21.9652903150661, |
| "grad_norm": 0.3947569727897644, |
| "learning_rate": 0.00033661870084474217, |
| "loss": 3.2349, |
| "step": 75450 |
| }, |
| { |
| "epoch": 21.97984974666589, |
| "grad_norm": 0.40448886156082153, |
| "learning_rate": 0.000336443926594815, |
| "loss": 3.2366, |
| "step": 75500 |
| }, |
| { |
| "epoch": 21.99440917826568, |
| "grad_norm": 0.40597161650657654, |
| "learning_rate": 0.0003362691523448878, |
| "loss": 3.2422, |
| "step": 75550 |
| }, |
| { |
| "epoch": 22.008735658959875, |
| "grad_norm": 0.38081154227256775, |
| "learning_rate": 0.0003360943780949606, |
| "loss": 3.1834, |
| "step": 75600 |
| }, |
| { |
| "epoch": 22.023295090559664, |
| "grad_norm": 0.4210168719291687, |
| "learning_rate": 0.00033591960384503344, |
| "loss": 3.1409, |
| "step": 75650 |
| }, |
| { |
| "epoch": 22.037854522159456, |
| "grad_norm": 0.40493515133857727, |
| "learning_rate": 0.0003357448295951063, |
| "loss": 3.1449, |
| "step": 75700 |
| }, |
| { |
| "epoch": 22.052413953759245, |
| "grad_norm": 0.4119669795036316, |
| "learning_rate": 0.0003355700553451791, |
| "loss": 3.1488, |
| "step": 75750 |
| }, |
| { |
| "epoch": 22.066973385359034, |
| "grad_norm": 0.44360145926475525, |
| "learning_rate": 0.0003353952810952519, |
| "loss": 3.1563, |
| "step": 75800 |
| }, |
| { |
| "epoch": 22.081532816958827, |
| "grad_norm": 0.4260116517543793, |
| "learning_rate": 0.0003352205068453247, |
| "loss": 3.1691, |
| "step": 75850 |
| }, |
| { |
| "epoch": 22.096092248558616, |
| "grad_norm": 0.42039230465888977, |
| "learning_rate": 0.00033504573259539756, |
| "loss": 3.1466, |
| "step": 75900 |
| }, |
| { |
| "epoch": 22.110651680158405, |
| "grad_norm": 0.4209480285644531, |
| "learning_rate": 0.0003348709583454704, |
| "loss": 3.1731, |
| "step": 75950 |
| }, |
| { |
| "epoch": 22.125211111758198, |
| "grad_norm": 0.4045847952365875, |
| "learning_rate": 0.0003346961840955432, |
| "loss": 3.1508, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.125211111758198, |
| "eval_accuracy": 0.37304953464447893, |
| "eval_loss": 3.552255868911743, |
| "eval_runtime": 181.5419, |
| "eval_samples_per_second": 91.676, |
| "eval_steps_per_second": 5.734, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.139770543357987, |
| "grad_norm": 0.42066532373428345, |
| "learning_rate": 0.000334521409845616, |
| "loss": 3.18, |
| "step": 76050 |
| }, |
| { |
| "epoch": 22.15432997495778, |
| "grad_norm": 0.40776318311691284, |
| "learning_rate": 0.0003343466355956889, |
| "loss": 3.1774, |
| "step": 76100 |
| }, |
| { |
| "epoch": 22.168889406557568, |
| "grad_norm": 0.40850281715393066, |
| "learning_rate": 0.0003341718613457617, |
| "loss": 3.1786, |
| "step": 76150 |
| }, |
| { |
| "epoch": 22.183448838157357, |
| "grad_norm": 0.42042505741119385, |
| "learning_rate": 0.00033399708709583455, |
| "loss": 3.1611, |
| "step": 76200 |
| }, |
| { |
| "epoch": 22.19800826975715, |
| "grad_norm": 0.43354547023773193, |
| "learning_rate": 0.0003338223128459074, |
| "loss": 3.162, |
| "step": 76250 |
| }, |
| { |
| "epoch": 22.21256770135694, |
| "grad_norm": 0.4136315882205963, |
| "learning_rate": 0.00033364753859598016, |
| "loss": 3.1812, |
| "step": 76300 |
| }, |
| { |
| "epoch": 22.227127132956728, |
| "grad_norm": 0.4253956377506256, |
| "learning_rate": 0.000333472764346053, |
| "loss": 3.1806, |
| "step": 76350 |
| }, |
| { |
| "epoch": 22.24168656455652, |
| "grad_norm": 0.41711297631263733, |
| "learning_rate": 0.00033329799009612583, |
| "loss": 3.1771, |
| "step": 76400 |
| }, |
| { |
| "epoch": 22.25624599615631, |
| "grad_norm": 0.40799766778945923, |
| "learning_rate": 0.00033312321584619866, |
| "loss": 3.1944, |
| "step": 76450 |
| }, |
| { |
| "epoch": 22.270805427756102, |
| "grad_norm": 0.4232812821865082, |
| "learning_rate": 0.0003329484415962715, |
| "loss": 3.1952, |
| "step": 76500 |
| }, |
| { |
| "epoch": 22.28536485935589, |
| "grad_norm": 0.40353062748908997, |
| "learning_rate": 0.0003327736673463443, |
| "loss": 3.171, |
| "step": 76550 |
| }, |
| { |
| "epoch": 22.29992429095568, |
| "grad_norm": 0.39604508876800537, |
| "learning_rate": 0.0003325988930964171, |
| "loss": 3.1883, |
| "step": 76600 |
| }, |
| { |
| "epoch": 22.314483722555472, |
| "grad_norm": 0.4719644784927368, |
| "learning_rate": 0.00033242411884648994, |
| "loss": 3.1888, |
| "step": 76650 |
| }, |
| { |
| "epoch": 22.32904315415526, |
| "grad_norm": 0.45007991790771484, |
| "learning_rate": 0.0003322493445965628, |
| "loss": 3.178, |
| "step": 76700 |
| }, |
| { |
| "epoch": 22.34360258575505, |
| "grad_norm": 0.40224868059158325, |
| "learning_rate": 0.0003320745703466356, |
| "loss": 3.1944, |
| "step": 76750 |
| }, |
| { |
| "epoch": 22.358162017354843, |
| "grad_norm": 0.4655930995941162, |
| "learning_rate": 0.0003318997960967084, |
| "loss": 3.1932, |
| "step": 76800 |
| }, |
| { |
| "epoch": 22.372721448954632, |
| "grad_norm": 0.4105687141418457, |
| "learning_rate": 0.0003317250218467812, |
| "loss": 3.2026, |
| "step": 76850 |
| }, |
| { |
| "epoch": 22.387280880554425, |
| "grad_norm": 0.42557498812675476, |
| "learning_rate": 0.00033155024759685405, |
| "loss": 3.193, |
| "step": 76900 |
| }, |
| { |
| "epoch": 22.401840312154214, |
| "grad_norm": 0.4060722589492798, |
| "learning_rate": 0.0003313754733469269, |
| "loss": 3.1982, |
| "step": 76950 |
| }, |
| { |
| "epoch": 22.416399743754003, |
| "grad_norm": 0.4282877445220947, |
| "learning_rate": 0.00033120069909699966, |
| "loss": 3.2084, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.416399743754003, |
| "eval_accuracy": 0.3734410886942825, |
| "eval_loss": 3.547260046005249, |
| "eval_runtime": 181.4848, |
| "eval_samples_per_second": 91.705, |
| "eval_steps_per_second": 5.736, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.430959175353795, |
| "grad_norm": 0.40111586451530457, |
| "learning_rate": 0.0003310259248470725, |
| "loss": 3.2089, |
| "step": 77050 |
| }, |
| { |
| "epoch": 22.445518606953584, |
| "grad_norm": 0.4090210199356079, |
| "learning_rate": 0.00033085115059714533, |
| "loss": 3.2076, |
| "step": 77100 |
| }, |
| { |
| "epoch": 22.460078038553373, |
| "grad_norm": 0.40906432271003723, |
| "learning_rate": 0.00033067637634721816, |
| "loss": 3.2039, |
| "step": 77150 |
| }, |
| { |
| "epoch": 22.474637470153166, |
| "grad_norm": 0.40148457884788513, |
| "learning_rate": 0.000330501602097291, |
| "loss": 3.2014, |
| "step": 77200 |
| }, |
| { |
| "epoch": 22.489196901752955, |
| "grad_norm": 0.427418053150177, |
| "learning_rate": 0.0003303268278473638, |
| "loss": 3.2069, |
| "step": 77250 |
| }, |
| { |
| "epoch": 22.503756333352747, |
| "grad_norm": 0.4312450587749481, |
| "learning_rate": 0.0003301520535974366, |
| "loss": 3.2027, |
| "step": 77300 |
| }, |
| { |
| "epoch": 22.518315764952536, |
| "grad_norm": 0.40587109327316284, |
| "learning_rate": 0.00032997727934750944, |
| "loss": 3.2128, |
| "step": 77350 |
| }, |
| { |
| "epoch": 22.532875196552325, |
| "grad_norm": 0.4101792871952057, |
| "learning_rate": 0.0003298025050975823, |
| "loss": 3.2068, |
| "step": 77400 |
| }, |
| { |
| "epoch": 22.547434628152118, |
| "grad_norm": 0.40882381796836853, |
| "learning_rate": 0.0003296277308476551, |
| "loss": 3.2115, |
| "step": 77450 |
| }, |
| { |
| "epoch": 22.561994059751907, |
| "grad_norm": 0.44296374917030334, |
| "learning_rate": 0.0003294529565977279, |
| "loss": 3.208, |
| "step": 77500 |
| }, |
| { |
| "epoch": 22.576553491351696, |
| "grad_norm": 0.40894755721092224, |
| "learning_rate": 0.0003292781823478007, |
| "loss": 3.209, |
| "step": 77550 |
| }, |
| { |
| "epoch": 22.59111292295149, |
| "grad_norm": 0.4192773997783661, |
| "learning_rate": 0.00032910340809787355, |
| "loss": 3.2099, |
| "step": 77600 |
| }, |
| { |
| "epoch": 22.605672354551277, |
| "grad_norm": 0.40448158979415894, |
| "learning_rate": 0.0003289286338479464, |
| "loss": 3.2154, |
| "step": 77650 |
| }, |
| { |
| "epoch": 22.62023178615107, |
| "grad_norm": 0.4075862765312195, |
| "learning_rate": 0.00032875385959801916, |
| "loss": 3.2135, |
| "step": 77700 |
| }, |
| { |
| "epoch": 22.63479121775086, |
| "grad_norm": 0.41879263520240784, |
| "learning_rate": 0.000328579085348092, |
| "loss": 3.2297, |
| "step": 77750 |
| }, |
| { |
| "epoch": 22.649350649350648, |
| "grad_norm": 0.3986618220806122, |
| "learning_rate": 0.00032840431109816483, |
| "loss": 3.2217, |
| "step": 77800 |
| }, |
| { |
| "epoch": 22.66391008095044, |
| "grad_norm": 0.42791303992271423, |
| "learning_rate": 0.00032822953684823766, |
| "loss": 3.2292, |
| "step": 77850 |
| }, |
| { |
| "epoch": 22.67846951255023, |
| "grad_norm": 0.3977803885936737, |
| "learning_rate": 0.0003280547625983105, |
| "loss": 3.2199, |
| "step": 77900 |
| }, |
| { |
| "epoch": 22.693028944150022, |
| "grad_norm": 0.402538537979126, |
| "learning_rate": 0.0003278799883483833, |
| "loss": 3.2187, |
| "step": 77950 |
| }, |
| { |
| "epoch": 22.70758837574981, |
| "grad_norm": 0.4115932285785675, |
| "learning_rate": 0.0003277052140984561, |
| "loss": 3.2192, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.70758837574981, |
| "eval_accuracy": 0.37358524643153745, |
| "eval_loss": 3.5417346954345703, |
| "eval_runtime": 180.8954, |
| "eval_samples_per_second": 92.003, |
| "eval_steps_per_second": 5.755, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.7221478073496, |
| "grad_norm": 0.42461565136909485, |
| "learning_rate": 0.00032753043984852894, |
| "loss": 3.2215, |
| "step": 78050 |
| }, |
| { |
| "epoch": 22.736707238949393, |
| "grad_norm": 0.3989030420780182, |
| "learning_rate": 0.0003273556655986018, |
| "loss": 3.2121, |
| "step": 78100 |
| }, |
| { |
| "epoch": 22.75126667054918, |
| "grad_norm": 0.40380460023880005, |
| "learning_rate": 0.00032718089134867455, |
| "loss": 3.2228, |
| "step": 78150 |
| }, |
| { |
| "epoch": 22.76582610214897, |
| "grad_norm": 0.4479510486125946, |
| "learning_rate": 0.0003270061170987474, |
| "loss": 3.219, |
| "step": 78200 |
| }, |
| { |
| "epoch": 22.780385533748763, |
| "grad_norm": 0.4035521149635315, |
| "learning_rate": 0.0003268313428488202, |
| "loss": 3.2273, |
| "step": 78250 |
| }, |
| { |
| "epoch": 22.794944965348552, |
| "grad_norm": 0.41515037417411804, |
| "learning_rate": 0.00032665656859889305, |
| "loss": 3.2283, |
| "step": 78300 |
| }, |
| { |
| "epoch": 22.80950439694834, |
| "grad_norm": 0.39006373286247253, |
| "learning_rate": 0.0003264817943489659, |
| "loss": 3.2261, |
| "step": 78350 |
| }, |
| { |
| "epoch": 22.824063828548134, |
| "grad_norm": 0.44352978467941284, |
| "learning_rate": 0.00032630702009903866, |
| "loss": 3.2247, |
| "step": 78400 |
| }, |
| { |
| "epoch": 22.838623260147923, |
| "grad_norm": 0.3941769599914551, |
| "learning_rate": 0.0003261322458491115, |
| "loss": 3.2325, |
| "step": 78450 |
| }, |
| { |
| "epoch": 22.853182691747715, |
| "grad_norm": 0.41656285524368286, |
| "learning_rate": 0.00032595747159918433, |
| "loss": 3.2328, |
| "step": 78500 |
| }, |
| { |
| "epoch": 22.867742123347504, |
| "grad_norm": 0.43540436029434204, |
| "learning_rate": 0.00032578269734925716, |
| "loss": 3.2362, |
| "step": 78550 |
| }, |
| { |
| "epoch": 22.882301554947293, |
| "grad_norm": 0.42130741477012634, |
| "learning_rate": 0.00032560792309933, |
| "loss": 3.231, |
| "step": 78600 |
| }, |
| { |
| "epoch": 22.896860986547086, |
| "grad_norm": 0.39623120427131653, |
| "learning_rate": 0.0003254331488494029, |
| "loss": 3.2289, |
| "step": 78650 |
| }, |
| { |
| "epoch": 22.911420418146875, |
| "grad_norm": 0.41021716594696045, |
| "learning_rate": 0.00032525837459947566, |
| "loss": 3.2312, |
| "step": 78700 |
| }, |
| { |
| "epoch": 22.925979849746668, |
| "grad_norm": 0.3942141532897949, |
| "learning_rate": 0.0003250836003495485, |
| "loss": 3.2271, |
| "step": 78750 |
| }, |
| { |
| "epoch": 22.940539281346457, |
| "grad_norm": 0.4345264136791229, |
| "learning_rate": 0.0003249088260996213, |
| "loss": 3.2332, |
| "step": 78800 |
| }, |
| { |
| "epoch": 22.955098712946246, |
| "grad_norm": 0.423775315284729, |
| "learning_rate": 0.00032473405184969416, |
| "loss": 3.2217, |
| "step": 78850 |
| }, |
| { |
| "epoch": 22.969658144546038, |
| "grad_norm": 0.4158564805984497, |
| "learning_rate": 0.00032455927759976694, |
| "loss": 3.2264, |
| "step": 78900 |
| }, |
| { |
| "epoch": 22.984217576145827, |
| "grad_norm": 0.4096801280975342, |
| "learning_rate": 0.00032438450334983977, |
| "loss": 3.2465, |
| "step": 78950 |
| }, |
| { |
| "epoch": 22.998777007745616, |
| "grad_norm": 0.38529083132743835, |
| "learning_rate": 0.0003242097290999126, |
| "loss": 3.2454, |
| "step": 79000 |
| }, |
| { |
| "epoch": 22.998777007745616, |
| "eval_accuracy": 0.3740442935818177, |
| "eval_loss": 3.533736228942871, |
| "eval_runtime": 181.0451, |
| "eval_samples_per_second": 91.927, |
| "eval_steps_per_second": 5.75, |
| "step": 79000 |
| }, |
| { |
| "epoch": 23.01310348843981, |
| "grad_norm": 0.4034730792045593, |
| "learning_rate": 0.00032403495484998544, |
| "loss": 3.1405, |
| "step": 79050 |
| }, |
| { |
| "epoch": 23.027662920039603, |
| "grad_norm": 0.4115341007709503, |
| "learning_rate": 0.00032386018060005827, |
| "loss": 3.1426, |
| "step": 79100 |
| }, |
| { |
| "epoch": 23.042222351639392, |
| "grad_norm": 0.4165702760219574, |
| "learning_rate": 0.00032368540635013105, |
| "loss": 3.1417, |
| "step": 79150 |
| }, |
| { |
| "epoch": 23.05678178323918, |
| "grad_norm": 0.42881783843040466, |
| "learning_rate": 0.0003235106321002039, |
| "loss": 3.1477, |
| "step": 79200 |
| }, |
| { |
| "epoch": 23.071341214838974, |
| "grad_norm": 0.4214617609977722, |
| "learning_rate": 0.0003233358578502767, |
| "loss": 3.137, |
| "step": 79250 |
| }, |
| { |
| "epoch": 23.085900646438763, |
| "grad_norm": 0.4396951198577881, |
| "learning_rate": 0.00032316108360034955, |
| "loss": 3.1507, |
| "step": 79300 |
| }, |
| { |
| "epoch": 23.10046007803855, |
| "grad_norm": 0.42560476064682007, |
| "learning_rate": 0.0003229863093504224, |
| "loss": 3.1489, |
| "step": 79350 |
| }, |
| { |
| "epoch": 23.115019509638344, |
| "grad_norm": 0.4523085653781891, |
| "learning_rate": 0.00032281153510049516, |
| "loss": 3.1484, |
| "step": 79400 |
| }, |
| { |
| "epoch": 23.129578941238133, |
| "grad_norm": 0.4005928337574005, |
| "learning_rate": 0.000322636760850568, |
| "loss": 3.162, |
| "step": 79450 |
| }, |
| { |
| "epoch": 23.144138372837926, |
| "grad_norm": 0.4236561357975006, |
| "learning_rate": 0.0003224619866006408, |
| "loss": 3.167, |
| "step": 79500 |
| }, |
| { |
| "epoch": 23.158697804437715, |
| "grad_norm": 0.4328435957431793, |
| "learning_rate": 0.00032228721235071366, |
| "loss": 3.1655, |
| "step": 79550 |
| }, |
| { |
| "epoch": 23.173257236037504, |
| "grad_norm": 0.41501981019973755, |
| "learning_rate": 0.00032211243810078644, |
| "loss": 3.1654, |
| "step": 79600 |
| }, |
| { |
| "epoch": 23.187816667637296, |
| "grad_norm": 0.43546444177627563, |
| "learning_rate": 0.00032193766385085927, |
| "loss": 3.1676, |
| "step": 79650 |
| }, |
| { |
| "epoch": 23.202376099237085, |
| "grad_norm": 0.45414286851882935, |
| "learning_rate": 0.0003217628896009321, |
| "loss": 3.1616, |
| "step": 79700 |
| }, |
| { |
| "epoch": 23.216935530836878, |
| "grad_norm": 0.4444766044616699, |
| "learning_rate": 0.00032158811535100494, |
| "loss": 3.1677, |
| "step": 79750 |
| }, |
| { |
| "epoch": 23.231494962436667, |
| "grad_norm": 0.43765148520469666, |
| "learning_rate": 0.00032141334110107777, |
| "loss": 3.165, |
| "step": 79800 |
| }, |
| { |
| "epoch": 23.246054394036456, |
| "grad_norm": 0.4241009056568146, |
| "learning_rate": 0.00032123856685115055, |
| "loss": 3.165, |
| "step": 79850 |
| }, |
| { |
| "epoch": 23.26061382563625, |
| "grad_norm": 0.41852983832359314, |
| "learning_rate": 0.0003210637926012234, |
| "loss": 3.1684, |
| "step": 79900 |
| }, |
| { |
| "epoch": 23.275173257236037, |
| "grad_norm": 0.4211058020591736, |
| "learning_rate": 0.0003208890183512962, |
| "loss": 3.1892, |
| "step": 79950 |
| }, |
| { |
| "epoch": 23.289732688835826, |
| "grad_norm": 0.41997700929641724, |
| "learning_rate": 0.00032071424410136905, |
| "loss": 3.1837, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.289732688835826, |
| "eval_accuracy": 0.3734963530796902, |
| "eval_loss": 3.5481793880462646, |
| "eval_runtime": 180.9399, |
| "eval_samples_per_second": 91.981, |
| "eval_steps_per_second": 5.753, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.30429212043562, |
| "grad_norm": 0.41827014088630676, |
| "learning_rate": 0.0003205394698514419, |
| "loss": 3.1311, |
| "step": 80050 |
| }, |
| { |
| "epoch": 23.318851552035408, |
| "grad_norm": 0.41703981161117554, |
| "learning_rate": 0.00032036469560151466, |
| "loss": 3.1356, |
| "step": 80100 |
| }, |
| { |
| "epoch": 23.3334109836352, |
| "grad_norm": 0.4520207941532135, |
| "learning_rate": 0.0003201899213515875, |
| "loss": 3.143, |
| "step": 80150 |
| }, |
| { |
| "epoch": 23.34797041523499, |
| "grad_norm": 0.43337830901145935, |
| "learning_rate": 0.0003200151471016603, |
| "loss": 3.1458, |
| "step": 80200 |
| }, |
| { |
| "epoch": 23.36252984683478, |
| "grad_norm": 0.4231213331222534, |
| "learning_rate": 0.00031984037285173316, |
| "loss": 3.1387, |
| "step": 80250 |
| }, |
| { |
| "epoch": 23.37708927843457, |
| "grad_norm": 0.44809332489967346, |
| "learning_rate": 0.00031966559860180594, |
| "loss": 3.1528, |
| "step": 80300 |
| }, |
| { |
| "epoch": 23.39164871003436, |
| "grad_norm": 0.43833646178245544, |
| "learning_rate": 0.00031949082435187877, |
| "loss": 3.1466, |
| "step": 80350 |
| }, |
| { |
| "epoch": 23.40620814163415, |
| "grad_norm": 0.4350813329219818, |
| "learning_rate": 0.0003193160501019516, |
| "loss": 3.1661, |
| "step": 80400 |
| }, |
| { |
| "epoch": 23.42076757323394, |
| "grad_norm": 0.4260636270046234, |
| "learning_rate": 0.00031914127585202444, |
| "loss": 3.1666, |
| "step": 80450 |
| }, |
| { |
| "epoch": 23.43532700483373, |
| "grad_norm": 0.44178247451782227, |
| "learning_rate": 0.00031896650160209727, |
| "loss": 3.1526, |
| "step": 80500 |
| }, |
| { |
| "epoch": 23.449886436433523, |
| "grad_norm": 0.4201173484325409, |
| "learning_rate": 0.00031879172735217005, |
| "loss": 3.1709, |
| "step": 80550 |
| }, |
| { |
| "epoch": 23.464445868033312, |
| "grad_norm": 0.4208093285560608, |
| "learning_rate": 0.0003186169531022429, |
| "loss": 3.1733, |
| "step": 80600 |
| }, |
| { |
| "epoch": 23.4790052996331, |
| "grad_norm": 0.44302788376808167, |
| "learning_rate": 0.0003184421788523157, |
| "loss": 3.1758, |
| "step": 80650 |
| }, |
| { |
| "epoch": 23.493564731232894, |
| "grad_norm": 0.42564231157302856, |
| "learning_rate": 0.00031826740460238855, |
| "loss": 3.1739, |
| "step": 80700 |
| }, |
| { |
| "epoch": 23.508124162832683, |
| "grad_norm": 0.4493538439273834, |
| "learning_rate": 0.0003180926303524614, |
| "loss": 3.1696, |
| "step": 80750 |
| }, |
| { |
| "epoch": 23.522683594432472, |
| "grad_norm": 0.445277601480484, |
| "learning_rate": 0.00031791785610253416, |
| "loss": 3.1706, |
| "step": 80800 |
| }, |
| { |
| "epoch": 23.537243026032264, |
| "grad_norm": 0.4273313581943512, |
| "learning_rate": 0.000317743081852607, |
| "loss": 3.1791, |
| "step": 80850 |
| }, |
| { |
| "epoch": 23.551802457632053, |
| "grad_norm": 0.4198790490627289, |
| "learning_rate": 0.0003175683076026798, |
| "loss": 3.1725, |
| "step": 80900 |
| }, |
| { |
| "epoch": 23.566361889231846, |
| "grad_norm": 0.4127906858921051, |
| "learning_rate": 0.00031739353335275266, |
| "loss": 3.1838, |
| "step": 80950 |
| }, |
| { |
| "epoch": 23.580921320831635, |
| "grad_norm": 0.4176872968673706, |
| "learning_rate": 0.00031721875910282544, |
| "loss": 3.1719, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.580921320831635, |
| "eval_accuracy": 0.3729034955664441, |
| "eval_loss": 3.5576372146606445, |
| "eval_runtime": 180.9809, |
| "eval_samples_per_second": 91.96, |
| "eval_steps_per_second": 5.752, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.595480752431424, |
| "grad_norm": 0.44739457964897156, |
| "learning_rate": 0.00031704398485289827, |
| "loss": 3.1871, |
| "step": 81050 |
| }, |
| { |
| "epoch": 23.610040184031217, |
| "grad_norm": 0.4258241355419159, |
| "learning_rate": 0.0003168692106029711, |
| "loss": 3.1846, |
| "step": 81100 |
| }, |
| { |
| "epoch": 23.624599615631006, |
| "grad_norm": 0.44633159041404724, |
| "learning_rate": 0.000316694436353044, |
| "loss": 3.1742, |
| "step": 81150 |
| }, |
| { |
| "epoch": 23.639159047230795, |
| "grad_norm": 0.4098469913005829, |
| "learning_rate": 0.0003165196621031168, |
| "loss": 3.1868, |
| "step": 81200 |
| }, |
| { |
| "epoch": 23.653718478830587, |
| "grad_norm": 0.42541512846946716, |
| "learning_rate": 0.00031634488785318966, |
| "loss": 3.1906, |
| "step": 81250 |
| }, |
| { |
| "epoch": 23.668277910430376, |
| "grad_norm": 0.4811411499977112, |
| "learning_rate": 0.00031617011360326243, |
| "loss": 3.1903, |
| "step": 81300 |
| }, |
| { |
| "epoch": 23.68283734203017, |
| "grad_norm": 0.42175766825675964, |
| "learning_rate": 0.00031599533935333527, |
| "loss": 3.1967, |
| "step": 81350 |
| }, |
| { |
| "epoch": 23.697396773629958, |
| "grad_norm": 0.4402812421321869, |
| "learning_rate": 0.0003158205651034081, |
| "loss": 3.1909, |
| "step": 81400 |
| }, |
| { |
| "epoch": 23.711956205229747, |
| "grad_norm": 0.4189620018005371, |
| "learning_rate": 0.00031564579085348093, |
| "loss": 3.1839, |
| "step": 81450 |
| }, |
| { |
| "epoch": 23.72651563682954, |
| "grad_norm": 0.4007689654827118, |
| "learning_rate": 0.0003154710166035537, |
| "loss": 3.1906, |
| "step": 81500 |
| }, |
| { |
| "epoch": 23.74107506842933, |
| "grad_norm": 0.42000943422317505, |
| "learning_rate": 0.00031529624235362655, |
| "loss": 3.2023, |
| "step": 81550 |
| }, |
| { |
| "epoch": 23.755634500029117, |
| "grad_norm": 0.4228629469871521, |
| "learning_rate": 0.0003151214681036994, |
| "loss": 3.1989, |
| "step": 81600 |
| }, |
| { |
| "epoch": 23.77019393162891, |
| "grad_norm": 0.4203532338142395, |
| "learning_rate": 0.0003149466938537722, |
| "loss": 3.1811, |
| "step": 81650 |
| }, |
| { |
| "epoch": 23.7847533632287, |
| "grad_norm": 0.4034475088119507, |
| "learning_rate": 0.00031477191960384504, |
| "loss": 3.1947, |
| "step": 81700 |
| }, |
| { |
| "epoch": 23.79931279482849, |
| "grad_norm": 0.4441761374473572, |
| "learning_rate": 0.0003145971453539178, |
| "loss": 3.1996, |
| "step": 81750 |
| }, |
| { |
| "epoch": 23.81387222642828, |
| "grad_norm": 0.40938514471054077, |
| "learning_rate": 0.00031442237110399066, |
| "loss": 3.1999, |
| "step": 81800 |
| }, |
| { |
| "epoch": 23.82843165802807, |
| "grad_norm": 0.42533165216445923, |
| "learning_rate": 0.0003142475968540635, |
| "loss": 3.2039, |
| "step": 81850 |
| }, |
| { |
| "epoch": 23.842991089627862, |
| "grad_norm": 0.40785083174705505, |
| "learning_rate": 0.0003140728226041363, |
| "loss": 3.1983, |
| "step": 81900 |
| }, |
| { |
| "epoch": 23.85755052122765, |
| "grad_norm": 0.40363097190856934, |
| "learning_rate": 0.00031389804835420915, |
| "loss": 3.206, |
| "step": 81950 |
| }, |
| { |
| "epoch": 23.87210995282744, |
| "grad_norm": 0.43303757905960083, |
| "learning_rate": 0.00031372327410428193, |
| "loss": 3.2133, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.87210995282744, |
| "eval_accuracy": 0.37346883847078505, |
| "eval_loss": 3.5454885959625244, |
| "eval_runtime": 179.9471, |
| "eval_samples_per_second": 92.488, |
| "eval_steps_per_second": 5.785, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.886669384427233, |
| "grad_norm": 0.42279261350631714, |
| "learning_rate": 0.00031354849985435477, |
| "loss": 3.2198, |
| "step": 82050 |
| }, |
| { |
| "epoch": 23.90122881602702, |
| "grad_norm": 0.3982255160808563, |
| "learning_rate": 0.0003133737256044276, |
| "loss": 3.1981, |
| "step": 82100 |
| }, |
| { |
| "epoch": 23.915788247626814, |
| "grad_norm": 0.4409744441509247, |
| "learning_rate": 0.00031319895135450043, |
| "loss": 3.2056, |
| "step": 82150 |
| }, |
| { |
| "epoch": 23.930347679226603, |
| "grad_norm": 0.4399181604385376, |
| "learning_rate": 0.0003130241771045732, |
| "loss": 3.1991, |
| "step": 82200 |
| }, |
| { |
| "epoch": 23.944907110826392, |
| "grad_norm": 0.4288428723812103, |
| "learning_rate": 0.00031284940285464604, |
| "loss": 3.1975, |
| "step": 82250 |
| }, |
| { |
| "epoch": 23.959466542426185, |
| "grad_norm": 0.4355453550815582, |
| "learning_rate": 0.0003126746286047189, |
| "loss": 3.2068, |
| "step": 82300 |
| }, |
| { |
| "epoch": 23.974025974025974, |
| "grad_norm": 0.4542391300201416, |
| "learning_rate": 0.0003124998543547917, |
| "loss": 3.2008, |
| "step": 82350 |
| }, |
| { |
| "epoch": 23.988585405625763, |
| "grad_norm": 0.41459161043167114, |
| "learning_rate": 0.00031232508010486454, |
| "loss": 3.2094, |
| "step": 82400 |
| }, |
| { |
| "epoch": 24.003203074951955, |
| "grad_norm": 0.4521249830722809, |
| "learning_rate": 0.0003121503058549373, |
| "loss": 3.2476, |
| "step": 82450 |
| }, |
| { |
| "epoch": 24.017762506551744, |
| "grad_norm": 0.43049049377441406, |
| "learning_rate": 0.00031197553160501016, |
| "loss": 3.1265, |
| "step": 82500 |
| }, |
| { |
| "epoch": 24.032321938151533, |
| "grad_norm": 0.45564672350883484, |
| "learning_rate": 0.000311800757355083, |
| "loss": 3.1277, |
| "step": 82550 |
| }, |
| { |
| "epoch": 24.046881369751326, |
| "grad_norm": 0.4392612874507904, |
| "learning_rate": 0.0003116259831051558, |
| "loss": 3.1344, |
| "step": 82600 |
| }, |
| { |
| "epoch": 24.061440801351115, |
| "grad_norm": 0.4402536153793335, |
| "learning_rate": 0.00031145120885522865, |
| "loss": 3.1388, |
| "step": 82650 |
| }, |
| { |
| "epoch": 24.076000232950907, |
| "grad_norm": 0.428521990776062, |
| "learning_rate": 0.00031127643460530143, |
| "loss": 3.1424, |
| "step": 82700 |
| }, |
| { |
| "epoch": 24.090559664550696, |
| "grad_norm": 0.42514893412590027, |
| "learning_rate": 0.00031110166035537427, |
| "loss": 3.1492, |
| "step": 82750 |
| }, |
| { |
| "epoch": 24.105119096150485, |
| "grad_norm": 0.44392284750938416, |
| "learning_rate": 0.0003109268861054471, |
| "loss": 3.154, |
| "step": 82800 |
| }, |
| { |
| "epoch": 24.119678527750278, |
| "grad_norm": 0.41279974579811096, |
| "learning_rate": 0.00031075211185551993, |
| "loss": 3.1598, |
| "step": 82850 |
| }, |
| { |
| "epoch": 24.134237959350067, |
| "grad_norm": 0.46087998151779175, |
| "learning_rate": 0.0003105773376055927, |
| "loss": 3.1509, |
| "step": 82900 |
| }, |
| { |
| "epoch": 24.148797390949856, |
| "grad_norm": 0.4077779948711395, |
| "learning_rate": 0.00031040256335566554, |
| "loss": 3.1598, |
| "step": 82950 |
| }, |
| { |
| "epoch": 24.16335682254965, |
| "grad_norm": 0.404313862323761, |
| "learning_rate": 0.0003102277891057384, |
| "loss": 3.1573, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.16335682254965, |
| "eval_accuracy": 0.37331809604080063, |
| "eval_loss": 3.554750919342041, |
| "eval_runtime": 179.8077, |
| "eval_samples_per_second": 92.56, |
| "eval_steps_per_second": 5.79, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.177916254149437, |
| "grad_norm": 0.4314119219779968, |
| "learning_rate": 0.0003100530148558112, |
| "loss": 3.1623, |
| "step": 83050 |
| }, |
| { |
| "epoch": 24.19247568574923, |
| "grad_norm": 0.44017013907432556, |
| "learning_rate": 0.00030987824060588404, |
| "loss": 3.1604, |
| "step": 83100 |
| }, |
| { |
| "epoch": 24.20703511734902, |
| "grad_norm": 0.4304588735103607, |
| "learning_rate": 0.0003097034663559568, |
| "loss": 3.1786, |
| "step": 83150 |
| }, |
| { |
| "epoch": 24.221594548948808, |
| "grad_norm": 0.4203026294708252, |
| "learning_rate": 0.00030952869210602965, |
| "loss": 3.1765, |
| "step": 83200 |
| }, |
| { |
| "epoch": 24.2361539805486, |
| "grad_norm": 0.4360384941101074, |
| "learning_rate": 0.0003093539178561025, |
| "loss": 3.1594, |
| "step": 83250 |
| }, |
| { |
| "epoch": 24.25071341214839, |
| "grad_norm": 0.419367253780365, |
| "learning_rate": 0.0003091791436061753, |
| "loss": 3.1581, |
| "step": 83300 |
| }, |
| { |
| "epoch": 24.26527284374818, |
| "grad_norm": 0.43426862359046936, |
| "learning_rate": 0.00030900436935624815, |
| "loss": 3.1767, |
| "step": 83350 |
| }, |
| { |
| "epoch": 24.27983227534797, |
| "grad_norm": 0.4190501868724823, |
| "learning_rate": 0.00030882959510632093, |
| "loss": 3.1773, |
| "step": 83400 |
| }, |
| { |
| "epoch": 24.29439170694776, |
| "grad_norm": 0.4230547845363617, |
| "learning_rate": 0.00030865482085639377, |
| "loss": 3.1788, |
| "step": 83450 |
| }, |
| { |
| "epoch": 24.308951138547553, |
| "grad_norm": 0.4331873059272766, |
| "learning_rate": 0.0003084800466064666, |
| "loss": 3.1688, |
| "step": 83500 |
| }, |
| { |
| "epoch": 24.32351057014734, |
| "grad_norm": 0.4186467230319977, |
| "learning_rate": 0.00030830527235653943, |
| "loss": 3.1707, |
| "step": 83550 |
| }, |
| { |
| "epoch": 24.33807000174713, |
| "grad_norm": 0.41275304555892944, |
| "learning_rate": 0.0003081304981066122, |
| "loss": 3.1758, |
| "step": 83600 |
| }, |
| { |
| "epoch": 24.352629433346923, |
| "grad_norm": 0.5706424713134766, |
| "learning_rate": 0.00030795572385668504, |
| "loss": 3.1845, |
| "step": 83650 |
| }, |
| { |
| "epoch": 24.367188864946712, |
| "grad_norm": 0.43165168166160583, |
| "learning_rate": 0.00030778094960675793, |
| "loss": 3.1623, |
| "step": 83700 |
| }, |
| { |
| "epoch": 24.3817482965465, |
| "grad_norm": 0.44549575448036194, |
| "learning_rate": 0.00030760617535683076, |
| "loss": 3.1842, |
| "step": 83750 |
| }, |
| { |
| "epoch": 24.396307728146294, |
| "grad_norm": 0.42982882261276245, |
| "learning_rate": 0.0003074314011069036, |
| "loss": 3.1825, |
| "step": 83800 |
| }, |
| { |
| "epoch": 24.410867159746083, |
| "grad_norm": 0.42767763137817383, |
| "learning_rate": 0.00030725662685697643, |
| "loss": 3.1915, |
| "step": 83850 |
| }, |
| { |
| "epoch": 24.425426591345875, |
| "grad_norm": 0.42826566100120544, |
| "learning_rate": 0.0003070818526070492, |
| "loss": 3.1861, |
| "step": 83900 |
| }, |
| { |
| "epoch": 24.439986022945664, |
| "grad_norm": 0.4173426926136017, |
| "learning_rate": 0.00030690707835712204, |
| "loss": 3.1829, |
| "step": 83950 |
| }, |
| { |
| "epoch": 24.454545454545453, |
| "grad_norm": 0.4142906069755554, |
| "learning_rate": 0.0003067323041071949, |
| "loss": 3.1941, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.454545454545453, |
| "eval_accuracy": 0.37343744359652153, |
| "eval_loss": 3.5522103309631348, |
| "eval_runtime": 179.9407, |
| "eval_samples_per_second": 92.492, |
| "eval_steps_per_second": 5.785, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.469104886145246, |
| "grad_norm": 0.42911434173583984, |
| "learning_rate": 0.0003065575298572677, |
| "loss": 3.1824, |
| "step": 84050 |
| }, |
| { |
| "epoch": 24.483664317745035, |
| "grad_norm": 0.4673752784729004, |
| "learning_rate": 0.00030638275560734054, |
| "loss": 3.1878, |
| "step": 84100 |
| }, |
| { |
| "epoch": 24.498223749344824, |
| "grad_norm": 0.4118911325931549, |
| "learning_rate": 0.0003062079813574133, |
| "loss": 3.1945, |
| "step": 84150 |
| }, |
| { |
| "epoch": 24.512783180944616, |
| "grad_norm": 0.46239766478538513, |
| "learning_rate": 0.00030603320710748615, |
| "loss": 3.1837, |
| "step": 84200 |
| }, |
| { |
| "epoch": 24.527342612544405, |
| "grad_norm": 0.4571888744831085, |
| "learning_rate": 0.000305858432857559, |
| "loss": 3.1924, |
| "step": 84250 |
| }, |
| { |
| "epoch": 24.541902044144198, |
| "grad_norm": 0.4163052439689636, |
| "learning_rate": 0.0003056836586076318, |
| "loss": 3.1966, |
| "step": 84300 |
| }, |
| { |
| "epoch": 24.556461475743987, |
| "grad_norm": 0.4218531847000122, |
| "learning_rate": 0.0003055088843577046, |
| "loss": 3.2015, |
| "step": 84350 |
| }, |
| { |
| "epoch": 24.571020907343776, |
| "grad_norm": 0.4395730197429657, |
| "learning_rate": 0.00030533411010777743, |
| "loss": 3.2011, |
| "step": 84400 |
| }, |
| { |
| "epoch": 24.58558033894357, |
| "grad_norm": 0.43827715516090393, |
| "learning_rate": 0.00030515933585785026, |
| "loss": 3.2071, |
| "step": 84450 |
| }, |
| { |
| "epoch": 24.600139770543358, |
| "grad_norm": 0.4391777813434601, |
| "learning_rate": 0.0003049845616079231, |
| "loss": 3.199, |
| "step": 84500 |
| }, |
| { |
| "epoch": 24.61469920214315, |
| "grad_norm": 0.45976823568344116, |
| "learning_rate": 0.00030480978735799593, |
| "loss": 3.2007, |
| "step": 84550 |
| }, |
| { |
| "epoch": 24.62925863374294, |
| "grad_norm": 0.43706169724464417, |
| "learning_rate": 0.0003046350131080687, |
| "loss": 3.2013, |
| "step": 84600 |
| }, |
| { |
| "epoch": 24.643818065342728, |
| "grad_norm": 0.43872877955436707, |
| "learning_rate": 0.00030446023885814154, |
| "loss": 3.2041, |
| "step": 84650 |
| }, |
| { |
| "epoch": 24.65837749694252, |
| "grad_norm": 0.42261114716529846, |
| "learning_rate": 0.0003042854646082144, |
| "loss": 3.2004, |
| "step": 84700 |
| }, |
| { |
| "epoch": 24.67293692854231, |
| "grad_norm": 0.43338093161582947, |
| "learning_rate": 0.0003041106903582872, |
| "loss": 3.1977, |
| "step": 84750 |
| }, |
| { |
| "epoch": 24.6874963601421, |
| "grad_norm": 0.41846537590026855, |
| "learning_rate": 0.00030393591610836, |
| "loss": 3.1972, |
| "step": 84800 |
| }, |
| { |
| "epoch": 24.70205579174189, |
| "grad_norm": 0.42820459604263306, |
| "learning_rate": 0.0003037611418584328, |
| "loss": 3.2049, |
| "step": 84850 |
| }, |
| { |
| "epoch": 24.71661522334168, |
| "grad_norm": 0.43307816982269287, |
| "learning_rate": 0.00030358636760850565, |
| "loss": 3.2179, |
| "step": 84900 |
| }, |
| { |
| "epoch": 24.73117465494147, |
| "grad_norm": 0.4248763918876648, |
| "learning_rate": 0.0003034115933585785, |
| "loss": 3.1926, |
| "step": 84950 |
| }, |
| { |
| "epoch": 24.745734086541262, |
| "grad_norm": 0.41839364171028137, |
| "learning_rate": 0.0003032368191086513, |
| "loss": 3.2039, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.745734086541262, |
| "eval_accuracy": 0.37404652767399377, |
| "eval_loss": 3.5435619354248047, |
| "eval_runtime": 177.4415, |
| "eval_samples_per_second": 93.794, |
| "eval_steps_per_second": 5.867, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.76029351814105, |
| "grad_norm": 0.4315228760242462, |
| "learning_rate": 0.0003030620448587241, |
| "loss": 3.1948, |
| "step": 85050 |
| }, |
| { |
| "epoch": 24.774852949740843, |
| "grad_norm": 0.43137073516845703, |
| "learning_rate": 0.00030288727060879693, |
| "loss": 3.2095, |
| "step": 85100 |
| }, |
| { |
| "epoch": 24.789412381340632, |
| "grad_norm": 0.43723025918006897, |
| "learning_rate": 0.00030271249635886976, |
| "loss": 3.209, |
| "step": 85150 |
| }, |
| { |
| "epoch": 24.80397181294042, |
| "grad_norm": 0.4241926372051239, |
| "learning_rate": 0.0003025377221089426, |
| "loss": 3.214, |
| "step": 85200 |
| }, |
| { |
| "epoch": 24.818531244540214, |
| "grad_norm": 0.41172343492507935, |
| "learning_rate": 0.00030236294785901543, |
| "loss": 3.2183, |
| "step": 85250 |
| }, |
| { |
| "epoch": 24.833090676140003, |
| "grad_norm": 0.4208565652370453, |
| "learning_rate": 0.0003021881736090882, |
| "loss": 3.2044, |
| "step": 85300 |
| }, |
| { |
| "epoch": 24.847650107739796, |
| "grad_norm": 0.41848379373550415, |
| "learning_rate": 0.00030201339935916104, |
| "loss": 3.2094, |
| "step": 85350 |
| }, |
| { |
| "epoch": 24.862209539339585, |
| "grad_norm": 0.41701391339302063, |
| "learning_rate": 0.00030183862510923387, |
| "loss": 3.1972, |
| "step": 85400 |
| }, |
| { |
| "epoch": 24.876768970939374, |
| "grad_norm": 0.4314718246459961, |
| "learning_rate": 0.0003016638508593067, |
| "loss": 3.205, |
| "step": 85450 |
| }, |
| { |
| "epoch": 24.891328402539166, |
| "grad_norm": 0.41032907366752625, |
| "learning_rate": 0.0003014890766093795, |
| "loss": 3.2128, |
| "step": 85500 |
| }, |
| { |
| "epoch": 24.905887834138955, |
| "grad_norm": 0.4259736239910126, |
| "learning_rate": 0.0003013143023594523, |
| "loss": 3.2088, |
| "step": 85550 |
| }, |
| { |
| "epoch": 24.920447265738744, |
| "grad_norm": 0.39667725563049316, |
| "learning_rate": 0.00030113952810952515, |
| "loss": 3.2289, |
| "step": 85600 |
| }, |
| { |
| "epoch": 24.935006697338537, |
| "grad_norm": 0.41058072447776794, |
| "learning_rate": 0.000300964753859598, |
| "loss": 3.2142, |
| "step": 85650 |
| }, |
| { |
| "epoch": 24.949566128938326, |
| "grad_norm": 0.42639005184173584, |
| "learning_rate": 0.0003007899796096708, |
| "loss": 3.2211, |
| "step": 85700 |
| }, |
| { |
| "epoch": 24.96412556053812, |
| "grad_norm": 0.41749000549316406, |
| "learning_rate": 0.0003006152053597436, |
| "loss": 3.213, |
| "step": 85750 |
| }, |
| { |
| "epoch": 24.978684992137907, |
| "grad_norm": 0.4088965058326721, |
| "learning_rate": 0.00030044043110981643, |
| "loss": 3.2202, |
| "step": 85800 |
| }, |
| { |
| "epoch": 24.993244423737696, |
| "grad_norm": 0.4255065619945526, |
| "learning_rate": 0.00030026565685988926, |
| "loss": 3.2187, |
| "step": 85850 |
| }, |
| { |
| "epoch": 25.00757090443189, |
| "grad_norm": 0.4710450768470764, |
| "learning_rate": 0.0003000908826099621, |
| "loss": 3.1597, |
| "step": 85900 |
| }, |
| { |
| "epoch": 25.02213033603168, |
| "grad_norm": 0.43516671657562256, |
| "learning_rate": 0.0002999161083600349, |
| "loss": 3.1244, |
| "step": 85950 |
| }, |
| { |
| "epoch": 25.036689767631472, |
| "grad_norm": 0.46017566323280334, |
| "learning_rate": 0.00029974133411010776, |
| "loss": 3.1082, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.036689767631472, |
| "eval_accuracy": 0.3735376249930479, |
| "eval_loss": 3.5517325401306152, |
| "eval_runtime": 178.0523, |
| "eval_samples_per_second": 93.473, |
| "eval_steps_per_second": 5.847, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.05124919923126, |
| "grad_norm": 0.44637131690979004, |
| "learning_rate": 0.0002995665598601806, |
| "loss": 3.1258, |
| "step": 86050 |
| }, |
| { |
| "epoch": 25.065808630831054, |
| "grad_norm": 0.39747855067253113, |
| "learning_rate": 0.00029939178561025337, |
| "loss": 3.1318, |
| "step": 86100 |
| }, |
| { |
| "epoch": 25.080368062430843, |
| "grad_norm": 0.4319270849227905, |
| "learning_rate": 0.0002992170113603262, |
| "loss": 3.1203, |
| "step": 86150 |
| }, |
| { |
| "epoch": 25.09492749403063, |
| "grad_norm": 0.41497504711151123, |
| "learning_rate": 0.00029904223711039904, |
| "loss": 3.1417, |
| "step": 86200 |
| }, |
| { |
| "epoch": 25.109486925630424, |
| "grad_norm": 0.4178447127342224, |
| "learning_rate": 0.00029886746286047187, |
| "loss": 3.1361, |
| "step": 86250 |
| }, |
| { |
| "epoch": 25.124046357230213, |
| "grad_norm": 0.41532012820243835, |
| "learning_rate": 0.0002986926886105447, |
| "loss": 3.1451, |
| "step": 86300 |
| }, |
| { |
| "epoch": 25.138605788830006, |
| "grad_norm": 0.4553651809692383, |
| "learning_rate": 0.0002985179143606175, |
| "loss": 3.1575, |
| "step": 86350 |
| }, |
| { |
| "epoch": 25.153165220429795, |
| "grad_norm": 0.4206736087799072, |
| "learning_rate": 0.0002983431401106903, |
| "loss": 3.1446, |
| "step": 86400 |
| }, |
| { |
| "epoch": 25.167724652029584, |
| "grad_norm": 0.46819040179252625, |
| "learning_rate": 0.00029816836586076315, |
| "loss": 3.1393, |
| "step": 86450 |
| }, |
| { |
| "epoch": 25.182284083629376, |
| "grad_norm": 0.4540737271308899, |
| "learning_rate": 0.000297993591610836, |
| "loss": 3.1469, |
| "step": 86500 |
| }, |
| { |
| "epoch": 25.196843515229165, |
| "grad_norm": 0.44796428084373474, |
| "learning_rate": 0.0002978188173609088, |
| "loss": 3.1496, |
| "step": 86550 |
| }, |
| { |
| "epoch": 25.211402946828954, |
| "grad_norm": 0.4612496495246887, |
| "learning_rate": 0.0002976440431109816, |
| "loss": 3.1554, |
| "step": 86600 |
| }, |
| { |
| "epoch": 25.225962378428747, |
| "grad_norm": 0.44751015305519104, |
| "learning_rate": 0.0002974692688610544, |
| "loss": 3.1544, |
| "step": 86650 |
| }, |
| { |
| "epoch": 25.240521810028536, |
| "grad_norm": 0.452826589345932, |
| "learning_rate": 0.00029729449461112726, |
| "loss": 3.1547, |
| "step": 86700 |
| }, |
| { |
| "epoch": 25.25508124162833, |
| "grad_norm": 0.4215683937072754, |
| "learning_rate": 0.0002971197203612001, |
| "loss": 3.1616, |
| "step": 86750 |
| }, |
| { |
| "epoch": 25.269640673228118, |
| "grad_norm": 0.484609454870224, |
| "learning_rate": 0.00029694494611127287, |
| "loss": 3.1572, |
| "step": 86800 |
| }, |
| { |
| "epoch": 25.284200104827907, |
| "grad_norm": 0.420391321182251, |
| "learning_rate": 0.00029677017186134576, |
| "loss": 3.165, |
| "step": 86850 |
| }, |
| { |
| "epoch": 25.2987595364277, |
| "grad_norm": 0.42065128684043884, |
| "learning_rate": 0.0002965953976114186, |
| "loss": 3.1636, |
| "step": 86900 |
| }, |
| { |
| "epoch": 25.313318968027488, |
| "grad_norm": 0.43046483397483826, |
| "learning_rate": 0.00029642062336149137, |
| "loss": 3.1632, |
| "step": 86950 |
| }, |
| { |
| "epoch": 25.327878399627277, |
| "grad_norm": 0.4139987528324127, |
| "learning_rate": 0.0002962458491115642, |
| "loss": 3.1654, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.327878399627277, |
| "eval_accuracy": 0.3735500888757143, |
| "eval_loss": 3.551439046859741, |
| "eval_runtime": 178.5097, |
| "eval_samples_per_second": 93.233, |
| "eval_steps_per_second": 5.832, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.34243783122707, |
| "grad_norm": 0.42929497361183167, |
| "learning_rate": 0.00029607107486163704, |
| "loss": 3.1601, |
| "step": 87050 |
| }, |
| { |
| "epoch": 25.35699726282686, |
| "grad_norm": 0.45584776997566223, |
| "learning_rate": 0.00029589630061170987, |
| "loss": 3.1801, |
| "step": 87100 |
| }, |
| { |
| "epoch": 25.37155669442665, |
| "grad_norm": 0.43226608633995056, |
| "learning_rate": 0.0002957215263617827, |
| "loss": 3.1721, |
| "step": 87150 |
| }, |
| { |
| "epoch": 25.38611612602644, |
| "grad_norm": 0.4360572099685669, |
| "learning_rate": 0.0002955467521118555, |
| "loss": 3.1776, |
| "step": 87200 |
| }, |
| { |
| "epoch": 25.40067555762623, |
| "grad_norm": 0.44590285420417786, |
| "learning_rate": 0.0002953719778619283, |
| "loss": 3.1612, |
| "step": 87250 |
| }, |
| { |
| "epoch": 25.415234989226022, |
| "grad_norm": 0.4651472866535187, |
| "learning_rate": 0.00029519720361200115, |
| "loss": 3.1681, |
| "step": 87300 |
| }, |
| { |
| "epoch": 25.42979442082581, |
| "grad_norm": 0.4429469704627991, |
| "learning_rate": 0.000295022429362074, |
| "loss": 3.1682, |
| "step": 87350 |
| }, |
| { |
| "epoch": 25.4443538524256, |
| "grad_norm": 0.4391014575958252, |
| "learning_rate": 0.00029484765511214676, |
| "loss": 3.1772, |
| "step": 87400 |
| }, |
| { |
| "epoch": 25.458913284025392, |
| "grad_norm": 0.4634419083595276, |
| "learning_rate": 0.0002946728808622196, |
| "loss": 3.1747, |
| "step": 87450 |
| }, |
| { |
| "epoch": 25.47347271562518, |
| "grad_norm": 0.43895062804222107, |
| "learning_rate": 0.0002944981066122924, |
| "loss": 3.1781, |
| "step": 87500 |
| }, |
| { |
| "epoch": 25.488032147224974, |
| "grad_norm": 0.46622321009635925, |
| "learning_rate": 0.00029432333236236526, |
| "loss": 3.1714, |
| "step": 87550 |
| }, |
| { |
| "epoch": 25.502591578824763, |
| "grad_norm": 0.4347987771034241, |
| "learning_rate": 0.0002941485581124381, |
| "loss": 3.1755, |
| "step": 87600 |
| }, |
| { |
| "epoch": 25.517151010424552, |
| "grad_norm": 0.4216921925544739, |
| "learning_rate": 0.00029397378386251087, |
| "loss": 3.1928, |
| "step": 87650 |
| }, |
| { |
| "epoch": 25.531710442024345, |
| "grad_norm": 0.426127552986145, |
| "learning_rate": 0.0002937990096125837, |
| "loss": 3.1887, |
| "step": 87700 |
| }, |
| { |
| "epoch": 25.546269873624134, |
| "grad_norm": 0.4356168508529663, |
| "learning_rate": 0.00029362423536265654, |
| "loss": 3.1794, |
| "step": 87750 |
| }, |
| { |
| "epoch": 25.560829305223923, |
| "grad_norm": 0.43473145365715027, |
| "learning_rate": 0.00029344946111272937, |
| "loss": 3.1756, |
| "step": 87800 |
| }, |
| { |
| "epoch": 25.575388736823715, |
| "grad_norm": 0.42265087366104126, |
| "learning_rate": 0.0002932746868628022, |
| "loss": 3.1773, |
| "step": 87850 |
| }, |
| { |
| "epoch": 25.589948168423504, |
| "grad_norm": 0.46903398633003235, |
| "learning_rate": 0.000293099912612875, |
| "loss": 3.1875, |
| "step": 87900 |
| }, |
| { |
| "epoch": 25.604507600023297, |
| "grad_norm": 0.46841877698898315, |
| "learning_rate": 0.0002929251383629478, |
| "loss": 3.1823, |
| "step": 87950 |
| }, |
| { |
| "epoch": 25.619067031623086, |
| "grad_norm": 0.41188380122184753, |
| "learning_rate": 0.00029275036411302065, |
| "loss": 3.1816, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.619067031623086, |
| "eval_accuracy": 0.3739879709422213, |
| "eval_loss": 3.542097806930542, |
| "eval_runtime": 176.7868, |
| "eval_samples_per_second": 94.142, |
| "eval_steps_per_second": 5.888, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.633626463222875, |
| "grad_norm": 0.4317525625228882, |
| "learning_rate": 0.0002925755898630935, |
| "loss": 3.1857, |
| "step": 88050 |
| }, |
| { |
| "epoch": 25.648185894822667, |
| "grad_norm": 0.43668872117996216, |
| "learning_rate": 0.0002924008156131663, |
| "loss": 3.2019, |
| "step": 88100 |
| }, |
| { |
| "epoch": 25.662745326422456, |
| "grad_norm": 0.5344383120536804, |
| "learning_rate": 0.00029222604136323915, |
| "loss": 3.1902, |
| "step": 88150 |
| }, |
| { |
| "epoch": 25.677304758022245, |
| "grad_norm": 0.44829392433166504, |
| "learning_rate": 0.000292051267113312, |
| "loss": 3.1828, |
| "step": 88200 |
| }, |
| { |
| "epoch": 25.691864189622038, |
| "grad_norm": 0.45893147587776184, |
| "learning_rate": 0.00029187649286338476, |
| "loss": 3.1951, |
| "step": 88250 |
| }, |
| { |
| "epoch": 25.706423621221827, |
| "grad_norm": 0.4264863133430481, |
| "learning_rate": 0.0002917017186134576, |
| "loss": 3.186, |
| "step": 88300 |
| }, |
| { |
| "epoch": 25.72098305282162, |
| "grad_norm": 0.4173523187637329, |
| "learning_rate": 0.0002915269443635304, |
| "loss": 3.2085, |
| "step": 88350 |
| }, |
| { |
| "epoch": 25.73554248442141, |
| "grad_norm": 0.4248258173465729, |
| "learning_rate": 0.00029135217011360326, |
| "loss": 3.2001, |
| "step": 88400 |
| }, |
| { |
| "epoch": 25.750101916021197, |
| "grad_norm": 0.41881099343299866, |
| "learning_rate": 0.0002911773958636761, |
| "loss": 3.1997, |
| "step": 88450 |
| }, |
| { |
| "epoch": 25.76466134762099, |
| "grad_norm": 0.4275151789188385, |
| "learning_rate": 0.00029100262161374887, |
| "loss": 3.2037, |
| "step": 88500 |
| }, |
| { |
| "epoch": 25.77922077922078, |
| "grad_norm": 0.473474383354187, |
| "learning_rate": 0.0002908278473638217, |
| "loss": 3.2004, |
| "step": 88550 |
| }, |
| { |
| "epoch": 25.793780210820568, |
| "grad_norm": 0.41812214255332947, |
| "learning_rate": 0.00029065307311389453, |
| "loss": 3.1992, |
| "step": 88600 |
| }, |
| { |
| "epoch": 25.80833964242036, |
| "grad_norm": 0.4513116180896759, |
| "learning_rate": 0.00029047829886396737, |
| "loss": 3.2088, |
| "step": 88650 |
| }, |
| { |
| "epoch": 25.82289907402015, |
| "grad_norm": 0.4494040012359619, |
| "learning_rate": 0.0002903035246140402, |
| "loss": 3.2008, |
| "step": 88700 |
| }, |
| { |
| "epoch": 25.837458505619942, |
| "grad_norm": 0.4608217477798462, |
| "learning_rate": 0.000290128750364113, |
| "loss": 3.2038, |
| "step": 88750 |
| }, |
| { |
| "epoch": 25.85201793721973, |
| "grad_norm": 0.4516802132129669, |
| "learning_rate": 0.0002899539761141858, |
| "loss": 3.2057, |
| "step": 88800 |
| }, |
| { |
| "epoch": 25.86657736881952, |
| "grad_norm": 0.41412249207496643, |
| "learning_rate": 0.00028977920186425864, |
| "loss": 3.204, |
| "step": 88850 |
| }, |
| { |
| "epoch": 25.881136800419313, |
| "grad_norm": 0.44088810682296753, |
| "learning_rate": 0.0002896044276143315, |
| "loss": 3.2054, |
| "step": 88900 |
| }, |
| { |
| "epoch": 25.8956962320191, |
| "grad_norm": 0.42800846695899963, |
| "learning_rate": 0.00028942965336440426, |
| "loss": 3.1936, |
| "step": 88950 |
| }, |
| { |
| "epoch": 25.91025566361889, |
| "grad_norm": 0.47048747539520264, |
| "learning_rate": 0.0002892548791144771, |
| "loss": 3.2067, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.91025566361889, |
| "eval_accuracy": 0.3746011704526494, |
| "eval_loss": 3.5390212535858154, |
| "eval_runtime": 176.1021, |
| "eval_samples_per_second": 94.508, |
| "eval_steps_per_second": 5.911, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.924815095218683, |
| "grad_norm": 0.4377363324165344, |
| "learning_rate": 0.0002890801048645499, |
| "loss": 3.2089, |
| "step": 89050 |
| }, |
| { |
| "epoch": 25.939374526818472, |
| "grad_norm": 0.43033215403556824, |
| "learning_rate": 0.00028890533061462276, |
| "loss": 3.2125, |
| "step": 89100 |
| }, |
| { |
| "epoch": 25.953933958418265, |
| "grad_norm": 0.4349314570426941, |
| "learning_rate": 0.0002887305563646956, |
| "loss": 3.201, |
| "step": 89150 |
| }, |
| { |
| "epoch": 25.968493390018054, |
| "grad_norm": 0.45417773723602295, |
| "learning_rate": 0.00028855578211476837, |
| "loss": 3.2129, |
| "step": 89200 |
| }, |
| { |
| "epoch": 25.983052821617843, |
| "grad_norm": 0.4514182507991791, |
| "learning_rate": 0.0002883810078648412, |
| "loss": 3.2026, |
| "step": 89250 |
| }, |
| { |
| "epoch": 25.997612253217635, |
| "grad_norm": 0.4432578980922699, |
| "learning_rate": 0.00028820623361491403, |
| "loss": 3.2073, |
| "step": 89300 |
| }, |
| { |
| "epoch": 26.01193873391183, |
| "grad_norm": 0.43464499711990356, |
| "learning_rate": 0.00028803145936498687, |
| "loss": 3.1196, |
| "step": 89350 |
| }, |
| { |
| "epoch": 26.02649816551162, |
| "grad_norm": 0.427076518535614, |
| "learning_rate": 0.0002878566851150597, |
| "loss": 3.1105, |
| "step": 89400 |
| }, |
| { |
| "epoch": 26.041057597111408, |
| "grad_norm": 0.44585657119750977, |
| "learning_rate": 0.00028768191086513253, |
| "loss": 3.115, |
| "step": 89450 |
| }, |
| { |
| "epoch": 26.0556170287112, |
| "grad_norm": 0.45118430256843567, |
| "learning_rate": 0.00028750713661520536, |
| "loss": 3.1275, |
| "step": 89500 |
| }, |
| { |
| "epoch": 26.07017646031099, |
| "grad_norm": 0.41694751381874084, |
| "learning_rate": 0.00028733236236527814, |
| "loss": 3.1038, |
| "step": 89550 |
| }, |
| { |
| "epoch": 26.08473589191078, |
| "grad_norm": 0.4492938816547394, |
| "learning_rate": 0.000287157588115351, |
| "loss": 3.1343, |
| "step": 89600 |
| }, |
| { |
| "epoch": 26.09929532351057, |
| "grad_norm": 0.44321227073669434, |
| "learning_rate": 0.0002869828138654238, |
| "loss": 3.1221, |
| "step": 89650 |
| }, |
| { |
| "epoch": 26.11385475511036, |
| "grad_norm": 0.42474403977394104, |
| "learning_rate": 0.00028680803961549664, |
| "loss": 3.1323, |
| "step": 89700 |
| }, |
| { |
| "epoch": 26.128414186710152, |
| "grad_norm": 0.42992469668388367, |
| "learning_rate": 0.0002866332653655695, |
| "loss": 3.1243, |
| "step": 89750 |
| }, |
| { |
| "epoch": 26.14297361830994, |
| "grad_norm": 0.4463481903076172, |
| "learning_rate": 0.00028645849111564225, |
| "loss": 3.1386, |
| "step": 89800 |
| }, |
| { |
| "epoch": 26.15753304990973, |
| "grad_norm": 0.42387306690216064, |
| "learning_rate": 0.0002862837168657151, |
| "loss": 3.1402, |
| "step": 89850 |
| }, |
| { |
| "epoch": 26.172092481509523, |
| "grad_norm": 0.44750088453292847, |
| "learning_rate": 0.0002861089426157879, |
| "loss": 3.1307, |
| "step": 89900 |
| }, |
| { |
| "epoch": 26.186651913109312, |
| "grad_norm": 0.49076640605926514, |
| "learning_rate": 0.00028593416836586075, |
| "loss": 3.1315, |
| "step": 89950 |
| }, |
| { |
| "epoch": 26.2012113447091, |
| "grad_norm": 0.43750905990600586, |
| "learning_rate": 0.0002857593941159336, |
| "loss": 3.1412, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.2012113447091, |
| "eval_accuracy": 0.37368295856829026, |
| "eval_loss": 3.552704095840454, |
| "eval_runtime": 175.9971, |
| "eval_samples_per_second": 94.564, |
| "eval_steps_per_second": 5.915, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.215770776308894, |
| "grad_norm": 0.43243998289108276, |
| "learning_rate": 0.00028558461986600637, |
| "loss": 3.1477, |
| "step": 90050 |
| }, |
| { |
| "epoch": 26.230330207908683, |
| "grad_norm": 0.44490525126457214, |
| "learning_rate": 0.0002854098456160792, |
| "loss": 3.1395, |
| "step": 90100 |
| }, |
| { |
| "epoch": 26.244889639508475, |
| "grad_norm": 0.4202319383621216, |
| "learning_rate": 0.00028523507136615203, |
| "loss": 3.1557, |
| "step": 90150 |
| }, |
| { |
| "epoch": 26.259449071108264, |
| "grad_norm": 0.4608379006385803, |
| "learning_rate": 0.00028506029711622486, |
| "loss": 3.1526, |
| "step": 90200 |
| }, |
| { |
| "epoch": 26.274008502708053, |
| "grad_norm": 0.473849356174469, |
| "learning_rate": 0.00028488552286629764, |
| "loss": 3.1539, |
| "step": 90250 |
| }, |
| { |
| "epoch": 26.288567934307846, |
| "grad_norm": 0.42520883679389954, |
| "learning_rate": 0.0002847107486163705, |
| "loss": 3.1494, |
| "step": 90300 |
| }, |
| { |
| "epoch": 26.303127365907635, |
| "grad_norm": 0.4529514014720917, |
| "learning_rate": 0.0002845359743664433, |
| "loss": 3.1534, |
| "step": 90350 |
| }, |
| { |
| "epoch": 26.317686797507424, |
| "grad_norm": 0.4225722551345825, |
| "learning_rate": 0.00028436120011651614, |
| "loss": 3.1542, |
| "step": 90400 |
| }, |
| { |
| "epoch": 26.332246229107216, |
| "grad_norm": 0.4789310693740845, |
| "learning_rate": 0.000284186425866589, |
| "loss": 3.1621, |
| "step": 90450 |
| }, |
| { |
| "epoch": 26.346805660707005, |
| "grad_norm": 0.42410004138946533, |
| "learning_rate": 0.00028401165161666175, |
| "loss": 3.1612, |
| "step": 90500 |
| }, |
| { |
| "epoch": 26.361365092306798, |
| "grad_norm": 0.43350374698638916, |
| "learning_rate": 0.0002838368773667346, |
| "loss": 3.1668, |
| "step": 90550 |
| }, |
| { |
| "epoch": 26.375924523906587, |
| "grad_norm": 0.4327101409435272, |
| "learning_rate": 0.0002836621031168075, |
| "loss": 3.1691, |
| "step": 90600 |
| }, |
| { |
| "epoch": 26.390483955506376, |
| "grad_norm": 0.4356554448604584, |
| "learning_rate": 0.00028348732886688025, |
| "loss": 3.162, |
| "step": 90650 |
| }, |
| { |
| "epoch": 26.40504338710617, |
| "grad_norm": 0.44769665598869324, |
| "learning_rate": 0.0002833125546169531, |
| "loss": 3.1721, |
| "step": 90700 |
| }, |
| { |
| "epoch": 26.419602818705958, |
| "grad_norm": 0.43828025460243225, |
| "learning_rate": 0.0002831377803670259, |
| "loss": 3.1689, |
| "step": 90750 |
| }, |
| { |
| "epoch": 26.434162250305747, |
| "grad_norm": 0.4498804807662964, |
| "learning_rate": 0.00028296300611709875, |
| "loss": 3.1753, |
| "step": 90800 |
| }, |
| { |
| "epoch": 26.44872168190554, |
| "grad_norm": 0.4299849569797516, |
| "learning_rate": 0.00028278823186717153, |
| "loss": 3.1675, |
| "step": 90850 |
| }, |
| { |
| "epoch": 26.463281113505328, |
| "grad_norm": 0.42671430110931396, |
| "learning_rate": 0.00028261345761724436, |
| "loss": 3.1616, |
| "step": 90900 |
| }, |
| { |
| "epoch": 26.47784054510512, |
| "grad_norm": 0.4571177661418915, |
| "learning_rate": 0.0002824386833673172, |
| "loss": 3.1643, |
| "step": 90950 |
| }, |
| { |
| "epoch": 26.49239997670491, |
| "grad_norm": 0.4367692172527313, |
| "learning_rate": 0.00028226390911739003, |
| "loss": 3.1852, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.49239997670491, |
| "eval_accuracy": 0.3740624014868236, |
| "eval_loss": 3.545224666595459, |
| "eval_runtime": 176.1401, |
| "eval_samples_per_second": 94.487, |
| "eval_steps_per_second": 5.91, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.5069594083047, |
| "grad_norm": 0.45861831307411194, |
| "learning_rate": 0.00028208913486746286, |
| "loss": 3.1676, |
| "step": 91050 |
| }, |
| { |
| "epoch": 26.52151883990449, |
| "grad_norm": 0.42132991552352905, |
| "learning_rate": 0.00028191436061753564, |
| "loss": 3.1729, |
| "step": 91100 |
| }, |
| { |
| "epoch": 26.53607827150428, |
| "grad_norm": 0.4799935817718506, |
| "learning_rate": 0.0002817395863676085, |
| "loss": 3.1639, |
| "step": 91150 |
| }, |
| { |
| "epoch": 26.55063770310407, |
| "grad_norm": 0.467133492231369, |
| "learning_rate": 0.0002815648121176813, |
| "loss": 3.1814, |
| "step": 91200 |
| }, |
| { |
| "epoch": 26.565197134703862, |
| "grad_norm": 0.40786775946617126, |
| "learning_rate": 0.00028139003786775414, |
| "loss": 3.1825, |
| "step": 91250 |
| }, |
| { |
| "epoch": 26.57975656630365, |
| "grad_norm": 0.46661651134490967, |
| "learning_rate": 0.000281215263617827, |
| "loss": 3.1724, |
| "step": 91300 |
| }, |
| { |
| "epoch": 26.594315997903443, |
| "grad_norm": 0.43742454051971436, |
| "learning_rate": 0.00028104048936789975, |
| "loss": 3.1886, |
| "step": 91350 |
| }, |
| { |
| "epoch": 26.608875429503232, |
| "grad_norm": 0.43313705921173096, |
| "learning_rate": 0.0002808657151179726, |
| "loss": 3.1689, |
| "step": 91400 |
| }, |
| { |
| "epoch": 26.62343486110302, |
| "grad_norm": 0.42333483695983887, |
| "learning_rate": 0.0002806909408680454, |
| "loss": 3.1738, |
| "step": 91450 |
| }, |
| { |
| "epoch": 26.637994292702814, |
| "grad_norm": 0.42646193504333496, |
| "learning_rate": 0.00028051616661811825, |
| "loss": 3.1797, |
| "step": 91500 |
| }, |
| { |
| "epoch": 26.652553724302603, |
| "grad_norm": 0.4323969781398773, |
| "learning_rate": 0.00028034139236819103, |
| "loss": 3.1817, |
| "step": 91550 |
| }, |
| { |
| "epoch": 26.667113155902392, |
| "grad_norm": 0.4830801784992218, |
| "learning_rate": 0.00028016661811826386, |
| "loss": 3.169, |
| "step": 91600 |
| }, |
| { |
| "epoch": 26.681672587502185, |
| "grad_norm": 0.44306960701942444, |
| "learning_rate": 0.0002799918438683367, |
| "loss": 3.1853, |
| "step": 91650 |
| }, |
| { |
| "epoch": 26.696232019101974, |
| "grad_norm": 0.438679963350296, |
| "learning_rate": 0.00027981706961840953, |
| "loss": 3.1856, |
| "step": 91700 |
| }, |
| { |
| "epoch": 26.710791450701766, |
| "grad_norm": 0.4450984299182892, |
| "learning_rate": 0.00027964229536848236, |
| "loss": 3.1767, |
| "step": 91750 |
| }, |
| { |
| "epoch": 26.725350882301555, |
| "grad_norm": 0.46220827102661133, |
| "learning_rate": 0.00027946752111855514, |
| "loss": 3.1854, |
| "step": 91800 |
| }, |
| { |
| "epoch": 26.739910313901344, |
| "grad_norm": 0.4325886070728302, |
| "learning_rate": 0.000279292746868628, |
| "loss": 3.1755, |
| "step": 91850 |
| }, |
| { |
| "epoch": 26.754469745501137, |
| "grad_norm": 0.45203420519828796, |
| "learning_rate": 0.00027911797261870086, |
| "loss": 3.1833, |
| "step": 91900 |
| }, |
| { |
| "epoch": 26.769029177100926, |
| "grad_norm": 0.4569106698036194, |
| "learning_rate": 0.00027894319836877364, |
| "loss": 3.1961, |
| "step": 91950 |
| }, |
| { |
| "epoch": 26.783588608700715, |
| "grad_norm": 0.4249524772167206, |
| "learning_rate": 0.00027876842411884647, |
| "loss": 3.1961, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.783588608700715, |
| "eval_accuracy": 0.3745718920867632, |
| "eval_loss": 3.5404043197631836, |
| "eval_runtime": 176.1461, |
| "eval_samples_per_second": 94.484, |
| "eval_steps_per_second": 5.91, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.798148040300507, |
| "grad_norm": 0.4360384941101074, |
| "learning_rate": 0.0002785936498689193, |
| "loss": 3.1865, |
| "step": 92050 |
| }, |
| { |
| "epoch": 26.812707471900296, |
| "grad_norm": 0.4332277178764343, |
| "learning_rate": 0.00027841887561899214, |
| "loss": 3.1902, |
| "step": 92100 |
| }, |
| { |
| "epoch": 26.82726690350009, |
| "grad_norm": 0.46646252274513245, |
| "learning_rate": 0.0002782441013690649, |
| "loss": 3.1949, |
| "step": 92150 |
| }, |
| { |
| "epoch": 26.841826335099878, |
| "grad_norm": 0.4422335922718048, |
| "learning_rate": 0.00027806932711913775, |
| "loss": 3.1888, |
| "step": 92200 |
| }, |
| { |
| "epoch": 26.856385766699667, |
| "grad_norm": 0.4528057873249054, |
| "learning_rate": 0.0002778945528692106, |
| "loss": 3.2017, |
| "step": 92250 |
| }, |
| { |
| "epoch": 26.87094519829946, |
| "grad_norm": 0.4345687925815582, |
| "learning_rate": 0.0002777197786192834, |
| "loss": 3.1882, |
| "step": 92300 |
| }, |
| { |
| "epoch": 26.88550462989925, |
| "grad_norm": 0.4311511218547821, |
| "learning_rate": 0.00027754500436935625, |
| "loss": 3.1851, |
| "step": 92350 |
| }, |
| { |
| "epoch": 26.900064061499037, |
| "grad_norm": 0.44769158959388733, |
| "learning_rate": 0.00027737023011942903, |
| "loss": 3.1828, |
| "step": 92400 |
| }, |
| { |
| "epoch": 26.91462349309883, |
| "grad_norm": 0.45371246337890625, |
| "learning_rate": 0.00027719545586950186, |
| "loss": 3.1854, |
| "step": 92450 |
| }, |
| { |
| "epoch": 26.92918292469862, |
| "grad_norm": 0.4307236671447754, |
| "learning_rate": 0.0002770206816195747, |
| "loss": 3.2048, |
| "step": 92500 |
| }, |
| { |
| "epoch": 26.94374235629841, |
| "grad_norm": 0.5101444125175476, |
| "learning_rate": 0.0002768459073696475, |
| "loss": 3.1934, |
| "step": 92550 |
| }, |
| { |
| "epoch": 26.9583017878982, |
| "grad_norm": 0.4513118863105774, |
| "learning_rate": 0.00027667113311972036, |
| "loss": 3.1901, |
| "step": 92600 |
| }, |
| { |
| "epoch": 26.97286121949799, |
| "grad_norm": 0.4673958420753479, |
| "learning_rate": 0.00027649635886979314, |
| "loss": 3.2089, |
| "step": 92650 |
| }, |
| { |
| "epoch": 26.987420651097782, |
| "grad_norm": 0.4448801279067993, |
| "learning_rate": 0.00027632158461986597, |
| "loss": 3.2034, |
| "step": 92700 |
| }, |
| { |
| "epoch": 27.001747131791976, |
| "grad_norm": 0.46219342947006226, |
| "learning_rate": 0.0002761468103699388, |
| "loss": 3.1836, |
| "step": 92750 |
| }, |
| { |
| "epoch": 27.016306563391765, |
| "grad_norm": 0.431754469871521, |
| "learning_rate": 0.00027597203612001164, |
| "loss": 3.0981, |
| "step": 92800 |
| }, |
| { |
| "epoch": 27.030865994991554, |
| "grad_norm": 0.4268021583557129, |
| "learning_rate": 0.0002757972618700844, |
| "loss": 3.0964, |
| "step": 92850 |
| }, |
| { |
| "epoch": 27.045425426591347, |
| "grad_norm": 0.44790345430374146, |
| "learning_rate": 0.00027562248762015725, |
| "loss": 3.094, |
| "step": 92900 |
| }, |
| { |
| "epoch": 27.059984858191136, |
| "grad_norm": 0.43892034888267517, |
| "learning_rate": 0.0002754477133702301, |
| "loss": 3.1152, |
| "step": 92950 |
| }, |
| { |
| "epoch": 27.074544289790925, |
| "grad_norm": 0.446199893951416, |
| "learning_rate": 0.0002752729391203029, |
| "loss": 3.1188, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.074544289790925, |
| "eval_accuracy": 0.3737672661519867, |
| "eval_loss": 3.5553925037384033, |
| "eval_runtime": 176.0476, |
| "eval_samples_per_second": 94.537, |
| "eval_steps_per_second": 5.913, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.089103721390718, |
| "grad_norm": 0.4196743667125702, |
| "learning_rate": 0.00027509816487037575, |
| "loss": 3.1054, |
| "step": 93050 |
| }, |
| { |
| "epoch": 27.103663152990507, |
| "grad_norm": 0.43984171748161316, |
| "learning_rate": 0.00027492339062044853, |
| "loss": 3.1183, |
| "step": 93100 |
| }, |
| { |
| "epoch": 27.1182225845903, |
| "grad_norm": 0.44343408942222595, |
| "learning_rate": 0.0002747486163705214, |
| "loss": 3.1244, |
| "step": 93150 |
| }, |
| { |
| "epoch": 27.132782016190088, |
| "grad_norm": 0.4573042690753937, |
| "learning_rate": 0.00027457384212059425, |
| "loss": 3.1338, |
| "step": 93200 |
| }, |
| { |
| "epoch": 27.147341447789877, |
| "grad_norm": 0.435369074344635, |
| "learning_rate": 0.000274399067870667, |
| "loss": 3.1289, |
| "step": 93250 |
| }, |
| { |
| "epoch": 27.16190087938967, |
| "grad_norm": 0.4890010356903076, |
| "learning_rate": 0.00027422429362073986, |
| "loss": 3.1225, |
| "step": 93300 |
| }, |
| { |
| "epoch": 27.17646031098946, |
| "grad_norm": 0.44470691680908203, |
| "learning_rate": 0.0002740495193708127, |
| "loss": 3.1252, |
| "step": 93350 |
| }, |
| { |
| "epoch": 27.191019742589248, |
| "grad_norm": 0.4353671371936798, |
| "learning_rate": 0.0002738747451208855, |
| "loss": 3.1254, |
| "step": 93400 |
| }, |
| { |
| "epoch": 27.20557917418904, |
| "grad_norm": 0.44027820229530334, |
| "learning_rate": 0.0002736999708709583, |
| "loss": 3.1343, |
| "step": 93450 |
| }, |
| { |
| "epoch": 27.22013860578883, |
| "grad_norm": 0.44768866896629333, |
| "learning_rate": 0.00027352519662103114, |
| "loss": 3.1497, |
| "step": 93500 |
| }, |
| { |
| "epoch": 27.234698037388622, |
| "grad_norm": 0.4610934853553772, |
| "learning_rate": 0.00027335042237110397, |
| "loss": 3.1332, |
| "step": 93550 |
| }, |
| { |
| "epoch": 27.24925746898841, |
| "grad_norm": 0.4198959171772003, |
| "learning_rate": 0.0002731756481211768, |
| "loss": 3.1289, |
| "step": 93600 |
| }, |
| { |
| "epoch": 27.2638169005882, |
| "grad_norm": 0.4309977889060974, |
| "learning_rate": 0.00027300087387124964, |
| "loss": 3.1357, |
| "step": 93650 |
| }, |
| { |
| "epoch": 27.278376332187992, |
| "grad_norm": 0.4930909276008606, |
| "learning_rate": 0.0002728260996213224, |
| "loss": 3.1572, |
| "step": 93700 |
| }, |
| { |
| "epoch": 27.29293576378778, |
| "grad_norm": 0.4195786118507385, |
| "learning_rate": 0.00027265132537139525, |
| "loss": 3.1518, |
| "step": 93750 |
| }, |
| { |
| "epoch": 27.30749519538757, |
| "grad_norm": 0.4539899230003357, |
| "learning_rate": 0.0002724765511214681, |
| "loss": 3.1462, |
| "step": 93800 |
| }, |
| { |
| "epoch": 27.322054626987363, |
| "grad_norm": 0.4597534239292145, |
| "learning_rate": 0.0002723017768715409, |
| "loss": 3.1491, |
| "step": 93850 |
| }, |
| { |
| "epoch": 27.336614058587152, |
| "grad_norm": 0.42478206753730774, |
| "learning_rate": 0.00027212700262161375, |
| "loss": 3.1528, |
| "step": 93900 |
| }, |
| { |
| "epoch": 27.351173490186945, |
| "grad_norm": 0.42217180132865906, |
| "learning_rate": 0.0002719522283716865, |
| "loss": 3.1603, |
| "step": 93950 |
| }, |
| { |
| "epoch": 27.365732921786734, |
| "grad_norm": 0.4504246711730957, |
| "learning_rate": 0.00027177745412175936, |
| "loss": 3.1557, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.365732921786734, |
| "eval_accuracy": 0.3741429463889604, |
| "eval_loss": 3.549989938735962, |
| "eval_runtime": 176.2106, |
| "eval_samples_per_second": 94.449, |
| "eval_steps_per_second": 5.908, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.380292353386523, |
| "grad_norm": 0.43801188468933105, |
| "learning_rate": 0.0002716026798718322, |
| "loss": 3.147, |
| "step": 94050 |
| }, |
| { |
| "epoch": 27.394851784986315, |
| "grad_norm": 0.4779060482978821, |
| "learning_rate": 0.000271427905621905, |
| "loss": 3.1564, |
| "step": 94100 |
| }, |
| { |
| "epoch": 27.409411216586104, |
| "grad_norm": 0.4386543035507202, |
| "learning_rate": 0.0002712531313719778, |
| "loss": 3.172, |
| "step": 94150 |
| }, |
| { |
| "epoch": 27.423970648185893, |
| "grad_norm": 0.4766581058502197, |
| "learning_rate": 0.00027107835712205064, |
| "loss": 3.1584, |
| "step": 94200 |
| }, |
| { |
| "epoch": 27.438530079785686, |
| "grad_norm": 0.4249516427516937, |
| "learning_rate": 0.00027090358287212347, |
| "loss": 3.1512, |
| "step": 94250 |
| }, |
| { |
| "epoch": 27.453089511385475, |
| "grad_norm": 0.44262126088142395, |
| "learning_rate": 0.0002707288086221963, |
| "loss": 3.1583, |
| "step": 94300 |
| }, |
| { |
| "epoch": 27.467648942985267, |
| "grad_norm": 0.4553981125354767, |
| "learning_rate": 0.00027055403437226914, |
| "loss": 3.1597, |
| "step": 94350 |
| }, |
| { |
| "epoch": 27.482208374585056, |
| "grad_norm": 0.4854367673397064, |
| "learning_rate": 0.00027037926012234197, |
| "loss": 3.1701, |
| "step": 94400 |
| }, |
| { |
| "epoch": 27.496767806184845, |
| "grad_norm": 0.4336088001728058, |
| "learning_rate": 0.0002702044858724148, |
| "loss": 3.16, |
| "step": 94450 |
| }, |
| { |
| "epoch": 27.511327237784638, |
| "grad_norm": 0.4473203718662262, |
| "learning_rate": 0.00027002971162248763, |
| "loss": 3.1578, |
| "step": 94500 |
| }, |
| { |
| "epoch": 27.525886669384427, |
| "grad_norm": 0.43575260043144226, |
| "learning_rate": 0.0002698549373725604, |
| "loss": 3.1591, |
| "step": 94550 |
| }, |
| { |
| "epoch": 27.540446100984216, |
| "grad_norm": 0.45486170053482056, |
| "learning_rate": 0.00026968016312263325, |
| "loss": 3.1661, |
| "step": 94600 |
| }, |
| { |
| "epoch": 27.55500553258401, |
| "grad_norm": 0.4264790713787079, |
| "learning_rate": 0.0002695053888727061, |
| "loss": 3.168, |
| "step": 94650 |
| }, |
| { |
| "epoch": 27.569564964183797, |
| "grad_norm": 0.41783830523490906, |
| "learning_rate": 0.0002693306146227789, |
| "loss": 3.1697, |
| "step": 94700 |
| }, |
| { |
| "epoch": 27.58412439578359, |
| "grad_norm": 0.46509838104248047, |
| "learning_rate": 0.0002691558403728517, |
| "loss": 3.1784, |
| "step": 94750 |
| }, |
| { |
| "epoch": 27.59868382738338, |
| "grad_norm": 0.46188145875930786, |
| "learning_rate": 0.0002689810661229245, |
| "loss": 3.1765, |
| "step": 94800 |
| }, |
| { |
| "epoch": 27.613243258983168, |
| "grad_norm": 0.43464067578315735, |
| "learning_rate": 0.00026880629187299736, |
| "loss": 3.1751, |
| "step": 94850 |
| }, |
| { |
| "epoch": 27.62780269058296, |
| "grad_norm": 0.4667948782444, |
| "learning_rate": 0.0002686315176230702, |
| "loss": 3.1575, |
| "step": 94900 |
| }, |
| { |
| "epoch": 27.64236212218275, |
| "grad_norm": 0.4603780210018158, |
| "learning_rate": 0.000268456743373143, |
| "loss": 3.1652, |
| "step": 94950 |
| }, |
| { |
| "epoch": 27.65692155378254, |
| "grad_norm": 0.4413023591041565, |
| "learning_rate": 0.0002682819691232158, |
| "loss": 3.1792, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.65692155378254, |
| "eval_accuracy": 0.37462045419564277, |
| "eval_loss": 3.538667917251587, |
| "eval_runtime": 176.0826, |
| "eval_samples_per_second": 94.518, |
| "eval_steps_per_second": 5.912, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.67148098538233, |
| "grad_norm": 0.48090237379074097, |
| "learning_rate": 0.00026810719487328863, |
| "loss": 3.1765, |
| "step": 95050 |
| }, |
| { |
| "epoch": 27.68604041698212, |
| "grad_norm": 0.43271276354789734, |
| "learning_rate": 0.00026793242062336147, |
| "loss": 3.172, |
| "step": 95100 |
| }, |
| { |
| "epoch": 27.700599848581913, |
| "grad_norm": 0.45895567536354065, |
| "learning_rate": 0.0002677576463734343, |
| "loss": 3.1673, |
| "step": 95150 |
| }, |
| { |
| "epoch": 27.7151592801817, |
| "grad_norm": 0.4838683605194092, |
| "learning_rate": 0.00026758287212350713, |
| "loss": 3.1804, |
| "step": 95200 |
| }, |
| { |
| "epoch": 27.72971871178149, |
| "grad_norm": 0.4482438266277313, |
| "learning_rate": 0.0002674080978735799, |
| "loss": 3.177, |
| "step": 95250 |
| }, |
| { |
| "epoch": 27.744278143381283, |
| "grad_norm": 0.4664458930492401, |
| "learning_rate": 0.00026723332362365275, |
| "loss": 3.1773, |
| "step": 95300 |
| }, |
| { |
| "epoch": 27.758837574981072, |
| "grad_norm": 0.44228246808052063, |
| "learning_rate": 0.0002670585493737256, |
| "loss": 3.1871, |
| "step": 95350 |
| }, |
| { |
| "epoch": 27.773397006580865, |
| "grad_norm": 0.4452652633190155, |
| "learning_rate": 0.0002668837751237984, |
| "loss": 3.1735, |
| "step": 95400 |
| }, |
| { |
| "epoch": 27.787956438180654, |
| "grad_norm": 0.4514998495578766, |
| "learning_rate": 0.0002667090008738712, |
| "loss": 3.1836, |
| "step": 95450 |
| }, |
| { |
| "epoch": 27.802515869780443, |
| "grad_norm": 0.45780277252197266, |
| "learning_rate": 0.000266534226623944, |
| "loss": 3.1752, |
| "step": 95500 |
| }, |
| { |
| "epoch": 27.817075301380235, |
| "grad_norm": 0.42713457345962524, |
| "learning_rate": 0.00026635945237401686, |
| "loss": 3.1747, |
| "step": 95550 |
| }, |
| { |
| "epoch": 27.831634732980024, |
| "grad_norm": 0.4621380865573883, |
| "learning_rate": 0.0002661846781240897, |
| "loss": 3.1898, |
| "step": 95600 |
| }, |
| { |
| "epoch": 27.846194164579813, |
| "grad_norm": 0.44072896242141724, |
| "learning_rate": 0.0002660099038741625, |
| "loss": 3.1854, |
| "step": 95650 |
| }, |
| { |
| "epoch": 27.860753596179606, |
| "grad_norm": 0.4288334548473358, |
| "learning_rate": 0.00026583512962423536, |
| "loss": 3.1805, |
| "step": 95700 |
| }, |
| { |
| "epoch": 27.875313027779395, |
| "grad_norm": 0.42848989367485046, |
| "learning_rate": 0.0002656603553743082, |
| "loss": 3.1719, |
| "step": 95750 |
| }, |
| { |
| "epoch": 27.889872459379184, |
| "grad_norm": 0.4722457230091095, |
| "learning_rate": 0.000265485581124381, |
| "loss": 3.1861, |
| "step": 95800 |
| }, |
| { |
| "epoch": 27.904431890978977, |
| "grad_norm": 0.4696759283542633, |
| "learning_rate": 0.0002653108068744538, |
| "loss": 3.187, |
| "step": 95850 |
| }, |
| { |
| "epoch": 27.918991322578766, |
| "grad_norm": 0.4808979034423828, |
| "learning_rate": 0.00026513603262452663, |
| "loss": 3.1933, |
| "step": 95900 |
| }, |
| { |
| "epoch": 27.933550754178558, |
| "grad_norm": 0.4903891682624817, |
| "learning_rate": 0.00026496125837459947, |
| "loss": 3.1746, |
| "step": 95950 |
| }, |
| { |
| "epoch": 27.948110185778347, |
| "grad_norm": 0.44483309984207153, |
| "learning_rate": 0.0002647864841246723, |
| "loss": 3.1917, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.948110185778347, |
| "eval_accuracy": 0.374729689544672, |
| "eval_loss": 3.535334348678589, |
| "eval_runtime": 175.9673, |
| "eval_samples_per_second": 94.58, |
| "eval_steps_per_second": 5.916, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.962669617378136, |
| "grad_norm": 0.4571549892425537, |
| "learning_rate": 0.0002646117098747451, |
| "loss": 3.1916, |
| "step": 96050 |
| }, |
| { |
| "epoch": 27.97722904897793, |
| "grad_norm": 0.44937238097190857, |
| "learning_rate": 0.0002644369356248179, |
| "loss": 3.176, |
| "step": 96100 |
| }, |
| { |
| "epoch": 27.991788480577718, |
| "grad_norm": 0.4279572367668152, |
| "learning_rate": 0.00026426216137489074, |
| "loss": 3.1906, |
| "step": 96150 |
| }, |
| { |
| "epoch": 28.006114961271912, |
| "grad_norm": 0.4721224904060364, |
| "learning_rate": 0.0002640873871249636, |
| "loss": 3.1421, |
| "step": 96200 |
| }, |
| { |
| "epoch": 28.0206743928717, |
| "grad_norm": 0.4491647779941559, |
| "learning_rate": 0.0002639126128750364, |
| "loss": 3.0955, |
| "step": 96250 |
| }, |
| { |
| "epoch": 28.035233824471494, |
| "grad_norm": 0.436516672372818, |
| "learning_rate": 0.0002637378386251092, |
| "loss": 3.0938, |
| "step": 96300 |
| }, |
| { |
| "epoch": 28.049793256071283, |
| "grad_norm": 0.4377647936344147, |
| "learning_rate": 0.000263563064375182, |
| "loss": 3.1034, |
| "step": 96350 |
| }, |
| { |
| "epoch": 28.06435268767107, |
| "grad_norm": 0.4661620557308197, |
| "learning_rate": 0.00026338829012525485, |
| "loss": 3.1043, |
| "step": 96400 |
| }, |
| { |
| "epoch": 28.078912119270864, |
| "grad_norm": 0.4748687744140625, |
| "learning_rate": 0.0002632135158753277, |
| "loss": 3.1055, |
| "step": 96450 |
| }, |
| { |
| "epoch": 28.093471550870653, |
| "grad_norm": 0.4351736903190613, |
| "learning_rate": 0.0002630387416254005, |
| "loss": 3.1103, |
| "step": 96500 |
| }, |
| { |
| "epoch": 28.108030982470446, |
| "grad_norm": 0.4778978228569031, |
| "learning_rate": 0.0002628639673754733, |
| "loss": 3.1122, |
| "step": 96550 |
| }, |
| { |
| "epoch": 28.122590414070235, |
| "grad_norm": 0.47012126445770264, |
| "learning_rate": 0.00026268919312554613, |
| "loss": 3.1148, |
| "step": 96600 |
| }, |
| { |
| "epoch": 28.137149845670024, |
| "grad_norm": 0.4676751494407654, |
| "learning_rate": 0.00026251441887561897, |
| "loss": 3.1213, |
| "step": 96650 |
| }, |
| { |
| "epoch": 28.151709277269816, |
| "grad_norm": 0.46864810585975647, |
| "learning_rate": 0.0002623396446256918, |
| "loss": 3.1236, |
| "step": 96700 |
| }, |
| { |
| "epoch": 28.166268708869605, |
| "grad_norm": 0.4691295921802521, |
| "learning_rate": 0.0002621648703757646, |
| "loss": 3.1346, |
| "step": 96750 |
| }, |
| { |
| "epoch": 28.180828140469394, |
| "grad_norm": 0.4796832501888275, |
| "learning_rate": 0.0002619900961258374, |
| "loss": 3.1204, |
| "step": 96800 |
| }, |
| { |
| "epoch": 28.195387572069187, |
| "grad_norm": 0.47000518441200256, |
| "learning_rate": 0.00026181532187591024, |
| "loss": 3.1227, |
| "step": 96850 |
| }, |
| { |
| "epoch": 28.209947003668976, |
| "grad_norm": 0.4579707682132721, |
| "learning_rate": 0.0002616405476259831, |
| "loss": 3.1165, |
| "step": 96900 |
| }, |
| { |
| "epoch": 28.22450643526877, |
| "grad_norm": 0.4562205672264099, |
| "learning_rate": 0.0002614657733760559, |
| "loss": 3.1214, |
| "step": 96950 |
| }, |
| { |
| "epoch": 28.239065866868557, |
| "grad_norm": 0.4554135501384735, |
| "learning_rate": 0.00026129099912612874, |
| "loss": 3.1361, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.239065866868557, |
| "eval_accuracy": 0.37413753753421836, |
| "eval_loss": 3.554201364517212, |
| "eval_runtime": 176.0871, |
| "eval_samples_per_second": 94.516, |
| "eval_steps_per_second": 5.912, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.253625298468346, |
| "grad_norm": 0.4856763184070587, |
| "learning_rate": 0.0002611162248762016, |
| "loss": 3.1141, |
| "step": 97050 |
| }, |
| { |
| "epoch": 28.26818473006814, |
| "grad_norm": 0.43940815329551697, |
| "learning_rate": 0.0002609414506262744, |
| "loss": 3.1263, |
| "step": 97100 |
| }, |
| { |
| "epoch": 28.282744161667928, |
| "grad_norm": 0.4479314088821411, |
| "learning_rate": 0.0002607666763763472, |
| "loss": 3.1356, |
| "step": 97150 |
| }, |
| { |
| "epoch": 28.29730359326772, |
| "grad_norm": 0.4651418924331665, |
| "learning_rate": 0.00026059190212642, |
| "loss": 3.1287, |
| "step": 97200 |
| }, |
| { |
| "epoch": 28.31186302486751, |
| "grad_norm": 0.433713436126709, |
| "learning_rate": 0.00026041712787649285, |
| "loss": 3.1419, |
| "step": 97250 |
| }, |
| { |
| "epoch": 28.3264224564673, |
| "grad_norm": 0.44931450486183167, |
| "learning_rate": 0.0002602423536265657, |
| "loss": 3.1499, |
| "step": 97300 |
| }, |
| { |
| "epoch": 28.34098188806709, |
| "grad_norm": 0.4344748258590698, |
| "learning_rate": 0.00026006757937663846, |
| "loss": 3.1301, |
| "step": 97350 |
| }, |
| { |
| "epoch": 28.35554131966688, |
| "grad_norm": 0.4586000144481659, |
| "learning_rate": 0.0002598928051267113, |
| "loss": 3.1288, |
| "step": 97400 |
| }, |
| { |
| "epoch": 28.37010075126667, |
| "grad_norm": 0.4540034234523773, |
| "learning_rate": 0.00025971803087678413, |
| "loss": 3.1552, |
| "step": 97450 |
| }, |
| { |
| "epoch": 28.38466018286646, |
| "grad_norm": 0.4512953758239746, |
| "learning_rate": 0.00025954325662685696, |
| "loss": 3.1397, |
| "step": 97500 |
| }, |
| { |
| "epoch": 28.39921961446625, |
| "grad_norm": 0.42919859290122986, |
| "learning_rate": 0.0002593684823769298, |
| "loss": 3.1338, |
| "step": 97550 |
| }, |
| { |
| "epoch": 28.413779046066043, |
| "grad_norm": 0.4367040693759918, |
| "learning_rate": 0.0002591937081270026, |
| "loss": 3.1489, |
| "step": 97600 |
| }, |
| { |
| "epoch": 28.428338477665832, |
| "grad_norm": 0.4803733229637146, |
| "learning_rate": 0.0002590189338770754, |
| "loss": 3.1471, |
| "step": 97650 |
| }, |
| { |
| "epoch": 28.44289790926562, |
| "grad_norm": 0.45755213499069214, |
| "learning_rate": 0.00025884415962714824, |
| "loss": 3.1535, |
| "step": 97700 |
| }, |
| { |
| "epoch": 28.457457340865414, |
| "grad_norm": 0.4427001178264618, |
| "learning_rate": 0.0002586693853772211, |
| "loss": 3.1458, |
| "step": 97750 |
| }, |
| { |
| "epoch": 28.472016772465203, |
| "grad_norm": 0.46226221323013306, |
| "learning_rate": 0.0002584946111272939, |
| "loss": 3.1571, |
| "step": 97800 |
| }, |
| { |
| "epoch": 28.486576204064992, |
| "grad_norm": 0.4624316990375519, |
| "learning_rate": 0.0002583198368773667, |
| "loss": 3.1577, |
| "step": 97850 |
| }, |
| { |
| "epoch": 28.501135635664784, |
| "grad_norm": 0.4385261535644531, |
| "learning_rate": 0.0002581450626274395, |
| "loss": 3.1523, |
| "step": 97900 |
| }, |
| { |
| "epoch": 28.515695067264573, |
| "grad_norm": 0.4555385112762451, |
| "learning_rate": 0.00025797028837751235, |
| "loss": 3.1588, |
| "step": 97950 |
| }, |
| { |
| "epoch": 28.530254498864366, |
| "grad_norm": 0.4434893727302551, |
| "learning_rate": 0.0002577955141275852, |
| "loss": 3.1525, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.530254498864366, |
| "eval_accuracy": 0.37440010215680436, |
| "eval_loss": 3.545118570327759, |
| "eval_runtime": 176.0773, |
| "eval_samples_per_second": 94.521, |
| "eval_steps_per_second": 5.912, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.544813930464155, |
| "grad_norm": 0.4388698637485504, |
| "learning_rate": 0.00025762073987765796, |
| "loss": 3.1592, |
| "step": 98050 |
| }, |
| { |
| "epoch": 28.559373362063944, |
| "grad_norm": 0.5155254602432251, |
| "learning_rate": 0.0002574459656277308, |
| "loss": 3.1566, |
| "step": 98100 |
| }, |
| { |
| "epoch": 28.573932793663737, |
| "grad_norm": 0.44777607917785645, |
| "learning_rate": 0.00025727119137780363, |
| "loss": 3.1591, |
| "step": 98150 |
| }, |
| { |
| "epoch": 28.588492225263526, |
| "grad_norm": 0.47019049525260925, |
| "learning_rate": 0.00025709641712787646, |
| "loss": 3.1679, |
| "step": 98200 |
| }, |
| { |
| "epoch": 28.603051656863315, |
| "grad_norm": 0.4456532895565033, |
| "learning_rate": 0.0002569216428779493, |
| "loss": 3.1684, |
| "step": 98250 |
| }, |
| { |
| "epoch": 28.617611088463107, |
| "grad_norm": 0.46583524346351624, |
| "learning_rate": 0.00025674686862802213, |
| "loss": 3.1535, |
| "step": 98300 |
| }, |
| { |
| "epoch": 28.632170520062896, |
| "grad_norm": 0.4479556381702423, |
| "learning_rate": 0.00025657209437809496, |
| "loss": 3.1657, |
| "step": 98350 |
| }, |
| { |
| "epoch": 28.64672995166269, |
| "grad_norm": 0.4612073004245758, |
| "learning_rate": 0.0002563973201281678, |
| "loss": 3.1703, |
| "step": 98400 |
| }, |
| { |
| "epoch": 28.661289383262478, |
| "grad_norm": 0.4215568006038666, |
| "learning_rate": 0.0002562225458782406, |
| "loss": 3.1538, |
| "step": 98450 |
| }, |
| { |
| "epoch": 28.675848814862267, |
| "grad_norm": 0.44902729988098145, |
| "learning_rate": 0.0002560477716283134, |
| "loss": 3.1591, |
| "step": 98500 |
| }, |
| { |
| "epoch": 28.69040824646206, |
| "grad_norm": 0.42525118589401245, |
| "learning_rate": 0.00025587299737838624, |
| "loss": 3.1546, |
| "step": 98550 |
| }, |
| { |
| "epoch": 28.70496767806185, |
| "grad_norm": 0.47658178210258484, |
| "learning_rate": 0.00025569822312845907, |
| "loss": 3.1737, |
| "step": 98600 |
| }, |
| { |
| "epoch": 28.719527109661637, |
| "grad_norm": 0.4418516457080841, |
| "learning_rate": 0.00025552344887853185, |
| "loss": 3.1702, |
| "step": 98650 |
| }, |
| { |
| "epoch": 28.73408654126143, |
| "grad_norm": 0.4458206295967102, |
| "learning_rate": 0.0002553486746286047, |
| "loss": 3.1577, |
| "step": 98700 |
| }, |
| { |
| "epoch": 28.74864597286122, |
| "grad_norm": 0.44089481234550476, |
| "learning_rate": 0.0002551739003786775, |
| "loss": 3.1698, |
| "step": 98750 |
| }, |
| { |
| "epoch": 28.76320540446101, |
| "grad_norm": 0.4285340905189514, |
| "learning_rate": 0.00025499912612875035, |
| "loss": 3.1718, |
| "step": 98800 |
| }, |
| { |
| "epoch": 28.7777648360608, |
| "grad_norm": 0.435953825712204, |
| "learning_rate": 0.0002548243518788232, |
| "loss": 3.1659, |
| "step": 98850 |
| }, |
| { |
| "epoch": 28.79232426766059, |
| "grad_norm": 0.43512865900993347, |
| "learning_rate": 0.00025464957762889596, |
| "loss": 3.1716, |
| "step": 98900 |
| }, |
| { |
| "epoch": 28.806883699260382, |
| "grad_norm": 0.4703885316848755, |
| "learning_rate": 0.0002544748033789688, |
| "loss": 3.1799, |
| "step": 98950 |
| }, |
| { |
| "epoch": 28.82144313086017, |
| "grad_norm": 0.4259127974510193, |
| "learning_rate": 0.00025430002912904163, |
| "loss": 3.1863, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.82144313086017, |
| "eval_accuracy": 0.37484139415347484, |
| "eval_loss": 3.539607048034668, |
| "eval_runtime": 176.2346, |
| "eval_samples_per_second": 94.437, |
| "eval_steps_per_second": 5.907, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.83600256245996, |
| "grad_norm": 0.4716574251651764, |
| "learning_rate": 0.00025412525487911446, |
| "loss": 3.1813, |
| "step": 99050 |
| }, |
| { |
| "epoch": 28.850561994059753, |
| "grad_norm": 0.4820871353149414, |
| "learning_rate": 0.0002539504806291873, |
| "loss": 3.1705, |
| "step": 99100 |
| }, |
| { |
| "epoch": 28.86512142565954, |
| "grad_norm": 0.4526395797729492, |
| "learning_rate": 0.00025377570637926007, |
| "loss": 3.1719, |
| "step": 99150 |
| }, |
| { |
| "epoch": 28.879680857259334, |
| "grad_norm": 0.4278837740421295, |
| "learning_rate": 0.0002536009321293329, |
| "loss": 3.1699, |
| "step": 99200 |
| }, |
| { |
| "epoch": 28.894240288859123, |
| "grad_norm": 0.4406258761882782, |
| "learning_rate": 0.00025342615787940574, |
| "loss": 3.1713, |
| "step": 99250 |
| }, |
| { |
| "epoch": 28.908799720458912, |
| "grad_norm": 0.4723201394081116, |
| "learning_rate": 0.00025325138362947857, |
| "loss": 3.1784, |
| "step": 99300 |
| }, |
| { |
| "epoch": 28.923359152058705, |
| "grad_norm": 0.4460213780403137, |
| "learning_rate": 0.00025307660937955135, |
| "loss": 3.1768, |
| "step": 99350 |
| }, |
| { |
| "epoch": 28.937918583658494, |
| "grad_norm": 0.44851601123809814, |
| "learning_rate": 0.0002529018351296242, |
| "loss": 3.1791, |
| "step": 99400 |
| }, |
| { |
| "epoch": 28.952478015258283, |
| "grad_norm": 0.45233815908432007, |
| "learning_rate": 0.00025272706087969707, |
| "loss": 3.1863, |
| "step": 99450 |
| }, |
| { |
| "epoch": 28.967037446858075, |
| "grad_norm": 0.4619120955467224, |
| "learning_rate": 0.00025255228662976985, |
| "loss": 3.1833, |
| "step": 99500 |
| }, |
| { |
| "epoch": 28.981596878457864, |
| "grad_norm": 0.44890516996383667, |
| "learning_rate": 0.0002523775123798427, |
| "loss": 3.1821, |
| "step": 99550 |
| }, |
| { |
| "epoch": 28.996156310057657, |
| "grad_norm": 0.4457606077194214, |
| "learning_rate": 0.0002522027381299155, |
| "loss": 3.1888, |
| "step": 99600 |
| }, |
| { |
| "epoch": 29.010482790751848, |
| "grad_norm": 0.46348798274993896, |
| "learning_rate": 0.00025202796387998835, |
| "loss": 3.1317, |
| "step": 99650 |
| }, |
| { |
| "epoch": 29.02504222235164, |
| "grad_norm": 0.45175158977508545, |
| "learning_rate": 0.0002518531896300612, |
| "loss": 3.0791, |
| "step": 99700 |
| }, |
| { |
| "epoch": 29.03960165395143, |
| "grad_norm": 0.492388516664505, |
| "learning_rate": 0.00025167841538013396, |
| "loss": 3.0982, |
| "step": 99750 |
| }, |
| { |
| "epoch": 29.054161085551222, |
| "grad_norm": 0.4821106195449829, |
| "learning_rate": 0.0002515036411302068, |
| "loss": 3.0746, |
| "step": 99800 |
| }, |
| { |
| "epoch": 29.06872051715101, |
| "grad_norm": 0.4595145285129547, |
| "learning_rate": 0.0002513288668802796, |
| "loss": 3.1005, |
| "step": 99850 |
| }, |
| { |
| "epoch": 29.0832799487508, |
| "grad_norm": 0.4326048195362091, |
| "learning_rate": 0.00025115409263035246, |
| "loss": 3.0932, |
| "step": 99900 |
| }, |
| { |
| "epoch": 29.097839380350592, |
| "grad_norm": 0.4561901092529297, |
| "learning_rate": 0.00025097931838042524, |
| "loss": 3.0916, |
| "step": 99950 |
| }, |
| { |
| "epoch": 29.11239881195038, |
| "grad_norm": 0.470004677772522, |
| "learning_rate": 0.00025080454413049807, |
| "loss": 3.0999, |
| "step": 100000 |
| }, |
| { |
| "epoch": 29.11239881195038, |
| "eval_accuracy": 0.3739163624087888, |
| "eval_loss": 3.552521228790283, |
| "eval_runtime": 178.7823, |
| "eval_samples_per_second": 93.091, |
| "eval_steps_per_second": 5.823, |
| "step": 100000 |
| }, |
| { |
| "epoch": 29.11239881195038, |
| "step": 100000, |
| "total_flos": 2.089834314006528e+18, |
| "train_loss": 0.6335119366455079, |
| "train_runtime": 39859.0956, |
| "train_samples_per_second": 344.628, |
| "train_steps_per_second": 4.309 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 171750, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 20, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 20 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.089834314006528e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|