diff --git "a/last_to_drop_frequency_40817/checkpoint-40000/trainer_state.json" "b/last_to_drop_frequency_40817/checkpoint-40000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last_to_drop_frequency_40817/checkpoint-40000/trainer_state.json" @@ -0,0 +1,6003 @@ +{ + "best_global_step": 40000, + "best_metric": 3.5595271587371826, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_drop_frequency_40817/checkpoint-40000", + "epoch": 11.644982819870712, + "eval_steps": 1000, + "global_step": 40000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.014559431599790344, + "grad_norm": 1.5551334619522095, + "learning_rate": 0.000294, + "loss": 8.4667, + "step": 50 + }, + { + "epoch": 0.029118863199580687, + "grad_norm": 0.7336986064910889, + "learning_rate": 0.0005939999999999999, + "loss": 6.7245, + "step": 100 + }, + { + "epoch": 0.043678294799371034, + "grad_norm": 0.4792507588863373, + "learning_rate": 0.0005998287212350713, + "loss": 6.3255, + "step": 150 + }, + { + "epoch": 0.058237726399161374, + "grad_norm": 0.47392821311950684, + "learning_rate": 0.0005996539469851441, + "loss": 6.1138, + "step": 200 + }, + { + "epoch": 0.07279715799895171, + "grad_norm": 0.442217618227005, + "learning_rate": 0.000599479172735217, + "loss": 5.9746, + "step": 250 + }, + { + "epoch": 0.08735658959874207, + "grad_norm": 0.4978708028793335, + "learning_rate": 0.0005993043984852897, + "loss": 5.8573, + "step": 300 + }, + { + "epoch": 0.10191602119853241, + "grad_norm": 0.5078408122062683, + "learning_rate": 0.0005991296242353626, + "loss": 5.7377, + "step": 350 + }, + { + "epoch": 0.11647545279832275, + "grad_norm": 0.4501552879810333, + "learning_rate": 0.0005989548499854355, + "loss": 5.613, + "step": 400 + }, + { + "epoch": 0.1310348843981131, + "grad_norm": 0.41562119126319885, + "learning_rate": 0.0005987800757355083, + "loss": 5.5049, + "step": 450 + }, + { + "epoch": 0.14559431599790343, + "grad_norm": 0.39685097336769104, + "learning_rate": 0.0005986053014855811, + "loss": 5.4153, + "step": 500 + }, + { + "epoch": 0.1601537475976938, + "grad_norm": 0.4735598862171173, + "learning_rate": 0.000598430527235654, + "loss": 5.3339, + "step": 550 + }, + { + "epoch": 0.17471317919748414, + "grad_norm": 0.4490765929222107, + "learning_rate": 0.0005982557529857267, + "loss": 5.2571, + "step": 600 + }, + { + "epoch": 0.18927261079727448, + "grad_norm": 0.5662270188331604, + "learning_rate": 0.0005980809787357995, + "loss": 5.183, + "step": 650 + }, + { + "epoch": 0.20383204239706482, + "grad_norm": 0.4178728759288788, + "learning_rate": 0.0005979062044858724, + "loss": 5.1337, + "step": 700 + }, + { + "epoch": 0.21839147399685516, + "grad_norm": 0.4277268648147583, + "learning_rate": 0.0005977314302359452, + "loss": 5.082, + "step": 750 + }, + { + "epoch": 0.2329509055966455, + "grad_norm": 0.49093976616859436, + "learning_rate": 0.0005975566559860181, + "loss": 5.0414, + "step": 800 + }, + { + "epoch": 0.24751033719643586, + "grad_norm": 0.40832236409187317, + "learning_rate": 0.0005973818817360908, + "loss": 4.9782, + "step": 850 + }, + { + "epoch": 0.2620697687962262, + "grad_norm": 0.42992544174194336, + "learning_rate": 0.0005972071074861636, + "loss": 4.9343, + "step": 900 + }, + { + "epoch": 0.2766292003960165, + "grad_norm": 0.5416184067726135, + "learning_rate": 0.0005970323332362365, + "loss": 4.8685, + "step": 950 + }, + { + "epoch": 0.29118863199580686, + "grad_norm": 0.5198241472244263, + "learning_rate": 0.0005968575589863093, + "loss": 4.849, + "step": 1000 + }, + { + "epoch": 0.29118863199580686, + "eval_accuracy": 0.25379510529217636, + "eval_loss": 4.761143684387207, + "eval_runtime": 183.641, + "eval_samples_per_second": 90.628, + "eval_steps_per_second": 5.669, + "step": 1000 + }, + { + "epoch": 0.30574806359559725, + "grad_norm": 0.47911056876182556, + "learning_rate": 0.0005966827847363822, + "loss": 4.7726, + "step": 1050 + }, + { + "epoch": 0.3203074951953876, + "grad_norm": 0.4758392572402954, + "learning_rate": 0.000596508010486455, + "loss": 4.7537, + "step": 1100 + }, + { + "epoch": 0.33486692679517793, + "grad_norm": 0.47129762172698975, + "learning_rate": 0.0005963332362365277, + "loss": 4.7115, + "step": 1150 + }, + { + "epoch": 0.3494263583949683, + "grad_norm": 0.42803141474723816, + "learning_rate": 0.0005961584619866006, + "loss": 4.6751, + "step": 1200 + }, + { + "epoch": 0.3639857899947586, + "grad_norm": 0.4740878641605377, + "learning_rate": 0.0005959836877366734, + "loss": 4.6417, + "step": 1250 + }, + { + "epoch": 0.37854522159454895, + "grad_norm": 0.40221309661865234, + "learning_rate": 0.0005958089134867463, + "loss": 4.6053, + "step": 1300 + }, + { + "epoch": 0.3931046531943393, + "grad_norm": 0.44672706723213196, + "learning_rate": 0.0005956341392368191, + "loss": 4.5801, + "step": 1350 + }, + { + "epoch": 0.40766408479412963, + "grad_norm": 0.4823697507381439, + "learning_rate": 0.0005954593649868918, + "loss": 4.5599, + "step": 1400 + }, + { + "epoch": 0.42222351639392, + "grad_norm": 0.5122449398040771, + "learning_rate": 0.0005952845907369647, + "loss": 4.5344, + "step": 1450 + }, + { + "epoch": 0.4367829479937103, + "grad_norm": 0.4088864028453827, + "learning_rate": 0.0005951098164870375, + "loss": 4.4951, + "step": 1500 + }, + { + "epoch": 0.45134237959350065, + "grad_norm": 0.40731462836265564, + "learning_rate": 0.0005949350422371104, + "loss": 4.5018, + "step": 1550 + }, + { + "epoch": 0.465901811193291, + "grad_norm": 0.4263319671154022, + "learning_rate": 0.0005947602679871832, + "loss": 4.4755, + "step": 1600 + }, + { + "epoch": 0.48046124279308133, + "grad_norm": 0.38340768218040466, + "learning_rate": 0.000594585493737256, + "loss": 4.4569, + "step": 1650 + }, + { + "epoch": 0.49502067439287173, + "grad_norm": 0.3979549705982208, + "learning_rate": 0.0005944107194873288, + "loss": 4.4444, + "step": 1700 + }, + { + "epoch": 0.509580105992662, + "grad_norm": 0.4176700711250305, + "learning_rate": 0.0005942359452374016, + "loss": 4.4173, + "step": 1750 + }, + { + "epoch": 0.5241395375924524, + "grad_norm": 0.3926246464252472, + "learning_rate": 0.0005940611709874745, + "loss": 4.4004, + "step": 1800 + }, + { + "epoch": 0.5386989691922427, + "grad_norm": 0.3877740502357483, + "learning_rate": 0.0005938863967375473, + "loss": 4.3869, + "step": 1850 + }, + { + "epoch": 0.553258400792033, + "grad_norm": 0.4315814971923828, + "learning_rate": 0.0005937116224876201, + "loss": 4.3636, + "step": 1900 + }, + { + "epoch": 0.5678178323918234, + "grad_norm": 0.403978556394577, + "learning_rate": 0.000593536848237693, + "loss": 4.3628, + "step": 1950 + }, + { + "epoch": 0.5823772639916137, + "grad_norm": 0.39984941482543945, + "learning_rate": 0.0005933620739877657, + "loss": 4.3402, + "step": 2000 + }, + { + "epoch": 0.5823772639916137, + "eval_accuracy": 0.29928804185701036, + "eval_loss": 4.288719177246094, + "eval_runtime": 180.6232, + "eval_samples_per_second": 92.142, + "eval_steps_per_second": 5.763, + "step": 2000 + }, + { + "epoch": 0.5969366955914042, + "grad_norm": 0.4173935055732727, + "learning_rate": 0.0005931872997378385, + "loss": 4.3411, + "step": 2050 + }, + { + "epoch": 0.6114961271911945, + "grad_norm": 0.37241849303245544, + "learning_rate": 0.0005930125254879114, + "loss": 4.3243, + "step": 2100 + }, + { + "epoch": 0.6260555587909848, + "grad_norm": 0.4274754822254181, + "learning_rate": 0.0005928377512379842, + "loss": 4.2883, + "step": 2150 + }, + { + "epoch": 0.6406149903907752, + "grad_norm": 0.4375714063644409, + "learning_rate": 0.0005926629769880571, + "loss": 4.2941, + "step": 2200 + }, + { + "epoch": 0.6551744219905655, + "grad_norm": 0.39245837926864624, + "learning_rate": 0.0005924882027381298, + "loss": 4.2863, + "step": 2250 + }, + { + "epoch": 0.6697338535903559, + "grad_norm": 0.3508373498916626, + "learning_rate": 0.0005923134284882026, + "loss": 4.2683, + "step": 2300 + }, + { + "epoch": 0.6842932851901462, + "grad_norm": 0.37966057658195496, + "learning_rate": 0.0005921386542382755, + "loss": 4.268, + "step": 2350 + }, + { + "epoch": 0.6988527167899365, + "grad_norm": 0.4270515441894531, + "learning_rate": 0.0005919638799883483, + "loss": 4.2548, + "step": 2400 + }, + { + "epoch": 0.7134121483897269, + "grad_norm": 0.36582618951797485, + "learning_rate": 0.0005917891057384212, + "loss": 4.2418, + "step": 2450 + }, + { + "epoch": 0.7279715799895172, + "grad_norm": 0.3588745594024658, + "learning_rate": 0.000591614331488494, + "loss": 4.2315, + "step": 2500 + }, + { + "epoch": 0.7425310115893076, + "grad_norm": 0.3805822730064392, + "learning_rate": 0.0005914395572385667, + "loss": 4.2263, + "step": 2550 + }, + { + "epoch": 0.7570904431890979, + "grad_norm": 0.37862271070480347, + "learning_rate": 0.0005912647829886396, + "loss": 4.2177, + "step": 2600 + }, + { + "epoch": 0.7716498747888882, + "grad_norm": 0.40694668889045715, + "learning_rate": 0.0005910900087387124, + "loss": 4.1886, + "step": 2650 + }, + { + "epoch": 0.7862093063886786, + "grad_norm": 0.3988340497016907, + "learning_rate": 0.0005909152344887853, + "loss": 4.1907, + "step": 2700 + }, + { + "epoch": 0.8007687379884689, + "grad_norm": 0.4412493109703064, + "learning_rate": 0.0005907404602388581, + "loss": 4.1929, + "step": 2750 + }, + { + "epoch": 0.8153281695882593, + "grad_norm": 0.37306517362594604, + "learning_rate": 0.0005905656859889308, + "loss": 4.1721, + "step": 2800 + }, + { + "epoch": 0.8298876011880496, + "grad_norm": 0.36752834916114807, + "learning_rate": 0.0005903909117390037, + "loss": 4.1729, + "step": 2850 + }, + { + "epoch": 0.84444703278784, + "grad_norm": 0.38249292969703674, + "learning_rate": 0.0005902161374890766, + "loss": 4.17, + "step": 2900 + }, + { + "epoch": 0.8590064643876303, + "grad_norm": 0.3479909598827362, + "learning_rate": 0.0005900413632391494, + "loss": 4.1629, + "step": 2950 + }, + { + "epoch": 0.8735658959874206, + "grad_norm": 0.34885624051094055, + "learning_rate": 0.0005898665889892223, + "loss": 4.1563, + "step": 3000 + }, + { + "epoch": 0.8735658959874206, + "eval_accuracy": 0.31519818808069494, + "eval_loss": 4.099164009094238, + "eval_runtime": 183.4247, + "eval_samples_per_second": 90.735, + "eval_steps_per_second": 5.675, + "step": 3000 + }, + { + "epoch": 0.888125327587211, + "grad_norm": 0.38681846857070923, + "learning_rate": 0.0005896918147392951, + "loss": 4.1567, + "step": 3050 + }, + { + "epoch": 0.9026847591870013, + "grad_norm": 0.3432327210903168, + "learning_rate": 0.0005895170404893678, + "loss": 4.1293, + "step": 3100 + }, + { + "epoch": 0.9172441907867916, + "grad_norm": 0.3937830626964569, + "learning_rate": 0.0005893422662394407, + "loss": 4.1285, + "step": 3150 + }, + { + "epoch": 0.931803622386582, + "grad_norm": 0.39171546697616577, + "learning_rate": 0.0005891674919895135, + "loss": 4.1279, + "step": 3200 + }, + { + "epoch": 0.9463630539863723, + "grad_norm": 0.37026646733283997, + "learning_rate": 0.0005889927177395864, + "loss": 4.1106, + "step": 3250 + }, + { + "epoch": 0.9609224855861627, + "grad_norm": 0.3460790812969208, + "learning_rate": 0.0005888179434896592, + "loss": 4.1132, + "step": 3300 + }, + { + "epoch": 0.975481917185953, + "grad_norm": 0.36886388063430786, + "learning_rate": 0.000588643169239732, + "loss": 4.0977, + "step": 3350 + }, + { + "epoch": 0.9900413487857435, + "grad_norm": 0.36020082235336304, + "learning_rate": 0.0005884683949898048, + "loss": 4.0966, + "step": 3400 + }, + { + "epoch": 1.0043678294799372, + "grad_norm": 0.33763444423675537, + "learning_rate": 0.0005882936207398776, + "loss": 4.0577, + "step": 3450 + }, + { + "epoch": 1.0189272610797275, + "grad_norm": 0.34525808691978455, + "learning_rate": 0.0005881188464899504, + "loss": 4.0248, + "step": 3500 + }, + { + "epoch": 1.0334866926795179, + "grad_norm": 0.37068355083465576, + "learning_rate": 0.0005879440722400233, + "loss": 4.0183, + "step": 3550 + }, + { + "epoch": 1.0480461242793082, + "grad_norm": 0.34973421692848206, + "learning_rate": 0.0005877692979900961, + "loss": 4.0291, + "step": 3600 + }, + { + "epoch": 1.0626055558790986, + "grad_norm": 0.3637358248233795, + "learning_rate": 0.000587594523740169, + "loss": 4.0199, + "step": 3650 + }, + { + "epoch": 1.077164987478889, + "grad_norm": 0.34920114278793335, + "learning_rate": 0.0005874197494902417, + "loss": 4.0247, + "step": 3700 + }, + { + "epoch": 1.0917244190786792, + "grad_norm": 0.3420464098453522, + "learning_rate": 0.0005872449752403145, + "loss": 4.0189, + "step": 3750 + }, + { + "epoch": 1.1062838506784696, + "grad_norm": 0.34696176648139954, + "learning_rate": 0.0005870702009903874, + "loss": 4.0089, + "step": 3800 + }, + { + "epoch": 1.12084328227826, + "grad_norm": 0.3416752815246582, + "learning_rate": 0.0005868954267404602, + "loss": 3.9978, + "step": 3850 + }, + { + "epoch": 1.1354027138780503, + "grad_norm": 0.3729047179222107, + "learning_rate": 0.0005867206524905331, + "loss": 3.9976, + "step": 3900 + }, + { + "epoch": 1.1499621454778406, + "grad_norm": 0.34707263112068176, + "learning_rate": 0.0005865458782406058, + "loss": 3.9927, + "step": 3950 + }, + { + "epoch": 1.164521577077631, + "grad_norm": 0.3424519896507263, + "learning_rate": 0.0005863711039906786, + "loss": 3.9798, + "step": 4000 + }, + { + "epoch": 1.164521577077631, + "eval_accuracy": 0.32528629009357674, + "eval_loss": 3.9908077716827393, + "eval_runtime": 180.5563, + "eval_samples_per_second": 92.176, + "eval_steps_per_second": 5.766, + "step": 4000 + }, + { + "epoch": 1.1790810086774213, + "grad_norm": 0.3473677635192871, + "learning_rate": 0.0005861963297407515, + "loss": 3.9837, + "step": 4050 + }, + { + "epoch": 1.1936404402772116, + "grad_norm": 0.3695130944252014, + "learning_rate": 0.0005860215554908243, + "loss": 3.9857, + "step": 4100 + }, + { + "epoch": 1.208199871877002, + "grad_norm": 0.3494517207145691, + "learning_rate": 0.0005858467812408972, + "loss": 3.9749, + "step": 4150 + }, + { + "epoch": 1.2227593034767923, + "grad_norm": 0.3514440655708313, + "learning_rate": 0.00058567200699097, + "loss": 3.9773, + "step": 4200 + }, + { + "epoch": 1.2373187350765826, + "grad_norm": 0.33939051628112793, + "learning_rate": 0.0005854972327410427, + "loss": 3.9868, + "step": 4250 + }, + { + "epoch": 1.251878166676373, + "grad_norm": 0.39269140362739563, + "learning_rate": 0.0005853224584911156, + "loss": 3.9676, + "step": 4300 + }, + { + "epoch": 1.2664375982761633, + "grad_norm": 0.3487934470176697, + "learning_rate": 0.0005851476842411884, + "loss": 3.973, + "step": 4350 + }, + { + "epoch": 1.2809970298759537, + "grad_norm": 0.33803650736808777, + "learning_rate": 0.0005849729099912613, + "loss": 3.9805, + "step": 4400 + }, + { + "epoch": 1.295556461475744, + "grad_norm": 0.34375283122062683, + "learning_rate": 0.0005847981357413341, + "loss": 3.9729, + "step": 4450 + }, + { + "epoch": 1.3101158930755343, + "grad_norm": 0.3429529070854187, + "learning_rate": 0.0005846233614914068, + "loss": 3.9492, + "step": 4500 + }, + { + "epoch": 1.3246753246753247, + "grad_norm": 0.3482668399810791, + "learning_rate": 0.0005844485872414797, + "loss": 3.9654, + "step": 4550 + }, + { + "epoch": 1.339234756275115, + "grad_norm": 0.3361050486564636, + "learning_rate": 0.0005842738129915525, + "loss": 3.9693, + "step": 4600 + }, + { + "epoch": 1.3537941878749054, + "grad_norm": 0.34350207448005676, + "learning_rate": 0.0005840990387416253, + "loss": 3.9628, + "step": 4650 + }, + { + "epoch": 1.3683536194746957, + "grad_norm": 0.35732749104499817, + "learning_rate": 0.0005839242644916982, + "loss": 3.9383, + "step": 4700 + }, + { + "epoch": 1.382913051074486, + "grad_norm": 0.32812654972076416, + "learning_rate": 0.000583749490241771, + "loss": 3.9402, + "step": 4750 + }, + { + "epoch": 1.3974724826742764, + "grad_norm": 0.3359614312648773, + "learning_rate": 0.0005835747159918438, + "loss": 3.9409, + "step": 4800 + }, + { + "epoch": 1.4120319142740667, + "grad_norm": 0.36291930079460144, + "learning_rate": 0.0005833999417419166, + "loss": 3.9373, + "step": 4850 + }, + { + "epoch": 1.426591345873857, + "grad_norm": 0.3357282876968384, + "learning_rate": 0.0005832251674919894, + "loss": 3.9373, + "step": 4900 + }, + { + "epoch": 1.4411507774736474, + "grad_norm": 0.3662075996398926, + "learning_rate": 0.0005830503932420623, + "loss": 3.9326, + "step": 4950 + }, + { + "epoch": 1.4557102090734377, + "grad_norm": 0.3387506604194641, + "learning_rate": 0.0005828756189921351, + "loss": 3.9189, + "step": 5000 + }, + { + "epoch": 1.4557102090734377, + "eval_accuracy": 0.3320894535210645, + "eval_loss": 3.91398549079895, + "eval_runtime": 185.101, + "eval_samples_per_second": 89.913, + "eval_steps_per_second": 5.624, + "step": 5000 + }, + { + "epoch": 1.470269640673228, + "grad_norm": 0.32989710569381714, + "learning_rate": 0.000582700844742208, + "loss": 3.9282, + "step": 5050 + }, + { + "epoch": 1.4848290722730184, + "grad_norm": 0.3328815996646881, + "learning_rate": 0.0005825260704922807, + "loss": 3.9183, + "step": 5100 + }, + { + "epoch": 1.4993885038728088, + "grad_norm": 0.33961018919944763, + "learning_rate": 0.0005823512962423535, + "loss": 3.9253, + "step": 5150 + }, + { + "epoch": 1.5139479354725993, + "grad_norm": 0.33562958240509033, + "learning_rate": 0.0005821765219924264, + "loss": 3.9222, + "step": 5200 + }, + { + "epoch": 1.5285073670723897, + "grad_norm": 0.3406899571418762, + "learning_rate": 0.0005820017477424992, + "loss": 3.9185, + "step": 5250 + }, + { + "epoch": 1.54306679867218, + "grad_norm": 0.3406858742237091, + "learning_rate": 0.0005818269734925721, + "loss": 3.9156, + "step": 5300 + }, + { + "epoch": 1.5576262302719703, + "grad_norm": 0.34090015292167664, + "learning_rate": 0.0005816521992426448, + "loss": 3.8969, + "step": 5350 + }, + { + "epoch": 1.5721856618717607, + "grad_norm": 0.31158268451690674, + "learning_rate": 0.0005814774249927176, + "loss": 3.9143, + "step": 5400 + }, + { + "epoch": 1.586745093471551, + "grad_norm": 0.34926122426986694, + "learning_rate": 0.0005813026507427905, + "loss": 3.9132, + "step": 5450 + }, + { + "epoch": 1.6013045250713414, + "grad_norm": 0.34333717823028564, + "learning_rate": 0.0005811278764928634, + "loss": 3.9041, + "step": 5500 + }, + { + "epoch": 1.6158639566711317, + "grad_norm": 0.3164921998977661, + "learning_rate": 0.0005809531022429362, + "loss": 3.908, + "step": 5550 + }, + { + "epoch": 1.630423388270922, + "grad_norm": 0.3325600028038025, + "learning_rate": 0.0005807783279930091, + "loss": 3.8937, + "step": 5600 + }, + { + "epoch": 1.6449828198707124, + "grad_norm": 0.3716844916343689, + "learning_rate": 0.0005806035537430818, + "loss": 3.913, + "step": 5650 + }, + { + "epoch": 1.6595422514705027, + "grad_norm": 0.3302454352378845, + "learning_rate": 0.0005804287794931546, + "loss": 3.8894, + "step": 5700 + }, + { + "epoch": 1.674101683070293, + "grad_norm": 0.3286576271057129, + "learning_rate": 0.0005802540052432275, + "loss": 3.9061, + "step": 5750 + }, + { + "epoch": 1.6886611146700834, + "grad_norm": 0.31899774074554443, + "learning_rate": 0.0005800792309933003, + "loss": 3.885, + "step": 5800 + }, + { + "epoch": 1.7032205462698737, + "grad_norm": 0.38346347212791443, + "learning_rate": 0.0005799044567433732, + "loss": 3.8978, + "step": 5850 + }, + { + "epoch": 1.717779977869664, + "grad_norm": 0.32501021027565, + "learning_rate": 0.000579729682493446, + "loss": 3.8928, + "step": 5900 + }, + { + "epoch": 1.7323394094694544, + "grad_norm": 0.33264926075935364, + "learning_rate": 0.0005795549082435187, + "loss": 3.8917, + "step": 5950 + }, + { + "epoch": 1.7468988410692448, + "grad_norm": 0.35515546798706055, + "learning_rate": 0.0005793801339935916, + "loss": 3.8836, + "step": 6000 + }, + { + "epoch": 1.7468988410692448, + "eval_accuracy": 0.33675024013551297, + "eval_loss": 3.8585171699523926, + "eval_runtime": 185.0399, + "eval_samples_per_second": 89.943, + "eval_steps_per_second": 5.626, + "step": 6000 + }, + { + "epoch": 1.761458272669035, + "grad_norm": 0.3250105679035187, + "learning_rate": 0.0005792053597436644, + "loss": 3.8774, + "step": 6050 + }, + { + "epoch": 1.7760177042688254, + "grad_norm": 0.333280473947525, + "learning_rate": 0.0005790305854937372, + "loss": 3.8726, + "step": 6100 + }, + { + "epoch": 1.7905771358686158, + "grad_norm": 0.32873275876045227, + "learning_rate": 0.0005788558112438101, + "loss": 3.8701, + "step": 6150 + }, + { + "epoch": 1.8051365674684061, + "grad_norm": 0.3332742154598236, + "learning_rate": 0.0005786810369938828, + "loss": 3.8699, + "step": 6200 + }, + { + "epoch": 1.8196959990681965, + "grad_norm": 0.3222472369670868, + "learning_rate": 0.0005785062627439557, + "loss": 3.874, + "step": 6250 + }, + { + "epoch": 1.8342554306679868, + "grad_norm": 0.3324868381023407, + "learning_rate": 0.0005783314884940285, + "loss": 3.869, + "step": 6300 + }, + { + "epoch": 1.8488148622677771, + "grad_norm": 0.32730036973953247, + "learning_rate": 0.0005781567142441013, + "loss": 3.8536, + "step": 6350 + }, + { + "epoch": 1.8633742938675675, + "grad_norm": 0.3353622257709503, + "learning_rate": 0.0005779819399941742, + "loss": 3.869, + "step": 6400 + }, + { + "epoch": 1.8779337254673578, + "grad_norm": 0.33830076456069946, + "learning_rate": 0.000577807165744247, + "loss": 3.8726, + "step": 6450 + }, + { + "epoch": 1.8924931570671482, + "grad_norm": 0.31618306040763855, + "learning_rate": 0.0005776323914943198, + "loss": 3.8508, + "step": 6500 + }, + { + "epoch": 1.9070525886669385, + "grad_norm": 0.33165860176086426, + "learning_rate": 0.0005774576172443926, + "loss": 3.8566, + "step": 6550 + }, + { + "epoch": 1.9216120202667288, + "grad_norm": 0.3387751579284668, + "learning_rate": 0.0005772828429944654, + "loss": 3.8548, + "step": 6600 + }, + { + "epoch": 1.9361714518665192, + "grad_norm": 0.3364385664463043, + "learning_rate": 0.0005771080687445383, + "loss": 3.8539, + "step": 6650 + }, + { + "epoch": 1.9507308834663095, + "grad_norm": 0.34390878677368164, + "learning_rate": 0.0005769332944946111, + "loss": 3.8631, + "step": 6700 + }, + { + "epoch": 1.9652903150660999, + "grad_norm": 0.3324083685874939, + "learning_rate": 0.0005767585202446839, + "loss": 3.8482, + "step": 6750 + }, + { + "epoch": 1.9798497466658902, + "grad_norm": 0.32365697622299194, + "learning_rate": 0.0005765837459947567, + "loss": 3.8303, + "step": 6800 + }, + { + "epoch": 1.9944091782656805, + "grad_norm": 0.3342290222644806, + "learning_rate": 0.0005764089717448295, + "loss": 3.8508, + "step": 6850 + }, + { + "epoch": 2.0087356589598744, + "grad_norm": 0.3290010392665863, + "learning_rate": 0.0005762341974949024, + "loss": 3.7915, + "step": 6900 + }, + { + "epoch": 2.0232950905596647, + "grad_norm": 0.3240971565246582, + "learning_rate": 0.0005760594232449752, + "loss": 3.7587, + "step": 6950 + }, + { + "epoch": 2.037854522159455, + "grad_norm": 0.3391764163970947, + "learning_rate": 0.0005758846489950481, + "loss": 3.7526, + "step": 7000 + }, + { + "epoch": 2.037854522159455, + "eval_accuracy": 0.34109519666654636, + "eval_loss": 3.816195249557495, + "eval_runtime": 184.953, + "eval_samples_per_second": 89.985, + "eval_steps_per_second": 5.628, + "step": 7000 + }, + { + "epoch": 2.0524139537592454, + "grad_norm": 0.33266958594322205, + "learning_rate": 0.0005757098747451208, + "loss": 3.7541, + "step": 7050 + }, + { + "epoch": 2.0669733853590357, + "grad_norm": 0.34850549697875977, + "learning_rate": 0.0005755351004951936, + "loss": 3.7518, + "step": 7100 + }, + { + "epoch": 2.081532816958826, + "grad_norm": 0.3229345679283142, + "learning_rate": 0.0005753603262452665, + "loss": 3.7485, + "step": 7150 + }, + { + "epoch": 2.0960922485586164, + "grad_norm": 0.31956946849823, + "learning_rate": 0.0005751855519953393, + "loss": 3.7446, + "step": 7200 + }, + { + "epoch": 2.1106516801584068, + "grad_norm": 0.3483135402202606, + "learning_rate": 0.0005750107777454121, + "loss": 3.76, + "step": 7250 + }, + { + "epoch": 2.125211111758197, + "grad_norm": 0.3251873850822449, + "learning_rate": 0.0005748360034954849, + "loss": 3.7494, + "step": 7300 + }, + { + "epoch": 2.1397705433579874, + "grad_norm": 0.3456031382083893, + "learning_rate": 0.0005746612292455577, + "loss": 3.7564, + "step": 7350 + }, + { + "epoch": 2.154329974957778, + "grad_norm": 0.3253571093082428, + "learning_rate": 0.0005744864549956306, + "loss": 3.7517, + "step": 7400 + }, + { + "epoch": 2.168889406557568, + "grad_norm": 0.322238564491272, + "learning_rate": 0.0005743116807457034, + "loss": 3.7582, + "step": 7450 + }, + { + "epoch": 2.1834488381573585, + "grad_norm": 0.33640897274017334, + "learning_rate": 0.0005741369064957762, + "loss": 3.7567, + "step": 7500 + }, + { + "epoch": 2.198008269757149, + "grad_norm": 0.3346073627471924, + "learning_rate": 0.0005739621322458491, + "loss": 3.7461, + "step": 7550 + }, + { + "epoch": 2.212567701356939, + "grad_norm": 0.3327328860759735, + "learning_rate": 0.0005737873579959218, + "loss": 3.7594, + "step": 7600 + }, + { + "epoch": 2.2271271329567295, + "grad_norm": 0.3236997723579407, + "learning_rate": 0.0005736125837459947, + "loss": 3.7726, + "step": 7650 + }, + { + "epoch": 2.24168656455652, + "grad_norm": 0.33130574226379395, + "learning_rate": 0.0005734378094960675, + "loss": 3.7486, + "step": 7700 + }, + { + "epoch": 2.25624599615631, + "grad_norm": 0.34919485449790955, + "learning_rate": 0.0005732630352461403, + "loss": 3.7578, + "step": 7750 + }, + { + "epoch": 2.2708054277561005, + "grad_norm": 0.3182968497276306, + "learning_rate": 0.0005730882609962132, + "loss": 3.7604, + "step": 7800 + }, + { + "epoch": 2.285364859355891, + "grad_norm": 0.30436646938323975, + "learning_rate": 0.0005729134867462859, + "loss": 3.7412, + "step": 7850 + }, + { + "epoch": 2.299924290955681, + "grad_norm": 0.3302886188030243, + "learning_rate": 0.0005727387124963588, + "loss": 3.7515, + "step": 7900 + }, + { + "epoch": 2.3144837225554715, + "grad_norm": 0.30620837211608887, + "learning_rate": 0.0005725639382464317, + "loss": 3.7695, + "step": 7950 + }, + { + "epoch": 2.329043154155262, + "grad_norm": 0.3169257640838623, + "learning_rate": 0.0005723891639965045, + "loss": 3.7682, + "step": 8000 + }, + { + "epoch": 2.329043154155262, + "eval_accuracy": 0.34396188967982283, + "eval_loss": 3.788954496383667, + "eval_runtime": 182.8165, + "eval_samples_per_second": 91.037, + "eval_steps_per_second": 5.694, + "step": 8000 + }, + { + "epoch": 2.343602585755052, + "grad_norm": 0.3280718922615051, + "learning_rate": 0.0005722143897465773, + "loss": 3.7452, + "step": 8050 + }, + { + "epoch": 2.3581620173548425, + "grad_norm": 0.3237084746360779, + "learning_rate": 0.0005720396154966502, + "loss": 3.762, + "step": 8100 + }, + { + "epoch": 2.372721448954633, + "grad_norm": 0.31791386008262634, + "learning_rate": 0.0005718648412467229, + "loss": 3.7504, + "step": 8150 + }, + { + "epoch": 2.3872808805544232, + "grad_norm": 0.32723358273506165, + "learning_rate": 0.0005716900669967958, + "loss": 3.7561, + "step": 8200 + }, + { + "epoch": 2.4018403121542136, + "grad_norm": 0.3216814398765564, + "learning_rate": 0.0005715152927468686, + "loss": 3.7496, + "step": 8250 + }, + { + "epoch": 2.416399743754004, + "grad_norm": 0.32928794622421265, + "learning_rate": 0.0005713405184969414, + "loss": 3.7533, + "step": 8300 + }, + { + "epoch": 2.4309591753537942, + "grad_norm": 0.3223062753677368, + "learning_rate": 0.0005711657442470143, + "loss": 3.766, + "step": 8350 + }, + { + "epoch": 2.4455186069535846, + "grad_norm": 0.3292803168296814, + "learning_rate": 0.000570990969997087, + "loss": 3.7502, + "step": 8400 + }, + { + "epoch": 2.460078038553375, + "grad_norm": 0.3402736783027649, + "learning_rate": 0.0005708161957471599, + "loss": 3.744, + "step": 8450 + }, + { + "epoch": 2.4746374701531653, + "grad_norm": 0.3164720833301544, + "learning_rate": 0.0005706414214972327, + "loss": 3.7426, + "step": 8500 + }, + { + "epoch": 2.4891969017529556, + "grad_norm": 0.33465683460235596, + "learning_rate": 0.0005704666472473055, + "loss": 3.756, + "step": 8550 + }, + { + "epoch": 2.503756333352746, + "grad_norm": 0.3301171362400055, + "learning_rate": 0.0005702918729973784, + "loss": 3.7448, + "step": 8600 + }, + { + "epoch": 2.5183157649525363, + "grad_norm": 0.3436541259288788, + "learning_rate": 0.0005701170987474512, + "loss": 3.7449, + "step": 8650 + }, + { + "epoch": 2.5328751965523266, + "grad_norm": 0.3333314061164856, + "learning_rate": 0.0005699423244975239, + "loss": 3.7381, + "step": 8700 + }, + { + "epoch": 2.547434628152117, + "grad_norm": 0.3258245885372162, + "learning_rate": 0.0005697675502475968, + "loss": 3.7338, + "step": 8750 + }, + { + "epoch": 2.5619940597519073, + "grad_norm": 0.34784647822380066, + "learning_rate": 0.0005695927759976696, + "loss": 3.734, + "step": 8800 + }, + { + "epoch": 2.5765534913516976, + "grad_norm": 0.31109482049942017, + "learning_rate": 0.0005694180017477425, + "loss": 3.7372, + "step": 8850 + }, + { + "epoch": 2.591112922951488, + "grad_norm": 0.31201112270355225, + "learning_rate": 0.0005692432274978153, + "loss": 3.7499, + "step": 8900 + }, + { + "epoch": 2.6056723545512783, + "grad_norm": 0.31193050742149353, + "learning_rate": 0.000569068453247888, + "loss": 3.7385, + "step": 8950 + }, + { + "epoch": 2.6202317861510687, + "grad_norm": 0.3446432948112488, + "learning_rate": 0.0005688936789979609, + "loss": 3.7477, + "step": 9000 + }, + { + "epoch": 2.6202317861510687, + "eval_accuracy": 0.3468738524556142, + "eval_loss": 3.757246255874634, + "eval_runtime": 182.4423, + "eval_samples_per_second": 91.223, + "eval_steps_per_second": 5.706, + "step": 9000 + }, + { + "epoch": 2.634791217750859, + "grad_norm": 0.31883829832077026, + "learning_rate": 0.0005687189047480337, + "loss": 3.7364, + "step": 9050 + }, + { + "epoch": 2.6493506493506493, + "grad_norm": 0.3273116946220398, + "learning_rate": 0.0005685441304981066, + "loss": 3.7312, + "step": 9100 + }, + { + "epoch": 2.6639100809504397, + "grad_norm": 0.3443247973918915, + "learning_rate": 0.0005683693562481794, + "loss": 3.7366, + "step": 9150 + }, + { + "epoch": 2.67846951255023, + "grad_norm": 0.30951568484306335, + "learning_rate": 0.0005681945819982522, + "loss": 3.7425, + "step": 9200 + }, + { + "epoch": 2.6930289441500204, + "grad_norm": 0.3140866756439209, + "learning_rate": 0.000568019807748325, + "loss": 3.7396, + "step": 9250 + }, + { + "epoch": 2.7075883757498107, + "grad_norm": 0.32707467675209045, + "learning_rate": 0.0005678450334983978, + "loss": 3.7348, + "step": 9300 + }, + { + "epoch": 2.722147807349601, + "grad_norm": 0.32110151648521423, + "learning_rate": 0.0005676702592484707, + "loss": 3.7223, + "step": 9350 + }, + { + "epoch": 2.7367072389493914, + "grad_norm": 0.3235968053340912, + "learning_rate": 0.0005674954849985435, + "loss": 3.7379, + "step": 9400 + }, + { + "epoch": 2.7512666705491817, + "grad_norm": 0.34924793243408203, + "learning_rate": 0.0005673207107486163, + "loss": 3.7503, + "step": 9450 + }, + { + "epoch": 2.765826102148972, + "grad_norm": 0.32524895668029785, + "learning_rate": 0.0005671459364986892, + "loss": 3.7302, + "step": 9500 + }, + { + "epoch": 2.7803855337487624, + "grad_norm": 0.3183753490447998, + "learning_rate": 0.0005669711622487619, + "loss": 3.7229, + "step": 9550 + }, + { + "epoch": 2.7949449653485527, + "grad_norm": 0.31938815116882324, + "learning_rate": 0.0005667963879988348, + "loss": 3.7208, + "step": 9600 + }, + { + "epoch": 2.809504396948343, + "grad_norm": 0.3149973154067993, + "learning_rate": 0.0005666216137489076, + "loss": 3.7312, + "step": 9650 + }, + { + "epoch": 2.8240638285481334, + "grad_norm": 0.32664161920547485, + "learning_rate": 0.0005664468394989804, + "loss": 3.7436, + "step": 9700 + }, + { + "epoch": 2.8386232601479238, + "grad_norm": 0.31149327754974365, + "learning_rate": 0.0005662720652490533, + "loss": 3.728, + "step": 9750 + }, + { + "epoch": 2.853182691747714, + "grad_norm": 0.3289666175842285, + "learning_rate": 0.000566097290999126, + "loss": 3.7286, + "step": 9800 + }, + { + "epoch": 2.8677421233475044, + "grad_norm": 0.3204244077205658, + "learning_rate": 0.0005659225167491988, + "loss": 3.7122, + "step": 9850 + }, + { + "epoch": 2.882301554947295, + "grad_norm": 0.33363139629364014, + "learning_rate": 0.0005657477424992717, + "loss": 3.7409, + "step": 9900 + }, + { + "epoch": 2.896860986547085, + "grad_norm": 0.3554539084434509, + "learning_rate": 0.0005655729682493445, + "loss": 3.7301, + "step": 9950 + }, + { + "epoch": 2.9114204181468755, + "grad_norm": 0.306832879781723, + "learning_rate": 0.0005653981939994174, + "loss": 3.73, + "step": 10000 + }, + { + "epoch": 2.9114204181468755, + "eval_accuracy": 0.3494360034301546, + "eval_loss": 3.729952573776245, + "eval_runtime": 181.5285, + "eval_samples_per_second": 91.683, + "eval_steps_per_second": 5.735, + "step": 10000 + }, + { + "epoch": 2.925979849746666, + "grad_norm": 0.31433573365211487, + "learning_rate": 0.0005652234197494902, + "loss": 3.7247, + "step": 10050 + }, + { + "epoch": 2.940539281346456, + "grad_norm": 0.3179089426994324, + "learning_rate": 0.0005650486454995629, + "loss": 3.7153, + "step": 10100 + }, + { + "epoch": 2.9550987129462465, + "grad_norm": 0.3196451961994171, + "learning_rate": 0.0005648738712496358, + "loss": 3.7189, + "step": 10150 + }, + { + "epoch": 2.969658144546037, + "grad_norm": 0.30295759439468384, + "learning_rate": 0.0005646990969997086, + "loss": 3.7165, + "step": 10200 + }, + { + "epoch": 2.984217576145827, + "grad_norm": 0.32530921697616577, + "learning_rate": 0.0005645243227497815, + "loss": 3.715, + "step": 10250 + }, + { + "epoch": 2.9987770077456175, + "grad_norm": 0.30198994278907776, + "learning_rate": 0.0005643495484998543, + "loss": 3.7192, + "step": 10300 + }, + { + "epoch": 3.0131034884398114, + "grad_norm": 0.31793293356895447, + "learning_rate": 0.000564174774249927, + "loss": 3.6316, + "step": 10350 + }, + { + "epoch": 3.0276629200396017, + "grad_norm": 0.3131251633167267, + "learning_rate": 0.0005639999999999999, + "loss": 3.6161, + "step": 10400 + }, + { + "epoch": 3.042222351639392, + "grad_norm": 0.3221314251422882, + "learning_rate": 0.0005638252257500727, + "loss": 3.6239, + "step": 10450 + }, + { + "epoch": 3.0567817832391824, + "grad_norm": 0.3299553096294403, + "learning_rate": 0.0005636504515001456, + "loss": 3.6255, + "step": 10500 + }, + { + "epoch": 3.0713412148389727, + "grad_norm": 0.3239217698574066, + "learning_rate": 0.0005634756772502185, + "loss": 3.6207, + "step": 10550 + }, + { + "epoch": 3.085900646438763, + "grad_norm": 0.3120846152305603, + "learning_rate": 0.0005633009030002913, + "loss": 3.6305, + "step": 10600 + }, + { + "epoch": 3.1004600780385534, + "grad_norm": 0.324990838766098, + "learning_rate": 0.000563126128750364, + "loss": 3.6298, + "step": 10650 + }, + { + "epoch": 3.1150195096383437, + "grad_norm": 0.3125215172767639, + "learning_rate": 0.0005629513545004369, + "loss": 3.617, + "step": 10700 + }, + { + "epoch": 3.129578941238134, + "grad_norm": 0.3323279917240143, + "learning_rate": 0.0005627765802505097, + "loss": 3.6235, + "step": 10750 + }, + { + "epoch": 3.1441383728379244, + "grad_norm": 0.3290170133113861, + "learning_rate": 0.0005626018060005826, + "loss": 3.6227, + "step": 10800 + }, + { + "epoch": 3.1586978044377148, + "grad_norm": 0.3450184762477875, + "learning_rate": 0.0005624270317506554, + "loss": 3.64, + "step": 10850 + }, + { + "epoch": 3.173257236037505, + "grad_norm": 0.32774847745895386, + "learning_rate": 0.0005622522575007282, + "loss": 3.646, + "step": 10900 + }, + { + "epoch": 3.1878166676372954, + "grad_norm": 0.32285189628601074, + "learning_rate": 0.000562077483250801, + "loss": 3.643, + "step": 10950 + }, + { + "epoch": 3.2023760992370858, + "grad_norm": 0.3201664686203003, + "learning_rate": 0.0005619027090008738, + "loss": 3.6397, + "step": 11000 + }, + { + "epoch": 3.2023760992370858, + "eval_accuracy": 0.3513435653971105, + "eval_loss": 3.7152557373046875, + "eval_runtime": 180.6216, + "eval_samples_per_second": 92.143, + "eval_steps_per_second": 5.763, + "step": 11000 + }, + { + "epoch": 3.216935530836876, + "grad_norm": 0.32860246300697327, + "learning_rate": 0.0005617279347509467, + "loss": 3.6478, + "step": 11050 + }, + { + "epoch": 3.2314949624366665, + "grad_norm": 0.32338783144950867, + "learning_rate": 0.0005615531605010195, + "loss": 3.6419, + "step": 11100 + }, + { + "epoch": 3.246054394036457, + "grad_norm": 0.3216056823730469, + "learning_rate": 0.0005613783862510923, + "loss": 3.6497, + "step": 11150 + }, + { + "epoch": 3.260613825636247, + "grad_norm": 0.36512988805770874, + "learning_rate": 0.0005612036120011652, + "loss": 3.6238, + "step": 11200 + }, + { + "epoch": 3.2751732572360375, + "grad_norm": 0.33006951212882996, + "learning_rate": 0.0005610288377512379, + "loss": 3.65, + "step": 11250 + }, + { + "epoch": 3.289732688835828, + "grad_norm": 0.32506290078163147, + "learning_rate": 0.0005608540635013107, + "loss": 3.6369, + "step": 11300 + }, + { + "epoch": 3.304292120435618, + "grad_norm": 0.3291010856628418, + "learning_rate": 0.0005606792892513836, + "loss": 3.644, + "step": 11350 + }, + { + "epoch": 3.3188515520354085, + "grad_norm": 0.3134164810180664, + "learning_rate": 0.0005605045150014564, + "loss": 3.6428, + "step": 11400 + }, + { + "epoch": 3.333410983635199, + "grad_norm": 0.3079008162021637, + "learning_rate": 0.0005603297407515293, + "loss": 3.638, + "step": 11450 + }, + { + "epoch": 3.347970415234989, + "grad_norm": 0.2959432899951935, + "learning_rate": 0.000560154966501602, + "loss": 3.6469, + "step": 11500 + }, + { + "epoch": 3.3625298468347795, + "grad_norm": 0.3210470676422119, + "learning_rate": 0.0005599801922516748, + "loss": 3.6441, + "step": 11550 + }, + { + "epoch": 3.37708927843457, + "grad_norm": 0.3303925395011902, + "learning_rate": 0.0005598054180017477, + "loss": 3.6448, + "step": 11600 + }, + { + "epoch": 3.39164871003436, + "grad_norm": 0.3426654040813446, + "learning_rate": 0.0005596306437518205, + "loss": 3.638, + "step": 11650 + }, + { + "epoch": 3.4062081416341505, + "grad_norm": 0.35107845067977905, + "learning_rate": 0.0005594558695018934, + "loss": 3.6483, + "step": 11700 + }, + { + "epoch": 3.420767573233941, + "grad_norm": 0.3188258111476898, + "learning_rate": 0.0005592810952519662, + "loss": 3.6422, + "step": 11750 + }, + { + "epoch": 3.435327004833731, + "grad_norm": 0.33043134212493896, + "learning_rate": 0.0005591063210020389, + "loss": 3.6448, + "step": 11800 + }, + { + "epoch": 3.4498864364335216, + "grad_norm": 0.31511127948760986, + "learning_rate": 0.0005589315467521118, + "loss": 3.648, + "step": 11850 + }, + { + "epoch": 3.464445868033312, + "grad_norm": 0.3306327164173126, + "learning_rate": 0.0005587567725021846, + "loss": 3.6258, + "step": 11900 + }, + { + "epoch": 3.4790052996331022, + "grad_norm": 0.3343588411808014, + "learning_rate": 0.0005585819982522575, + "loss": 3.646, + "step": 11950 + }, + { + "epoch": 3.4935647312328926, + "grad_norm": 0.3293665945529938, + "learning_rate": 0.0005584072240023303, + "loss": 3.6405, + "step": 12000 + }, + { + "epoch": 3.4935647312328926, + "eval_accuracy": 0.35307957260170497, + "eval_loss": 3.6972014904022217, + "eval_runtime": 181.5639, + "eval_samples_per_second": 91.665, + "eval_steps_per_second": 5.734, + "step": 12000 + }, + { + "epoch": 3.508124162832683, + "grad_norm": 0.3309422433376312, + "learning_rate": 0.000558232449752403, + "loss": 3.6445, + "step": 12050 + }, + { + "epoch": 3.5226835944324733, + "grad_norm": 0.3296276032924652, + "learning_rate": 0.0005580576755024759, + "loss": 3.6433, + "step": 12100 + }, + { + "epoch": 3.5372430260322636, + "grad_norm": 0.3203052580356598, + "learning_rate": 0.0005578829012525487, + "loss": 3.6408, + "step": 12150 + }, + { + "epoch": 3.551802457632054, + "grad_norm": 0.31741246581077576, + "learning_rate": 0.0005577081270026216, + "loss": 3.6379, + "step": 12200 + }, + { + "epoch": 3.5663618892318443, + "grad_norm": 0.32449865341186523, + "learning_rate": 0.0005575333527526944, + "loss": 3.6515, + "step": 12250 + }, + { + "epoch": 3.5809213208316346, + "grad_norm": 0.3303356170654297, + "learning_rate": 0.0005573585785027672, + "loss": 3.6346, + "step": 12300 + }, + { + "epoch": 3.595480752431425, + "grad_norm": 0.3001437783241272, + "learning_rate": 0.00055718380425284, + "loss": 3.6476, + "step": 12350 + }, + { + "epoch": 3.6100401840312153, + "grad_norm": 0.3065738379955292, + "learning_rate": 0.0005570090300029128, + "loss": 3.6495, + "step": 12400 + }, + { + "epoch": 3.6245996156310056, + "grad_norm": 0.3155801594257355, + "learning_rate": 0.0005568342557529856, + "loss": 3.6398, + "step": 12450 + }, + { + "epoch": 3.639159047230796, + "grad_norm": 0.3072325587272644, + "learning_rate": 0.0005566594815030585, + "loss": 3.6446, + "step": 12500 + }, + { + "epoch": 3.6537184788305863, + "grad_norm": 0.331887811422348, + "learning_rate": 0.0005564847072531313, + "loss": 3.6402, + "step": 12550 + }, + { + "epoch": 3.6682779104303767, + "grad_norm": 0.30090418457984924, + "learning_rate": 0.0005563099330032042, + "loss": 3.6303, + "step": 12600 + }, + { + "epoch": 3.682837342030167, + "grad_norm": 0.3239140808582306, + "learning_rate": 0.0005561351587532769, + "loss": 3.6552, + "step": 12650 + }, + { + "epoch": 3.6973967736299573, + "grad_norm": 0.320881724357605, + "learning_rate": 0.0005559603845033497, + "loss": 3.6356, + "step": 12700 + }, + { + "epoch": 3.7119562052297477, + "grad_norm": 0.3165138363838196, + "learning_rate": 0.0005557856102534226, + "loss": 3.6434, + "step": 12750 + }, + { + "epoch": 3.726515636829538, + "grad_norm": 0.3095230162143707, + "learning_rate": 0.0005556108360034954, + "loss": 3.6385, + "step": 12800 + }, + { + "epoch": 3.7410750684293284, + "grad_norm": 0.34694117307662964, + "learning_rate": 0.0005554360617535683, + "loss": 3.6463, + "step": 12850 + }, + { + "epoch": 3.755634500029119, + "grad_norm": 0.32559525966644287, + "learning_rate": 0.000555261287503641, + "loss": 3.6323, + "step": 12900 + }, + { + "epoch": 3.770193931628909, + "grad_norm": 0.3220575451850891, + "learning_rate": 0.0005550865132537138, + "loss": 3.6369, + "step": 12950 + }, + { + "epoch": 3.7847533632287, + "grad_norm": 0.31526488065719604, + "learning_rate": 0.0005549117390037867, + "loss": 3.6412, + "step": 13000 + }, + { + "epoch": 3.7847533632287, + "eval_accuracy": 0.35453631828429244, + "eval_loss": 3.682695150375366, + "eval_runtime": 183.1807, + "eval_samples_per_second": 90.856, + "eval_steps_per_second": 5.683, + "step": 13000 + }, + { + "epoch": 3.7993127948284897, + "grad_norm": 0.3306889832019806, + "learning_rate": 0.0005547369647538596, + "loss": 3.6618, + "step": 13050 + }, + { + "epoch": 3.8138722264282805, + "grad_norm": 0.33385586738586426, + "learning_rate": 0.0005545621905039324, + "loss": 3.6427, + "step": 13100 + }, + { + "epoch": 3.8284316580280704, + "grad_norm": 0.30829793214797974, + "learning_rate": 0.0005543874162540053, + "loss": 3.6345, + "step": 13150 + }, + { + "epoch": 3.842991089627861, + "grad_norm": 0.3245658576488495, + "learning_rate": 0.000554212642004078, + "loss": 3.6519, + "step": 13200 + }, + { + "epoch": 3.857550521227651, + "grad_norm": 0.29873931407928467, + "learning_rate": 0.0005540378677541508, + "loss": 3.639, + "step": 13250 + }, + { + "epoch": 3.872109952827442, + "grad_norm": 0.3140360414981842, + "learning_rate": 0.0005538630935042237, + "loss": 3.644, + "step": 13300 + }, + { + "epoch": 3.8866693844272318, + "grad_norm": 0.31487107276916504, + "learning_rate": 0.0005536883192542965, + "loss": 3.6451, + "step": 13350 + }, + { + "epoch": 3.9012288160270225, + "grad_norm": 0.31665652990341187, + "learning_rate": 0.0005535135450043694, + "loss": 3.63, + "step": 13400 + }, + { + "epoch": 3.9157882476268124, + "grad_norm": 0.3285450339317322, + "learning_rate": 0.0005533387707544422, + "loss": 3.6402, + "step": 13450 + }, + { + "epoch": 3.930347679226603, + "grad_norm": 0.3168368935585022, + "learning_rate": 0.0005531639965045149, + "loss": 3.6433, + "step": 13500 + }, + { + "epoch": 3.944907110826393, + "grad_norm": 0.3096484839916229, + "learning_rate": 0.0005529892222545878, + "loss": 3.6292, + "step": 13550 + }, + { + "epoch": 3.959466542426184, + "grad_norm": 0.31400060653686523, + "learning_rate": 0.0005528144480046606, + "loss": 3.6337, + "step": 13600 + }, + { + "epoch": 3.974025974025974, + "grad_norm": 0.32995402812957764, + "learning_rate": 0.0005526396737547335, + "loss": 3.644, + "step": 13650 + }, + { + "epoch": 3.9885854056257646, + "grad_norm": 0.30545228719711304, + "learning_rate": 0.0005524648995048063, + "loss": 3.6337, + "step": 13700 + }, + { + "epoch": 4.002911886319958, + "grad_norm": 0.3340036869049072, + "learning_rate": 0.000552290125254879, + "loss": 3.6049, + "step": 13750 + }, + { + "epoch": 4.017471317919749, + "grad_norm": 0.3237653076648712, + "learning_rate": 0.0005521153510049519, + "loss": 3.5263, + "step": 13800 + }, + { + "epoch": 4.032030749519539, + "grad_norm": 0.33258405327796936, + "learning_rate": 0.0005519405767550247, + "loss": 3.5231, + "step": 13850 + }, + { + "epoch": 4.046590181119329, + "grad_norm": 0.33560073375701904, + "learning_rate": 0.0005517658025050975, + "loss": 3.5422, + "step": 13900 + }, + { + "epoch": 4.061149612719119, + "grad_norm": 0.32539400458335876, + "learning_rate": 0.0005515910282551704, + "loss": 3.5393, + "step": 13950 + }, + { + "epoch": 4.07570904431891, + "grad_norm": 0.3466116786003113, + "learning_rate": 0.0005514162540052432, + "loss": 3.5371, + "step": 14000 + }, + { + "epoch": 4.07570904431891, + "eval_accuracy": 0.35599906074061566, + "eval_loss": 3.6699209213256836, + "eval_runtime": 180.5976, + "eval_samples_per_second": 92.155, + "eval_steps_per_second": 5.764, + "step": 14000 + }, + { + "epoch": 4.0902684759187, + "grad_norm": 0.35234954953193665, + "learning_rate": 0.000551241479755316, + "loss": 3.5405, + "step": 14050 + }, + { + "epoch": 4.104827907518491, + "grad_norm": 0.3241097629070282, + "learning_rate": 0.0005510667055053888, + "loss": 3.5312, + "step": 14100 + }, + { + "epoch": 4.119387339118281, + "grad_norm": 0.35480767488479614, + "learning_rate": 0.0005508919312554616, + "loss": 3.541, + "step": 14150 + }, + { + "epoch": 4.1339467707180715, + "grad_norm": 0.31226274371147156, + "learning_rate": 0.0005507171570055345, + "loss": 3.5525, + "step": 14200 + }, + { + "epoch": 4.148506202317861, + "grad_norm": 0.3221980631351471, + "learning_rate": 0.0005505423827556073, + "loss": 3.5545, + "step": 14250 + }, + { + "epoch": 4.163065633917652, + "grad_norm": 0.33322617411613464, + "learning_rate": 0.0005503676085056802, + "loss": 3.5607, + "step": 14300 + }, + { + "epoch": 4.177625065517442, + "grad_norm": 0.31406116485595703, + "learning_rate": 0.0005501928342557529, + "loss": 3.5597, + "step": 14350 + }, + { + "epoch": 4.192184497117233, + "grad_norm": 0.30982154607772827, + "learning_rate": 0.0005500180600058257, + "loss": 3.5544, + "step": 14400 + }, + { + "epoch": 4.206743928717023, + "grad_norm": 0.31833505630493164, + "learning_rate": 0.0005498432857558986, + "loss": 3.5589, + "step": 14450 + }, + { + "epoch": 4.2213033603168135, + "grad_norm": 0.31112346053123474, + "learning_rate": 0.0005496685115059714, + "loss": 3.5535, + "step": 14500 + }, + { + "epoch": 4.235862791916603, + "grad_norm": 0.3102998733520508, + "learning_rate": 0.0005494937372560443, + "loss": 3.5584, + "step": 14550 + }, + { + "epoch": 4.250422223516394, + "grad_norm": 0.3442176878452301, + "learning_rate": 0.000549318963006117, + "loss": 3.5691, + "step": 14600 + }, + { + "epoch": 4.264981655116184, + "grad_norm": 0.3217466175556183, + "learning_rate": 0.0005491441887561898, + "loss": 3.5748, + "step": 14650 + }, + { + "epoch": 4.279541086715975, + "grad_norm": 0.32345715165138245, + "learning_rate": 0.0005489694145062627, + "loss": 3.5711, + "step": 14700 + }, + { + "epoch": 4.294100518315765, + "grad_norm": 0.31309959292411804, + "learning_rate": 0.0005487946402563355, + "loss": 3.5544, + "step": 14750 + }, + { + "epoch": 4.308659949915556, + "grad_norm": 0.31507858633995056, + "learning_rate": 0.0005486198660064084, + "loss": 3.5806, + "step": 14800 + }, + { + "epoch": 4.3232193815153455, + "grad_norm": 0.3113386631011963, + "learning_rate": 0.0005484450917564812, + "loss": 3.5698, + "step": 14850 + }, + { + "epoch": 4.337778813115136, + "grad_norm": 0.30662500858306885, + "learning_rate": 0.0005482703175065539, + "loss": 3.5684, + "step": 14900 + }, + { + "epoch": 4.352338244714926, + "grad_norm": 0.33159640431404114, + "learning_rate": 0.0005480955432566268, + "loss": 3.5681, + "step": 14950 + }, + { + "epoch": 4.366897676314717, + "grad_norm": 0.3497229218482971, + "learning_rate": 0.0005479207690066996, + "loss": 3.5768, + "step": 15000 + }, + { + "epoch": 4.366897676314717, + "eval_accuracy": 0.35680662627036064, + "eval_loss": 3.663057565689087, + "eval_runtime": 180.5674, + "eval_samples_per_second": 92.171, + "eval_steps_per_second": 5.765, + "step": 15000 + }, + { + "epoch": 4.381457107914507, + "grad_norm": 0.3152848184108734, + "learning_rate": 0.0005477459947567725, + "loss": 3.5651, + "step": 15050 + }, + { + "epoch": 4.396016539514298, + "grad_norm": 0.31485655903816223, + "learning_rate": 0.0005475712205068453, + "loss": 3.5724, + "step": 15100 + }, + { + "epoch": 4.4105759711140875, + "grad_norm": 0.3210237920284271, + "learning_rate": 0.000547396446256918, + "loss": 3.5743, + "step": 15150 + }, + { + "epoch": 4.425135402713878, + "grad_norm": 0.31647804379463196, + "learning_rate": 0.0005472216720069909, + "loss": 3.5643, + "step": 15200 + }, + { + "epoch": 4.439694834313668, + "grad_norm": 0.3220058083534241, + "learning_rate": 0.0005470468977570637, + "loss": 3.5777, + "step": 15250 + }, + { + "epoch": 4.454254265913459, + "grad_norm": 0.31475868821144104, + "learning_rate": 0.0005468721235071365, + "loss": 3.5759, + "step": 15300 + }, + { + "epoch": 4.468813697513249, + "grad_norm": 0.31258007884025574, + "learning_rate": 0.0005466973492572094, + "loss": 3.58, + "step": 15350 + }, + { + "epoch": 4.48337312911304, + "grad_norm": 0.3323783874511719, + "learning_rate": 0.0005465225750072822, + "loss": 3.5717, + "step": 15400 + }, + { + "epoch": 4.4979325607128295, + "grad_norm": 0.31647196412086487, + "learning_rate": 0.000546347800757355, + "loss": 3.5666, + "step": 15450 + }, + { + "epoch": 4.51249199231262, + "grad_norm": 0.3166157007217407, + "learning_rate": 0.0005461730265074279, + "loss": 3.5661, + "step": 15500 + }, + { + "epoch": 4.52705142391241, + "grad_norm": 0.33359718322753906, + "learning_rate": 0.0005459982522575007, + "loss": 3.581, + "step": 15550 + }, + { + "epoch": 4.541610855512201, + "grad_norm": 0.30880287289619446, + "learning_rate": 0.0005458234780075735, + "loss": 3.5767, + "step": 15600 + }, + { + "epoch": 4.556170287111991, + "grad_norm": 0.3321440517902374, + "learning_rate": 0.0005456487037576464, + "loss": 3.5927, + "step": 15650 + }, + { + "epoch": 4.570729718711782, + "grad_norm": 0.35169097781181335, + "learning_rate": 0.0005454739295077192, + "loss": 3.5777, + "step": 15700 + }, + { + "epoch": 4.585289150311572, + "grad_norm": 0.3210912048816681, + "learning_rate": 0.000545299155257792, + "loss": 3.5641, + "step": 15750 + }, + { + "epoch": 4.599848581911362, + "grad_norm": 0.3266526460647583, + "learning_rate": 0.0005451243810078648, + "loss": 3.5624, + "step": 15800 + }, + { + "epoch": 4.614408013511152, + "grad_norm": 0.3169322609901428, + "learning_rate": 0.0005449496067579376, + "loss": 3.582, + "step": 15850 + }, + { + "epoch": 4.628967445110943, + "grad_norm": 0.30979159474372864, + "learning_rate": 0.0005447748325080105, + "loss": 3.5808, + "step": 15900 + }, + { + "epoch": 4.643526876710733, + "grad_norm": 0.3104844391345978, + "learning_rate": 0.0005446000582580833, + "loss": 3.5779, + "step": 15950 + }, + { + "epoch": 4.658086308310524, + "grad_norm": 0.3167930543422699, + "learning_rate": 0.0005444252840081562, + "loss": 3.577, + "step": 16000 + }, + { + "epoch": 4.658086308310524, + "eval_accuracy": 0.3581025173162721, + "eval_loss": 3.6506025791168213, + "eval_runtime": 184.1909, + "eval_samples_per_second": 90.357, + "eval_steps_per_second": 5.652, + "step": 16000 + }, + { + "epoch": 4.672645739910314, + "grad_norm": 0.3104100823402405, + "learning_rate": 0.0005442505097582289, + "loss": 3.5755, + "step": 16050 + }, + { + "epoch": 4.687205171510104, + "grad_norm": 0.32251089811325073, + "learning_rate": 0.0005440757355083017, + "loss": 3.5785, + "step": 16100 + }, + { + "epoch": 4.701764603109894, + "grad_norm": 0.30579274892807007, + "learning_rate": 0.0005439009612583746, + "loss": 3.5736, + "step": 16150 + }, + { + "epoch": 4.716324034709685, + "grad_norm": 0.32924431562423706, + "learning_rate": 0.0005437261870084474, + "loss": 3.5859, + "step": 16200 + }, + { + "epoch": 4.730883466309475, + "grad_norm": 0.32339397072792053, + "learning_rate": 0.0005435514127585203, + "loss": 3.5714, + "step": 16250 + }, + { + "epoch": 4.745442897909266, + "grad_norm": 0.3301834762096405, + "learning_rate": 0.000543376638508593, + "loss": 3.581, + "step": 16300 + }, + { + "epoch": 4.760002329509056, + "grad_norm": 0.3323529064655304, + "learning_rate": 0.0005432018642586658, + "loss": 3.5745, + "step": 16350 + }, + { + "epoch": 4.7745617611088464, + "grad_norm": 0.31460458040237427, + "learning_rate": 0.0005430270900087387, + "loss": 3.5752, + "step": 16400 + }, + { + "epoch": 4.789121192708636, + "grad_norm": 0.30962061882019043, + "learning_rate": 0.0005428523157588115, + "loss": 3.5847, + "step": 16450 + }, + { + "epoch": 4.803680624308427, + "grad_norm": 0.31121689081192017, + "learning_rate": 0.0005426775415088843, + "loss": 3.581, + "step": 16500 + }, + { + "epoch": 4.818240055908217, + "grad_norm": 0.3271123170852661, + "learning_rate": 0.0005425027672589572, + "loss": 3.5747, + "step": 16550 + }, + { + "epoch": 4.832799487508008, + "grad_norm": 0.34155216813087463, + "learning_rate": 0.0005423279930090299, + "loss": 3.5757, + "step": 16600 + }, + { + "epoch": 4.847358919107798, + "grad_norm": 0.31826114654541016, + "learning_rate": 0.0005421532187591028, + "loss": 3.5863, + "step": 16650 + }, + { + "epoch": 4.8619183507075885, + "grad_norm": 0.3213462829589844, + "learning_rate": 0.0005419784445091756, + "loss": 3.5846, + "step": 16700 + }, + { + "epoch": 4.876477782307378, + "grad_norm": 0.3335978388786316, + "learning_rate": 0.0005418036702592484, + "loss": 3.5778, + "step": 16750 + }, + { + "epoch": 4.891037213907169, + "grad_norm": 0.32565537095069885, + "learning_rate": 0.0005416288960093213, + "loss": 3.5903, + "step": 16800 + }, + { + "epoch": 4.905596645506959, + "grad_norm": 0.31601616740226746, + "learning_rate": 0.000541454121759394, + "loss": 3.581, + "step": 16850 + }, + { + "epoch": 4.92015607710675, + "grad_norm": 0.3034924268722534, + "learning_rate": 0.0005412793475094669, + "loss": 3.5731, + "step": 16900 + }, + { + "epoch": 4.93471550870654, + "grad_norm": 0.30528074502944946, + "learning_rate": 0.0005411045732595397, + "loss": 3.5775, + "step": 16950 + }, + { + "epoch": 4.9492749403063305, + "grad_norm": 0.32346123456954956, + "learning_rate": 0.0005409297990096125, + "loss": 3.5711, + "step": 17000 + }, + { + "epoch": 4.9492749403063305, + "eval_accuracy": 0.3593712465046746, + "eval_loss": 3.6345937252044678, + "eval_runtime": 183.7217, + "eval_samples_per_second": 90.588, + "eval_steps_per_second": 5.666, + "step": 17000 + }, + { + "epoch": 4.96383437190612, + "grad_norm": 0.3116399049758911, + "learning_rate": 0.0005407550247596854, + "loss": 3.5657, + "step": 17050 + }, + { + "epoch": 4.978393803505911, + "grad_norm": 0.3291073143482208, + "learning_rate": 0.0005405802505097582, + "loss": 3.5751, + "step": 17100 + }, + { + "epoch": 4.992953235105701, + "grad_norm": 0.3149360716342926, + "learning_rate": 0.000540405476259831, + "loss": 3.5743, + "step": 17150 + }, + { + "epoch": 5.007279715799895, + "grad_norm": 0.3213154971599579, + "learning_rate": 0.0005402307020099038, + "loss": 3.5347, + "step": 17200 + }, + { + "epoch": 5.021839147399685, + "grad_norm": 0.3356756567955017, + "learning_rate": 0.0005400559277599766, + "loss": 3.4594, + "step": 17250 + }, + { + "epoch": 5.036398578999476, + "grad_norm": 0.3190675973892212, + "learning_rate": 0.0005398811535100495, + "loss": 3.4737, + "step": 17300 + }, + { + "epoch": 5.050958010599266, + "grad_norm": 0.30441927909851074, + "learning_rate": 0.0005397063792601223, + "loss": 3.4683, + "step": 17350 + }, + { + "epoch": 5.065517442199057, + "grad_norm": 0.3276670277118683, + "learning_rate": 0.0005395316050101951, + "loss": 3.4779, + "step": 17400 + }, + { + "epoch": 5.080076873798847, + "grad_norm": 0.3393913805484772, + "learning_rate": 0.0005393568307602679, + "loss": 3.4886, + "step": 17450 + }, + { + "epoch": 5.094636305398637, + "grad_norm": 0.33122798800468445, + "learning_rate": 0.0005391820565103407, + "loss": 3.4829, + "step": 17500 + }, + { + "epoch": 5.109195736998427, + "grad_norm": 0.32901448011398315, + "learning_rate": 0.0005390072822604136, + "loss": 3.4872, + "step": 17550 + }, + { + "epoch": 5.123755168598218, + "grad_norm": 0.3309627175331116, + "learning_rate": 0.0005388325080104864, + "loss": 3.4831, + "step": 17600 + }, + { + "epoch": 5.138314600198008, + "grad_norm": 0.32044172286987305, + "learning_rate": 0.0005386577337605593, + "loss": 3.4888, + "step": 17650 + }, + { + "epoch": 5.152874031797799, + "grad_norm": 0.3464089334011078, + "learning_rate": 0.000538482959510632, + "loss": 3.4972, + "step": 17700 + }, + { + "epoch": 5.167433463397589, + "grad_norm": 0.3171513080596924, + "learning_rate": 0.0005383081852607048, + "loss": 3.5026, + "step": 17750 + }, + { + "epoch": 5.1819928949973795, + "grad_norm": 0.3164452612400055, + "learning_rate": 0.0005381334110107777, + "loss": 3.4926, + "step": 17800 + }, + { + "epoch": 5.196552326597169, + "grad_norm": 0.32658103108406067, + "learning_rate": 0.0005379586367608505, + "loss": 3.5046, + "step": 17850 + }, + { + "epoch": 5.21111175819696, + "grad_norm": 0.32511815428733826, + "learning_rate": 0.0005377838625109233, + "loss": 3.4953, + "step": 17900 + }, + { + "epoch": 5.22567118979675, + "grad_norm": 0.343904972076416, + "learning_rate": 0.0005376090882609961, + "loss": 3.5065, + "step": 17950 + }, + { + "epoch": 5.240230621396541, + "grad_norm": 0.33408525586128235, + "learning_rate": 0.0005374343140110689, + "loss": 3.5066, + "step": 18000 + }, + { + "epoch": 5.240230621396541, + "eval_accuracy": 0.35997068871065013, + "eval_loss": 3.6377220153808594, + "eval_runtime": 181.2212, + "eval_samples_per_second": 91.838, + "eval_steps_per_second": 5.744, + "step": 18000 + }, + { + "epoch": 5.254790052996331, + "grad_norm": 0.3558831512928009, + "learning_rate": 0.0005372595397611418, + "loss": 3.5154, + "step": 18050 + }, + { + "epoch": 5.2693494845961215, + "grad_norm": 0.3240915536880493, + "learning_rate": 0.0005370847655112147, + "loss": 3.5104, + "step": 18100 + }, + { + "epoch": 5.283908916195911, + "grad_norm": 0.3641294538974762, + "learning_rate": 0.0005369099912612875, + "loss": 3.5125, + "step": 18150 + }, + { + "epoch": 5.298468347795702, + "grad_norm": 0.323595255613327, + "learning_rate": 0.0005367352170113603, + "loss": 3.5091, + "step": 18200 + }, + { + "epoch": 5.313027779395492, + "grad_norm": 0.31085318326950073, + "learning_rate": 0.0005365604427614331, + "loss": 3.5061, + "step": 18250 + }, + { + "epoch": 5.327587210995283, + "grad_norm": 0.3321459889411926, + "learning_rate": 0.0005363856685115059, + "loss": 3.5128, + "step": 18300 + }, + { + "epoch": 5.342146642595073, + "grad_norm": 0.3359740674495697, + "learning_rate": 0.0005362108942615788, + "loss": 3.5207, + "step": 18350 + }, + { + "epoch": 5.3567060741948636, + "grad_norm": 0.35164040327072144, + "learning_rate": 0.0005360361200116516, + "loss": 3.5206, + "step": 18400 + }, + { + "epoch": 5.3712655057946534, + "grad_norm": 0.33065569400787354, + "learning_rate": 0.0005358613457617244, + "loss": 3.5137, + "step": 18450 + }, + { + "epoch": 5.385824937394444, + "grad_norm": 0.31795698404312134, + "learning_rate": 0.0005356865715117973, + "loss": 3.5181, + "step": 18500 + }, + { + "epoch": 5.400384368994234, + "grad_norm": 0.3166426718235016, + "learning_rate": 0.00053551179726187, + "loss": 3.5129, + "step": 18550 + }, + { + "epoch": 5.414943800594025, + "grad_norm": 0.3113225996494293, + "learning_rate": 0.0005353370230119429, + "loss": 3.5224, + "step": 18600 + }, + { + "epoch": 5.429503232193815, + "grad_norm": 0.3037504553794861, + "learning_rate": 0.0005351622487620157, + "loss": 3.5212, + "step": 18650 + }, + { + "epoch": 5.444062663793606, + "grad_norm": 0.3170977830886841, + "learning_rate": 0.0005349874745120885, + "loss": 3.5185, + "step": 18700 + }, + { + "epoch": 5.4586220953933955, + "grad_norm": 0.3276199698448181, + "learning_rate": 0.0005348127002621614, + "loss": 3.5143, + "step": 18750 + }, + { + "epoch": 5.473181526993186, + "grad_norm": 0.35049423575401306, + "learning_rate": 0.0005346379260122341, + "loss": 3.5178, + "step": 18800 + }, + { + "epoch": 5.487740958592976, + "grad_norm": 0.3257882595062256, + "learning_rate": 0.000534463151762307, + "loss": 3.513, + "step": 18850 + }, + { + "epoch": 5.502300390192767, + "grad_norm": 0.3254280686378479, + "learning_rate": 0.0005342883775123798, + "loss": 3.5157, + "step": 18900 + }, + { + "epoch": 5.516859821792557, + "grad_norm": 0.35354653000831604, + "learning_rate": 0.0005341136032624526, + "loss": 3.5323, + "step": 18950 + }, + { + "epoch": 5.531419253392348, + "grad_norm": 0.3293665945529938, + "learning_rate": 0.0005339388290125255, + "loss": 3.5294, + "step": 19000 + }, + { + "epoch": 5.531419253392348, + "eval_accuracy": 0.36105410583223874, + "eval_loss": 3.6271042823791504, + "eval_runtime": 181.268, + "eval_samples_per_second": 91.814, + "eval_steps_per_second": 5.743, + "step": 19000 + }, + { + "epoch": 5.5459786849921375, + "grad_norm": 0.32479095458984375, + "learning_rate": 0.0005337640547625983, + "loss": 3.5257, + "step": 19050 + }, + { + "epoch": 5.560538116591928, + "grad_norm": 0.30282458662986755, + "learning_rate": 0.000533589280512671, + "loss": 3.5376, + "step": 19100 + }, + { + "epoch": 5.575097548191718, + "grad_norm": 0.3051811754703522, + "learning_rate": 0.0005334145062627439, + "loss": 3.5188, + "step": 19150 + }, + { + "epoch": 5.589656979791509, + "grad_norm": 0.34127405285835266, + "learning_rate": 0.0005332397320128167, + "loss": 3.5171, + "step": 19200 + }, + { + "epoch": 5.604216411391299, + "grad_norm": 0.3210941553115845, + "learning_rate": 0.0005330649577628896, + "loss": 3.5248, + "step": 19250 + }, + { + "epoch": 5.61877584299109, + "grad_norm": 0.3192020654678345, + "learning_rate": 0.0005328901835129624, + "loss": 3.533, + "step": 19300 + }, + { + "epoch": 5.6333352745908805, + "grad_norm": 0.34110450744628906, + "learning_rate": 0.0005327154092630351, + "loss": 3.5295, + "step": 19350 + }, + { + "epoch": 5.64789470619067, + "grad_norm": 0.3144545555114746, + "learning_rate": 0.000532540635013108, + "loss": 3.5335, + "step": 19400 + }, + { + "epoch": 5.66245413779046, + "grad_norm": 0.3245835304260254, + "learning_rate": 0.0005323658607631808, + "loss": 3.5229, + "step": 19450 + }, + { + "epoch": 5.677013569390251, + "grad_norm": 0.3528177738189697, + "learning_rate": 0.0005321910865132537, + "loss": 3.5209, + "step": 19500 + }, + { + "epoch": 5.691573000990042, + "grad_norm": 0.3312878906726837, + "learning_rate": 0.0005320163122633265, + "loss": 3.5321, + "step": 19550 + }, + { + "epoch": 5.706132432589832, + "grad_norm": 0.3077809512615204, + "learning_rate": 0.0005318415380133993, + "loss": 3.5183, + "step": 19600 + }, + { + "epoch": 5.720691864189622, + "grad_norm": 0.32409968972206116, + "learning_rate": 0.0005316667637634721, + "loss": 3.5276, + "step": 19650 + }, + { + "epoch": 5.735251295789412, + "grad_norm": 0.3110126852989197, + "learning_rate": 0.0005314919895135449, + "loss": 3.5253, + "step": 19700 + }, + { + "epoch": 5.749810727389203, + "grad_norm": 0.33343297243118286, + "learning_rate": 0.0005313172152636178, + "loss": 3.5272, + "step": 19750 + }, + { + "epoch": 5.764370158988993, + "grad_norm": 0.3315747082233429, + "learning_rate": 0.0005311424410136906, + "loss": 3.534, + "step": 19800 + }, + { + "epoch": 5.778929590588783, + "grad_norm": 0.3091914653778076, + "learning_rate": 0.0005309676667637634, + "loss": 3.5276, + "step": 19850 + }, + { + "epoch": 5.793489022188574, + "grad_norm": 0.30921050906181335, + "learning_rate": 0.0005307928925138363, + "loss": 3.5219, + "step": 19900 + }, + { + "epoch": 5.8080484537883645, + "grad_norm": 0.30907315015792847, + "learning_rate": 0.000530618118263909, + "loss": 3.534, + "step": 19950 + }, + { + "epoch": 5.822607885388154, + "grad_norm": 0.36628568172454834, + "learning_rate": 0.0005304433440139819, + "loss": 3.538, + "step": 20000 + }, + { + "epoch": 5.822607885388154, + "eval_accuracy": 0.36194950645964236, + "eval_loss": 3.61657452583313, + "eval_runtime": 183.6498, + "eval_samples_per_second": 90.624, + "eval_steps_per_second": 5.668, + "step": 20000 + }, + { + "epoch": 5.837167316987944, + "grad_norm": 0.3185259997844696, + "learning_rate": 0.0005302685697640547, + "loss": 3.5243, + "step": 20050 + }, + { + "epoch": 5.851726748587735, + "grad_norm": 0.3328113257884979, + "learning_rate": 0.0005300937955141275, + "loss": 3.5306, + "step": 20100 + }, + { + "epoch": 5.866286180187526, + "grad_norm": 0.31715288758277893, + "learning_rate": 0.0005299190212642004, + "loss": 3.5368, + "step": 20150 + }, + { + "epoch": 5.880845611787316, + "grad_norm": 0.3114943206310272, + "learning_rate": 0.0005297442470142731, + "loss": 3.5279, + "step": 20200 + }, + { + "epoch": 5.895405043387106, + "grad_norm": 0.3375224471092224, + "learning_rate": 0.000529569472764346, + "loss": 3.5214, + "step": 20250 + }, + { + "epoch": 5.9099644749868965, + "grad_norm": 0.29627102613449097, + "learning_rate": 0.0005293946985144188, + "loss": 3.5182, + "step": 20300 + }, + { + "epoch": 5.924523906586687, + "grad_norm": 0.33964815735816956, + "learning_rate": 0.0005292199242644916, + "loss": 3.541, + "step": 20350 + }, + { + "epoch": 5.939083338186477, + "grad_norm": 0.3077552914619446, + "learning_rate": 0.0005290451500145645, + "loss": 3.5246, + "step": 20400 + }, + { + "epoch": 5.953642769786267, + "grad_norm": 0.3167116641998291, + "learning_rate": 0.0005288703757646373, + "loss": 3.5294, + "step": 20450 + }, + { + "epoch": 5.968202201386058, + "grad_norm": 0.3327026665210724, + "learning_rate": 0.00052869560151471, + "loss": 3.5322, + "step": 20500 + }, + { + "epoch": 5.982761632985849, + "grad_norm": 0.3215795159339905, + "learning_rate": 0.0005285208272647829, + "loss": 3.5378, + "step": 20550 + }, + { + "epoch": 5.9973210645856385, + "grad_norm": 0.3464929759502411, + "learning_rate": 0.0005283460530148558, + "loss": 3.544, + "step": 20600 + }, + { + "epoch": 6.011647545279832, + "grad_norm": 0.37006425857543945, + "learning_rate": 0.0005281712787649286, + "loss": 3.4569, + "step": 20650 + }, + { + "epoch": 6.026206976879623, + "grad_norm": 0.32685425877571106, + "learning_rate": 0.0005279965045150015, + "loss": 3.4143, + "step": 20700 + }, + { + "epoch": 6.040766408479413, + "grad_norm": 0.31896543502807617, + "learning_rate": 0.0005278217302650743, + "loss": 3.4284, + "step": 20750 + }, + { + "epoch": 6.055325840079203, + "grad_norm": 0.3501061499118805, + "learning_rate": 0.000527646956015147, + "loss": 3.4257, + "step": 20800 + }, + { + "epoch": 6.069885271678993, + "grad_norm": 0.3293428421020508, + "learning_rate": 0.0005274721817652199, + "loss": 3.4253, + "step": 20850 + }, + { + "epoch": 6.084444703278784, + "grad_norm": 0.33916565775871277, + "learning_rate": 0.0005272974075152927, + "loss": 3.4532, + "step": 20900 + }, + { + "epoch": 6.099004134878574, + "grad_norm": 0.3229523301124573, + "learning_rate": 0.0005271226332653656, + "loss": 3.4476, + "step": 20950 + }, + { + "epoch": 6.113563566478365, + "grad_norm": 0.3364764153957367, + "learning_rate": 0.0005269478590154384, + "loss": 3.4415, + "step": 21000 + }, + { + "epoch": 6.113563566478365, + "eval_accuracy": 0.3621457538197391, + "eval_loss": 3.6198906898498535, + "eval_runtime": 183.4856, + "eval_samples_per_second": 90.705, + "eval_steps_per_second": 5.673, + "step": 21000 + }, + { + "epoch": 6.128122998078155, + "grad_norm": 0.3735044300556183, + "learning_rate": 0.0005267730847655111, + "loss": 3.4457, + "step": 21050 + }, + { + "epoch": 6.142682429677945, + "grad_norm": 0.34455105662345886, + "learning_rate": 0.000526598310515584, + "loss": 3.4528, + "step": 21100 + }, + { + "epoch": 6.157241861277735, + "grad_norm": 0.33916333317756653, + "learning_rate": 0.0005264235362656568, + "loss": 3.4609, + "step": 21150 + }, + { + "epoch": 6.171801292877526, + "grad_norm": 0.3121279180049896, + "learning_rate": 0.0005262487620157297, + "loss": 3.4478, + "step": 21200 + }, + { + "epoch": 6.186360724477316, + "grad_norm": 0.30740803480148315, + "learning_rate": 0.0005260739877658025, + "loss": 3.4448, + "step": 21250 + }, + { + "epoch": 6.200920156077107, + "grad_norm": 0.3505891263484955, + "learning_rate": 0.0005258992135158753, + "loss": 3.4585, + "step": 21300 + }, + { + "epoch": 6.215479587676897, + "grad_norm": 0.33900803327560425, + "learning_rate": 0.0005257244392659481, + "loss": 3.46, + "step": 21350 + }, + { + "epoch": 6.2300390192766875, + "grad_norm": 0.3224051892757416, + "learning_rate": 0.0005255496650160209, + "loss": 3.4557, + "step": 21400 + }, + { + "epoch": 6.244598450876477, + "grad_norm": 0.35417911410331726, + "learning_rate": 0.0005253748907660938, + "loss": 3.4583, + "step": 21450 + }, + { + "epoch": 6.259157882476268, + "grad_norm": 0.34107911586761475, + "learning_rate": 0.0005252001165161666, + "loss": 3.4659, + "step": 21500 + }, + { + "epoch": 6.273717314076059, + "grad_norm": 0.32315975427627563, + "learning_rate": 0.0005250253422662394, + "loss": 3.4613, + "step": 21550 + }, + { + "epoch": 6.288276745675849, + "grad_norm": 0.3344326615333557, + "learning_rate": 0.0005248505680163123, + "loss": 3.4729, + "step": 21600 + }, + { + "epoch": 6.302836177275639, + "grad_norm": 0.34388530254364014, + "learning_rate": 0.000524675793766385, + "loss": 3.4723, + "step": 21650 + }, + { + "epoch": 6.3173956088754295, + "grad_norm": 0.34264546632766724, + "learning_rate": 0.0005245010195164579, + "loss": 3.4751, + "step": 21700 + }, + { + "epoch": 6.33195504047522, + "grad_norm": 0.32228031754493713, + "learning_rate": 0.0005243262452665307, + "loss": 3.4586, + "step": 21750 + }, + { + "epoch": 6.34651447207501, + "grad_norm": 0.34229689836502075, + "learning_rate": 0.0005241514710166035, + "loss": 3.4657, + "step": 21800 + }, + { + "epoch": 6.3610739036748, + "grad_norm": 0.3267248570919037, + "learning_rate": 0.0005239766967666764, + "loss": 3.4747, + "step": 21850 + }, + { + "epoch": 6.375633335274591, + "grad_norm": 0.3363324999809265, + "learning_rate": 0.0005238019225167491, + "loss": 3.4736, + "step": 21900 + }, + { + "epoch": 6.390192766874382, + "grad_norm": 0.32636144757270813, + "learning_rate": 0.0005236271482668219, + "loss": 3.4731, + "step": 21950 + }, + { + "epoch": 6.4047521984741715, + "grad_norm": 0.3209141194820404, + "learning_rate": 0.0005234523740168948, + "loss": 3.475, + "step": 22000 + }, + { + "epoch": 6.4047521984741715, + "eval_accuracy": 0.36269357673806785, + "eval_loss": 3.6098952293395996, + "eval_runtime": 183.5371, + "eval_samples_per_second": 90.679, + "eval_steps_per_second": 5.672, + "step": 22000 + }, + { + "epoch": 6.419311630073962, + "grad_norm": 0.3223513066768646, + "learning_rate": 0.0005232775997669676, + "loss": 3.4759, + "step": 22050 + }, + { + "epoch": 6.433871061673752, + "grad_norm": 0.3284885585308075, + "learning_rate": 0.0005231028255170405, + "loss": 3.4796, + "step": 22100 + }, + { + "epoch": 6.448430493273543, + "grad_norm": 0.32980912923812866, + "learning_rate": 0.0005229280512671133, + "loss": 3.4839, + "step": 22150 + }, + { + "epoch": 6.462989924873333, + "grad_norm": 0.33856451511383057, + "learning_rate": 0.000522753277017186, + "loss": 3.4825, + "step": 22200 + }, + { + "epoch": 6.477549356473124, + "grad_norm": 0.3303597867488861, + "learning_rate": 0.0005225785027672589, + "loss": 3.4827, + "step": 22250 + }, + { + "epoch": 6.492108788072914, + "grad_norm": 0.32675686478614807, + "learning_rate": 0.0005224037285173317, + "loss": 3.4781, + "step": 22300 + }, + { + "epoch": 6.506668219672704, + "grad_norm": 0.3315143883228302, + "learning_rate": 0.0005222289542674046, + "loss": 3.4786, + "step": 22350 + }, + { + "epoch": 6.521227651272494, + "grad_norm": 0.35115185379981995, + "learning_rate": 0.0005220541800174774, + "loss": 3.4777, + "step": 22400 + }, + { + "epoch": 6.535787082872285, + "grad_norm": 0.32922348380088806, + "learning_rate": 0.0005218794057675501, + "loss": 3.4764, + "step": 22450 + }, + { + "epoch": 6.550346514472075, + "grad_norm": 0.32848137617111206, + "learning_rate": 0.000521704631517623, + "loss": 3.4864, + "step": 22500 + }, + { + "epoch": 6.564905946071866, + "grad_norm": 0.3455169200897217, + "learning_rate": 0.0005215298572676958, + "loss": 3.4872, + "step": 22550 + }, + { + "epoch": 6.579465377671656, + "grad_norm": 0.3491528034210205, + "learning_rate": 0.0005213550830177687, + "loss": 3.4941, + "step": 22600 + }, + { + "epoch": 6.594024809271446, + "grad_norm": 0.3292933404445648, + "learning_rate": 0.0005211803087678415, + "loss": 3.4849, + "step": 22650 + }, + { + "epoch": 6.608584240871236, + "grad_norm": 0.33583250641822815, + "learning_rate": 0.0005210055345179143, + "loss": 3.4787, + "step": 22700 + }, + { + "epoch": 6.623143672471027, + "grad_norm": 0.32590252161026, + "learning_rate": 0.0005208307602679871, + "loss": 3.4819, + "step": 22750 + }, + { + "epoch": 6.637703104070817, + "grad_norm": 0.34313255548477173, + "learning_rate": 0.0005206559860180599, + "loss": 3.4679, + "step": 22800 + }, + { + "epoch": 6.652262535670608, + "grad_norm": 0.3168715238571167, + "learning_rate": 0.0005204812117681328, + "loss": 3.4812, + "step": 22850 + }, + { + "epoch": 6.666821967270398, + "grad_norm": 0.33726438879966736, + "learning_rate": 0.0005203064375182056, + "loss": 3.4791, + "step": 22900 + }, + { + "epoch": 6.6813813988701884, + "grad_norm": 0.33907851576805115, + "learning_rate": 0.0005201316632682784, + "loss": 3.4817, + "step": 22950 + }, + { + "epoch": 6.695940830469978, + "grad_norm": 0.3657963275909424, + "learning_rate": 0.0005199568890183513, + "loss": 3.4859, + "step": 23000 + }, + { + "epoch": 6.695940830469978, + "eval_accuracy": 0.3636106127844396, + "eval_loss": 3.6045539379119873, + "eval_runtime": 184.0903, + "eval_samples_per_second": 90.407, + "eval_steps_per_second": 5.655, + "step": 23000 + }, + { + "epoch": 6.710500262069769, + "grad_norm": 0.3322959840297699, + "learning_rate": 0.000519782114768424, + "loss": 3.4824, + "step": 23050 + }, + { + "epoch": 6.725059693669559, + "grad_norm": 0.343662828207016, + "learning_rate": 0.0005196073405184969, + "loss": 3.4897, + "step": 23100 + }, + { + "epoch": 6.73961912526935, + "grad_norm": 0.32909801602363586, + "learning_rate": 0.0005194325662685697, + "loss": 3.4858, + "step": 23150 + }, + { + "epoch": 6.75417855686914, + "grad_norm": 0.3375694751739502, + "learning_rate": 0.0005192577920186426, + "loss": 3.4808, + "step": 23200 + }, + { + "epoch": 6.7687379884689305, + "grad_norm": 0.3138526678085327, + "learning_rate": 0.0005190830177687154, + "loss": 3.483, + "step": 23250 + }, + { + "epoch": 6.78329742006872, + "grad_norm": 0.3340669572353363, + "learning_rate": 0.0005189082435187883, + "loss": 3.4903, + "step": 23300 + }, + { + "epoch": 6.797856851668511, + "grad_norm": 0.3336253762245178, + "learning_rate": 0.000518733469268861, + "loss": 3.4864, + "step": 23350 + }, + { + "epoch": 6.812416283268301, + "grad_norm": 0.3235922157764435, + "learning_rate": 0.0005185586950189338, + "loss": 3.5037, + "step": 23400 + }, + { + "epoch": 6.826975714868092, + "grad_norm": 0.3445108234882355, + "learning_rate": 0.0005183839207690067, + "loss": 3.4908, + "step": 23450 + }, + { + "epoch": 6.841535146467882, + "grad_norm": 0.3229808211326599, + "learning_rate": 0.0005182091465190795, + "loss": 3.4906, + "step": 23500 + }, + { + "epoch": 6.8560945780676725, + "grad_norm": 0.29649391770362854, + "learning_rate": 0.0005180343722691524, + "loss": 3.4964, + "step": 23550 + }, + { + "epoch": 6.870654009667462, + "grad_norm": 0.3273935616016388, + "learning_rate": 0.0005178595980192251, + "loss": 3.493, + "step": 23600 + }, + { + "epoch": 6.885213441267253, + "grad_norm": 0.33352574706077576, + "learning_rate": 0.0005176848237692979, + "loss": 3.4915, + "step": 23650 + }, + { + "epoch": 6.899772872867043, + "grad_norm": 0.3277892768383026, + "learning_rate": 0.0005175100495193708, + "loss": 3.4986, + "step": 23700 + }, + { + "epoch": 6.914332304466834, + "grad_norm": 0.3182038366794586, + "learning_rate": 0.0005173352752694436, + "loss": 3.5041, + "step": 23750 + }, + { + "epoch": 6.928891736066624, + "grad_norm": 0.3153535723686218, + "learning_rate": 0.0005171605010195165, + "loss": 3.5035, + "step": 23800 + }, + { + "epoch": 6.943451167666415, + "grad_norm": 0.34128624200820923, + "learning_rate": 0.0005169857267695893, + "loss": 3.4943, + "step": 23850 + }, + { + "epoch": 6.9580105992662045, + "grad_norm": 0.3200225234031677, + "learning_rate": 0.000516810952519662, + "loss": 3.5004, + "step": 23900 + }, + { + "epoch": 6.972570030865995, + "grad_norm": 0.35053977370262146, + "learning_rate": 0.0005166361782697349, + "loss": 3.4938, + "step": 23950 + }, + { + "epoch": 6.987129462465785, + "grad_norm": 0.35640257596969604, + "learning_rate": 0.0005164614040198077, + "loss": 3.4951, + "step": 24000 + }, + { + "epoch": 6.987129462465785, + "eval_accuracy": 0.36452964775538993, + "eval_loss": 3.5928568840026855, + "eval_runtime": 184.1497, + "eval_samples_per_second": 90.378, + "eval_steps_per_second": 5.653, + "step": 24000 + }, + { + "epoch": 7.001455943159979, + "grad_norm": 0.3846636712551117, + "learning_rate": 0.0005162866297698806, + "loss": 3.4935, + "step": 24050 + }, + { + "epoch": 7.016015374759769, + "grad_norm": 0.3523205518722534, + "learning_rate": 0.0005161118555199534, + "loss": 3.3822, + "step": 24100 + }, + { + "epoch": 7.03057480635956, + "grad_norm": 0.36663973331451416, + "learning_rate": 0.0005159370812700261, + "loss": 3.3874, + "step": 24150 + }, + { + "epoch": 7.04513423795935, + "grad_norm": 0.38096940517425537, + "learning_rate": 0.000515762307020099, + "loss": 3.3899, + "step": 24200 + }, + { + "epoch": 7.059693669559141, + "grad_norm": 0.35516002774238586, + "learning_rate": 0.0005155875327701718, + "loss": 3.3847, + "step": 24250 + }, + { + "epoch": 7.074253101158931, + "grad_norm": 0.3651926815509796, + "learning_rate": 0.0005154127585202447, + "loss": 3.4049, + "step": 24300 + }, + { + "epoch": 7.0888125327587215, + "grad_norm": 0.36075493693351746, + "learning_rate": 0.0005152379842703175, + "loss": 3.3965, + "step": 24350 + }, + { + "epoch": 7.103371964358511, + "grad_norm": 0.38245540857315063, + "learning_rate": 0.0005150632100203903, + "loss": 3.4028, + "step": 24400 + }, + { + "epoch": 7.117931395958302, + "grad_norm": 0.32894188165664673, + "learning_rate": 0.0005148884357704631, + "loss": 3.3985, + "step": 24450 + }, + { + "epoch": 7.132490827558092, + "grad_norm": 0.3118518590927124, + "learning_rate": 0.0005147136615205359, + "loss": 3.4151, + "step": 24500 + }, + { + "epoch": 7.147050259157883, + "grad_norm": 0.3686443269252777, + "learning_rate": 0.0005145388872706087, + "loss": 3.4092, + "step": 24550 + }, + { + "epoch": 7.161609690757673, + "grad_norm": 0.35504400730133057, + "learning_rate": 0.0005143641130206816, + "loss": 3.4128, + "step": 24600 + }, + { + "epoch": 7.1761691223574635, + "grad_norm": 0.371929794549942, + "learning_rate": 0.0005141893387707544, + "loss": 3.4088, + "step": 24650 + }, + { + "epoch": 7.190728553957253, + "grad_norm": 0.35544171929359436, + "learning_rate": 0.0005140145645208272, + "loss": 3.4102, + "step": 24700 + }, + { + "epoch": 7.205287985557044, + "grad_norm": 0.32105565071105957, + "learning_rate": 0.0005138397902709, + "loss": 3.4146, + "step": 24750 + }, + { + "epoch": 7.219847417156834, + "grad_norm": 0.3172771632671356, + "learning_rate": 0.0005136650160209728, + "loss": 3.4218, + "step": 24800 + }, + { + "epoch": 7.234406848756625, + "grad_norm": 0.3447094261646271, + "learning_rate": 0.0005134902417710457, + "loss": 3.4251, + "step": 24850 + }, + { + "epoch": 7.248966280356415, + "grad_norm": 0.3414628505706787, + "learning_rate": 0.0005133154675211185, + "loss": 3.4205, + "step": 24900 + }, + { + "epoch": 7.2635257119562056, + "grad_norm": 0.36512497067451477, + "learning_rate": 0.0005131406932711914, + "loss": 3.4273, + "step": 24950 + }, + { + "epoch": 7.2780851435559955, + "grad_norm": 0.3672768771648407, + "learning_rate": 0.0005129659190212641, + "loss": 3.4339, + "step": 25000 + }, + { + "epoch": 7.2780851435559955, + "eval_accuracy": 0.36433104871931843, + "eval_loss": 3.5997819900512695, + "eval_runtime": 180.8992, + "eval_samples_per_second": 92.002, + "eval_steps_per_second": 5.755, + "step": 25000 + }, + { + "epoch": 7.292644575155786, + "grad_norm": 0.3394540846347809, + "learning_rate": 0.0005127911447713369, + "loss": 3.4232, + "step": 25050 + }, + { + "epoch": 7.307204006755576, + "grad_norm": 0.3045949637889862, + "learning_rate": 0.0005126163705214098, + "loss": 3.4267, + "step": 25100 + }, + { + "epoch": 7.321763438355367, + "grad_norm": 0.32903987169265747, + "learning_rate": 0.0005124415962714826, + "loss": 3.4341, + "step": 25150 + }, + { + "epoch": 7.336322869955157, + "grad_norm": 0.3628155589103699, + "learning_rate": 0.0005122668220215555, + "loss": 3.4336, + "step": 25200 + }, + { + "epoch": 7.350882301554948, + "grad_norm": 0.3750855624675751, + "learning_rate": 0.0005120920477716282, + "loss": 3.436, + "step": 25250 + }, + { + "epoch": 7.3654417331547375, + "grad_norm": 0.31662774085998535, + "learning_rate": 0.000511917273521701, + "loss": 3.4372, + "step": 25300 + }, + { + "epoch": 7.380001164754528, + "grad_norm": 0.3318006694316864, + "learning_rate": 0.0005117424992717739, + "loss": 3.4377, + "step": 25350 + }, + { + "epoch": 7.394560596354318, + "grad_norm": 0.3489433526992798, + "learning_rate": 0.0005115677250218467, + "loss": 3.4364, + "step": 25400 + }, + { + "epoch": 7.409120027954109, + "grad_norm": 0.3378850817680359, + "learning_rate": 0.0005113929507719196, + "loss": 3.4392, + "step": 25450 + }, + { + "epoch": 7.423679459553899, + "grad_norm": 0.3490906357765198, + "learning_rate": 0.0005112181765219924, + "loss": 3.4404, + "step": 25500 + }, + { + "epoch": 7.43823889115369, + "grad_norm": 0.33684709668159485, + "learning_rate": 0.0005110434022720651, + "loss": 3.4316, + "step": 25550 + }, + { + "epoch": 7.4527983227534795, + "grad_norm": 0.3533405363559723, + "learning_rate": 0.000510868628022138, + "loss": 3.4519, + "step": 25600 + }, + { + "epoch": 7.46735775435327, + "grad_norm": 0.364666610956192, + "learning_rate": 0.0005106938537722109, + "loss": 3.4428, + "step": 25650 + }, + { + "epoch": 7.48191718595306, + "grad_norm": 0.3563931882381439, + "learning_rate": 0.0005105190795222837, + "loss": 3.442, + "step": 25700 + }, + { + "epoch": 7.496476617552851, + "grad_norm": 0.35002008080482483, + "learning_rate": 0.0005103443052723565, + "loss": 3.4379, + "step": 25750 + }, + { + "epoch": 7.511036049152641, + "grad_norm": 0.3543298542499542, + "learning_rate": 0.0005101695310224294, + "loss": 3.457, + "step": 25800 + }, + { + "epoch": 7.525595480752432, + "grad_norm": 0.33176884055137634, + "learning_rate": 0.0005099947567725021, + "loss": 3.4399, + "step": 25850 + }, + { + "epoch": 7.540154912352222, + "grad_norm": 0.34475451707839966, + "learning_rate": 0.000509819982522575, + "loss": 3.4443, + "step": 25900 + }, + { + "epoch": 7.554714343952012, + "grad_norm": 0.33004602789878845, + "learning_rate": 0.0005096452082726478, + "loss": 3.4584, + "step": 25950 + }, + { + "epoch": 7.569273775551802, + "grad_norm": 0.3163653016090393, + "learning_rate": 0.0005094704340227206, + "loss": 3.45, + "step": 26000 + }, + { + "epoch": 7.569273775551802, + "eval_accuracy": 0.36467333515745, + "eval_loss": 3.5947012901306152, + "eval_runtime": 180.7981, + "eval_samples_per_second": 92.053, + "eval_steps_per_second": 5.758, + "step": 26000 + }, + { + "epoch": 7.583833207151593, + "grad_norm": 0.3281993567943573, + "learning_rate": 0.0005092956597727935, + "loss": 3.4512, + "step": 26050 + }, + { + "epoch": 7.598392638751383, + "grad_norm": 0.31753283739089966, + "learning_rate": 0.0005091208855228662, + "loss": 3.4646, + "step": 26100 + }, + { + "epoch": 7.612952070351174, + "grad_norm": 0.3362863063812256, + "learning_rate": 0.0005089461112729391, + "loss": 3.4544, + "step": 26150 + }, + { + "epoch": 7.627511501950964, + "grad_norm": 0.34793269634246826, + "learning_rate": 0.0005087713370230119, + "loss": 3.4481, + "step": 26200 + }, + { + "epoch": 7.642070933550754, + "grad_norm": 0.3432117700576782, + "learning_rate": 0.0005085965627730847, + "loss": 3.4591, + "step": 26250 + }, + { + "epoch": 7.656630365150544, + "grad_norm": 0.3630698323249817, + "learning_rate": 0.0005084217885231576, + "loss": 3.4618, + "step": 26300 + }, + { + "epoch": 7.671189796750335, + "grad_norm": 0.3361819088459015, + "learning_rate": 0.0005082470142732304, + "loss": 3.4646, + "step": 26350 + }, + { + "epoch": 7.685749228350125, + "grad_norm": 0.3673403263092041, + "learning_rate": 0.0005080722400233032, + "loss": 3.4539, + "step": 26400 + }, + { + "epoch": 7.700308659949916, + "grad_norm": 0.33987388014793396, + "learning_rate": 0.000507897465773376, + "loss": 3.4693, + "step": 26450 + }, + { + "epoch": 7.714868091549706, + "grad_norm": 0.32190704345703125, + "learning_rate": 0.0005077226915234488, + "loss": 3.4468, + "step": 26500 + }, + { + "epoch": 7.729427523149496, + "grad_norm": 0.3864888846874237, + "learning_rate": 0.0005075479172735217, + "loss": 3.4556, + "step": 26550 + }, + { + "epoch": 7.743986954749286, + "grad_norm": 0.3400271534919739, + "learning_rate": 0.0005073731430235945, + "loss": 3.4587, + "step": 26600 + }, + { + "epoch": 7.758546386349077, + "grad_norm": 0.3375173509120941, + "learning_rate": 0.0005071983687736674, + "loss": 3.4628, + "step": 26650 + }, + { + "epoch": 7.773105817948867, + "grad_norm": 0.3561650216579437, + "learning_rate": 0.0005070235945237401, + "loss": 3.4572, + "step": 26700 + }, + { + "epoch": 7.787665249548658, + "grad_norm": 0.3330904543399811, + "learning_rate": 0.0005068488202738129, + "loss": 3.4615, + "step": 26750 + }, + { + "epoch": 7.802224681148448, + "grad_norm": 0.3155699074268341, + "learning_rate": 0.0005066740460238858, + "loss": 3.4407, + "step": 26800 + }, + { + "epoch": 7.8167841127482385, + "grad_norm": 0.3466147780418396, + "learning_rate": 0.0005064992717739586, + "loss": 3.457, + "step": 26850 + }, + { + "epoch": 7.831343544348028, + "grad_norm": 0.3634095788002014, + "learning_rate": 0.0005063244975240315, + "loss": 3.4655, + "step": 26900 + }, + { + "epoch": 7.845902975947819, + "grad_norm": 0.3383113741874695, + "learning_rate": 0.0005061497232741042, + "loss": 3.4613, + "step": 26950 + }, + { + "epoch": 7.860462407547609, + "grad_norm": 0.33195146918296814, + "learning_rate": 0.000505974949024177, + "loss": 3.4628, + "step": 27000 + }, + { + "epoch": 7.860462407547609, + "eval_accuracy": 0.3657741546812521, + "eval_loss": 3.5823051929473877, + "eval_runtime": 183.1613, + "eval_samples_per_second": 90.865, + "eval_steps_per_second": 5.684, + "step": 27000 + }, + { + "epoch": 7.8750218391474, + "grad_norm": 0.35151028633117676, + "learning_rate": 0.0005058001747742499, + "loss": 3.4647, + "step": 27050 + }, + { + "epoch": 7.88958127074719, + "grad_norm": 0.35772132873535156, + "learning_rate": 0.0005056254005243227, + "loss": 3.4699, + "step": 27100 + }, + { + "epoch": 7.9041407023469805, + "grad_norm": 0.3402451276779175, + "learning_rate": 0.0005054506262743955, + "loss": 3.4738, + "step": 27150 + }, + { + "epoch": 7.91870013394677, + "grad_norm": 0.33174848556518555, + "learning_rate": 0.0005052758520244684, + "loss": 3.4582, + "step": 27200 + }, + { + "epoch": 7.933259565546561, + "grad_norm": 0.33006104826927185, + "learning_rate": 0.0005051010777745411, + "loss": 3.4596, + "step": 27250 + }, + { + "epoch": 7.947818997146351, + "grad_norm": 0.347843199968338, + "learning_rate": 0.000504926303524614, + "loss": 3.4695, + "step": 27300 + }, + { + "epoch": 7.962378428746142, + "grad_norm": 0.32010769844055176, + "learning_rate": 0.0005047515292746868, + "loss": 3.4707, + "step": 27350 + }, + { + "epoch": 7.976937860345932, + "grad_norm": 0.3584131896495819, + "learning_rate": 0.0005045767550247596, + "loss": 3.4584, + "step": 27400 + }, + { + "epoch": 7.991497291945723, + "grad_norm": 0.3257739543914795, + "learning_rate": 0.0005044019807748325, + "loss": 3.4689, + "step": 27450 + }, + { + "epoch": 8.005823772639916, + "grad_norm": 0.33740708231925964, + "learning_rate": 0.0005042272065249052, + "loss": 3.4213, + "step": 27500 + }, + { + "epoch": 8.020383204239707, + "grad_norm": 0.33311763405799866, + "learning_rate": 0.0005040524322749781, + "loss": 3.3665, + "step": 27550 + }, + { + "epoch": 8.034942635839498, + "grad_norm": 0.32844987511634827, + "learning_rate": 0.0005038776580250509, + "loss": 3.3403, + "step": 27600 + }, + { + "epoch": 8.049502067439287, + "grad_norm": 0.33761027455329895, + "learning_rate": 0.0005037028837751237, + "loss": 3.3569, + "step": 27650 + }, + { + "epoch": 8.064061499039077, + "grad_norm": 0.35406097769737244, + "learning_rate": 0.0005035281095251966, + "loss": 3.3567, + "step": 27700 + }, + { + "epoch": 8.078620930638868, + "grad_norm": 0.38495901226997375, + "learning_rate": 0.0005033533352752694, + "loss": 3.3788, + "step": 27750 + }, + { + "epoch": 8.093180362238659, + "grad_norm": 0.331709086894989, + "learning_rate": 0.0005031785610253422, + "loss": 3.3591, + "step": 27800 + }, + { + "epoch": 8.107739793838448, + "grad_norm": 0.3502473533153534, + "learning_rate": 0.000503003786775415, + "loss": 3.3805, + "step": 27850 + }, + { + "epoch": 8.122299225438239, + "grad_norm": 0.3384426534175873, + "learning_rate": 0.0005028290125254878, + "loss": 3.3839, + "step": 27900 + }, + { + "epoch": 8.13685865703803, + "grad_norm": 0.36867547035217285, + "learning_rate": 0.0005026542382755607, + "loss": 3.3739, + "step": 27950 + }, + { + "epoch": 8.15141808863782, + "grad_norm": 0.342602014541626, + "learning_rate": 0.0005024794640256335, + "loss": 3.3902, + "step": 28000 + }, + { + "epoch": 8.15141808863782, + "eval_accuracy": 0.3654784314274215, + "eval_loss": 3.5930871963500977, + "eval_runtime": 181.7974, + "eval_samples_per_second": 91.547, + "eval_steps_per_second": 5.726, + "step": 28000 + }, + { + "epoch": 8.16597752023761, + "grad_norm": 0.34287703037261963, + "learning_rate": 0.0005023046897757064, + "loss": 3.3962, + "step": 28050 + }, + { + "epoch": 8.1805369518374, + "grad_norm": 0.3469769060611725, + "learning_rate": 0.0005021299155257791, + "loss": 3.3802, + "step": 28100 + }, + { + "epoch": 8.19509638343719, + "grad_norm": 0.33318281173706055, + "learning_rate": 0.000501955141275852, + "loss": 3.3841, + "step": 28150 + }, + { + "epoch": 8.209655815036982, + "grad_norm": 0.3634045720100403, + "learning_rate": 0.0005017803670259248, + "loss": 3.3948, + "step": 28200 + }, + { + "epoch": 8.22421524663677, + "grad_norm": 0.33881765604019165, + "learning_rate": 0.0005016055927759977, + "loss": 3.3884, + "step": 28250 + }, + { + "epoch": 8.238774678236561, + "grad_norm": 0.3295370638370514, + "learning_rate": 0.0005014308185260705, + "loss": 3.3949, + "step": 28300 + }, + { + "epoch": 8.253334109836352, + "grad_norm": 0.32435914874076843, + "learning_rate": 0.0005012560442761432, + "loss": 3.3864, + "step": 28350 + }, + { + "epoch": 8.267893541436143, + "grad_norm": 0.3608424663543701, + "learning_rate": 0.0005010812700262161, + "loss": 3.3904, + "step": 28400 + }, + { + "epoch": 8.282452973035932, + "grad_norm": 0.34927839040756226, + "learning_rate": 0.0005009064957762889, + "loss": 3.4077, + "step": 28450 + }, + { + "epoch": 8.297012404635723, + "grad_norm": 0.37262028455734253, + "learning_rate": 0.0005007317215263618, + "loss": 3.3987, + "step": 28500 + }, + { + "epoch": 8.311571836235514, + "grad_norm": 0.335907906293869, + "learning_rate": 0.0005005569472764346, + "loss": 3.4031, + "step": 28550 + }, + { + "epoch": 8.326131267835304, + "grad_norm": 0.32725778222084045, + "learning_rate": 0.0005003821730265074, + "loss": 3.4097, + "step": 28600 + }, + { + "epoch": 8.340690699435093, + "grad_norm": 0.34939050674438477, + "learning_rate": 0.0005002073987765802, + "loss": 3.4022, + "step": 28650 + }, + { + "epoch": 8.355250131034884, + "grad_norm": 0.329519659280777, + "learning_rate": 0.000500032624526653, + "loss": 3.395, + "step": 28700 + }, + { + "epoch": 8.369809562634675, + "grad_norm": 0.342352032661438, + "learning_rate": 0.0004998578502767259, + "loss": 3.4104, + "step": 28750 + }, + { + "epoch": 8.384368994234466, + "grad_norm": 0.33699142932891846, + "learning_rate": 0.0004996830760267987, + "loss": 3.4088, + "step": 28800 + }, + { + "epoch": 8.398928425834255, + "grad_norm": 0.3412262797355652, + "learning_rate": 0.0004995083017768715, + "loss": 3.4005, + "step": 28850 + }, + { + "epoch": 8.413487857434045, + "grad_norm": 0.3386961817741394, + "learning_rate": 0.0004993335275269444, + "loss": 3.4233, + "step": 28900 + }, + { + "epoch": 8.428047289033836, + "grad_norm": 0.3285767138004303, + "learning_rate": 0.0004991587532770171, + "loss": 3.4173, + "step": 28950 + }, + { + "epoch": 8.442606720633627, + "grad_norm": 0.36489197611808777, + "learning_rate": 0.00049898397902709, + "loss": 3.4052, + "step": 29000 + }, + { + "epoch": 8.442606720633627, + "eval_accuracy": 0.3661564196109552, + "eval_loss": 3.5855534076690674, + "eval_runtime": 181.6388, + "eval_samples_per_second": 91.627, + "eval_steps_per_second": 5.731, + "step": 29000 + }, + { + "epoch": 8.457166152233416, + "grad_norm": 0.33611413836479187, + "learning_rate": 0.0004988092047771628, + "loss": 3.4126, + "step": 29050 + }, + { + "epoch": 8.471725583833207, + "grad_norm": 0.353683203458786, + "learning_rate": 0.0004986344305272356, + "loss": 3.411, + "step": 29100 + }, + { + "epoch": 8.486285015432998, + "grad_norm": 0.3237438201904297, + "learning_rate": 0.0004984596562773085, + "loss": 3.4116, + "step": 29150 + }, + { + "epoch": 8.500844447032788, + "grad_norm": 0.3344637155532837, + "learning_rate": 0.0004982848820273812, + "loss": 3.4252, + "step": 29200 + }, + { + "epoch": 8.515403878632577, + "grad_norm": 0.36675405502319336, + "learning_rate": 0.0004981101077774541, + "loss": 3.4157, + "step": 29250 + }, + { + "epoch": 8.529963310232368, + "grad_norm": 0.3269510269165039, + "learning_rate": 0.0004979353335275269, + "loss": 3.4108, + "step": 29300 + }, + { + "epoch": 8.544522741832159, + "grad_norm": 0.32414621114730835, + "learning_rate": 0.0004977605592775997, + "loss": 3.4189, + "step": 29350 + }, + { + "epoch": 8.55908217343195, + "grad_norm": 0.34106162190437317, + "learning_rate": 0.0004975857850276726, + "loss": 3.413, + "step": 29400 + }, + { + "epoch": 8.573641605031739, + "grad_norm": 0.3581826388835907, + "learning_rate": 0.0004974110107777454, + "loss": 3.4321, + "step": 29450 + }, + { + "epoch": 8.58820103663153, + "grad_norm": 0.34445253014564514, + "learning_rate": 0.0004972362365278182, + "loss": 3.4358, + "step": 29500 + }, + { + "epoch": 8.60276046823132, + "grad_norm": 0.3163345158100128, + "learning_rate": 0.000497061462277891, + "loss": 3.4149, + "step": 29550 + }, + { + "epoch": 8.617319899831111, + "grad_norm": 0.3480691909790039, + "learning_rate": 0.0004968866880279638, + "loss": 3.4231, + "step": 29600 + }, + { + "epoch": 8.6318793314309, + "grad_norm": 0.35285916924476624, + "learning_rate": 0.0004967119137780367, + "loss": 3.438, + "step": 29650 + }, + { + "epoch": 8.646438763030691, + "grad_norm": 0.3620506823062897, + "learning_rate": 0.0004965371395281095, + "loss": 3.428, + "step": 29700 + }, + { + "epoch": 8.660998194630482, + "grad_norm": 0.3322892189025879, + "learning_rate": 0.0004963623652781822, + "loss": 3.4191, + "step": 29750 + }, + { + "epoch": 8.675557626230272, + "grad_norm": 0.3197033405303955, + "learning_rate": 0.0004961875910282551, + "loss": 3.4174, + "step": 29800 + }, + { + "epoch": 8.690117057830061, + "grad_norm": 0.34963804483413696, + "learning_rate": 0.0004960128167783279, + "loss": 3.4357, + "step": 29850 + }, + { + "epoch": 8.704676489429852, + "grad_norm": 0.3247370421886444, + "learning_rate": 0.0004958380425284008, + "loss": 3.421, + "step": 29900 + }, + { + "epoch": 8.719235921029643, + "grad_norm": 0.32233262062072754, + "learning_rate": 0.0004956632682784736, + "loss": 3.4173, + "step": 29950 + }, + { + "epoch": 8.733795352629434, + "grad_norm": 0.36941850185394287, + "learning_rate": 0.0004954884940285464, + "loss": 3.4335, + "step": 30000 + }, + { + "epoch": 8.733795352629434, + "eval_accuracy": 0.36665227049024096, + "eval_loss": 3.578705310821533, + "eval_runtime": 182.0872, + "eval_samples_per_second": 91.401, + "eval_steps_per_second": 5.717, + "step": 30000 + }, + { + "epoch": 8.748354784229225, + "grad_norm": 0.3275495767593384, + "learning_rate": 0.0004953137197786192, + "loss": 3.4322, + "step": 30050 + }, + { + "epoch": 8.762914215829014, + "grad_norm": 0.3445492684841156, + "learning_rate": 0.000495138945528692, + "loss": 3.4295, + "step": 30100 + }, + { + "epoch": 8.777473647428804, + "grad_norm": 0.364786297082901, + "learning_rate": 0.0004949641712787649, + "loss": 3.4453, + "step": 30150 + }, + { + "epoch": 8.792033079028595, + "grad_norm": 0.3113223612308502, + "learning_rate": 0.0004947893970288377, + "loss": 3.4291, + "step": 30200 + }, + { + "epoch": 8.806592510628384, + "grad_norm": 0.3300077021121979, + "learning_rate": 0.0004946146227789105, + "loss": 3.4139, + "step": 30250 + }, + { + "epoch": 8.821151942228175, + "grad_norm": 0.3236207067966461, + "learning_rate": 0.0004944398485289834, + "loss": 3.4379, + "step": 30300 + }, + { + "epoch": 8.835711373827966, + "grad_norm": 0.34814879298210144, + "learning_rate": 0.0004942650742790561, + "loss": 3.4444, + "step": 30350 + }, + { + "epoch": 8.850270805427757, + "grad_norm": 0.35743340849876404, + "learning_rate": 0.000494090300029129, + "loss": 3.4288, + "step": 30400 + }, + { + "epoch": 8.864830237027547, + "grad_norm": 0.3275567889213562, + "learning_rate": 0.0004939155257792018, + "loss": 3.4436, + "step": 30450 + }, + { + "epoch": 8.879389668627336, + "grad_norm": 0.3158799707889557, + "learning_rate": 0.0004937407515292746, + "loss": 3.4395, + "step": 30500 + }, + { + "epoch": 8.893949100227127, + "grad_norm": 0.33471766114234924, + "learning_rate": 0.0004935659772793475, + "loss": 3.4325, + "step": 30550 + }, + { + "epoch": 8.908508531826918, + "grad_norm": 0.34228864312171936, + "learning_rate": 0.0004933912030294202, + "loss": 3.427, + "step": 30600 + }, + { + "epoch": 8.923067963426707, + "grad_norm": 0.3427802324295044, + "learning_rate": 0.0004932164287794931, + "loss": 3.4251, + "step": 30650 + }, + { + "epoch": 8.937627395026498, + "grad_norm": 0.35321247577667236, + "learning_rate": 0.000493041654529566, + "loss": 3.4431, + "step": 30700 + }, + { + "epoch": 8.952186826626289, + "grad_norm": 0.3708088994026184, + "learning_rate": 0.0004928668802796388, + "loss": 3.4538, + "step": 30750 + }, + { + "epoch": 8.96674625822608, + "grad_norm": 0.34218692779541016, + "learning_rate": 0.0004926921060297116, + "loss": 3.4313, + "step": 30800 + }, + { + "epoch": 8.98130568982587, + "grad_norm": 0.35662880539894104, + "learning_rate": 0.0004925173317797845, + "loss": 3.4317, + "step": 30850 + }, + { + "epoch": 8.995865121425659, + "grad_norm": 0.3506118655204773, + "learning_rate": 0.0004923425575298572, + "loss": 3.4502, + "step": 30900 + }, + { + "epoch": 9.010191602119853, + "grad_norm": 0.34358587861061096, + "learning_rate": 0.0004921677832799301, + "loss": 3.3637, + "step": 30950 + }, + { + "epoch": 9.024751033719644, + "grad_norm": 0.3556085526943207, + "learning_rate": 0.0004919930090300029, + "loss": 3.3189, + "step": 31000 + }, + { + "epoch": 9.024751033719644, + "eval_accuracy": 0.36717398980524946, + "eval_loss": 3.5799267292022705, + "eval_runtime": 181.8055, + "eval_samples_per_second": 91.543, + "eval_steps_per_second": 5.726, + "step": 31000 + }, + { + "epoch": 9.039310465319433, + "grad_norm": 0.34578433632850647, + "learning_rate": 0.0004918182347800757, + "loss": 3.3231, + "step": 31050 + }, + { + "epoch": 9.053869896919224, + "grad_norm": 0.3525102138519287, + "learning_rate": 0.0004916434605301486, + "loss": 3.3193, + "step": 31100 + }, + { + "epoch": 9.068429328519015, + "grad_norm": 0.3447619080543518, + "learning_rate": 0.0004914686862802213, + "loss": 3.3293, + "step": 31150 + }, + { + "epoch": 9.082988760118806, + "grad_norm": 0.316193550825119, + "learning_rate": 0.0004912939120302941, + "loss": 3.35, + "step": 31200 + }, + { + "epoch": 9.097548191718595, + "grad_norm": 0.3357117772102356, + "learning_rate": 0.000491119137780367, + "loss": 3.3503, + "step": 31250 + }, + { + "epoch": 9.112107623318385, + "grad_norm": 0.3565595746040344, + "learning_rate": 0.0004909443635304398, + "loss": 3.3394, + "step": 31300 + }, + { + "epoch": 9.126667054918176, + "grad_norm": 0.35598695278167725, + "learning_rate": 0.0004907695892805127, + "loss": 3.3571, + "step": 31350 + }, + { + "epoch": 9.141226486517967, + "grad_norm": 0.3496910035610199, + "learning_rate": 0.0004905948150305855, + "loss": 3.354, + "step": 31400 + }, + { + "epoch": 9.155785918117756, + "grad_norm": 0.34782034158706665, + "learning_rate": 0.0004904200407806582, + "loss": 3.3431, + "step": 31450 + }, + { + "epoch": 9.170345349717547, + "grad_norm": 0.34046244621276855, + "learning_rate": 0.0004902452665307311, + "loss": 3.3657, + "step": 31500 + }, + { + "epoch": 9.184904781317337, + "grad_norm": 0.37150949239730835, + "learning_rate": 0.0004900704922808039, + "loss": 3.3665, + "step": 31550 + }, + { + "epoch": 9.199464212917128, + "grad_norm": 0.36348044872283936, + "learning_rate": 0.0004898957180308768, + "loss": 3.3567, + "step": 31600 + }, + { + "epoch": 9.214023644516917, + "grad_norm": 0.3551836311817169, + "learning_rate": 0.0004897209437809496, + "loss": 3.3674, + "step": 31650 + }, + { + "epoch": 9.228583076116708, + "grad_norm": 0.3500552475452423, + "learning_rate": 0.0004895461695310223, + "loss": 3.3814, + "step": 31700 + }, + { + "epoch": 9.243142507716499, + "grad_norm": 0.3479650318622589, + "learning_rate": 0.0004893713952810952, + "loss": 3.3613, + "step": 31750 + }, + { + "epoch": 9.25770193931629, + "grad_norm": 0.3503901958465576, + "learning_rate": 0.000489196621031168, + "loss": 3.3602, + "step": 31800 + }, + { + "epoch": 9.272261370916079, + "grad_norm": 0.33610227704048157, + "learning_rate": 0.0004890218467812409, + "loss": 3.3631, + "step": 31850 + }, + { + "epoch": 9.28682080251587, + "grad_norm": 0.3341948091983795, + "learning_rate": 0.0004888470725313137, + "loss": 3.3609, + "step": 31900 + }, + { + "epoch": 9.30138023411566, + "grad_norm": 0.3447319567203522, + "learning_rate": 0.0004886722982813865, + "loss": 3.3727, + "step": 31950 + }, + { + "epoch": 9.315939665715451, + "grad_norm": 0.32863977551460266, + "learning_rate": 0.0004884975240314593, + "loss": 3.3782, + "step": 32000 + }, + { + "epoch": 9.315939665715451, + "eval_accuracy": 0.366884968827947, + "eval_loss": 3.581573724746704, + "eval_runtime": 182.0337, + "eval_samples_per_second": 91.428, + "eval_steps_per_second": 5.719, + "step": 32000 + }, + { + "epoch": 9.33049909731524, + "grad_norm": 0.3508942127227783, + "learning_rate": 0.0004883227497815321, + "loss": 3.3778, + "step": 32050 + }, + { + "epoch": 9.34505852891503, + "grad_norm": 0.3674251437187195, + "learning_rate": 0.00048814797553160496, + "loss": 3.3807, + "step": 32100 + }, + { + "epoch": 9.359617960514822, + "grad_norm": 0.3387126922607422, + "learning_rate": 0.0004879732012816778, + "loss": 3.3823, + "step": 32150 + }, + { + "epoch": 9.374177392114612, + "grad_norm": 0.3542914390563965, + "learning_rate": 0.0004877984270317506, + "loss": 3.398, + "step": 32200 + }, + { + "epoch": 9.388736823714403, + "grad_norm": 0.354044109582901, + "learning_rate": 0.0004876236527818234, + "loss": 3.3764, + "step": 32250 + }, + { + "epoch": 9.403296255314192, + "grad_norm": 0.3662169575691223, + "learning_rate": 0.00048744887853189624, + "loss": 3.3919, + "step": 32300 + }, + { + "epoch": 9.417855686913983, + "grad_norm": 0.33728882670402527, + "learning_rate": 0.00048727410428196907, + "loss": 3.383, + "step": 32350 + }, + { + "epoch": 9.432415118513774, + "grad_norm": 0.32222864031791687, + "learning_rate": 0.0004870993300320419, + "loss": 3.3877, + "step": 32400 + }, + { + "epoch": 9.446974550113563, + "grad_norm": 0.3222348988056183, + "learning_rate": 0.00048692455578211474, + "loss": 3.3822, + "step": 32450 + }, + { + "epoch": 9.461533981713353, + "grad_norm": 0.3391883671283722, + "learning_rate": 0.0004867497815321875, + "loss": 3.3887, + "step": 32500 + }, + { + "epoch": 9.476093413313144, + "grad_norm": 0.3517501652240753, + "learning_rate": 0.00048657500728226035, + "loss": 3.3825, + "step": 32550 + }, + { + "epoch": 9.490652844912935, + "grad_norm": 0.3315829932689667, + "learning_rate": 0.0004864002330323332, + "loss": 3.3849, + "step": 32600 + }, + { + "epoch": 9.505212276512726, + "grad_norm": 0.33583584427833557, + "learning_rate": 0.000486225458782406, + "loss": 3.3938, + "step": 32650 + }, + { + "epoch": 9.519771708112515, + "grad_norm": 0.3496243357658386, + "learning_rate": 0.0004860506845324788, + "loss": 3.3901, + "step": 32700 + }, + { + "epoch": 9.534331139712306, + "grad_norm": 0.34915950894355774, + "learning_rate": 0.0004858759102825516, + "loss": 3.402, + "step": 32750 + }, + { + "epoch": 9.548890571312096, + "grad_norm": 0.3658216893672943, + "learning_rate": 0.00048570113603262446, + "loss": 3.391, + "step": 32800 + }, + { + "epoch": 9.563450002911885, + "grad_norm": 0.3504136800765991, + "learning_rate": 0.0004855263617826973, + "loss": 3.3906, + "step": 32850 + }, + { + "epoch": 9.578009434511676, + "grad_norm": 0.33254560828208923, + "learning_rate": 0.0004853515875327701, + "loss": 3.4056, + "step": 32900 + }, + { + "epoch": 9.592568866111467, + "grad_norm": 0.34906646609306335, + "learning_rate": 0.0004851768132828429, + "loss": 3.4075, + "step": 32950 + }, + { + "epoch": 9.607128297711258, + "grad_norm": 0.34559518098831177, + "learning_rate": 0.00048500203903291574, + "loss": 3.4026, + "step": 33000 + }, + { + "epoch": 9.607128297711258, + "eval_accuracy": 0.36745666125742, + "eval_loss": 3.5726640224456787, + "eval_runtime": 181.7757, + "eval_samples_per_second": 91.558, + "eval_steps_per_second": 5.727, + "step": 33000 + }, + { + "epoch": 9.621687729311049, + "grad_norm": 0.3735829293727875, + "learning_rate": 0.00048482726478298857, + "loss": 3.4065, + "step": 33050 + }, + { + "epoch": 9.636247160910838, + "grad_norm": 0.3518868684768677, + "learning_rate": 0.0004846524905330614, + "loss": 3.4036, + "step": 33100 + }, + { + "epoch": 9.650806592510628, + "grad_norm": 0.3787810802459717, + "learning_rate": 0.00048447771628313424, + "loss": 3.4012, + "step": 33150 + }, + { + "epoch": 9.66536602411042, + "grad_norm": 0.36960500478744507, + "learning_rate": 0.0004843029420332071, + "loss": 3.408, + "step": 33200 + }, + { + "epoch": 9.67992545571021, + "grad_norm": 0.34325626492500305, + "learning_rate": 0.0004841281677832799, + "loss": 3.4017, + "step": 33250 + }, + { + "epoch": 9.694484887309999, + "grad_norm": 0.3455840051174164, + "learning_rate": 0.00048395339353335273, + "loss": 3.4139, + "step": 33300 + }, + { + "epoch": 9.70904431890979, + "grad_norm": 0.35434481501579285, + "learning_rate": 0.00048377861928342557, + "loss": 3.3996, + "step": 33350 + }, + { + "epoch": 9.72360375050958, + "grad_norm": 0.33681508898735046, + "learning_rate": 0.0004836038450334984, + "loss": 3.4125, + "step": 33400 + }, + { + "epoch": 9.738163182109371, + "grad_norm": 0.35238656401634216, + "learning_rate": 0.0004834290707835712, + "loss": 3.4157, + "step": 33450 + }, + { + "epoch": 9.75272261370916, + "grad_norm": 0.37718260288238525, + "learning_rate": 0.000483254296533644, + "loss": 3.4033, + "step": 33500 + }, + { + "epoch": 9.767282045308951, + "grad_norm": 0.3434363901615143, + "learning_rate": 0.00048307952228371685, + "loss": 3.4143, + "step": 33550 + }, + { + "epoch": 9.781841476908742, + "grad_norm": 0.34627440571784973, + "learning_rate": 0.0004829047480337897, + "loss": 3.4043, + "step": 33600 + }, + { + "epoch": 9.796400908508533, + "grad_norm": 0.33534497022628784, + "learning_rate": 0.0004827299737838625, + "loss": 3.4029, + "step": 33650 + }, + { + "epoch": 9.810960340108322, + "grad_norm": 0.3508129417896271, + "learning_rate": 0.0004825551995339353, + "loss": 3.406, + "step": 33700 + }, + { + "epoch": 9.825519771708112, + "grad_norm": 0.34650343656539917, + "learning_rate": 0.0004823804252840081, + "loss": 3.404, + "step": 33750 + }, + { + "epoch": 9.840079203307903, + "grad_norm": 0.33442333340644836, + "learning_rate": 0.00048220565103408096, + "loss": 3.4015, + "step": 33800 + }, + { + "epoch": 9.854638634907694, + "grad_norm": 0.3506050407886505, + "learning_rate": 0.0004820308767841538, + "loss": 3.4156, + "step": 33850 + }, + { + "epoch": 9.869198066507483, + "grad_norm": 0.341828316450119, + "learning_rate": 0.0004818561025342266, + "loss": 3.4171, + "step": 33900 + }, + { + "epoch": 9.883757498107274, + "grad_norm": 0.3377910554409027, + "learning_rate": 0.0004816813282842994, + "loss": 3.4102, + "step": 33950 + }, + { + "epoch": 9.898316929707065, + "grad_norm": 0.35400837659835815, + "learning_rate": 0.00048150655403437223, + "loss": 3.4082, + "step": 34000 + }, + { + "epoch": 9.898316929707065, + "eval_accuracy": 0.36838721944064684, + "eval_loss": 3.5640623569488525, + "eval_runtime": 182.8947, + "eval_samples_per_second": 90.998, + "eval_steps_per_second": 5.692, + "step": 34000 + }, + { + "epoch": 9.912876361306855, + "grad_norm": 0.3472040593624115, + "learning_rate": 0.00048133177978444507, + "loss": 3.4179, + "step": 34050 + }, + { + "epoch": 9.927435792906644, + "grad_norm": 0.3496232032775879, + "learning_rate": 0.0004811570055345179, + "loss": 3.4113, + "step": 34100 + }, + { + "epoch": 9.941995224506435, + "grad_norm": 0.33684638142585754, + "learning_rate": 0.0004809822312845907, + "loss": 3.4137, + "step": 34150 + }, + { + "epoch": 9.956554656106226, + "grad_norm": 0.34335857629776, + "learning_rate": 0.0004808074570346635, + "loss": 3.4172, + "step": 34200 + }, + { + "epoch": 9.971114087706017, + "grad_norm": 0.34269091486930847, + "learning_rate": 0.00048063268278473634, + "loss": 3.4183, + "step": 34250 + }, + { + "epoch": 9.985673519305806, + "grad_norm": 0.3301508128643036, + "learning_rate": 0.0004804579085348092, + "loss": 3.4135, + "step": 34300 + }, + { + "epoch": 10.0, + "grad_norm": 0.8190501928329468, + "learning_rate": 0.000480283134284882, + "loss": 3.4084, + "step": 34350 + }, + { + "epoch": 10.01455943159979, + "grad_norm": 0.34881967306137085, + "learning_rate": 0.0004801083600349548, + "loss": 3.3121, + "step": 34400 + }, + { + "epoch": 10.029118863199582, + "grad_norm": 0.3504365086555481, + "learning_rate": 0.0004799335857850276, + "loss": 3.3012, + "step": 34450 + }, + { + "epoch": 10.04367829479937, + "grad_norm": 0.3723757565021515, + "learning_rate": 0.00047975881153510046, + "loss": 3.3123, + "step": 34500 + }, + { + "epoch": 10.058237726399161, + "grad_norm": 0.3652939200401306, + "learning_rate": 0.0004795840372851733, + "loss": 3.3082, + "step": 34550 + }, + { + "epoch": 10.072797157998952, + "grad_norm": 0.36539286375045776, + "learning_rate": 0.00047940926303524607, + "loss": 3.3053, + "step": 34600 + }, + { + "epoch": 10.087356589598743, + "grad_norm": 0.34552112221717834, + "learning_rate": 0.0004792344887853189, + "loss": 3.3203, + "step": 34650 + }, + { + "epoch": 10.101916021198532, + "grad_norm": 0.34289079904556274, + "learning_rate": 0.00047905971453539173, + "loss": 3.3335, + "step": 34700 + }, + { + "epoch": 10.116475452798323, + "grad_norm": 0.34614643454551697, + "learning_rate": 0.00047888494028546457, + "loss": 3.3293, + "step": 34750 + }, + { + "epoch": 10.131034884398114, + "grad_norm": 0.365692675113678, + "learning_rate": 0.0004787101660355374, + "loss": 3.3347, + "step": 34800 + }, + { + "epoch": 10.145594315997904, + "grad_norm": 0.3478696644306183, + "learning_rate": 0.0004785353917856102, + "loss": 3.3419, + "step": 34850 + }, + { + "epoch": 10.160153747597693, + "grad_norm": 0.345829576253891, + "learning_rate": 0.000478360617535683, + "loss": 3.3263, + "step": 34900 + }, + { + "epoch": 10.174713179197484, + "grad_norm": 0.4017032980918884, + "learning_rate": 0.00047818584328575584, + "loss": 3.3348, + "step": 34950 + }, + { + "epoch": 10.189272610797275, + "grad_norm": 0.34451884031295776, + "learning_rate": 0.0004780110690358287, + "loss": 3.3356, + "step": 35000 + }, + { + "epoch": 10.189272610797275, + "eval_accuracy": 0.36771605111744, + "eval_loss": 3.5778610706329346, + "eval_runtime": 183.2483, + "eval_samples_per_second": 90.822, + "eval_steps_per_second": 5.681, + "step": 35000 + }, + { + "epoch": 10.203832042397066, + "grad_norm": 0.35025554895401, + "learning_rate": 0.0004778362947859015, + "loss": 3.3442, + "step": 35050 + }, + { + "epoch": 10.218391473996855, + "grad_norm": 0.34518471360206604, + "learning_rate": 0.0004776615205359743, + "loss": 3.3374, + "step": 35100 + }, + { + "epoch": 10.232950905596645, + "grad_norm": 0.35896578431129456, + "learning_rate": 0.0004774867462860471, + "loss": 3.3453, + "step": 35150 + }, + { + "epoch": 10.247510337196436, + "grad_norm": 0.3396795094013214, + "learning_rate": 0.00047731197203611995, + "loss": 3.3457, + "step": 35200 + }, + { + "epoch": 10.262069768796227, + "grad_norm": 0.3721248209476471, + "learning_rate": 0.0004771371977861928, + "loss": 3.3458, + "step": 35250 + }, + { + "epoch": 10.276629200396016, + "grad_norm": 0.3700907230377197, + "learning_rate": 0.00047696242353626557, + "loss": 3.3381, + "step": 35300 + }, + { + "epoch": 10.291188631995807, + "grad_norm": 0.3764047622680664, + "learning_rate": 0.0004767876492863384, + "loss": 3.3418, + "step": 35350 + }, + { + "epoch": 10.305748063595598, + "grad_norm": 0.3617747724056244, + "learning_rate": 0.00047661287503641123, + "loss": 3.347, + "step": 35400 + }, + { + "epoch": 10.320307495195388, + "grad_norm": 0.34759700298309326, + "learning_rate": 0.00047643810078648407, + "loss": 3.3512, + "step": 35450 + }, + { + "epoch": 10.334866926795177, + "grad_norm": 0.35689282417297363, + "learning_rate": 0.0004762633265365569, + "loss": 3.3663, + "step": 35500 + }, + { + "epoch": 10.349426358394968, + "grad_norm": 0.32792720198631287, + "learning_rate": 0.0004760885522866297, + "loss": 3.3568, + "step": 35550 + }, + { + "epoch": 10.363985789994759, + "grad_norm": 0.3390996754169464, + "learning_rate": 0.0004759137780367025, + "loss": 3.3689, + "step": 35600 + }, + { + "epoch": 10.37854522159455, + "grad_norm": 0.35693955421447754, + "learning_rate": 0.00047573900378677534, + "loss": 3.3575, + "step": 35650 + }, + { + "epoch": 10.393104653194339, + "grad_norm": 0.3452168405056, + "learning_rate": 0.00047556422953684823, + "loss": 3.3642, + "step": 35700 + }, + { + "epoch": 10.40766408479413, + "grad_norm": 0.370328426361084, + "learning_rate": 0.00047538945528692106, + "loss": 3.3595, + "step": 35750 + }, + { + "epoch": 10.42222351639392, + "grad_norm": 0.37136757373809814, + "learning_rate": 0.0004752146810369939, + "loss": 3.346, + "step": 35800 + }, + { + "epoch": 10.436782947993711, + "grad_norm": 0.3773367702960968, + "learning_rate": 0.0004750399067870667, + "loss": 3.3645, + "step": 35850 + }, + { + "epoch": 10.4513423795935, + "grad_norm": 0.3447873592376709, + "learning_rate": 0.0004748651325371395, + "loss": 3.3598, + "step": 35900 + }, + { + "epoch": 10.46590181119329, + "grad_norm": 0.355688214302063, + "learning_rate": 0.00047469035828721234, + "loss": 3.3672, + "step": 35950 + }, + { + "epoch": 10.480461242793082, + "grad_norm": 0.3678136169910431, + "learning_rate": 0.0004745155840372852, + "loss": 3.3828, + "step": 36000 + }, + { + "epoch": 10.480461242793082, + "eval_accuracy": 0.36819038416155636, + "eval_loss": 3.568837881088257, + "eval_runtime": 183.926, + "eval_samples_per_second": 90.487, + "eval_steps_per_second": 5.66, + "step": 36000 + }, + { + "epoch": 10.495020674392872, + "grad_norm": 0.3659283220767975, + "learning_rate": 0.00047434080978735795, + "loss": 3.3647, + "step": 36050 + }, + { + "epoch": 10.509580105992661, + "grad_norm": 0.3798047602176666, + "learning_rate": 0.0004741660355374308, + "loss": 3.3631, + "step": 36100 + }, + { + "epoch": 10.524139537592452, + "grad_norm": 0.3466806411743164, + "learning_rate": 0.0004739912612875036, + "loss": 3.359, + "step": 36150 + }, + { + "epoch": 10.538698969192243, + "grad_norm": 0.35511037707328796, + "learning_rate": 0.00047381648703757645, + "loss": 3.3755, + "step": 36200 + }, + { + "epoch": 10.553258400792034, + "grad_norm": 0.3418614864349365, + "learning_rate": 0.0004736417127876493, + "loss": 3.3799, + "step": 36250 + }, + { + "epoch": 10.567817832391823, + "grad_norm": 0.38244953751564026, + "learning_rate": 0.00047346693853772206, + "loss": 3.382, + "step": 36300 + }, + { + "epoch": 10.582377263991614, + "grad_norm": 0.3323763310909271, + "learning_rate": 0.0004732921642877949, + "loss": 3.3828, + "step": 36350 + }, + { + "epoch": 10.596936695591404, + "grad_norm": 0.3437618315219879, + "learning_rate": 0.00047311739003786773, + "loss": 3.391, + "step": 36400 + }, + { + "epoch": 10.611496127191195, + "grad_norm": 0.36182549595832825, + "learning_rate": 0.00047294261578794056, + "loss": 3.3829, + "step": 36450 + }, + { + "epoch": 10.626055558790984, + "grad_norm": 0.38253724575042725, + "learning_rate": 0.0004727678415380134, + "loss": 3.3803, + "step": 36500 + }, + { + "epoch": 10.640614990390775, + "grad_norm": 0.36465519666671753, + "learning_rate": 0.0004725930672880862, + "loss": 3.3703, + "step": 36550 + }, + { + "epoch": 10.655174421990566, + "grad_norm": 0.3479657769203186, + "learning_rate": 0.000472418293038159, + "loss": 3.3709, + "step": 36600 + }, + { + "epoch": 10.669733853590357, + "grad_norm": 0.3454592227935791, + "learning_rate": 0.00047224351878823184, + "loss": 3.3876, + "step": 36650 + }, + { + "epoch": 10.684293285190146, + "grad_norm": 0.34455588459968567, + "learning_rate": 0.0004720687445383047, + "loss": 3.3788, + "step": 36700 + }, + { + "epoch": 10.698852716789936, + "grad_norm": 0.357598215341568, + "learning_rate": 0.00047189397028837745, + "loss": 3.3759, + "step": 36750 + }, + { + "epoch": 10.713412148389727, + "grad_norm": 0.36810582876205444, + "learning_rate": 0.0004717191960384503, + "loss": 3.3817, + "step": 36800 + }, + { + "epoch": 10.727971579989518, + "grad_norm": 0.37969326972961426, + "learning_rate": 0.0004715444217885231, + "loss": 3.3845, + "step": 36850 + }, + { + "epoch": 10.742531011589307, + "grad_norm": 0.362560898065567, + "learning_rate": 0.00047136964753859595, + "loss": 3.397, + "step": 36900 + }, + { + "epoch": 10.757090443189098, + "grad_norm": 0.36402398347854614, + "learning_rate": 0.0004711948732886688, + "loss": 3.3797, + "step": 36950 + }, + { + "epoch": 10.771649874788888, + "grad_norm": 0.3478822410106659, + "learning_rate": 0.00047102009903874156, + "loss": 3.3911, + "step": 37000 + }, + { + "epoch": 10.771649874788888, + "eval_accuracy": 0.368904117819907, + "eval_loss": 3.5626118183135986, + "eval_runtime": 183.8574, + "eval_samples_per_second": 90.521, + "eval_steps_per_second": 5.662, + "step": 37000 + }, + { + "epoch": 10.78620930638868, + "grad_norm": 0.34672781825065613, + "learning_rate": 0.0004708453247888144, + "loss": 3.3796, + "step": 37050 + }, + { + "epoch": 10.800768737988468, + "grad_norm": 0.35510483384132385, + "learning_rate": 0.00047067055053888723, + "loss": 3.3921, + "step": 37100 + }, + { + "epoch": 10.815328169588259, + "grad_norm": 0.3330132067203522, + "learning_rate": 0.00047049577628896006, + "loss": 3.3707, + "step": 37150 + }, + { + "epoch": 10.82988760118805, + "grad_norm": 0.35041606426239014, + "learning_rate": 0.0004703210020390329, + "loss": 3.3993, + "step": 37200 + }, + { + "epoch": 10.84444703278784, + "grad_norm": 0.34748944640159607, + "learning_rate": 0.0004701462277891057, + "loss": 3.3854, + "step": 37250 + }, + { + "epoch": 10.85900646438763, + "grad_norm": 0.3505236506462097, + "learning_rate": 0.0004699714535391785, + "loss": 3.3933, + "step": 37300 + }, + { + "epoch": 10.87356589598742, + "grad_norm": 0.3472146689891815, + "learning_rate": 0.00046979667928925134, + "loss": 3.3877, + "step": 37350 + }, + { + "epoch": 10.888125327587211, + "grad_norm": 0.33038902282714844, + "learning_rate": 0.0004696219050393242, + "loss": 3.3872, + "step": 37400 + }, + { + "epoch": 10.902684759187002, + "grad_norm": 0.33716917037963867, + "learning_rate": 0.00046944713078939695, + "loss": 3.3962, + "step": 37450 + }, + { + "epoch": 10.917244190786791, + "grad_norm": 0.3526748716831207, + "learning_rate": 0.0004692723565394698, + "loss": 3.3928, + "step": 37500 + }, + { + "epoch": 10.931803622386582, + "grad_norm": 0.36475178599357605, + "learning_rate": 0.0004690975822895426, + "loss": 3.3842, + "step": 37550 + }, + { + "epoch": 10.946363053986373, + "grad_norm": 0.36359477043151855, + "learning_rate": 0.00046892280803961545, + "loss": 3.401, + "step": 37600 + }, + { + "epoch": 10.960922485586163, + "grad_norm": 0.35189494490623474, + "learning_rate": 0.0004687480337896883, + "loss": 3.3937, + "step": 37650 + }, + { + "epoch": 10.975481917185952, + "grad_norm": 0.3400118350982666, + "learning_rate": 0.00046857325953976106, + "loss": 3.3934, + "step": 37700 + }, + { + "epoch": 10.990041348785743, + "grad_norm": 0.3473895490169525, + "learning_rate": 0.0004683984852898339, + "loss": 3.3902, + "step": 37750 + }, + { + "epoch": 11.004367829479937, + "grad_norm": 0.3693157732486725, + "learning_rate": 0.00046822371103990673, + "loss": 3.3584, + "step": 37800 + }, + { + "epoch": 11.018927261079728, + "grad_norm": 0.34884193539619446, + "learning_rate": 0.00046804893678997956, + "loss": 3.2712, + "step": 37850 + }, + { + "epoch": 11.033486692679517, + "grad_norm": 0.331039696931839, + "learning_rate": 0.00046787416254005234, + "loss": 3.2856, + "step": 37900 + }, + { + "epoch": 11.048046124279308, + "grad_norm": 0.34825077652931213, + "learning_rate": 0.0004676993882901252, + "loss": 3.2941, + "step": 37950 + }, + { + "epoch": 11.062605555879099, + "grad_norm": 0.3396894633769989, + "learning_rate": 0.000467524614040198, + "loss": 3.2861, + "step": 38000 + }, + { + "epoch": 11.062605555879099, + "eval_accuracy": 0.36855336534826616, + "eval_loss": 3.5719475746154785, + "eval_runtime": 180.4716, + "eval_samples_per_second": 92.219, + "eval_steps_per_second": 5.768, + "step": 38000 + }, + { + "epoch": 11.07716498747889, + "grad_norm": 0.41541653871536255, + "learning_rate": 0.00046734983979027084, + "loss": 3.2967, + "step": 38050 + }, + { + "epoch": 11.091724419078679, + "grad_norm": 0.34760013222694397, + "learning_rate": 0.00046717506554034367, + "loss": 3.2988, + "step": 38100 + }, + { + "epoch": 11.10628385067847, + "grad_norm": 0.3493053913116455, + "learning_rate": 0.00046700029129041645, + "loss": 3.2994, + "step": 38150 + }, + { + "epoch": 11.12084328227826, + "grad_norm": 0.35706987977027893, + "learning_rate": 0.0004668255170404893, + "loss": 3.315, + "step": 38200 + }, + { + "epoch": 11.135402713878051, + "grad_norm": 0.3363507390022278, + "learning_rate": 0.00046665074279056217, + "loss": 3.3071, + "step": 38250 + }, + { + "epoch": 11.14996214547784, + "grad_norm": 0.3618837296962738, + "learning_rate": 0.000466475968540635, + "loss": 3.3106, + "step": 38300 + }, + { + "epoch": 11.16452157707763, + "grad_norm": 0.33892515301704407, + "learning_rate": 0.00046630119429070784, + "loss": 3.3133, + "step": 38350 + }, + { + "epoch": 11.179081008677421, + "grad_norm": 0.33202266693115234, + "learning_rate": 0.00046612642004078067, + "loss": 3.3235, + "step": 38400 + }, + { + "epoch": 11.193640440277212, + "grad_norm": 0.3930901288986206, + "learning_rate": 0.00046595164579085345, + "loss": 3.3299, + "step": 38450 + }, + { + "epoch": 11.208199871877001, + "grad_norm": 0.4052780568599701, + "learning_rate": 0.0004657768715409263, + "loss": 3.3197, + "step": 38500 + }, + { + "epoch": 11.222759303476792, + "grad_norm": 0.3582177460193634, + "learning_rate": 0.0004656020972909991, + "loss": 3.3247, + "step": 38550 + }, + { + "epoch": 11.237318735076583, + "grad_norm": 0.3405052423477173, + "learning_rate": 0.00046542732304107195, + "loss": 3.3235, + "step": 38600 + }, + { + "epoch": 11.251878166676374, + "grad_norm": 0.32738906145095825, + "learning_rate": 0.0004652525487911447, + "loss": 3.3191, + "step": 38650 + }, + { + "epoch": 11.266437598276163, + "grad_norm": 0.36800041794776917, + "learning_rate": 0.00046507777454121756, + "loss": 3.3328, + "step": 38700 + }, + { + "epoch": 11.280997029875953, + "grad_norm": 0.37207457423210144, + "learning_rate": 0.0004649030002912904, + "loss": 3.3304, + "step": 38750 + }, + { + "epoch": 11.295556461475744, + "grad_norm": 0.36415359377861023, + "learning_rate": 0.0004647282260413632, + "loss": 3.3409, + "step": 38800 + }, + { + "epoch": 11.310115893075535, + "grad_norm": 0.3438774049282074, + "learning_rate": 0.00046455345179143606, + "loss": 3.3288, + "step": 38850 + }, + { + "epoch": 11.324675324675324, + "grad_norm": 0.3514200448989868, + "learning_rate": 0.00046437867754150884, + "loss": 3.331, + "step": 38900 + }, + { + "epoch": 11.339234756275115, + "grad_norm": 0.34444525837898254, + "learning_rate": 0.00046420390329158167, + "loss": 3.3253, + "step": 38950 + }, + { + "epoch": 11.353794187874906, + "grad_norm": 0.34927886724472046, + "learning_rate": 0.0004640291290416545, + "loss": 3.3361, + "step": 39000 + }, + { + "epoch": 11.353794187874906, + "eval_accuracy": 0.36882192674458786, + "eval_loss": 3.568171739578247, + "eval_runtime": 180.5055, + "eval_samples_per_second": 92.202, + "eval_steps_per_second": 5.767, + "step": 39000 + }, + { + "epoch": 11.368353619474696, + "grad_norm": 0.34765294194221497, + "learning_rate": 0.00046385435479172734, + "loss": 3.3385, + "step": 39050 + }, + { + "epoch": 11.382913051074485, + "grad_norm": 0.37567201256752014, + "learning_rate": 0.00046367958054180017, + "loss": 3.3372, + "step": 39100 + }, + { + "epoch": 11.397472482674276, + "grad_norm": 0.3298972547054291, + "learning_rate": 0.00046350480629187295, + "loss": 3.3449, + "step": 39150 + }, + { + "epoch": 11.412031914274067, + "grad_norm": 0.3385719656944275, + "learning_rate": 0.0004633300320419458, + "loss": 3.3381, + "step": 39200 + }, + { + "epoch": 11.426591345873858, + "grad_norm": 0.3834417164325714, + "learning_rate": 0.0004631552577920186, + "loss": 3.3453, + "step": 39250 + }, + { + "epoch": 11.441150777473647, + "grad_norm": 0.36645200848579407, + "learning_rate": 0.00046298048354209145, + "loss": 3.3585, + "step": 39300 + }, + { + "epoch": 11.455710209073438, + "grad_norm": 0.3596128523349762, + "learning_rate": 0.0004628057092921642, + "loss": 3.3505, + "step": 39350 + }, + { + "epoch": 11.470269640673228, + "grad_norm": 0.37306201457977295, + "learning_rate": 0.00046263093504223706, + "loss": 3.3489, + "step": 39400 + }, + { + "epoch": 11.484829072273019, + "grad_norm": 0.40729859471321106, + "learning_rate": 0.0004624561607923099, + "loss": 3.3555, + "step": 39450 + }, + { + "epoch": 11.499388503872808, + "grad_norm": 0.35100769996643066, + "learning_rate": 0.0004622813865423827, + "loss": 3.3443, + "step": 39500 + }, + { + "epoch": 11.513947935472599, + "grad_norm": 0.347989559173584, + "learning_rate": 0.00046210661229245556, + "loss": 3.3647, + "step": 39550 + }, + { + "epoch": 11.52850736707239, + "grad_norm": 0.35340970754623413, + "learning_rate": 0.00046193183804252834, + "loss": 3.346, + "step": 39600 + }, + { + "epoch": 11.54306679867218, + "grad_norm": 0.3439280092716217, + "learning_rate": 0.00046175706379260117, + "loss": 3.3613, + "step": 39650 + }, + { + "epoch": 11.55762623027197, + "grad_norm": 0.34520137310028076, + "learning_rate": 0.000461582289542674, + "loss": 3.347, + "step": 39700 + }, + { + "epoch": 11.57218566187176, + "grad_norm": 0.3320297598838806, + "learning_rate": 0.00046140751529274684, + "loss": 3.3489, + "step": 39750 + }, + { + "epoch": 11.586745093471551, + "grad_norm": 0.35040003061294556, + "learning_rate": 0.00046123274104281967, + "loss": 3.3462, + "step": 39800 + }, + { + "epoch": 11.601304525071342, + "grad_norm": 0.3691483438014984, + "learning_rate": 0.00046105796679289245, + "loss": 3.3593, + "step": 39850 + }, + { + "epoch": 11.61586395667113, + "grad_norm": 0.3896438777446747, + "learning_rate": 0.0004608831925429653, + "loss": 3.3616, + "step": 39900 + }, + { + "epoch": 11.630423388270922, + "grad_norm": 0.36567434668540955, + "learning_rate": 0.0004607084182930381, + "loss": 3.3553, + "step": 39950 + }, + { + "epoch": 11.644982819870712, + "grad_norm": 0.343128502368927, + "learning_rate": 0.00046053364404311095, + "loss": 3.361, + "step": 40000 + }, + { + "epoch": 11.644982819870712, + "eval_accuracy": 0.36932248097582326, + "eval_loss": 3.5595271587371826, + "eval_runtime": 180.4161, + "eval_samples_per_second": 92.248, + "eval_steps_per_second": 5.77, + "step": 40000 + } + ], + "logging_steps": 50, + "max_steps": 171750, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.35936181747712e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}