{ "best_global_step": 89000, "best_metric": 3.530644655227661, "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_push_frequency_5039/checkpoint-40000", "epoch": 31.741743840643018, "eval_steps": 1000, "global_step": 109000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014561127613722406, "grad_norm": 0.7142499685287476, "learning_rate": 0.000294, "loss": 8.4587, "step": 50 }, { "epoch": 0.029122255227444813, "grad_norm": 0.5678296685218811, "learning_rate": 0.0005939999999999999, "loss": 6.7387, "step": 100 }, { "epoch": 0.04368338284116722, "grad_norm": 0.5606561303138733, "learning_rate": 0.0005998286713286713, "loss": 6.3655, "step": 150 }, { "epoch": 0.058244510454889625, "grad_norm": 0.47408413887023926, "learning_rate": 0.0005996538461538461, "loss": 6.1262, "step": 200 }, { "epoch": 0.07280563806861204, "grad_norm": 0.4732152223587036, "learning_rate": 0.0005994790209790209, "loss": 5.9972, "step": 250 }, { "epoch": 0.08736676568233444, "grad_norm": 0.438856303691864, "learning_rate": 0.0005993041958041958, "loss": 5.8873, "step": 300 }, { "epoch": 0.10192789329605685, "grad_norm": 0.4995385408401489, "learning_rate": 0.0005991293706293705, "loss": 5.7343, "step": 350 }, { "epoch": 0.11648902090977925, "grad_norm": 0.4228087067604065, "learning_rate": 0.0005989545454545454, "loss": 5.63, "step": 400 }, { "epoch": 0.13105014852350166, "grad_norm": 0.4598565399646759, "learning_rate": 0.0005987797202797202, "loss": 5.5141, "step": 450 }, { "epoch": 0.14561127613722408, "grad_norm": 0.47586789727211, "learning_rate": 0.000598604895104895, "loss": 5.414, "step": 500 }, { "epoch": 0.16017240375094646, "grad_norm": 0.477698415517807, "learning_rate": 0.0005984300699300698, "loss": 5.3322, "step": 550 }, { "epoch": 0.17473353136466888, "grad_norm": 0.44980600476264954, "learning_rate": 0.0005982552447552447, "loss": 5.2471, "step": 600 }, { "epoch": 0.1892946589783913, "grad_norm": 0.4625977873802185, "learning_rate": 0.0005980804195804195, "loss": 5.1807, "step": 650 }, { "epoch": 0.2038557865921137, "grad_norm": 0.5372968912124634, "learning_rate": 0.0005979055944055943, "loss": 5.1209, "step": 700 }, { "epoch": 0.2184169142058361, "grad_norm": 0.5401430726051331, "learning_rate": 0.0005977307692307691, "loss": 5.0708, "step": 750 }, { "epoch": 0.2329780418195585, "grad_norm": 0.4346584379673004, "learning_rate": 0.000597555944055944, "loss": 5.0076, "step": 800 }, { "epoch": 0.24753916943328091, "grad_norm": 0.460715115070343, "learning_rate": 0.0005973811188811188, "loss": 4.9693, "step": 850 }, { "epoch": 0.2621002970470033, "grad_norm": 0.4268857538700104, "learning_rate": 0.0005972062937062936, "loss": 4.9309, "step": 900 }, { "epoch": 0.27666142466072574, "grad_norm": 0.44858410954475403, "learning_rate": 0.0005970314685314685, "loss": 4.8723, "step": 950 }, { "epoch": 0.29122255227444815, "grad_norm": 0.4247923493385315, "learning_rate": 0.0005968566433566433, "loss": 4.8399, "step": 1000 }, { "epoch": 0.29122255227444815, "eval_accuracy": 0.2545978147932791, "eval_loss": 4.757110118865967, "eval_runtime": 177.2928, "eval_samples_per_second": 93.89, "eval_steps_per_second": 5.872, "step": 1000 }, { "epoch": 0.30578367988817057, "grad_norm": 0.47108718752861023, "learning_rate": 0.0005966818181818181, "loss": 4.7695, "step": 1050 }, { "epoch": 0.3203448075018929, "grad_norm": 0.4697106182575226, "learning_rate": 0.0005965069930069929, "loss": 4.7315, "step": 1100 }, { "epoch": 0.33490593511561534, "grad_norm": 0.4338913559913635, "learning_rate": 0.0005963321678321677, "loss": 4.7008, "step": 1150 }, { "epoch": 0.34946706272933775, "grad_norm": 0.47174957394599915, "learning_rate": 0.0005961573426573425, "loss": 4.6608, "step": 1200 }, { "epoch": 0.36402819034306017, "grad_norm": 0.4433532953262329, "learning_rate": 0.0005959825174825174, "loss": 4.632, "step": 1250 }, { "epoch": 0.3785893179567826, "grad_norm": 0.5037450194358826, "learning_rate": 0.0005958076923076922, "loss": 4.5994, "step": 1300 }, { "epoch": 0.393150445570505, "grad_norm": 0.4283815622329712, "learning_rate": 0.000595632867132867, "loss": 4.567, "step": 1350 }, { "epoch": 0.4077115731842274, "grad_norm": 0.39518365263938904, "learning_rate": 0.0005954580419580418, "loss": 4.556, "step": 1400 }, { "epoch": 0.4222727007979498, "grad_norm": 0.435833215713501, "learning_rate": 0.0005952832167832168, "loss": 4.5333, "step": 1450 }, { "epoch": 0.4368338284116722, "grad_norm": 0.4726100265979767, "learning_rate": 0.0005951083916083916, "loss": 4.4928, "step": 1500 }, { "epoch": 0.4513949560253946, "grad_norm": 0.44213613867759705, "learning_rate": 0.0005949335664335664, "loss": 4.4892, "step": 1550 }, { "epoch": 0.465956083639117, "grad_norm": 0.39726656675338745, "learning_rate": 0.0005947587412587413, "loss": 4.4738, "step": 1600 }, { "epoch": 0.4805172112528394, "grad_norm": 0.4227396547794342, "learning_rate": 0.0005945839160839161, "loss": 4.4391, "step": 1650 }, { "epoch": 0.49507833886656183, "grad_norm": 0.4156210124492645, "learning_rate": 0.0005944090909090909, "loss": 4.4382, "step": 1700 }, { "epoch": 0.5096394664802842, "grad_norm": 0.4048631489276886, "learning_rate": 0.0005942342657342657, "loss": 4.4035, "step": 1750 }, { "epoch": 0.5242005940940067, "grad_norm": 0.3983406722545624, "learning_rate": 0.0005940594405594406, "loss": 4.3835, "step": 1800 }, { "epoch": 0.5387617217077291, "grad_norm": 0.4011361002922058, "learning_rate": 0.0005938846153846153, "loss": 4.3834, "step": 1850 }, { "epoch": 0.5533228493214515, "grad_norm": 0.4169325828552246, "learning_rate": 0.0005937097902097902, "loss": 4.3576, "step": 1900 }, { "epoch": 0.5678839769351739, "grad_norm": 0.4109379053115845, "learning_rate": 0.000593534965034965, "loss": 4.3456, "step": 1950 }, { "epoch": 0.5824451045488963, "grad_norm": 0.4288181960582733, "learning_rate": 0.0005933601398601398, "loss": 4.3272, "step": 2000 }, { "epoch": 0.5824451045488963, "eval_accuracy": 0.3003302568766484, "eval_loss": 4.275340557098389, "eval_runtime": 177.3788, "eval_samples_per_second": 93.844, "eval_steps_per_second": 5.869, "step": 2000 }, { "epoch": 0.5970062321626187, "grad_norm": 0.3974442183971405, "learning_rate": 0.0005931853146853146, "loss": 4.3343, "step": 2050 }, { "epoch": 0.6115673597763411, "grad_norm": 0.39755979180336, "learning_rate": 0.0005930104895104895, "loss": 4.3154, "step": 2100 }, { "epoch": 0.6261284873900634, "grad_norm": 0.3861689865589142, "learning_rate": 0.0005928356643356643, "loss": 4.2906, "step": 2150 }, { "epoch": 0.6406896150037859, "grad_norm": 0.3685368597507477, "learning_rate": 0.0005926608391608391, "loss": 4.2912, "step": 2200 }, { "epoch": 0.6552507426175083, "grad_norm": 0.4314819574356079, "learning_rate": 0.000592486013986014, "loss": 4.2757, "step": 2250 }, { "epoch": 0.6698118702312307, "grad_norm": 0.38772013783454895, "learning_rate": 0.0005923111888111888, "loss": 4.2653, "step": 2300 }, { "epoch": 0.6843729978449531, "grad_norm": 0.36975449323654175, "learning_rate": 0.0005921363636363636, "loss": 4.2463, "step": 2350 }, { "epoch": 0.6989341254586755, "grad_norm": 0.374622642993927, "learning_rate": 0.0005919615384615384, "loss": 4.2391, "step": 2400 }, { "epoch": 0.7134952530723979, "grad_norm": 0.3807830214500427, "learning_rate": 0.0005917867132867133, "loss": 4.2362, "step": 2450 }, { "epoch": 0.7280563806861203, "grad_norm": 0.35665392875671387, "learning_rate": 0.0005916118881118881, "loss": 4.2275, "step": 2500 }, { "epoch": 0.7426175082998427, "grad_norm": 0.39687132835388184, "learning_rate": 0.0005914370629370629, "loss": 4.2086, "step": 2550 }, { "epoch": 0.7571786359135652, "grad_norm": 0.3794260025024414, "learning_rate": 0.0005912622377622377, "loss": 4.2127, "step": 2600 }, { "epoch": 0.7717397635272876, "grad_norm": 0.40508922934532166, "learning_rate": 0.0005910874125874125, "loss": 4.2015, "step": 2650 }, { "epoch": 0.78630089114101, "grad_norm": 0.3626640737056732, "learning_rate": 0.0005909125874125873, "loss": 4.187, "step": 2700 }, { "epoch": 0.8008620187547324, "grad_norm": 0.36335116624832153, "learning_rate": 0.0005907377622377622, "loss": 4.173, "step": 2750 }, { "epoch": 0.8154231463684548, "grad_norm": 0.38054153323173523, "learning_rate": 0.000590562937062937, "loss": 4.1755, "step": 2800 }, { "epoch": 0.8299842739821772, "grad_norm": 0.3813888132572174, "learning_rate": 0.0005903881118881118, "loss": 4.171, "step": 2850 }, { "epoch": 0.8445454015958996, "grad_norm": 0.393034428358078, "learning_rate": 0.0005902132867132867, "loss": 4.154, "step": 2900 }, { "epoch": 0.8591065292096219, "grad_norm": 0.36573460698127747, "learning_rate": 0.0005900384615384615, "loss": 4.1445, "step": 2950 }, { "epoch": 0.8736676568233444, "grad_norm": 0.36031314730644226, "learning_rate": 0.0005898636363636363, "loss": 4.1337, "step": 3000 }, { "epoch": 0.8736676568233444, "eval_accuracy": 0.3159021295996076, "eval_loss": 4.092033386230469, "eval_runtime": 177.2002, "eval_samples_per_second": 93.939, "eval_steps_per_second": 5.875, "step": 3000 }, { "epoch": 0.8882287844370668, "grad_norm": 0.33011677861213684, "learning_rate": 0.0005896888111888111, "loss": 4.1267, "step": 3050 }, { "epoch": 0.9027899120507892, "grad_norm": 0.3441181480884552, "learning_rate": 0.000589513986013986, "loss": 4.1295, "step": 3100 }, { "epoch": 0.9173510396645116, "grad_norm": 0.3503768742084503, "learning_rate": 0.0005893391608391608, "loss": 4.1174, "step": 3150 }, { "epoch": 0.931912167278234, "grad_norm": 0.36162590980529785, "learning_rate": 0.0005891643356643356, "loss": 4.1218, "step": 3200 }, { "epoch": 0.9464732948919564, "grad_norm": 0.3388279974460602, "learning_rate": 0.0005889895104895104, "loss": 4.1123, "step": 3250 }, { "epoch": 0.9610344225056788, "grad_norm": 0.36532530188560486, "learning_rate": 0.0005888146853146853, "loss": 4.1076, "step": 3300 }, { "epoch": 0.9755955501194012, "grad_norm": 0.42103567719459534, "learning_rate": 0.00058863986013986, "loss": 4.0885, "step": 3350 }, { "epoch": 0.9901566777331237, "grad_norm": 0.354731947183609, "learning_rate": 0.0005884650349650349, "loss": 4.0847, "step": 3400 }, { "epoch": 1.004659560836391, "grad_norm": 0.34602439403533936, "learning_rate": 0.0005882902097902097, "loss": 4.0677, "step": 3450 }, { "epoch": 1.0192206884501136, "grad_norm": 0.3426474332809448, "learning_rate": 0.0005881153846153845, "loss": 4.0154, "step": 3500 }, { "epoch": 1.033781816063836, "grad_norm": 0.34196737408638, "learning_rate": 0.0005879405594405594, "loss": 4.0117, "step": 3550 }, { "epoch": 1.0483429436775584, "grad_norm": 0.35845261812210083, "learning_rate": 0.0005877657342657342, "loss": 4.0123, "step": 3600 }, { "epoch": 1.0629040712912807, "grad_norm": 0.3470519185066223, "learning_rate": 0.000587590909090909, "loss": 4.0003, "step": 3650 }, { "epoch": 1.0774651989050033, "grad_norm": 0.3520037531852722, "learning_rate": 0.0005874160839160838, "loss": 3.989, "step": 3700 }, { "epoch": 1.0920263265187256, "grad_norm": 0.3449414074420929, "learning_rate": 0.0005872412587412587, "loss": 4.0093, "step": 3750 }, { "epoch": 1.106587454132448, "grad_norm": 0.3322015106678009, "learning_rate": 0.0005870664335664335, "loss": 4.0085, "step": 3800 }, { "epoch": 1.1211485817461704, "grad_norm": 0.35296934843063354, "learning_rate": 0.0005868916083916083, "loss": 3.9958, "step": 3850 }, { "epoch": 1.135709709359893, "grad_norm": 0.3486291766166687, "learning_rate": 0.0005867167832167831, "loss": 4.0075, "step": 3900 }, { "epoch": 1.1502708369736152, "grad_norm": 0.34043097496032715, "learning_rate": 0.000586541958041958, "loss": 3.9973, "step": 3950 }, { "epoch": 1.1648319645873377, "grad_norm": 0.33894768357276917, "learning_rate": 0.0005863671328671328, "loss": 3.9915, "step": 4000 }, { "epoch": 1.1648319645873377, "eval_accuracy": 0.32561280096909206, "eval_loss": 3.9853665828704834, "eval_runtime": 177.196, "eval_samples_per_second": 93.941, "eval_steps_per_second": 5.875, "step": 4000 }, { "epoch": 1.17939309220106, "grad_norm": 0.3345903754234314, "learning_rate": 0.0005861923076923076, "loss": 3.9927, "step": 4050 }, { "epoch": 1.1939542198147826, "grad_norm": 0.3380124270915985, "learning_rate": 0.0005860174825174824, "loss": 3.9764, "step": 4100 }, { "epoch": 1.2085153474285049, "grad_norm": 0.3687117099761963, "learning_rate": 0.0005858426573426573, "loss": 3.985, "step": 4150 }, { "epoch": 1.2230764750422272, "grad_norm": 0.3383759558200836, "learning_rate": 0.000585667832167832, "loss": 3.967, "step": 4200 }, { "epoch": 1.2376376026559497, "grad_norm": 0.3614034354686737, "learning_rate": 0.000585493006993007, "loss": 3.9671, "step": 4250 }, { "epoch": 1.2521987302696722, "grad_norm": 0.3286982476711273, "learning_rate": 0.0005853181818181817, "loss": 3.9622, "step": 4300 }, { "epoch": 1.2667598578833945, "grad_norm": 0.34898343682289124, "learning_rate": 0.0005851433566433565, "loss": 3.9527, "step": 4350 }, { "epoch": 1.2813209854971168, "grad_norm": 0.3299115300178528, "learning_rate": 0.0005849685314685315, "loss": 3.9661, "step": 4400 }, { "epoch": 1.2958821131108393, "grad_norm": 0.3476330637931824, "learning_rate": 0.0005847937062937063, "loss": 3.9757, "step": 4450 }, { "epoch": 1.3104432407245616, "grad_norm": 0.3423220217227936, "learning_rate": 0.0005846188811188811, "loss": 3.9664, "step": 4500 }, { "epoch": 1.3250043683382842, "grad_norm": 0.36620113253593445, "learning_rate": 0.0005844440559440559, "loss": 3.9626, "step": 4550 }, { "epoch": 1.3395654959520065, "grad_norm": 0.34613823890686035, "learning_rate": 0.0005842692307692308, "loss": 3.9469, "step": 4600 }, { "epoch": 1.354126623565729, "grad_norm": 0.3371380865573883, "learning_rate": 0.0005840944055944056, "loss": 3.9431, "step": 4650 }, { "epoch": 1.3686877511794513, "grad_norm": 0.3557433485984802, "learning_rate": 0.0005839195804195804, "loss": 3.9371, "step": 4700 }, { "epoch": 1.3832488787931738, "grad_norm": 0.34661224484443665, "learning_rate": 0.0005837447552447552, "loss": 3.9475, "step": 4750 }, { "epoch": 1.3978100064068961, "grad_norm": 0.3363933563232422, "learning_rate": 0.0005835699300699301, "loss": 3.9525, "step": 4800 }, { "epoch": 1.4123711340206184, "grad_norm": 0.3473567068576813, "learning_rate": 0.0005833951048951048, "loss": 3.9478, "step": 4850 }, { "epoch": 1.426932261634341, "grad_norm": 0.35772213339805603, "learning_rate": 0.0005832202797202797, "loss": 3.9494, "step": 4900 }, { "epoch": 1.4414933892480635, "grad_norm": 0.3194500803947449, "learning_rate": 0.0005830454545454546, "loss": 3.9274, "step": 4950 }, { "epoch": 1.4560545168617858, "grad_norm": 0.3184325397014618, "learning_rate": 0.0005828706293706293, "loss": 3.91, "step": 5000 }, { "epoch": 1.4560545168617858, "eval_accuracy": 0.3322197019411703, "eval_loss": 3.9121792316436768, "eval_runtime": 177.1541, "eval_samples_per_second": 93.963, "eval_steps_per_second": 5.876, "step": 5000 }, { "epoch": 1.470615644475508, "grad_norm": 0.334377646446228, "learning_rate": 0.0005826958041958042, "loss": 3.9292, "step": 5050 }, { "epoch": 1.4851767720892306, "grad_norm": 0.3505079746246338, "learning_rate": 0.000582520979020979, "loss": 3.9197, "step": 5100 }, { "epoch": 1.4997378997029531, "grad_norm": 0.3423149585723877, "learning_rate": 0.0005823461538461538, "loss": 3.9216, "step": 5150 }, { "epoch": 1.5142990273166754, "grad_norm": 0.3242221772670746, "learning_rate": 0.0005821713286713286, "loss": 3.9264, "step": 5200 }, { "epoch": 1.5288601549303977, "grad_norm": 0.3360047936439514, "learning_rate": 0.0005819965034965035, "loss": 3.9075, "step": 5250 }, { "epoch": 1.5434212825441203, "grad_norm": 0.3281499445438385, "learning_rate": 0.0005818216783216783, "loss": 3.9156, "step": 5300 }, { "epoch": 1.5579824101578428, "grad_norm": 0.3143256604671478, "learning_rate": 0.0005816468531468531, "loss": 3.9054, "step": 5350 }, { "epoch": 1.572543537771565, "grad_norm": 0.34801602363586426, "learning_rate": 0.0005814720279720279, "loss": 3.9052, "step": 5400 }, { "epoch": 1.5871046653852874, "grad_norm": 0.33646127581596375, "learning_rate": 0.0005812972027972028, "loss": 3.9049, "step": 5450 }, { "epoch": 1.6016657929990097, "grad_norm": 0.34947100281715393, "learning_rate": 0.0005811223776223776, "loss": 3.9141, "step": 5500 }, { "epoch": 1.6162269206127322, "grad_norm": 0.3342869281768799, "learning_rate": 0.0005809475524475524, "loss": 3.8802, "step": 5550 }, { "epoch": 1.6307880482264547, "grad_norm": 0.3248964250087738, "learning_rate": 0.0005807727272727272, "loss": 3.9011, "step": 5600 }, { "epoch": 1.645349175840177, "grad_norm": 0.3466763496398926, "learning_rate": 0.0005805979020979021, "loss": 3.8889, "step": 5650 }, { "epoch": 1.6599103034538993, "grad_norm": 0.34471645951271057, "learning_rate": 0.0005804230769230769, "loss": 3.8915, "step": 5700 }, { "epoch": 1.6744714310676219, "grad_norm": 0.3128034770488739, "learning_rate": 0.0005802482517482517, "loss": 3.8916, "step": 5750 }, { "epoch": 1.6890325586813444, "grad_norm": 0.3492928445339203, "learning_rate": 0.0005800734265734265, "loss": 3.8693, "step": 5800 }, { "epoch": 1.7035936862950667, "grad_norm": 0.3331936001777649, "learning_rate": 0.0005798986013986013, "loss": 3.8772, "step": 5850 }, { "epoch": 1.718154813908789, "grad_norm": 0.3292458951473236, "learning_rate": 0.0005797237762237762, "loss": 3.874, "step": 5900 }, { "epoch": 1.7327159415225115, "grad_norm": 0.3334185779094696, "learning_rate": 0.000579548951048951, "loss": 3.878, "step": 5950 }, { "epoch": 1.747277069136234, "grad_norm": 0.3228028118610382, "learning_rate": 0.0005793741258741258, "loss": 3.8822, "step": 6000 }, { "epoch": 1.747277069136234, "eval_accuracy": 0.33703835809241034, "eval_loss": 3.8543479442596436, "eval_runtime": 177.1903, "eval_samples_per_second": 93.944, "eval_steps_per_second": 5.875, "step": 6000 }, { "epoch": 1.7618381967499563, "grad_norm": 0.35547763109207153, "learning_rate": 0.0005791993006993006, "loss": 3.8844, "step": 6050 }, { "epoch": 1.7763993243636786, "grad_norm": 0.33292338252067566, "learning_rate": 0.0005790244755244755, "loss": 3.8799, "step": 6100 }, { "epoch": 1.7909604519774012, "grad_norm": 0.3139176666736603, "learning_rate": 0.0005788496503496503, "loss": 3.869, "step": 6150 }, { "epoch": 1.8055215795911237, "grad_norm": 0.3225059509277344, "learning_rate": 0.0005786748251748251, "loss": 3.8519, "step": 6200 }, { "epoch": 1.820082707204846, "grad_norm": 0.3270617127418518, "learning_rate": 0.0005784999999999999, "loss": 3.8537, "step": 6250 }, { "epoch": 1.8346438348185683, "grad_norm": 0.32356396317481995, "learning_rate": 0.0005783251748251748, "loss": 3.8637, "step": 6300 }, { "epoch": 1.8492049624322906, "grad_norm": 0.34862807393074036, "learning_rate": 0.0005781503496503496, "loss": 3.8608, "step": 6350 }, { "epoch": 1.8637660900460131, "grad_norm": 0.3327373266220093, "learning_rate": 0.0005779755244755244, "loss": 3.8592, "step": 6400 }, { "epoch": 1.8783272176597356, "grad_norm": 0.323494553565979, "learning_rate": 0.0005778006993006993, "loss": 3.859, "step": 6450 }, { "epoch": 1.892888345273458, "grad_norm": 0.3193258047103882, "learning_rate": 0.000577625874125874, "loss": 3.8555, "step": 6500 }, { "epoch": 1.9074494728871803, "grad_norm": 0.34051451086997986, "learning_rate": 0.0005774510489510489, "loss": 3.8548, "step": 6550 }, { "epoch": 1.9220106005009028, "grad_norm": 0.34305649995803833, "learning_rate": 0.0005772762237762237, "loss": 3.8472, "step": 6600 }, { "epoch": 1.9365717281146253, "grad_norm": 0.3282603323459625, "learning_rate": 0.0005771013986013985, "loss": 3.8501, "step": 6650 }, { "epoch": 1.9511328557283476, "grad_norm": 0.31083357334136963, "learning_rate": 0.0005769265734265733, "loss": 3.8399, "step": 6700 }, { "epoch": 1.96569398334207, "grad_norm": 0.3474798798561096, "learning_rate": 0.0005767517482517482, "loss": 3.8562, "step": 6750 }, { "epoch": 1.9802551109557924, "grad_norm": 0.3274882435798645, "learning_rate": 0.000576576923076923, "loss": 3.8483, "step": 6800 }, { "epoch": 1.994816238569515, "grad_norm": 0.33568403124809265, "learning_rate": 0.0005764020979020978, "loss": 3.841, "step": 6850 }, { "epoch": 2.009319121672782, "grad_norm": 0.3249611556529999, "learning_rate": 0.0005762272727272726, "loss": 3.7798, "step": 6900 }, { "epoch": 2.023880249286505, "grad_norm": 0.3431794047355652, "learning_rate": 0.0005760524475524475, "loss": 3.7367, "step": 6950 }, { "epoch": 2.038441376900227, "grad_norm": 0.322765976190567, "learning_rate": 0.0005758776223776223, "loss": 3.7549, "step": 7000 }, { "epoch": 2.038441376900227, "eval_accuracy": 0.3418397325403657, "eval_loss": 3.810866355895996, "eval_runtime": 177.2789, "eval_samples_per_second": 93.897, "eval_steps_per_second": 5.872, "step": 7000 }, { "epoch": 2.0530025045139495, "grad_norm": 0.3592188060283661, "learning_rate": 0.0005757027972027971, "loss": 3.7467, "step": 7050 }, { "epoch": 2.067563632127672, "grad_norm": 0.3323240876197815, "learning_rate": 0.000575527972027972, "loss": 3.7268, "step": 7100 }, { "epoch": 2.0821247597413945, "grad_norm": 0.3217570185661316, "learning_rate": 0.0005753531468531468, "loss": 3.7491, "step": 7150 }, { "epoch": 2.096685887355117, "grad_norm": 0.33970892429351807, "learning_rate": 0.0005751783216783216, "loss": 3.7353, "step": 7200 }, { "epoch": 2.111247014968839, "grad_norm": 0.33313965797424316, "learning_rate": 0.0005750034965034964, "loss": 3.7521, "step": 7250 }, { "epoch": 2.1258081425825615, "grad_norm": 0.31688767671585083, "learning_rate": 0.0005748286713286712, "loss": 3.7518, "step": 7300 }, { "epoch": 2.140369270196284, "grad_norm": 0.32191213965415955, "learning_rate": 0.000574653846153846, "loss": 3.7579, "step": 7350 }, { "epoch": 2.1549303978100065, "grad_norm": 0.3361343741416931, "learning_rate": 0.000574479020979021, "loss": 3.7556, "step": 7400 }, { "epoch": 2.169491525423729, "grad_norm": 0.31816932559013367, "learning_rate": 0.0005743041958041958, "loss": 3.7599, "step": 7450 }, { "epoch": 2.184052653037451, "grad_norm": 0.32364562153816223, "learning_rate": 0.0005741293706293706, "loss": 3.7605, "step": 7500 }, { "epoch": 2.198613780651174, "grad_norm": 0.32209041714668274, "learning_rate": 0.0005739545454545454, "loss": 3.7569, "step": 7550 }, { "epoch": 2.213174908264896, "grad_norm": 0.33622536063194275, "learning_rate": 0.0005737797202797203, "loss": 3.7571, "step": 7600 }, { "epoch": 2.2277360358786185, "grad_norm": 0.34482723474502563, "learning_rate": 0.0005736048951048951, "loss": 3.7517, "step": 7650 }, { "epoch": 2.2422971634923408, "grad_norm": 0.322247177362442, "learning_rate": 0.0005734300699300699, "loss": 3.7608, "step": 7700 }, { "epoch": 2.256858291106063, "grad_norm": 0.3295249044895172, "learning_rate": 0.0005732552447552448, "loss": 3.7445, "step": 7750 }, { "epoch": 2.271419418719786, "grad_norm": 0.3132839798927307, "learning_rate": 0.0005730804195804196, "loss": 3.7522, "step": 7800 }, { "epoch": 2.285980546333508, "grad_norm": 0.31943801045417786, "learning_rate": 0.0005729055944055944, "loss": 3.7567, "step": 7850 }, { "epoch": 2.3005416739472304, "grad_norm": 0.3134506940841675, "learning_rate": 0.0005727307692307692, "loss": 3.7585, "step": 7900 }, { "epoch": 2.3151028015609527, "grad_norm": 0.32046815752983093, "learning_rate": 0.0005725559440559441, "loss": 3.7384, "step": 7950 }, { "epoch": 2.3296639291746755, "grad_norm": 0.34398654103279114, "learning_rate": 0.0005723811188811188, "loss": 3.751, "step": 8000 }, { "epoch": 2.3296639291746755, "eval_accuracy": 0.34463184446561096, "eval_loss": 3.7817397117614746, "eval_runtime": 177.3233, "eval_samples_per_second": 93.874, "eval_steps_per_second": 5.871, "step": 8000 }, { "epoch": 2.3442250567883978, "grad_norm": 0.3400885760784149, "learning_rate": 0.0005722062937062937, "loss": 3.7609, "step": 8050 }, { "epoch": 2.35878618440212, "grad_norm": 0.30969181656837463, "learning_rate": 0.0005720314685314685, "loss": 3.754, "step": 8100 }, { "epoch": 2.3733473120158424, "grad_norm": 0.35174381732940674, "learning_rate": 0.0005718566433566433, "loss": 3.7636, "step": 8150 }, { "epoch": 2.387908439629565, "grad_norm": 0.30494076013565063, "learning_rate": 0.0005716818181818181, "loss": 3.7505, "step": 8200 }, { "epoch": 2.4024695672432874, "grad_norm": 0.3317963480949402, "learning_rate": 0.000571506993006993, "loss": 3.7411, "step": 8250 }, { "epoch": 2.4170306948570097, "grad_norm": 0.3330554962158203, "learning_rate": 0.0005713321678321678, "loss": 3.7478, "step": 8300 }, { "epoch": 2.431591822470732, "grad_norm": 0.33387240767478943, "learning_rate": 0.0005711573426573426, "loss": 3.7595, "step": 8350 }, { "epoch": 2.4461529500844543, "grad_norm": 0.3251585066318512, "learning_rate": 0.0005709825174825175, "loss": 3.744, "step": 8400 }, { "epoch": 2.460714077698177, "grad_norm": 0.3279932141304016, "learning_rate": 0.0005708076923076923, "loss": 3.75, "step": 8450 }, { "epoch": 2.4752752053118994, "grad_norm": 0.317627876996994, "learning_rate": 0.0005706328671328671, "loss": 3.7403, "step": 8500 }, { "epoch": 2.4898363329256217, "grad_norm": 0.33804240822792053, "learning_rate": 0.0005704580419580419, "loss": 3.7408, "step": 8550 }, { "epoch": 2.5043974605393444, "grad_norm": 0.333448588848114, "learning_rate": 0.0005702832167832168, "loss": 3.7505, "step": 8600 }, { "epoch": 2.5189585881530667, "grad_norm": 0.325022429227829, "learning_rate": 0.0005701083916083916, "loss": 3.7527, "step": 8650 }, { "epoch": 2.533519715766789, "grad_norm": 0.3052625358104706, "learning_rate": 0.0005699335664335664, "loss": 3.7333, "step": 8700 }, { "epoch": 2.5480808433805113, "grad_norm": 0.3203795850276947, "learning_rate": 0.0005697587412587412, "loss": 3.7428, "step": 8750 }, { "epoch": 2.5626419709942336, "grad_norm": 0.30669328570365906, "learning_rate": 0.000569583916083916, "loss": 3.7437, "step": 8800 }, { "epoch": 2.5772030986079564, "grad_norm": 0.3302299976348877, "learning_rate": 0.0005694090909090908, "loss": 3.7527, "step": 8850 }, { "epoch": 2.5917642262216787, "grad_norm": 0.3199024498462677, "learning_rate": 0.0005692342657342657, "loss": 3.7376, "step": 8900 }, { "epoch": 2.606325353835401, "grad_norm": 0.3255689740180969, "learning_rate": 0.0005690594405594405, "loss": 3.7342, "step": 8950 }, { "epoch": 2.6208864814491233, "grad_norm": 0.33225247263908386, "learning_rate": 0.0005688846153846153, "loss": 3.7396, "step": 9000 }, { "epoch": 2.6208864814491233, "eval_accuracy": 0.347539990684339, "eval_loss": 3.750495672225952, "eval_runtime": 177.3467, "eval_samples_per_second": 93.861, "eval_steps_per_second": 5.87, "step": 9000 }, { "epoch": 2.6354476090628456, "grad_norm": 0.31644511222839355, "learning_rate": 0.0005687097902097901, "loss": 3.737, "step": 9050 }, { "epoch": 2.6500087366765683, "grad_norm": 0.32711660861968994, "learning_rate": 0.000568534965034965, "loss": 3.7258, "step": 9100 }, { "epoch": 2.6645698642902906, "grad_norm": 0.32171499729156494, "learning_rate": 0.0005683601398601398, "loss": 3.7459, "step": 9150 }, { "epoch": 2.679130991904013, "grad_norm": 0.30605417490005493, "learning_rate": 0.0005681853146853146, "loss": 3.7252, "step": 9200 }, { "epoch": 2.6936921195177357, "grad_norm": 0.31815701723098755, "learning_rate": 0.0005680104895104895, "loss": 3.7312, "step": 9250 }, { "epoch": 2.708253247131458, "grad_norm": 0.320014625787735, "learning_rate": 0.0005678356643356643, "loss": 3.7287, "step": 9300 }, { "epoch": 2.7228143747451803, "grad_norm": 0.31022703647613525, "learning_rate": 0.0005676608391608391, "loss": 3.7285, "step": 9350 }, { "epoch": 2.7373755023589026, "grad_norm": 0.33272796869277954, "learning_rate": 0.0005674860139860139, "loss": 3.7302, "step": 9400 }, { "epoch": 2.751936629972625, "grad_norm": 0.336722731590271, "learning_rate": 0.0005673111888111888, "loss": 3.713, "step": 9450 }, { "epoch": 2.7664977575863476, "grad_norm": 0.3396904468536377, "learning_rate": 0.0005671363636363635, "loss": 3.7374, "step": 9500 }, { "epoch": 2.78105888520007, "grad_norm": 0.3151884973049164, "learning_rate": 0.0005669615384615384, "loss": 3.7306, "step": 9550 }, { "epoch": 2.7956200128137922, "grad_norm": 0.31197404861450195, "learning_rate": 0.0005667867132867132, "loss": 3.7201, "step": 9600 }, { "epoch": 2.8101811404275145, "grad_norm": 0.33173638582229614, "learning_rate": 0.000566611888111888, "loss": 3.7265, "step": 9650 }, { "epoch": 2.824742268041237, "grad_norm": 0.31983959674835205, "learning_rate": 0.0005664370629370628, "loss": 3.7218, "step": 9700 }, { "epoch": 2.8393033956549596, "grad_norm": 0.3080846071243286, "learning_rate": 0.0005662622377622377, "loss": 3.7198, "step": 9750 }, { "epoch": 2.853864523268682, "grad_norm": 0.31331005692481995, "learning_rate": 0.0005660874125874125, "loss": 3.7078, "step": 9800 }, { "epoch": 2.868425650882404, "grad_norm": 0.3520505428314209, "learning_rate": 0.0005659125874125873, "loss": 3.726, "step": 9850 }, { "epoch": 2.882986778496127, "grad_norm": 0.3302844762802124, "learning_rate": 0.0005657377622377622, "loss": 3.7182, "step": 9900 }, { "epoch": 2.8975479061098492, "grad_norm": 0.3250056803226471, "learning_rate": 0.000565562937062937, "loss": 3.7079, "step": 9950 }, { "epoch": 2.9121090337235715, "grad_norm": 0.3227342963218689, "learning_rate": 0.0005653881118881118, "loss": 3.7105, "step": 10000 }, { "epoch": 2.9121090337235715, "eval_accuracy": 0.3495654768468674, "eval_loss": 3.724705457687378, "eval_runtime": 177.4012, "eval_samples_per_second": 93.833, "eval_steps_per_second": 5.868, "step": 10000 }, { "epoch": 2.926670161337294, "grad_norm": 0.32378628849983215, "learning_rate": 0.0005652132867132866, "loss": 3.7132, "step": 10050 }, { "epoch": 2.941231288951016, "grad_norm": 0.3264535665512085, "learning_rate": 0.0005650384615384615, "loss": 3.72, "step": 10100 }, { "epoch": 2.955792416564739, "grad_norm": 0.3335714042186737, "learning_rate": 0.0005648636363636363, "loss": 3.714, "step": 10150 }, { "epoch": 2.970353544178461, "grad_norm": 0.3222605288028717, "learning_rate": 0.0005646888111888111, "loss": 3.7256, "step": 10200 }, { "epoch": 2.9849146717921835, "grad_norm": 0.3324715495109558, "learning_rate": 0.000564513986013986, "loss": 3.7218, "step": 10250 }, { "epoch": 2.9994757994059063, "grad_norm": 0.3224101960659027, "learning_rate": 0.0005643391608391607, "loss": 3.7147, "step": 10300 }, { "epoch": 3.0139786825091734, "grad_norm": 0.3330100476741791, "learning_rate": 0.0005641643356643355, "loss": 3.6132, "step": 10350 }, { "epoch": 3.0285398101228957, "grad_norm": 0.3183719515800476, "learning_rate": 0.0005639895104895105, "loss": 3.6057, "step": 10400 }, { "epoch": 3.0431009377366185, "grad_norm": 0.31805160641670227, "learning_rate": 0.0005638146853146853, "loss": 3.616, "step": 10450 }, { "epoch": 3.057662065350341, "grad_norm": 0.3436165153980255, "learning_rate": 0.0005636398601398601, "loss": 3.625, "step": 10500 }, { "epoch": 3.072223192964063, "grad_norm": 0.32004937529563904, "learning_rate": 0.000563465034965035, "loss": 3.6283, "step": 10550 }, { "epoch": 3.0867843205777854, "grad_norm": 0.3213531970977783, "learning_rate": 0.0005632902097902098, "loss": 3.6217, "step": 10600 }, { "epoch": 3.101345448191508, "grad_norm": 0.31485000252723694, "learning_rate": 0.0005631153846153846, "loss": 3.6229, "step": 10650 }, { "epoch": 3.1159065758052304, "grad_norm": 0.3263075649738312, "learning_rate": 0.0005629405594405594, "loss": 3.6317, "step": 10700 }, { "epoch": 3.1304677034189528, "grad_norm": 0.34470370411872864, "learning_rate": 0.0005627657342657343, "loss": 3.6219, "step": 10750 }, { "epoch": 3.145028831032675, "grad_norm": 0.33321434259414673, "learning_rate": 0.0005625909090909091, "loss": 3.6205, "step": 10800 }, { "epoch": 3.1595899586463974, "grad_norm": 0.32217034697532654, "learning_rate": 0.0005624160839160839, "loss": 3.6309, "step": 10850 }, { "epoch": 3.17415108626012, "grad_norm": 0.34395185112953186, "learning_rate": 0.0005622412587412587, "loss": 3.6366, "step": 10900 }, { "epoch": 3.1887122138738424, "grad_norm": 0.3254980146884918, "learning_rate": 0.0005620664335664336, "loss": 3.6369, "step": 10950 }, { "epoch": 3.2032733414875647, "grad_norm": 0.338420033454895, "learning_rate": 0.0005618916083916083, "loss": 3.6418, "step": 11000 }, { "epoch": 3.2032733414875647, "eval_accuracy": 0.3518970960390101, "eval_loss": 3.711723566055298, "eval_runtime": 177.6335, "eval_samples_per_second": 93.71, "eval_steps_per_second": 5.86, "step": 11000 }, { "epoch": 3.217834469101287, "grad_norm": 0.3291839361190796, "learning_rate": 0.0005617167832167832, "loss": 3.6324, "step": 11050 }, { "epoch": 3.2323955967150098, "grad_norm": 0.3438150882720947, "learning_rate": 0.000561541958041958, "loss": 3.6253, "step": 11100 }, { "epoch": 3.246956724328732, "grad_norm": 0.36098095774650574, "learning_rate": 0.0005613671328671328, "loss": 3.6401, "step": 11150 }, { "epoch": 3.2615178519424544, "grad_norm": 0.351468026638031, "learning_rate": 0.0005611923076923077, "loss": 3.6395, "step": 11200 }, { "epoch": 3.2760789795561767, "grad_norm": 0.3300299644470215, "learning_rate": 0.0005610174825174825, "loss": 3.644, "step": 11250 }, { "epoch": 3.2906401071698994, "grad_norm": 0.3039533495903015, "learning_rate": 0.0005608426573426573, "loss": 3.6416, "step": 11300 }, { "epoch": 3.3052012347836217, "grad_norm": 0.35001346468925476, "learning_rate": 0.0005606678321678321, "loss": 3.6403, "step": 11350 }, { "epoch": 3.319762362397344, "grad_norm": 0.3169427514076233, "learning_rate": 0.000560493006993007, "loss": 3.6419, "step": 11400 }, { "epoch": 3.3343234900110663, "grad_norm": 0.3217058777809143, "learning_rate": 0.0005603181818181818, "loss": 3.6448, "step": 11450 }, { "epoch": 3.3488846176247886, "grad_norm": 0.3203667402267456, "learning_rate": 0.0005601433566433566, "loss": 3.6387, "step": 11500 }, { "epoch": 3.3634457452385114, "grad_norm": 0.3251610994338989, "learning_rate": 0.0005599685314685314, "loss": 3.6272, "step": 11550 }, { "epoch": 3.3780068728522337, "grad_norm": 0.329180508852005, "learning_rate": 0.0005597937062937063, "loss": 3.6358, "step": 11600 }, { "epoch": 3.392568000465956, "grad_norm": 0.31434664130210876, "learning_rate": 0.0005596188811188811, "loss": 3.6503, "step": 11650 }, { "epoch": 3.4071291280796787, "grad_norm": 0.3283628225326538, "learning_rate": 0.0005594440559440559, "loss": 3.6348, "step": 11700 }, { "epoch": 3.421690255693401, "grad_norm": 0.3240419626235962, "learning_rate": 0.0005592692307692307, "loss": 3.6482, "step": 11750 }, { "epoch": 3.4362513833071233, "grad_norm": 0.32653725147247314, "learning_rate": 0.0005590944055944055, "loss": 3.6465, "step": 11800 }, { "epoch": 3.4508125109208456, "grad_norm": 0.3058265447616577, "learning_rate": 0.0005589195804195803, "loss": 3.6452, "step": 11850 }, { "epoch": 3.465373638534568, "grad_norm": 0.3274167776107788, "learning_rate": 0.0005587447552447552, "loss": 3.6373, "step": 11900 }, { "epoch": 3.4799347661482907, "grad_norm": 0.34979650378227234, "learning_rate": 0.00055856993006993, "loss": 3.6462, "step": 11950 }, { "epoch": 3.494495893762013, "grad_norm": 0.3270816206932068, "learning_rate": 0.0005583951048951048, "loss": 3.6293, "step": 12000 }, { "epoch": 3.494495893762013, "eval_accuracy": 0.3533076122023403, "eval_loss": 3.6936569213867188, "eval_runtime": 177.1638, "eval_samples_per_second": 93.958, "eval_steps_per_second": 5.876, "step": 12000 }, { "epoch": 3.5090570213757353, "grad_norm": 0.3215574026107788, "learning_rate": 0.0005582202797202797, "loss": 3.632, "step": 12050 }, { "epoch": 3.523618148989458, "grad_norm": 0.326255738735199, "learning_rate": 0.0005580454545454545, "loss": 3.6415, "step": 12100 }, { "epoch": 3.53817927660318, "grad_norm": 0.33679527044296265, "learning_rate": 0.0005578706293706293, "loss": 3.6456, "step": 12150 }, { "epoch": 3.5527404042169026, "grad_norm": 0.3434588313102722, "learning_rate": 0.0005576958041958041, "loss": 3.6466, "step": 12200 }, { "epoch": 3.567301531830625, "grad_norm": 0.31146910786628723, "learning_rate": 0.000557520979020979, "loss": 3.6367, "step": 12250 }, { "epoch": 3.5818626594443472, "grad_norm": 0.3134685158729553, "learning_rate": 0.0005573461538461538, "loss": 3.6253, "step": 12300 }, { "epoch": 3.59642378705807, "grad_norm": 0.31741416454315186, "learning_rate": 0.0005571713286713286, "loss": 3.6344, "step": 12350 }, { "epoch": 3.6109849146717923, "grad_norm": 0.3213517963886261, "learning_rate": 0.0005569965034965034, "loss": 3.6461, "step": 12400 }, { "epoch": 3.6255460422855146, "grad_norm": 0.3236750662326813, "learning_rate": 0.0005568216783216783, "loss": 3.6363, "step": 12450 }, { "epoch": 3.640107169899237, "grad_norm": 0.3350413143634796, "learning_rate": 0.000556646853146853, "loss": 3.6341, "step": 12500 }, { "epoch": 3.654668297512959, "grad_norm": 0.3210690915584564, "learning_rate": 0.0005564720279720279, "loss": 3.6341, "step": 12550 }, { "epoch": 3.669229425126682, "grad_norm": 0.31280940771102905, "learning_rate": 0.0005562972027972027, "loss": 3.6406, "step": 12600 }, { "epoch": 3.6837905527404042, "grad_norm": 0.3478114902973175, "learning_rate": 0.0005561223776223775, "loss": 3.6437, "step": 12650 }, { "epoch": 3.6983516803541265, "grad_norm": 0.3223908245563507, "learning_rate": 0.0005559475524475524, "loss": 3.6346, "step": 12700 }, { "epoch": 3.7129128079678493, "grad_norm": 0.30570903420448303, "learning_rate": 0.0005557727272727272, "loss": 3.6475, "step": 12750 }, { "epoch": 3.7274739355815716, "grad_norm": 0.3094755709171295, "learning_rate": 0.000555597902097902, "loss": 3.6307, "step": 12800 }, { "epoch": 3.742035063195294, "grad_norm": 0.2975589334964752, "learning_rate": 0.0005554230769230768, "loss": 3.6396, "step": 12850 }, { "epoch": 3.756596190809016, "grad_norm": 0.3289732038974762, "learning_rate": 0.0005552482517482517, "loss": 3.6434, "step": 12900 }, { "epoch": 3.7711573184227385, "grad_norm": 0.32117950916290283, "learning_rate": 0.0005550734265734265, "loss": 3.6423, "step": 12950 }, { "epoch": 3.7857184460364612, "grad_norm": 0.32686787843704224, "learning_rate": 0.0005548986013986013, "loss": 3.642, "step": 13000 }, { "epoch": 3.7857184460364612, "eval_accuracy": 0.35558350671858546, "eval_loss": 3.676309823989868, "eval_runtime": 177.0944, "eval_samples_per_second": 93.995, "eval_steps_per_second": 5.878, "step": 13000 }, { "epoch": 3.8002795736501835, "grad_norm": 0.3189132809638977, "learning_rate": 0.0005547237762237761, "loss": 3.6439, "step": 13050 }, { "epoch": 3.814840701263906, "grad_norm": 0.33861854672431946, "learning_rate": 0.000554548951048951, "loss": 3.6343, "step": 13100 }, { "epoch": 3.829401828877628, "grad_norm": 0.3242887854576111, "learning_rate": 0.0005543741258741258, "loss": 3.64, "step": 13150 }, { "epoch": 3.8439629564913504, "grad_norm": 0.3283834457397461, "learning_rate": 0.0005541993006993006, "loss": 3.6236, "step": 13200 }, { "epoch": 3.858524084105073, "grad_norm": 0.3189845085144043, "learning_rate": 0.0005540244755244756, "loss": 3.6343, "step": 13250 }, { "epoch": 3.8730852117187955, "grad_norm": 0.3211795687675476, "learning_rate": 0.0005538496503496502, "loss": 3.6512, "step": 13300 }, { "epoch": 3.887646339332518, "grad_norm": 0.3205946087837219, "learning_rate": 0.0005536748251748252, "loss": 3.63, "step": 13350 }, { "epoch": 3.9022074669462405, "grad_norm": 0.3182278275489807, "learning_rate": 0.0005535, "loss": 3.6192, "step": 13400 }, { "epoch": 3.916768594559963, "grad_norm": 0.3093646466732025, "learning_rate": 0.0005533251748251748, "loss": 3.629, "step": 13450 }, { "epoch": 3.931329722173685, "grad_norm": 0.325284481048584, "learning_rate": 0.0005531503496503496, "loss": 3.6445, "step": 13500 }, { "epoch": 3.9458908497874075, "grad_norm": 0.34685397148132324, "learning_rate": 0.0005529755244755245, "loss": 3.6269, "step": 13550 }, { "epoch": 3.9604519774011298, "grad_norm": 0.3556680977344513, "learning_rate": 0.0005528006993006993, "loss": 3.6371, "step": 13600 }, { "epoch": 3.9750131050148525, "grad_norm": 0.3256296217441559, "learning_rate": 0.0005526258741258741, "loss": 3.6367, "step": 13650 }, { "epoch": 3.989574232628575, "grad_norm": 0.31314557790756226, "learning_rate": 0.0005524510489510489, "loss": 3.6367, "step": 13700 }, { "epoch": 4.004077115731842, "grad_norm": 0.34284132719039917, "learning_rate": 0.0005522762237762238, "loss": 3.6039, "step": 13750 }, { "epoch": 4.018638243345564, "grad_norm": 0.31531134247779846, "learning_rate": 0.0005521013986013986, "loss": 3.5281, "step": 13800 }, { "epoch": 4.033199370959287, "grad_norm": 0.329609751701355, "learning_rate": 0.0005519265734265734, "loss": 3.5438, "step": 13850 }, { "epoch": 4.04776049857301, "grad_norm": 0.3221539855003357, "learning_rate": 0.0005517517482517482, "loss": 3.5235, "step": 13900 }, { "epoch": 4.062321626186732, "grad_norm": 0.3120812177658081, "learning_rate": 0.0005515769230769231, "loss": 3.5343, "step": 13950 }, { "epoch": 4.076882753800454, "grad_norm": 0.32255053520202637, "learning_rate": 0.0005514020979020979, "loss": 3.543, "step": 14000 }, { "epoch": 4.076882753800454, "eval_accuracy": 0.35604047257346666, "eval_loss": 3.672227382659912, "eval_runtime": 177.2502, "eval_samples_per_second": 93.912, "eval_steps_per_second": 5.873, "step": 14000 }, { "epoch": 4.091443881414177, "grad_norm": 0.33779409527778625, "learning_rate": 0.0005512272727272727, "loss": 3.5375, "step": 14050 }, { "epoch": 4.106005009027899, "grad_norm": 0.30933794379234314, "learning_rate": 0.0005510524475524475, "loss": 3.5311, "step": 14100 }, { "epoch": 4.120566136641622, "grad_norm": 0.3431943356990814, "learning_rate": 0.0005508776223776223, "loss": 3.532, "step": 14150 }, { "epoch": 4.135127264255344, "grad_norm": 0.31620094180107117, "learning_rate": 0.0005507027972027972, "loss": 3.5503, "step": 14200 }, { "epoch": 4.149688391869066, "grad_norm": 0.3254013657569885, "learning_rate": 0.000550527972027972, "loss": 3.5538, "step": 14250 }, { "epoch": 4.164249519482789, "grad_norm": 0.33322015404701233, "learning_rate": 0.0005503531468531468, "loss": 3.5521, "step": 14300 }, { "epoch": 4.178810647096511, "grad_norm": 0.3261924386024475, "learning_rate": 0.0005501783216783216, "loss": 3.5439, "step": 14350 }, { "epoch": 4.193371774710234, "grad_norm": 0.32681429386138916, "learning_rate": 0.0005500034965034965, "loss": 3.5543, "step": 14400 }, { "epoch": 4.207932902323956, "grad_norm": 0.3240799307823181, "learning_rate": 0.0005498286713286713, "loss": 3.5622, "step": 14450 }, { "epoch": 4.222494029937678, "grad_norm": 0.32154789566993713, "learning_rate": 0.0005496538461538461, "loss": 3.5425, "step": 14500 }, { "epoch": 4.237055157551401, "grad_norm": 0.3319743275642395, "learning_rate": 0.0005494790209790209, "loss": 3.5755, "step": 14550 }, { "epoch": 4.251616285165123, "grad_norm": 0.32192280888557434, "learning_rate": 0.0005493041958041958, "loss": 3.5474, "step": 14600 }, { "epoch": 4.266177412778846, "grad_norm": 0.3264658451080322, "learning_rate": 0.0005491293706293706, "loss": 3.5525, "step": 14650 }, { "epoch": 4.280738540392568, "grad_norm": 0.30953332781791687, "learning_rate": 0.0005489545454545454, "loss": 3.5686, "step": 14700 }, { "epoch": 4.29529966800629, "grad_norm": 0.32248425483703613, "learning_rate": 0.0005487797202797203, "loss": 3.5576, "step": 14750 }, { "epoch": 4.309860795620013, "grad_norm": 0.3248192071914673, "learning_rate": 0.000548604895104895, "loss": 3.5652, "step": 14800 }, { "epoch": 4.324421923233735, "grad_norm": 0.3153315484523773, "learning_rate": 0.0005484300699300699, "loss": 3.5583, "step": 14850 }, { "epoch": 4.338983050847458, "grad_norm": 0.33788663148880005, "learning_rate": 0.0005482552447552447, "loss": 3.5586, "step": 14900 }, { "epoch": 4.35354417846118, "grad_norm": 0.3236865699291229, "learning_rate": 0.0005480804195804195, "loss": 3.5692, "step": 14950 }, { "epoch": 4.368105306074902, "grad_norm": 0.3407519459724426, "learning_rate": 0.0005479055944055943, "loss": 3.5577, "step": 15000 }, { "epoch": 4.368105306074902, "eval_accuracy": 0.3575495062017802, "eval_loss": 3.658498764038086, "eval_runtime": 176.9515, "eval_samples_per_second": 94.071, "eval_steps_per_second": 5.883, "step": 15000 }, { "epoch": 4.382666433688625, "grad_norm": 0.3334459364414215, "learning_rate": 0.0005477307692307692, "loss": 3.5686, "step": 15050 }, { "epoch": 4.397227561302348, "grad_norm": 0.32275843620300293, "learning_rate": 0.000547555944055944, "loss": 3.5695, "step": 15100 }, { "epoch": 4.41178868891607, "grad_norm": 0.3123950660228729, "learning_rate": 0.0005473811188811188, "loss": 3.5732, "step": 15150 }, { "epoch": 4.426349816529792, "grad_norm": 0.32868602871894836, "learning_rate": 0.0005472062937062936, "loss": 3.5836, "step": 15200 }, { "epoch": 4.440910944143514, "grad_norm": 0.3395336866378784, "learning_rate": 0.0005470314685314685, "loss": 3.5748, "step": 15250 }, { "epoch": 4.455472071757237, "grad_norm": 0.3110513687133789, "learning_rate": 0.0005468566433566433, "loss": 3.5758, "step": 15300 }, { "epoch": 4.47003319937096, "grad_norm": 0.3269290328025818, "learning_rate": 0.0005466818181818181, "loss": 3.5562, "step": 15350 }, { "epoch": 4.4845943269846815, "grad_norm": 0.32525432109832764, "learning_rate": 0.000546506993006993, "loss": 3.5786, "step": 15400 }, { "epoch": 4.499155454598404, "grad_norm": 0.33424168825149536, "learning_rate": 0.0005463321678321678, "loss": 3.577, "step": 15450 }, { "epoch": 4.513716582212126, "grad_norm": 0.31379204988479614, "learning_rate": 0.0005461573426573426, "loss": 3.5716, "step": 15500 }, { "epoch": 4.528277709825849, "grad_norm": 0.33003076910972595, "learning_rate": 0.0005459825174825174, "loss": 3.5673, "step": 15550 }, { "epoch": 4.542838837439572, "grad_norm": 0.31797513365745544, "learning_rate": 0.0005458076923076922, "loss": 3.5634, "step": 15600 }, { "epoch": 4.5573999650532935, "grad_norm": 0.3288755416870117, "learning_rate": 0.000545632867132867, "loss": 3.5642, "step": 15650 }, { "epoch": 4.571961092667016, "grad_norm": 0.3203357756137848, "learning_rate": 0.0005454580419580419, "loss": 3.5713, "step": 15700 }, { "epoch": 4.586522220280738, "grad_norm": 0.31958380341529846, "learning_rate": 0.0005452832167832167, "loss": 3.5633, "step": 15750 }, { "epoch": 4.601083347894461, "grad_norm": 0.33480215072631836, "learning_rate": 0.0005451083916083915, "loss": 3.5648, "step": 15800 }, { "epoch": 4.615644475508184, "grad_norm": 0.31152933835983276, "learning_rate": 0.0005449335664335663, "loss": 3.5824, "step": 15850 }, { "epoch": 4.630205603121905, "grad_norm": 0.31223437190055847, "learning_rate": 0.0005447587412587412, "loss": 3.5717, "step": 15900 }, { "epoch": 4.644766730735628, "grad_norm": 0.32843029499053955, "learning_rate": 0.000544583916083916, "loss": 3.561, "step": 15950 }, { "epoch": 4.659327858349351, "grad_norm": 0.3143811523914337, "learning_rate": 0.0005444090909090908, "loss": 3.5824, "step": 16000 }, { "epoch": 4.659327858349351, "eval_accuracy": 0.35877027631680114, "eval_loss": 3.646221399307251, "eval_runtime": 176.8737, "eval_samples_per_second": 94.112, "eval_steps_per_second": 5.886, "step": 16000 }, { "epoch": 4.673888985963073, "grad_norm": 0.32724830508232117, "learning_rate": 0.0005442342657342657, "loss": 3.5791, "step": 16050 }, { "epoch": 4.6884501135767955, "grad_norm": 0.3239496648311615, "learning_rate": 0.0005440594405594405, "loss": 3.5755, "step": 16100 }, { "epoch": 4.703011241190518, "grad_norm": 0.317526251077652, "learning_rate": 0.0005438846153846153, "loss": 3.5594, "step": 16150 }, { "epoch": 4.71757236880424, "grad_norm": 0.32207709550857544, "learning_rate": 0.0005437097902097901, "loss": 3.566, "step": 16200 }, { "epoch": 4.732133496417963, "grad_norm": 0.32053446769714355, "learning_rate": 0.0005435349650349651, "loss": 3.5819, "step": 16250 }, { "epoch": 4.746694624031685, "grad_norm": 0.3298093378543854, "learning_rate": 0.0005433601398601397, "loss": 3.576, "step": 16300 }, { "epoch": 4.7612557516454075, "grad_norm": 0.32744932174682617, "learning_rate": 0.0005431853146853147, "loss": 3.5798, "step": 16350 }, { "epoch": 4.77581687925913, "grad_norm": 0.3389306664466858, "learning_rate": 0.0005430104895104895, "loss": 3.5866, "step": 16400 }, { "epoch": 4.790378006872852, "grad_norm": 0.3112584352493286, "learning_rate": 0.0005428356643356643, "loss": 3.5773, "step": 16450 }, { "epoch": 4.804939134486575, "grad_norm": 0.32686665654182434, "learning_rate": 0.0005426608391608391, "loss": 3.5815, "step": 16500 }, { "epoch": 4.819500262100297, "grad_norm": 0.3334883749485016, "learning_rate": 0.000542486013986014, "loss": 3.5664, "step": 16550 }, { "epoch": 4.834061389714019, "grad_norm": 0.3258337378501892, "learning_rate": 0.0005423111888111888, "loss": 3.5747, "step": 16600 }, { "epoch": 4.848622517327742, "grad_norm": 0.29793718457221985, "learning_rate": 0.0005421363636363636, "loss": 3.5803, "step": 16650 }, { "epoch": 4.863183644941464, "grad_norm": 0.3122703433036804, "learning_rate": 0.0005419615384615385, "loss": 3.5709, "step": 16700 }, { "epoch": 4.877744772555187, "grad_norm": 0.31609809398651123, "learning_rate": 0.0005417867132867133, "loss": 3.5709, "step": 16750 }, { "epoch": 4.892305900168909, "grad_norm": 0.3263167142868042, "learning_rate": 0.0005416118881118881, "loss": 3.5827, "step": 16800 }, { "epoch": 4.906867027782631, "grad_norm": 0.3147113025188446, "learning_rate": 0.0005414370629370629, "loss": 3.5783, "step": 16850 }, { "epoch": 4.921428155396354, "grad_norm": 0.3340342938899994, "learning_rate": 0.0005412622377622378, "loss": 3.5829, "step": 16900 }, { "epoch": 4.935989283010076, "grad_norm": 0.3196568489074707, "learning_rate": 0.0005410874125874126, "loss": 3.5705, "step": 16950 }, { "epoch": 4.950550410623799, "grad_norm": 0.3239845931529999, "learning_rate": 0.0005409125874125874, "loss": 3.5754, "step": 17000 }, { "epoch": 4.950550410623799, "eval_accuracy": 0.3598982895346002, "eval_loss": 3.6311697959899902, "eval_runtime": 176.9446, "eval_samples_per_second": 94.075, "eval_steps_per_second": 5.883, "step": 17000 }, { "epoch": 4.9651115382375215, "grad_norm": 0.3366827368736267, "learning_rate": 0.0005407377622377622, "loss": 3.5755, "step": 17050 }, { "epoch": 4.979672665851243, "grad_norm": 0.3341779112815857, "learning_rate": 0.000540562937062937, "loss": 3.5782, "step": 17100 }, { "epoch": 4.994233793464966, "grad_norm": 0.3099096119403839, "learning_rate": 0.0005403881118881118, "loss": 3.5812, "step": 17150 }, { "epoch": 5.008736676568233, "grad_norm": 0.3530953824520111, "learning_rate": 0.0005402132867132867, "loss": 3.4902, "step": 17200 }, { "epoch": 5.023297804181956, "grad_norm": 0.3362038731575012, "learning_rate": 0.0005400384615384615, "loss": 3.4618, "step": 17250 }, { "epoch": 5.037858931795678, "grad_norm": 0.35798731446266174, "learning_rate": 0.0005398636363636363, "loss": 3.4807, "step": 17300 }, { "epoch": 5.052420059409401, "grad_norm": 0.3367793560028076, "learning_rate": 0.0005396888111888111, "loss": 3.4656, "step": 17350 }, { "epoch": 5.066981187023123, "grad_norm": 0.3349305987358093, "learning_rate": 0.000539513986013986, "loss": 3.4776, "step": 17400 }, { "epoch": 5.081542314636845, "grad_norm": 0.31668785214424133, "learning_rate": 0.0005393391608391608, "loss": 3.4841, "step": 17450 }, { "epoch": 5.096103442250568, "grad_norm": 0.32060810923576355, "learning_rate": 0.0005391643356643356, "loss": 3.4835, "step": 17500 }, { "epoch": 5.110664569864291, "grad_norm": 0.31662890315055847, "learning_rate": 0.0005389895104895105, "loss": 3.4778, "step": 17550 }, { "epoch": 5.125225697478013, "grad_norm": 0.3131362795829773, "learning_rate": 0.0005388146853146853, "loss": 3.4767, "step": 17600 }, { "epoch": 5.139786825091735, "grad_norm": 0.32624542713165283, "learning_rate": 0.0005386398601398601, "loss": 3.4872, "step": 17650 }, { "epoch": 5.154347952705457, "grad_norm": 0.3383287787437439, "learning_rate": 0.0005384650349650349, "loss": 3.4867, "step": 17700 }, { "epoch": 5.16890908031918, "grad_norm": 0.3286052644252777, "learning_rate": 0.0005382902097902098, "loss": 3.4852, "step": 17750 }, { "epoch": 5.183470207932903, "grad_norm": 0.31900569796562195, "learning_rate": 0.0005381153846153845, "loss": 3.4994, "step": 17800 }, { "epoch": 5.1980313355466246, "grad_norm": 0.3237035572528839, "learning_rate": 0.0005379405594405594, "loss": 3.4957, "step": 17850 }, { "epoch": 5.212592463160347, "grad_norm": 0.3051255941390991, "learning_rate": 0.0005377657342657342, "loss": 3.5044, "step": 17900 }, { "epoch": 5.227153590774069, "grad_norm": 0.33518698811531067, "learning_rate": 0.000537590909090909, "loss": 3.4936, "step": 17950 }, { "epoch": 5.241714718387792, "grad_norm": 0.3128952980041504, "learning_rate": 0.0005374160839160838, "loss": 3.5, "step": 18000 }, { "epoch": 5.241714718387792, "eval_accuracy": 0.36015245989175304, "eval_loss": 3.6348447799682617, "eval_runtime": 176.8036, "eval_samples_per_second": 94.15, "eval_steps_per_second": 5.888, "step": 18000 }, { "epoch": 5.256275846001515, "grad_norm": 0.33034732937812805, "learning_rate": 0.0005372412587412587, "loss": 3.509, "step": 18050 }, { "epoch": 5.2708369736152365, "grad_norm": 0.3423735499382019, "learning_rate": 0.0005370664335664335, "loss": 3.5055, "step": 18100 }, { "epoch": 5.285398101228959, "grad_norm": 0.3269577920436859, "learning_rate": 0.0005368916083916083, "loss": 3.5036, "step": 18150 }, { "epoch": 5.299959228842681, "grad_norm": 0.3377140462398529, "learning_rate": 0.0005367167832167832, "loss": 3.5139, "step": 18200 }, { "epoch": 5.314520356456404, "grad_norm": 0.3444065749645233, "learning_rate": 0.000536541958041958, "loss": 3.5031, "step": 18250 }, { "epoch": 5.329081484070127, "grad_norm": 0.3035888671875, "learning_rate": 0.0005363671328671328, "loss": 3.4989, "step": 18300 }, { "epoch": 5.3436426116838485, "grad_norm": 0.3229525089263916, "learning_rate": 0.0005361923076923076, "loss": 3.5053, "step": 18350 }, { "epoch": 5.358203739297571, "grad_norm": 0.3133704960346222, "learning_rate": 0.0005360174825174825, "loss": 3.5075, "step": 18400 }, { "epoch": 5.372764866911294, "grad_norm": 0.31471043825149536, "learning_rate": 0.0005358426573426573, "loss": 3.5161, "step": 18450 }, { "epoch": 5.387325994525016, "grad_norm": 0.3169424533843994, "learning_rate": 0.0005356678321678321, "loss": 3.5031, "step": 18500 }, { "epoch": 5.401887122138739, "grad_norm": 0.3456338047981262, "learning_rate": 0.0005354930069930069, "loss": 3.5103, "step": 18550 }, { "epoch": 5.41644824975246, "grad_norm": 0.3145761489868164, "learning_rate": 0.0005353181818181817, "loss": 3.5202, "step": 18600 }, { "epoch": 5.431009377366183, "grad_norm": 0.3344932198524475, "learning_rate": 0.0005351433566433565, "loss": 3.5115, "step": 18650 }, { "epoch": 5.445570504979906, "grad_norm": 0.32802441716194153, "learning_rate": 0.0005349685314685314, "loss": 3.5108, "step": 18700 }, { "epoch": 5.460131632593628, "grad_norm": 0.33433371782302856, "learning_rate": 0.0005347937062937062, "loss": 3.5175, "step": 18750 }, { "epoch": 5.4746927602073505, "grad_norm": 0.3225354552268982, "learning_rate": 0.000534618881118881, "loss": 3.5294, "step": 18800 }, { "epoch": 5.489253887821073, "grad_norm": 0.3271482586860657, "learning_rate": 0.0005344440559440559, "loss": 3.515, "step": 18850 }, { "epoch": 5.503815015434795, "grad_norm": 0.32893088459968567, "learning_rate": 0.0005342692307692307, "loss": 3.517, "step": 18900 }, { "epoch": 5.518376143048518, "grad_norm": 0.3370042145252228, "learning_rate": 0.0005340944055944055, "loss": 3.5241, "step": 18950 }, { "epoch": 5.53293727066224, "grad_norm": 0.32803529500961304, "learning_rate": 0.0005339195804195803, "loss": 3.5244, "step": 19000 }, { "epoch": 5.53293727066224, "eval_accuracy": 0.36118689327407866, "eval_loss": 3.623746395111084, "eval_runtime": 177.0774, "eval_samples_per_second": 94.004, "eval_steps_per_second": 5.879, "step": 19000 }, { "epoch": 5.5474983982759625, "grad_norm": 0.325192391872406, "learning_rate": 0.0005337447552447552, "loss": 3.5244, "step": 19050 }, { "epoch": 5.562059525889685, "grad_norm": 0.3339932858943939, "learning_rate": 0.00053356993006993, "loss": 3.5122, "step": 19100 }, { "epoch": 5.576620653503407, "grad_norm": 0.3150177597999573, "learning_rate": 0.0005333951048951048, "loss": 3.5213, "step": 19150 }, { "epoch": 5.59118178111713, "grad_norm": 0.3161846697330475, "learning_rate": 0.0005332202797202796, "loss": 3.5178, "step": 19200 }, { "epoch": 5.605742908730852, "grad_norm": 0.3393925130367279, "learning_rate": 0.0005330454545454546, "loss": 3.5075, "step": 19250 }, { "epoch": 5.620304036344574, "grad_norm": 0.3091968595981598, "learning_rate": 0.0005328706293706292, "loss": 3.5195, "step": 19300 }, { "epoch": 5.634865163958297, "grad_norm": 0.3470529317855835, "learning_rate": 0.0005326958041958042, "loss": 3.5365, "step": 19350 }, { "epoch": 5.649426291572019, "grad_norm": 0.3210427165031433, "learning_rate": 0.000532520979020979, "loss": 3.5194, "step": 19400 }, { "epoch": 5.663987419185742, "grad_norm": 0.3233715295791626, "learning_rate": 0.0005323461538461538, "loss": 3.5358, "step": 19450 }, { "epoch": 5.6785485467994645, "grad_norm": 0.3228999376296997, "learning_rate": 0.0005321713286713287, "loss": 3.5195, "step": 19500 }, { "epoch": 5.693109674413186, "grad_norm": 0.32254138588905334, "learning_rate": 0.0005319965034965035, "loss": 3.5227, "step": 19550 }, { "epoch": 5.707670802026909, "grad_norm": 0.3360702097415924, "learning_rate": 0.0005318216783216783, "loss": 3.5264, "step": 19600 }, { "epoch": 5.722231929640631, "grad_norm": 0.32106834650039673, "learning_rate": 0.0005316468531468531, "loss": 3.5294, "step": 19650 }, { "epoch": 5.736793057254354, "grad_norm": 0.3194294273853302, "learning_rate": 0.000531472027972028, "loss": 3.5303, "step": 19700 }, { "epoch": 5.7513541848680765, "grad_norm": 0.31817537546157837, "learning_rate": 0.0005312972027972028, "loss": 3.5289, "step": 19750 }, { "epoch": 5.765915312481798, "grad_norm": 0.32823848724365234, "learning_rate": 0.0005311223776223776, "loss": 3.542, "step": 19800 }, { "epoch": 5.780476440095521, "grad_norm": 0.33044731616973877, "learning_rate": 0.0005309475524475524, "loss": 3.5308, "step": 19850 }, { "epoch": 5.795037567709244, "grad_norm": 0.30992332100868225, "learning_rate": 0.0005307727272727273, "loss": 3.5285, "step": 19900 }, { "epoch": 5.809598695322966, "grad_norm": 0.3125179409980774, "learning_rate": 0.0005305979020979021, "loss": 3.516, "step": 19950 }, { "epoch": 5.824159822936688, "grad_norm": 0.32367056608200073, "learning_rate": 0.0005304230769230769, "loss": 3.5255, "step": 20000 }, { "epoch": 5.824159822936688, "eval_accuracy": 0.36226388432027534, "eval_loss": 3.613421678543091, "eval_runtime": 176.9052, "eval_samples_per_second": 94.096, "eval_steps_per_second": 5.885, "step": 20000 }, { "epoch": 5.83872095055041, "grad_norm": 0.3150468170642853, "learning_rate": 0.0005302482517482517, "loss": 3.5362, "step": 20050 }, { "epoch": 5.853282078164133, "grad_norm": 0.2990545332431793, "learning_rate": 0.0005300734265734265, "loss": 3.5295, "step": 20100 }, { "epoch": 5.867843205777856, "grad_norm": 0.29398471117019653, "learning_rate": 0.0005298986013986013, "loss": 3.5227, "step": 20150 }, { "epoch": 5.882404333391578, "grad_norm": 0.30902978777885437, "learning_rate": 0.0005297237762237762, "loss": 3.5248, "step": 20200 }, { "epoch": 5.8969654610053, "grad_norm": 0.3113495409488678, "learning_rate": 0.000529548951048951, "loss": 3.538, "step": 20250 }, { "epoch": 5.911526588619022, "grad_norm": 0.31345126032829285, "learning_rate": 0.0005293741258741258, "loss": 3.5334, "step": 20300 }, { "epoch": 5.926087716232745, "grad_norm": 0.29870733618736267, "learning_rate": 0.0005291993006993007, "loss": 3.5317, "step": 20350 }, { "epoch": 5.940648843846468, "grad_norm": 0.32265445590019226, "learning_rate": 0.0005290244755244755, "loss": 3.5114, "step": 20400 }, { "epoch": 5.95520997146019, "grad_norm": 0.34477177262306213, "learning_rate": 0.0005288496503496503, "loss": 3.5371, "step": 20450 }, { "epoch": 5.969771099073912, "grad_norm": 0.3217330574989319, "learning_rate": 0.0005286748251748251, "loss": 3.541, "step": 20500 }, { "epoch": 5.984332226687634, "grad_norm": 0.3138887882232666, "learning_rate": 0.0005285, "loss": 3.5157, "step": 20550 }, { "epoch": 5.998893354301357, "grad_norm": 0.34451672434806824, "learning_rate": 0.0005283251748251748, "loss": 3.5283, "step": 20600 }, { "epoch": 6.013396237404625, "grad_norm": 0.3342776894569397, "learning_rate": 0.0005281503496503496, "loss": 3.4197, "step": 20650 }, { "epoch": 6.027957365018347, "grad_norm": 0.31999471783638, "learning_rate": 0.0005279755244755244, "loss": 3.4206, "step": 20700 }, { "epoch": 6.04251849263207, "grad_norm": 0.3338138461112976, "learning_rate": 0.0005278006993006993, "loss": 3.4196, "step": 20750 }, { "epoch": 6.0570796202457915, "grad_norm": 0.3524891436100006, "learning_rate": 0.000527625874125874, "loss": 3.4353, "step": 20800 }, { "epoch": 6.071640747859514, "grad_norm": 0.3189772367477417, "learning_rate": 0.0005274510489510489, "loss": 3.4282, "step": 20850 }, { "epoch": 6.086201875473237, "grad_norm": 0.3258211314678192, "learning_rate": 0.0005272762237762238, "loss": 3.4414, "step": 20900 }, { "epoch": 6.100763003086959, "grad_norm": 0.33525657653808594, "learning_rate": 0.0005271013986013985, "loss": 3.4436, "step": 20950 }, { "epoch": 6.115324130700682, "grad_norm": 0.3542822599411011, "learning_rate": 0.0005269265734265734, "loss": 3.4384, "step": 21000 }, { "epoch": 6.115324130700682, "eval_accuracy": 0.3627670522798564, "eval_loss": 3.6149725914001465, "eval_runtime": 176.9568, "eval_samples_per_second": 94.068, "eval_steps_per_second": 5.883, "step": 21000 }, { "epoch": 6.1298852583144035, "grad_norm": 0.3309166431427002, "learning_rate": 0.0005267517482517482, "loss": 3.4315, "step": 21050 }, { "epoch": 6.144446385928126, "grad_norm": 0.3391399681568146, "learning_rate": 0.000526576923076923, "loss": 3.4495, "step": 21100 }, { "epoch": 6.159007513541849, "grad_norm": 0.3149496018886566, "learning_rate": 0.0005264020979020978, "loss": 3.4587, "step": 21150 }, { "epoch": 6.173568641155571, "grad_norm": 0.32553061842918396, "learning_rate": 0.0005262272727272727, "loss": 3.4494, "step": 21200 }, { "epoch": 6.1881297687692935, "grad_norm": 0.33138251304626465, "learning_rate": 0.0005260524475524475, "loss": 3.4513, "step": 21250 }, { "epoch": 6.202690896383016, "grad_norm": 0.32948899269104004, "learning_rate": 0.0005258776223776223, "loss": 3.4544, "step": 21300 }, { "epoch": 6.217252023996738, "grad_norm": 0.341678261756897, "learning_rate": 0.0005257027972027971, "loss": 3.442, "step": 21350 }, { "epoch": 6.231813151610461, "grad_norm": 0.33684396743774414, "learning_rate": 0.000525527972027972, "loss": 3.4379, "step": 21400 }, { "epoch": 6.246374279224183, "grad_norm": 0.34825167059898376, "learning_rate": 0.0005253531468531468, "loss": 3.4584, "step": 21450 }, { "epoch": 6.2609354068379055, "grad_norm": 0.335224449634552, "learning_rate": 0.0005251783216783216, "loss": 3.4535, "step": 21500 }, { "epoch": 6.275496534451628, "grad_norm": 0.3297387659549713, "learning_rate": 0.0005250034965034965, "loss": 3.4562, "step": 21550 }, { "epoch": 6.29005766206535, "grad_norm": 0.3344549238681793, "learning_rate": 0.0005248286713286712, "loss": 3.4579, "step": 21600 }, { "epoch": 6.304618789679073, "grad_norm": 0.3421954810619354, "learning_rate": 0.0005246538461538461, "loss": 3.4619, "step": 21650 }, { "epoch": 6.319179917292795, "grad_norm": 0.3300471007823944, "learning_rate": 0.0005244790209790209, "loss": 3.4587, "step": 21700 }, { "epoch": 6.3337410449065175, "grad_norm": 0.3263568878173828, "learning_rate": 0.0005243041958041957, "loss": 3.4679, "step": 21750 }, { "epoch": 6.34830217252024, "grad_norm": 0.33946892619132996, "learning_rate": 0.0005241293706293705, "loss": 3.4643, "step": 21800 }, { "epoch": 6.362863300133962, "grad_norm": 0.3320576250553131, "learning_rate": 0.0005239545454545454, "loss": 3.4587, "step": 21850 }, { "epoch": 6.377424427747685, "grad_norm": 0.327897846698761, "learning_rate": 0.0005237797202797202, "loss": 3.484, "step": 21900 }, { "epoch": 6.391985555361408, "grad_norm": 0.32064688205718994, "learning_rate": 0.000523604895104895, "loss": 3.4837, "step": 21950 }, { "epoch": 6.406546682975129, "grad_norm": 0.3372121751308441, "learning_rate": 0.0005234300699300698, "loss": 3.4611, "step": 22000 }, { "epoch": 6.406546682975129, "eval_accuracy": 0.36337061870613885, "eval_loss": 3.607774019241333, "eval_runtime": 176.7678, "eval_samples_per_second": 94.169, "eval_steps_per_second": 5.889, "step": 22000 }, { "epoch": 6.421107810588852, "grad_norm": 0.3300037384033203, "learning_rate": 0.0005232552447552447, "loss": 3.4701, "step": 22050 }, { "epoch": 6.435668938202574, "grad_norm": 0.35658490657806396, "learning_rate": 0.0005230804195804195, "loss": 3.4581, "step": 22100 }, { "epoch": 6.450230065816297, "grad_norm": 0.3384239077568054, "learning_rate": 0.0005229055944055943, "loss": 3.4855, "step": 22150 }, { "epoch": 6.4647911934300195, "grad_norm": 0.3166094720363617, "learning_rate": 0.0005227307692307691, "loss": 3.4814, "step": 22200 }, { "epoch": 6.479352321043741, "grad_norm": 0.31494995951652527, "learning_rate": 0.0005225559440559441, "loss": 3.4707, "step": 22250 }, { "epoch": 6.493913448657464, "grad_norm": 0.33063846826553345, "learning_rate": 0.0005223811188811189, "loss": 3.49, "step": 22300 }, { "epoch": 6.508474576271187, "grad_norm": 0.3317149579524994, "learning_rate": 0.0005222062937062937, "loss": 3.4876, "step": 22350 }, { "epoch": 6.523035703884909, "grad_norm": 0.3236587941646576, "learning_rate": 0.0005220314685314686, "loss": 3.476, "step": 22400 }, { "epoch": 6.5375968314986315, "grad_norm": 0.32496577501296997, "learning_rate": 0.0005218566433566433, "loss": 3.4801, "step": 22450 }, { "epoch": 6.552157959112353, "grad_norm": 0.3040722906589508, "learning_rate": 0.0005216818181818182, "loss": 3.4714, "step": 22500 }, { "epoch": 6.566719086726076, "grad_norm": 0.3201970160007477, "learning_rate": 0.000521506993006993, "loss": 3.4741, "step": 22550 }, { "epoch": 6.581280214339799, "grad_norm": 0.3287389576435089, "learning_rate": 0.0005213321678321678, "loss": 3.4773, "step": 22600 }, { "epoch": 6.595841341953521, "grad_norm": 0.3368239998817444, "learning_rate": 0.0005211573426573426, "loss": 3.4811, "step": 22650 }, { "epoch": 6.610402469567243, "grad_norm": 0.341665655374527, "learning_rate": 0.0005209825174825175, "loss": 3.4828, "step": 22700 }, { "epoch": 6.624963597180965, "grad_norm": 0.32684972882270813, "learning_rate": 0.0005208076923076923, "loss": 3.4814, "step": 22750 }, { "epoch": 6.639524724794688, "grad_norm": 0.32613569498062134, "learning_rate": 0.0005206328671328671, "loss": 3.4808, "step": 22800 }, { "epoch": 6.654085852408411, "grad_norm": 0.3358040452003479, "learning_rate": 0.0005204580419580419, "loss": 3.4861, "step": 22850 }, { "epoch": 6.668646980022133, "grad_norm": 0.3069293797016144, "learning_rate": 0.0005202832167832168, "loss": 3.4817, "step": 22900 }, { "epoch": 6.683208107635855, "grad_norm": 0.3199792504310608, "learning_rate": 0.0005201083916083916, "loss": 3.4666, "step": 22950 }, { "epoch": 6.697769235249577, "grad_norm": 0.33008405566215515, "learning_rate": 0.0005199335664335664, "loss": 3.5017, "step": 23000 }, { "epoch": 6.697769235249577, "eval_accuracy": 0.3638195903037183, "eval_loss": 3.6003408432006836, "eval_runtime": 176.8026, "eval_samples_per_second": 94.15, "eval_steps_per_second": 5.888, "step": 23000 }, { "epoch": 6.7123303628633, "grad_norm": 0.3358388841152191, "learning_rate": 0.0005197587412587413, "loss": 3.4842, "step": 23050 }, { "epoch": 6.726891490477023, "grad_norm": 0.32142195105552673, "learning_rate": 0.0005195839160839161, "loss": 3.4794, "step": 23100 }, { "epoch": 6.741452618090745, "grad_norm": 0.3318839371204376, "learning_rate": 0.0005194090909090909, "loss": 3.4883, "step": 23150 }, { "epoch": 6.756013745704467, "grad_norm": 0.33538463711738586, "learning_rate": 0.0005192342657342657, "loss": 3.4851, "step": 23200 }, { "epoch": 6.77057487331819, "grad_norm": 0.31580328941345215, "learning_rate": 0.0005190594405594405, "loss": 3.482, "step": 23250 }, { "epoch": 6.785136000931912, "grad_norm": 0.3154962658882141, "learning_rate": 0.0005188846153846153, "loss": 3.5067, "step": 23300 }, { "epoch": 6.799697128545635, "grad_norm": 0.32202768325805664, "learning_rate": 0.0005187097902097902, "loss": 3.4836, "step": 23350 }, { "epoch": 6.814258256159357, "grad_norm": 0.3195551335811615, "learning_rate": 0.000518534965034965, "loss": 3.4966, "step": 23400 }, { "epoch": 6.828819383773079, "grad_norm": 0.33164918422698975, "learning_rate": 0.0005183601398601398, "loss": 3.4705, "step": 23450 }, { "epoch": 6.843380511386802, "grad_norm": 0.32325154542922974, "learning_rate": 0.0005181853146853146, "loss": 3.487, "step": 23500 }, { "epoch": 6.857941639000524, "grad_norm": 0.3257133662700653, "learning_rate": 0.0005180104895104895, "loss": 3.4918, "step": 23550 }, { "epoch": 6.872502766614247, "grad_norm": 0.33086922764778137, "learning_rate": 0.0005178356643356643, "loss": 3.4844, "step": 23600 }, { "epoch": 6.887063894227969, "grad_norm": 0.3378217816352844, "learning_rate": 0.0005176608391608391, "loss": 3.4906, "step": 23650 }, { "epoch": 6.901625021841691, "grad_norm": 0.3248620629310608, "learning_rate": 0.000517486013986014, "loss": 3.4827, "step": 23700 }, { "epoch": 6.916186149455414, "grad_norm": 0.3082542419433594, "learning_rate": 0.0005173111888111888, "loss": 3.4889, "step": 23750 }, { "epoch": 6.930747277069136, "grad_norm": 0.326917439699173, "learning_rate": 0.0005171363636363636, "loss": 3.4922, "step": 23800 }, { "epoch": 6.945308404682859, "grad_norm": 0.32005560398101807, "learning_rate": 0.0005169615384615384, "loss": 3.5049, "step": 23850 }, { "epoch": 6.959869532296581, "grad_norm": 0.3334341049194336, "learning_rate": 0.0005167867132867133, "loss": 3.492, "step": 23900 }, { "epoch": 6.974430659910303, "grad_norm": 0.32381048798561096, "learning_rate": 0.000516611888111888, "loss": 3.4998, "step": 23950 }, { "epoch": 6.988991787524026, "grad_norm": 0.34249672293663025, "learning_rate": 0.0005164370629370629, "loss": 3.4861, "step": 24000 }, { "epoch": 6.988991787524026, "eval_accuracy": 0.36504106579438345, "eval_loss": 3.589481830596924, "eval_runtime": 176.7844, "eval_samples_per_second": 94.16, "eval_steps_per_second": 5.889, "step": 24000 }, { "epoch": 7.003494670627293, "grad_norm": 0.34342801570892334, "learning_rate": 0.0005162622377622377, "loss": 3.4622, "step": 24050 }, { "epoch": 7.018055798241016, "grad_norm": 0.3223758935928345, "learning_rate": 0.0005160874125874125, "loss": 3.3769, "step": 24100 }, { "epoch": 7.032616925854738, "grad_norm": 0.33334001898765564, "learning_rate": 0.0005159125874125873, "loss": 3.3793, "step": 24150 }, { "epoch": 7.0471780534684605, "grad_norm": 0.3260599374771118, "learning_rate": 0.0005157377622377622, "loss": 3.3868, "step": 24200 }, { "epoch": 7.061739181082183, "grad_norm": 0.32783156633377075, "learning_rate": 0.000515562937062937, "loss": 3.3913, "step": 24250 }, { "epoch": 7.076300308695905, "grad_norm": 0.33441364765167236, "learning_rate": 0.0005153881118881118, "loss": 3.398, "step": 24300 }, { "epoch": 7.090861436309628, "grad_norm": 0.31968820095062256, "learning_rate": 0.0005152132867132867, "loss": 3.4004, "step": 24350 }, { "epoch": 7.105422563923351, "grad_norm": 0.3459835648536682, "learning_rate": 0.0005150384615384615, "loss": 3.3953, "step": 24400 }, { "epoch": 7.1199836915370724, "grad_norm": 0.3527763783931732, "learning_rate": 0.0005148636363636363, "loss": 3.3997, "step": 24450 }, { "epoch": 7.134544819150795, "grad_norm": 0.33494487404823303, "learning_rate": 0.0005146888111888111, "loss": 3.4178, "step": 24500 }, { "epoch": 7.149105946764517, "grad_norm": 0.3221123218536377, "learning_rate": 0.000514513986013986, "loss": 3.4155, "step": 24550 }, { "epoch": 7.16366707437824, "grad_norm": 0.34533846378326416, "learning_rate": 0.0005143391608391608, "loss": 3.4131, "step": 24600 }, { "epoch": 7.1782282019919625, "grad_norm": 0.34984448552131653, "learning_rate": 0.0005141643356643356, "loss": 3.4036, "step": 24650 }, { "epoch": 7.192789329605684, "grad_norm": 0.32715174555778503, "learning_rate": 0.0005139895104895104, "loss": 3.4013, "step": 24700 }, { "epoch": 7.207350457219407, "grad_norm": 0.3404192626476288, "learning_rate": 0.0005138146853146852, "loss": 3.4106, "step": 24750 }, { "epoch": 7.22191158483313, "grad_norm": 0.31030765175819397, "learning_rate": 0.00051363986013986, "loss": 3.4243, "step": 24800 }, { "epoch": 7.236472712446852, "grad_norm": 0.3313232958316803, "learning_rate": 0.0005134650349650349, "loss": 3.422, "step": 24850 }, { "epoch": 7.2510338400605745, "grad_norm": 0.31506332755088806, "learning_rate": 0.0005132902097902097, "loss": 3.4182, "step": 24900 }, { "epoch": 7.265594967674296, "grad_norm": 0.32670408487319946, "learning_rate": 0.0005131153846153845, "loss": 3.4169, "step": 24950 }, { "epoch": 7.280156095288019, "grad_norm": 0.3303210437297821, "learning_rate": 0.0005129405594405594, "loss": 3.4213, "step": 25000 }, { "epoch": 7.280156095288019, "eval_accuracy": 0.3645839823768949, "eval_loss": 3.5995564460754395, "eval_runtime": 177.1172, "eval_samples_per_second": 93.983, "eval_steps_per_second": 5.877, "step": 25000 }, { "epoch": 7.294717222901742, "grad_norm": 0.3087610900402069, "learning_rate": 0.0005127657342657342, "loss": 3.4324, "step": 25050 }, { "epoch": 7.309278350515464, "grad_norm": 0.3354821801185608, "learning_rate": 0.000512590909090909, "loss": 3.4259, "step": 25100 }, { "epoch": 7.3238394781291865, "grad_norm": 0.32773345708847046, "learning_rate": 0.0005124160839160838, "loss": 3.4342, "step": 25150 }, { "epoch": 7.338400605742908, "grad_norm": 0.3321012854576111, "learning_rate": 0.0005122412587412588, "loss": 3.4347, "step": 25200 }, { "epoch": 7.352961733356631, "grad_norm": 0.33389997482299805, "learning_rate": 0.0005120664335664336, "loss": 3.4243, "step": 25250 }, { "epoch": 7.367522860970354, "grad_norm": 0.34130993485450745, "learning_rate": 0.0005118916083916084, "loss": 3.4286, "step": 25300 }, { "epoch": 7.382083988584076, "grad_norm": 0.32038894295692444, "learning_rate": 0.0005117167832167832, "loss": 3.4303, "step": 25350 }, { "epoch": 7.396645116197798, "grad_norm": 0.33677011728286743, "learning_rate": 0.0005115419580419581, "loss": 3.4209, "step": 25400 }, { "epoch": 7.411206243811521, "grad_norm": 0.32208898663520813, "learning_rate": 0.0005113671328671328, "loss": 3.4312, "step": 25450 }, { "epoch": 7.425767371425243, "grad_norm": 0.3141185939311981, "learning_rate": 0.0005111923076923077, "loss": 3.4408, "step": 25500 }, { "epoch": 7.440328499038966, "grad_norm": 0.33646243810653687, "learning_rate": 0.0005110174825174825, "loss": 3.4376, "step": 25550 }, { "epoch": 7.454889626652688, "grad_norm": 0.34247833490371704, "learning_rate": 0.0005108426573426573, "loss": 3.4332, "step": 25600 }, { "epoch": 7.46945075426641, "grad_norm": 0.31767964363098145, "learning_rate": 0.0005106678321678321, "loss": 3.4316, "step": 25650 }, { "epoch": 7.484011881880133, "grad_norm": 0.3620831072330475, "learning_rate": 0.000510493006993007, "loss": 3.4366, "step": 25700 }, { "epoch": 7.498573009493855, "grad_norm": 0.34545835852622986, "learning_rate": 0.0005103181818181818, "loss": 3.4369, "step": 25750 }, { "epoch": 7.513134137107578, "grad_norm": 0.35401225090026855, "learning_rate": 0.0005101433566433566, "loss": 3.4461, "step": 25800 }, { "epoch": 7.5276952647213005, "grad_norm": 0.31585973501205444, "learning_rate": 0.0005099685314685315, "loss": 3.4509, "step": 25850 }, { "epoch": 7.542256392335022, "grad_norm": 0.32004138827323914, "learning_rate": 0.0005097937062937063, "loss": 3.4571, "step": 25900 }, { "epoch": 7.556817519948745, "grad_norm": 0.34505805373191833, "learning_rate": 0.0005096188811188811, "loss": 3.4596, "step": 25950 }, { "epoch": 7.571378647562467, "grad_norm": 0.3367854356765747, "learning_rate": 0.0005094440559440559, "loss": 3.4574, "step": 26000 }, { "epoch": 7.571378647562467, "eval_accuracy": 0.36489834478902566, "eval_loss": 3.5899016857147217, "eval_runtime": 177.0415, "eval_samples_per_second": 94.023, "eval_steps_per_second": 5.88, "step": 26000 }, { "epoch": 7.58593977517619, "grad_norm": 0.31563594937324524, "learning_rate": 0.0005092692307692308, "loss": 3.4589, "step": 26050 }, { "epoch": 7.600500902789912, "grad_norm": 0.3145526945590973, "learning_rate": 0.0005090944055944056, "loss": 3.4487, "step": 26100 }, { "epoch": 7.615062030403634, "grad_norm": 0.3387201130390167, "learning_rate": 0.0005089195804195804, "loss": 3.4584, "step": 26150 }, { "epoch": 7.629623158017357, "grad_norm": 0.3117615282535553, "learning_rate": 0.0005087447552447552, "loss": 3.4548, "step": 26200 }, { "epoch": 7.644184285631079, "grad_norm": 0.3377816379070282, "learning_rate": 0.00050856993006993, "loss": 3.456, "step": 26250 }, { "epoch": 7.658745413244802, "grad_norm": 0.36369189620018005, "learning_rate": 0.0005083951048951048, "loss": 3.4473, "step": 26300 }, { "epoch": 7.673306540858524, "grad_norm": 0.3553776741027832, "learning_rate": 0.0005082202797202797, "loss": 3.4433, "step": 26350 }, { "epoch": 7.687867668472246, "grad_norm": 0.33874639868736267, "learning_rate": 0.0005080454545454545, "loss": 3.4423, "step": 26400 }, { "epoch": 7.702428796085969, "grad_norm": 0.33977967500686646, "learning_rate": 0.0005078706293706293, "loss": 3.4613, "step": 26450 }, { "epoch": 7.716989923699691, "grad_norm": 0.3333614468574524, "learning_rate": 0.0005076958041958042, "loss": 3.4496, "step": 26500 }, { "epoch": 7.731551051313414, "grad_norm": 0.3278277814388275, "learning_rate": 0.000507520979020979, "loss": 3.4582, "step": 26550 }, { "epoch": 7.746112178927136, "grad_norm": 0.33077800273895264, "learning_rate": 0.0005073461538461538, "loss": 3.4525, "step": 26600 }, { "epoch": 7.760673306540858, "grad_norm": 0.3300637900829315, "learning_rate": 0.0005071713286713286, "loss": 3.4449, "step": 26650 }, { "epoch": 7.775234434154581, "grad_norm": 0.3329484760761261, "learning_rate": 0.0005069965034965035, "loss": 3.4613, "step": 26700 }, { "epoch": 7.789795561768304, "grad_norm": 0.35463809967041016, "learning_rate": 0.0005068216783216783, "loss": 3.4637, "step": 26750 }, { "epoch": 7.8043566893820255, "grad_norm": 0.3434412181377411, "learning_rate": 0.0005066468531468531, "loss": 3.4636, "step": 26800 }, { "epoch": 7.818917816995748, "grad_norm": 0.33126378059387207, "learning_rate": 0.0005064720279720279, "loss": 3.4617, "step": 26850 }, { "epoch": 7.833478944609471, "grad_norm": 0.3325732350349426, "learning_rate": 0.0005062972027972028, "loss": 3.4551, "step": 26900 }, { "epoch": 7.848040072223193, "grad_norm": 0.3155645430088043, "learning_rate": 0.0005061223776223775, "loss": 3.4577, "step": 26950 }, { "epoch": 7.862601199836916, "grad_norm": 0.3395119607448578, "learning_rate": 0.0005059475524475524, "loss": 3.4488, "step": 27000 }, { "epoch": 7.862601199836916, "eval_accuracy": 0.36595053012506545, "eval_loss": 3.581655263900757, "eval_runtime": 176.4968, "eval_samples_per_second": 94.313, "eval_steps_per_second": 5.898, "step": 27000 }, { "epoch": 7.8771623274506375, "grad_norm": 0.3462318480014801, "learning_rate": 0.0005057727272727272, "loss": 3.4536, "step": 27050 }, { "epoch": 7.89172345506436, "grad_norm": 0.34466618299484253, "learning_rate": 0.000505597902097902, "loss": 3.45, "step": 27100 }, { "epoch": 7.906284582678083, "grad_norm": 0.3201284110546112, "learning_rate": 0.0005054230769230769, "loss": 3.4584, "step": 27150 }, { "epoch": 7.920845710291805, "grad_norm": 0.32837069034576416, "learning_rate": 0.0005052482517482517, "loss": 3.4479, "step": 27200 }, { "epoch": 7.935406837905528, "grad_norm": 0.3236668109893799, "learning_rate": 0.0005050734265734265, "loss": 3.4531, "step": 27250 }, { "epoch": 7.9499679655192494, "grad_norm": 0.34047573804855347, "learning_rate": 0.0005048986013986013, "loss": 3.4721, "step": 27300 }, { "epoch": 7.964529093132972, "grad_norm": 0.3239133954048157, "learning_rate": 0.0005047237762237762, "loss": 3.4515, "step": 27350 }, { "epoch": 7.979090220746695, "grad_norm": 0.32450243830680847, "learning_rate": 0.000504548951048951, "loss": 3.4584, "step": 27400 }, { "epoch": 7.993651348360417, "grad_norm": 0.3248816132545471, "learning_rate": 0.0005043741258741258, "loss": 3.4567, "step": 27450 }, { "epoch": 8.008154231463685, "grad_norm": 0.32652345299720764, "learning_rate": 0.0005041993006993006, "loss": 3.401, "step": 27500 }, { "epoch": 8.022715359077408, "grad_norm": 0.3587798774242401, "learning_rate": 0.0005040244755244755, "loss": 3.3408, "step": 27550 }, { "epoch": 8.037276486691129, "grad_norm": 0.3377417027950287, "learning_rate": 0.0005038496503496503, "loss": 3.3456, "step": 27600 }, { "epoch": 8.051837614304851, "grad_norm": 0.3346528708934784, "learning_rate": 0.0005036748251748251, "loss": 3.3501, "step": 27650 }, { "epoch": 8.066398741918574, "grad_norm": 0.35164663195610046, "learning_rate": 0.0005034999999999999, "loss": 3.3577, "step": 27700 }, { "epoch": 8.080959869532297, "grad_norm": 0.3102585971355438, "learning_rate": 0.0005033251748251747, "loss": 3.375, "step": 27750 }, { "epoch": 8.09552099714602, "grad_norm": 0.3394184708595276, "learning_rate": 0.0005031503496503496, "loss": 3.3755, "step": 27800 }, { "epoch": 8.11008212475974, "grad_norm": 0.32756221294403076, "learning_rate": 0.0005029755244755244, "loss": 3.3746, "step": 27850 }, { "epoch": 8.124643252373463, "grad_norm": 0.3159530758857727, "learning_rate": 0.0005028006993006992, "loss": 3.3724, "step": 27900 }, { "epoch": 8.139204379987186, "grad_norm": 0.33860981464385986, "learning_rate": 0.000502625874125874, "loss": 3.3789, "step": 27950 }, { "epoch": 8.153765507600909, "grad_norm": 0.34003502130508423, "learning_rate": 0.000502451048951049, "loss": 3.3653, "step": 28000 }, { "epoch": 8.153765507600909, "eval_accuracy": 0.36557009752758785, "eval_loss": 3.589177370071411, "eval_runtime": 176.3386, "eval_samples_per_second": 94.398, "eval_steps_per_second": 5.903, "step": 28000 }, { "epoch": 8.168326635214632, "grad_norm": 0.351193904876709, "learning_rate": 0.0005022762237762237, "loss": 3.3757, "step": 28050 }, { "epoch": 8.182887762828354, "grad_norm": 0.3602054715156555, "learning_rate": 0.0005021013986013985, "loss": 3.3774, "step": 28100 }, { "epoch": 8.197448890442075, "grad_norm": 0.31822383403778076, "learning_rate": 0.0005019265734265733, "loss": 3.3891, "step": 28150 }, { "epoch": 8.212010018055798, "grad_norm": 0.3160878121852875, "learning_rate": 0.0005017517482517483, "loss": 3.3887, "step": 28200 }, { "epoch": 8.22657114566952, "grad_norm": 0.3426283895969391, "learning_rate": 0.0005015769230769231, "loss": 3.3857, "step": 28250 }, { "epoch": 8.241132273283243, "grad_norm": 0.3402179777622223, "learning_rate": 0.0005014020979020979, "loss": 3.3947, "step": 28300 }, { "epoch": 8.255693400896966, "grad_norm": 0.3560255169868469, "learning_rate": 0.0005012272727272727, "loss": 3.3902, "step": 28350 }, { "epoch": 8.270254528510687, "grad_norm": 0.3526296317577362, "learning_rate": 0.0005010524475524476, "loss": 3.392, "step": 28400 }, { "epoch": 8.28481565612441, "grad_norm": 0.3429584503173828, "learning_rate": 0.0005008776223776223, "loss": 3.3907, "step": 28450 }, { "epoch": 8.299376783738133, "grad_norm": 0.3741019070148468, "learning_rate": 0.0005007027972027972, "loss": 3.3867, "step": 28500 }, { "epoch": 8.313937911351855, "grad_norm": 0.3399593234062195, "learning_rate": 0.000500527972027972, "loss": 3.4004, "step": 28550 }, { "epoch": 8.328499038965578, "grad_norm": 0.3139590322971344, "learning_rate": 0.0005003531468531468, "loss": 3.4095, "step": 28600 }, { "epoch": 8.3430601665793, "grad_norm": 0.3386857807636261, "learning_rate": 0.0005001783216783217, "loss": 3.4005, "step": 28650 }, { "epoch": 8.357621294193022, "grad_norm": 0.34544748067855835, "learning_rate": 0.0005000034965034965, "loss": 3.4143, "step": 28700 }, { "epoch": 8.372182421806745, "grad_norm": 0.3371305763721466, "learning_rate": 0.0004998286713286713, "loss": 3.4018, "step": 28750 }, { "epoch": 8.386743549420467, "grad_norm": 0.3295310437679291, "learning_rate": 0.0004996538461538461, "loss": 3.3987, "step": 28800 }, { "epoch": 8.40130467703419, "grad_norm": 0.3291172385215759, "learning_rate": 0.000499479020979021, "loss": 3.4058, "step": 28850 }, { "epoch": 8.415865804647911, "grad_norm": 0.3469097912311554, "learning_rate": 0.0004993041958041958, "loss": 3.4172, "step": 28900 }, { "epoch": 8.430426932261634, "grad_norm": 0.36806628108024597, "learning_rate": 0.0004991293706293706, "loss": 3.4096, "step": 28950 }, { "epoch": 8.444988059875357, "grad_norm": 0.3716620206832886, "learning_rate": 0.0004989545454545454, "loss": 3.4157, "step": 29000 }, { "epoch": 8.444988059875357, "eval_accuracy": 0.36656561768687107, "eval_loss": 3.5810399055480957, "eval_runtime": 176.3981, "eval_samples_per_second": 94.366, "eval_steps_per_second": 5.901, "step": 29000 }, { "epoch": 8.45954918748908, "grad_norm": 0.35446473956108093, "learning_rate": 0.0004987797202797203, "loss": 3.4093, "step": 29050 }, { "epoch": 8.474110315102802, "grad_norm": 0.3686777353286743, "learning_rate": 0.0004986048951048951, "loss": 3.4094, "step": 29100 }, { "epoch": 8.488671442716523, "grad_norm": 0.35613036155700684, "learning_rate": 0.0004984300699300699, "loss": 3.3991, "step": 29150 }, { "epoch": 8.503232570330246, "grad_norm": 0.32252880930900574, "learning_rate": 0.0004982552447552448, "loss": 3.4118, "step": 29200 }, { "epoch": 8.517793697943969, "grad_norm": 0.3352912962436676, "learning_rate": 0.0004980804195804195, "loss": 3.4036, "step": 29250 }, { "epoch": 8.532354825557691, "grad_norm": 0.31796717643737793, "learning_rate": 0.0004979055944055944, "loss": 3.4199, "step": 29300 }, { "epoch": 8.546915953171414, "grad_norm": 0.32738158106803894, "learning_rate": 0.0004977307692307692, "loss": 3.4043, "step": 29350 }, { "epoch": 8.561477080785137, "grad_norm": 0.33792996406555176, "learning_rate": 0.000497555944055944, "loss": 3.419, "step": 29400 }, { "epoch": 8.576038208398858, "grad_norm": 0.3542904555797577, "learning_rate": 0.0004973811188811188, "loss": 3.4204, "step": 29450 }, { "epoch": 8.59059933601258, "grad_norm": 0.3942735195159912, "learning_rate": 0.0004972062937062937, "loss": 3.4245, "step": 29500 }, { "epoch": 8.605160463626303, "grad_norm": 0.31929633021354675, "learning_rate": 0.0004970314685314685, "loss": 3.4133, "step": 29550 }, { "epoch": 8.619721591240026, "grad_norm": 0.3423312306404114, "learning_rate": 0.0004968566433566433, "loss": 3.4228, "step": 29600 }, { "epoch": 8.634282718853749, "grad_norm": 0.3417450189590454, "learning_rate": 0.0004966818181818181, "loss": 3.4134, "step": 29650 }, { "epoch": 8.64884384646747, "grad_norm": 0.3385900855064392, "learning_rate": 0.000496506993006993, "loss": 3.4247, "step": 29700 }, { "epoch": 8.663404974081192, "grad_norm": 0.32018157839775085, "learning_rate": 0.0004963321678321678, "loss": 3.3997, "step": 29750 }, { "epoch": 8.677966101694915, "grad_norm": 0.34629449248313904, "learning_rate": 0.0004961573426573426, "loss": 3.4207, "step": 29800 }, { "epoch": 8.692527229308638, "grad_norm": 0.32567837834358215, "learning_rate": 0.0004959825174825175, "loss": 3.4374, "step": 29850 }, { "epoch": 8.70708835692236, "grad_norm": 0.334450900554657, "learning_rate": 0.0004958076923076923, "loss": 3.4314, "step": 29900 }, { "epoch": 8.721649484536082, "grad_norm": 0.3887094557285309, "learning_rate": 0.0004956328671328671, "loss": 3.4253, "step": 29950 }, { "epoch": 8.736210612149804, "grad_norm": 0.3402259051799774, "learning_rate": 0.0004954580419580419, "loss": 3.4305, "step": 30000 }, { "epoch": 8.736210612149804, "eval_accuracy": 0.36701117996883653, "eval_loss": 3.574934244155884, "eval_runtime": 176.9658, "eval_samples_per_second": 94.063, "eval_steps_per_second": 5.882, "step": 30000 }, { "epoch": 8.750771739763527, "grad_norm": 0.3436213731765747, "learning_rate": 0.0004952832167832167, "loss": 3.4177, "step": 30050 }, { "epoch": 8.76533286737725, "grad_norm": 0.328987717628479, "learning_rate": 0.0004951083916083915, "loss": 3.4336, "step": 30100 }, { "epoch": 8.779893994990973, "grad_norm": 0.3385300934314728, "learning_rate": 0.0004949335664335664, "loss": 3.4331, "step": 30150 }, { "epoch": 8.794455122604695, "grad_norm": 0.33109623193740845, "learning_rate": 0.0004947587412587412, "loss": 3.4107, "step": 30200 }, { "epoch": 8.809016250218416, "grad_norm": 0.31443682312965393, "learning_rate": 0.000494583916083916, "loss": 3.43, "step": 30250 }, { "epoch": 8.82357737783214, "grad_norm": 0.29897189140319824, "learning_rate": 0.0004944090909090908, "loss": 3.444, "step": 30300 }, { "epoch": 8.838138505445862, "grad_norm": 0.34077444672584534, "learning_rate": 0.0004942342657342657, "loss": 3.4399, "step": 30350 }, { "epoch": 8.852699633059585, "grad_norm": 0.3268566131591797, "learning_rate": 0.0004940594405594405, "loss": 3.4268, "step": 30400 }, { "epoch": 8.867260760673307, "grad_norm": 0.32913246750831604, "learning_rate": 0.0004938846153846153, "loss": 3.433, "step": 30450 }, { "epoch": 8.881821888287028, "grad_norm": 0.31822359561920166, "learning_rate": 0.0004937097902097901, "loss": 3.4374, "step": 30500 }, { "epoch": 8.896383015900751, "grad_norm": 0.34571900963783264, "learning_rate": 0.000493534965034965, "loss": 3.4361, "step": 30550 }, { "epoch": 8.910944143514474, "grad_norm": 0.3010501563549042, "learning_rate": 0.0004933601398601398, "loss": 3.4355, "step": 30600 }, { "epoch": 8.925505271128197, "grad_norm": 0.3567883372306824, "learning_rate": 0.0004931853146853146, "loss": 3.4244, "step": 30650 }, { "epoch": 8.94006639874192, "grad_norm": 0.32925522327423096, "learning_rate": 0.0004930104895104895, "loss": 3.4203, "step": 30700 }, { "epoch": 8.95462752635564, "grad_norm": 0.33393311500549316, "learning_rate": 0.0004928356643356642, "loss": 3.4295, "step": 30750 }, { "epoch": 8.969188653969363, "grad_norm": 0.34621018171310425, "learning_rate": 0.0004926608391608391, "loss": 3.437, "step": 30800 }, { "epoch": 8.983749781583086, "grad_norm": 0.359050989151001, "learning_rate": 0.0004924860139860139, "loss": 3.4432, "step": 30850 }, { "epoch": 8.998310909196809, "grad_norm": 0.329822838306427, "learning_rate": 0.0004923111888111887, "loss": 3.4229, "step": 30900 }, { "epoch": 9.012813792300076, "grad_norm": 0.34674400091171265, "learning_rate": 0.0004921363636363635, "loss": 3.3361, "step": 30950 }, { "epoch": 9.027374919913798, "grad_norm": 0.33987441658973694, "learning_rate": 0.0004919615384615384, "loss": 3.3188, "step": 31000 }, { "epoch": 9.027374919913798, "eval_accuracy": 0.3674608569420602, "eval_loss": 3.5798559188842773, "eval_runtime": 176.6125, "eval_samples_per_second": 94.252, "eval_steps_per_second": 5.894, "step": 31000 }, { "epoch": 9.041936047527521, "grad_norm": 0.33782774209976196, "learning_rate": 0.0004917867132867132, "loss": 3.327, "step": 31050 }, { "epoch": 9.056497175141242, "grad_norm": 0.3608669340610504, "learning_rate": 0.000491611888111888, "loss": 3.3257, "step": 31100 }, { "epoch": 9.071058302754965, "grad_norm": 0.33250337839126587, "learning_rate": 0.0004914370629370628, "loss": 3.3302, "step": 31150 }, { "epoch": 9.085619430368688, "grad_norm": 0.380667120218277, "learning_rate": 0.0004912622377622378, "loss": 3.3367, "step": 31200 }, { "epoch": 9.10018055798241, "grad_norm": 0.3177809715270996, "learning_rate": 0.0004910874125874126, "loss": 3.341, "step": 31250 }, { "epoch": 9.114741685596133, "grad_norm": 0.3426327109336853, "learning_rate": 0.0004909125874125874, "loss": 3.3328, "step": 31300 }, { "epoch": 9.129302813209854, "grad_norm": 0.3812158703804016, "learning_rate": 0.0004907377622377623, "loss": 3.3401, "step": 31350 }, { "epoch": 9.143863940823577, "grad_norm": 0.3903525471687317, "learning_rate": 0.0004905629370629371, "loss": 3.358, "step": 31400 }, { "epoch": 9.1584250684373, "grad_norm": 0.3273092210292816, "learning_rate": 0.0004903881118881119, "loss": 3.3457, "step": 31450 }, { "epoch": 9.172986196051022, "grad_norm": 0.34901750087738037, "learning_rate": 0.0004902132867132867, "loss": 3.3489, "step": 31500 }, { "epoch": 9.187547323664745, "grad_norm": 0.3518262803554535, "learning_rate": 0.0004900384615384615, "loss": 3.3524, "step": 31550 }, { "epoch": 9.202108451278466, "grad_norm": 0.34695637226104736, "learning_rate": 0.0004898636363636363, "loss": 3.3472, "step": 31600 }, { "epoch": 9.216669578892189, "grad_norm": 0.330387145280838, "learning_rate": 0.0004896888111888112, "loss": 3.3538, "step": 31650 }, { "epoch": 9.231230706505912, "grad_norm": 0.36280760169029236, "learning_rate": 0.000489513986013986, "loss": 3.3641, "step": 31700 }, { "epoch": 9.245791834119634, "grad_norm": 0.3654526472091675, "learning_rate": 0.0004893391608391608, "loss": 3.3713, "step": 31750 }, { "epoch": 9.260352961733357, "grad_norm": 0.3399411737918854, "learning_rate": 0.0004891643356643356, "loss": 3.3678, "step": 31800 }, { "epoch": 9.27491408934708, "grad_norm": 0.3387416899204254, "learning_rate": 0.0004889895104895105, "loss": 3.3738, "step": 31850 }, { "epoch": 9.2894752169608, "grad_norm": 0.3442247807979584, "learning_rate": 0.0004888146853146853, "loss": 3.3624, "step": 31900 }, { "epoch": 9.304036344574524, "grad_norm": 0.3599652647972107, "learning_rate": 0.0004886398601398601, "loss": 3.3695, "step": 31950 }, { "epoch": 9.318597472188246, "grad_norm": 0.33344852924346924, "learning_rate": 0.000488465034965035, "loss": 3.367, "step": 32000 }, { "epoch": 9.318597472188246, "eval_accuracy": 0.3671916385711629, "eval_loss": 3.579378604888916, "eval_runtime": 176.8517, "eval_samples_per_second": 94.124, "eval_steps_per_second": 5.886, "step": 32000 }, { "epoch": 9.333158599801969, "grad_norm": 0.36125627160072327, "learning_rate": 0.0004882902097902098, "loss": 3.3745, "step": 32050 }, { "epoch": 9.347719727415692, "grad_norm": 0.3284973204135895, "learning_rate": 0.0004881153846153846, "loss": 3.3784, "step": 32100 }, { "epoch": 9.362280855029413, "grad_norm": 0.3623260259628296, "learning_rate": 0.0004879405594405594, "loss": 3.3884, "step": 32150 }, { "epoch": 9.376841982643136, "grad_norm": 0.35829871892929077, "learning_rate": 0.00048776573426573424, "loss": 3.3828, "step": 32200 }, { "epoch": 9.391403110256858, "grad_norm": 0.35492923855781555, "learning_rate": 0.00048759090909090904, "loss": 3.389, "step": 32250 }, { "epoch": 9.405964237870581, "grad_norm": 0.38198089599609375, "learning_rate": 0.0004874160839160839, "loss": 3.3864, "step": 32300 }, { "epoch": 9.420525365484304, "grad_norm": 0.3594827651977539, "learning_rate": 0.0004872412587412587, "loss": 3.3896, "step": 32350 }, { "epoch": 9.435086493098025, "grad_norm": 0.3426271975040436, "learning_rate": 0.00048706643356643354, "loss": 3.3825, "step": 32400 }, { "epoch": 9.449647620711747, "grad_norm": 0.3374452590942383, "learning_rate": 0.00048689160839160834, "loss": 3.3773, "step": 32450 }, { "epoch": 9.46420874832547, "grad_norm": 0.3500727415084839, "learning_rate": 0.0004867167832167832, "loss": 3.3978, "step": 32500 }, { "epoch": 9.478769875939193, "grad_norm": 0.3206511437892914, "learning_rate": 0.00048654195804195794, "loss": 3.3935, "step": 32550 }, { "epoch": 9.493331003552916, "grad_norm": 0.3213861882686615, "learning_rate": 0.00048636713286713285, "loss": 3.3972, "step": 32600 }, { "epoch": 9.507892131166638, "grad_norm": 0.3542587459087372, "learning_rate": 0.0004861923076923077, "loss": 3.3891, "step": 32650 }, { "epoch": 9.52245325878036, "grad_norm": 0.35178741812705994, "learning_rate": 0.00048601748251748245, "loss": 3.3902, "step": 32700 }, { "epoch": 9.537014386394082, "grad_norm": 0.3213842511177063, "learning_rate": 0.0004858426573426573, "loss": 3.3894, "step": 32750 }, { "epoch": 9.551575514007805, "grad_norm": 0.3286217451095581, "learning_rate": 0.0004856678321678321, "loss": 3.3926, "step": 32800 }, { "epoch": 9.566136641621528, "grad_norm": 0.3548128008842468, "learning_rate": 0.00048549300699300696, "loss": 3.3883, "step": 32850 }, { "epoch": 9.58069776923525, "grad_norm": 0.3568916618824005, "learning_rate": 0.00048531818181818176, "loss": 3.4017, "step": 32900 }, { "epoch": 9.595258896848971, "grad_norm": 0.3281558156013489, "learning_rate": 0.0004851433566433566, "loss": 3.4038, "step": 32950 }, { "epoch": 9.609820024462694, "grad_norm": 0.35022494196891785, "learning_rate": 0.0004849685314685314, "loss": 3.4041, "step": 33000 }, { "epoch": 9.609820024462694, "eval_accuracy": 0.3675565529044665, "eval_loss": 3.573331594467163, "eval_runtime": 177.4614, "eval_samples_per_second": 93.801, "eval_steps_per_second": 5.866, "step": 33000 }, { "epoch": 9.624381152076417, "grad_norm": 0.34615230560302734, "learning_rate": 0.00048479370629370627, "loss": 3.3877, "step": 33050 }, { "epoch": 9.63894227969014, "grad_norm": 0.3371587097644806, "learning_rate": 0.00048461888111888106, "loss": 3.3997, "step": 33100 }, { "epoch": 9.653503407303862, "grad_norm": 0.32450369000434875, "learning_rate": 0.0004844440559440559, "loss": 3.3985, "step": 33150 }, { "epoch": 9.668064534917583, "grad_norm": 0.3822547495365143, "learning_rate": 0.0004842692307692307, "loss": 3.396, "step": 33200 }, { "epoch": 9.682625662531306, "grad_norm": 0.3404769003391266, "learning_rate": 0.00048409440559440557, "loss": 3.4074, "step": 33250 }, { "epoch": 9.697186790145029, "grad_norm": 0.35298749804496765, "learning_rate": 0.0004839195804195803, "loss": 3.4048, "step": 33300 }, { "epoch": 9.711747917758752, "grad_norm": 0.346872478723526, "learning_rate": 0.0004837447552447552, "loss": 3.3976, "step": 33350 }, { "epoch": 9.726309045372474, "grad_norm": 0.32681095600128174, "learning_rate": 0.0004835699300699301, "loss": 3.4035, "step": 33400 }, { "epoch": 9.740870172986195, "grad_norm": 0.3425065279006958, "learning_rate": 0.0004833951048951048, "loss": 3.3915, "step": 33450 }, { "epoch": 9.755431300599918, "grad_norm": 0.33771848678588867, "learning_rate": 0.0004832202797202797, "loss": 3.3985, "step": 33500 }, { "epoch": 9.76999242821364, "grad_norm": 0.3263910412788391, "learning_rate": 0.0004830454545454545, "loss": 3.4014, "step": 33550 }, { "epoch": 9.784553555827364, "grad_norm": 0.332059770822525, "learning_rate": 0.00048287062937062933, "loss": 3.3994, "step": 33600 }, { "epoch": 9.799114683441086, "grad_norm": 0.35489657521247864, "learning_rate": 0.00048269580419580413, "loss": 3.4078, "step": 33650 }, { "epoch": 9.813675811054807, "grad_norm": 0.35381078720092773, "learning_rate": 0.000482520979020979, "loss": 3.4005, "step": 33700 }, { "epoch": 9.82823693866853, "grad_norm": 0.324863463640213, "learning_rate": 0.0004823461538461538, "loss": 3.3952, "step": 33750 }, { "epoch": 9.842798066282253, "grad_norm": 0.3365383446216583, "learning_rate": 0.00048217132867132864, "loss": 3.4049, "step": 33800 }, { "epoch": 9.857359193895975, "grad_norm": 0.34103065729141235, "learning_rate": 0.00048199650349650344, "loss": 3.4089, "step": 33850 }, { "epoch": 9.871920321509698, "grad_norm": 0.3343670070171356, "learning_rate": 0.0004818216783216783, "loss": 3.4227, "step": 33900 }, { "epoch": 9.88648144912342, "grad_norm": 0.32671719789505005, "learning_rate": 0.0004816468531468531, "loss": 3.4102, "step": 33950 }, { "epoch": 9.901042576737142, "grad_norm": 0.36884456872940063, "learning_rate": 0.00048147202797202795, "loss": 3.4055, "step": 34000 }, { "epoch": 9.901042576737142, "eval_accuracy": 0.368497524013926, "eval_loss": 3.564042329788208, "eval_runtime": 176.835, "eval_samples_per_second": 94.133, "eval_steps_per_second": 5.887, "step": 34000 }, { "epoch": 9.915603704350865, "grad_norm": 0.3321670591831207, "learning_rate": 0.0004812972027972028, "loss": 3.3949, "step": 34050 }, { "epoch": 9.930164831964587, "grad_norm": 0.36186736822128296, "learning_rate": 0.0004811223776223776, "loss": 3.4091, "step": 34100 }, { "epoch": 9.94472595957831, "grad_norm": 0.3428935408592224, "learning_rate": 0.00048094755244755245, "loss": 3.4074, "step": 34150 }, { "epoch": 9.959287087192033, "grad_norm": 0.35069432854652405, "learning_rate": 0.0004807727272727272, "loss": 3.3979, "step": 34200 }, { "epoch": 9.973848214805754, "grad_norm": 0.32886582612991333, "learning_rate": 0.00048059790209790205, "loss": 3.4107, "step": 34250 }, { "epoch": 9.988409342419477, "grad_norm": 0.3682115972042084, "learning_rate": 0.00048042307692307685, "loss": 3.4094, "step": 34300 }, { "epoch": 10.002912225522744, "grad_norm": 0.35744708776474, "learning_rate": 0.0004802482517482517, "loss": 3.3896, "step": 34350 }, { "epoch": 10.017473353136467, "grad_norm": 0.35240817070007324, "learning_rate": 0.0004800734265734265, "loss": 3.3044, "step": 34400 }, { "epoch": 10.03203448075019, "grad_norm": 0.34491005539894104, "learning_rate": 0.00047989860139860136, "loss": 3.2982, "step": 34450 }, { "epoch": 10.046595608363912, "grad_norm": 0.33637934923171997, "learning_rate": 0.00047972377622377616, "loss": 3.3106, "step": 34500 }, { "epoch": 10.061156735977635, "grad_norm": 0.3516424894332886, "learning_rate": 0.000479548951048951, "loss": 3.3193, "step": 34550 }, { "epoch": 10.075717863591356, "grad_norm": 0.3707565665245056, "learning_rate": 0.0004793741258741258, "loss": 3.3098, "step": 34600 }, { "epoch": 10.090278991205079, "grad_norm": 0.33687910437583923, "learning_rate": 0.00047919930069930067, "loss": 3.3132, "step": 34650 }, { "epoch": 10.104840118818801, "grad_norm": 0.353137731552124, "learning_rate": 0.0004790244755244755, "loss": 3.3231, "step": 34700 }, { "epoch": 10.119401246432524, "grad_norm": 0.36714377999305725, "learning_rate": 0.0004788496503496503, "loss": 3.324, "step": 34750 }, { "epoch": 10.133962374046247, "grad_norm": 0.32502633333206177, "learning_rate": 0.0004786748251748252, "loss": 3.3239, "step": 34800 }, { "epoch": 10.148523501659968, "grad_norm": 0.33693206310272217, "learning_rate": 0.0004785, "loss": 3.3299, "step": 34850 }, { "epoch": 10.16308462927369, "grad_norm": 0.3602798581123352, "learning_rate": 0.00047832517482517483, "loss": 3.3361, "step": 34900 }, { "epoch": 10.177645756887413, "grad_norm": 0.3324842154979706, "learning_rate": 0.0004781503496503496, "loss": 3.3397, "step": 34950 }, { "epoch": 10.192206884501136, "grad_norm": 0.338860422372818, "learning_rate": 0.00047797552447552443, "loss": 3.3366, "step": 35000 }, { "epoch": 10.192206884501136, "eval_accuracy": 0.3681085093461097, "eval_loss": 3.572119951248169, "eval_runtime": 176.7411, "eval_samples_per_second": 94.183, "eval_steps_per_second": 5.89, "step": 35000 }, { "epoch": 10.206768012114859, "grad_norm": 0.37433600425720215, "learning_rate": 0.00047780069930069923, "loss": 3.3329, "step": 35050 }, { "epoch": 10.221329139728581, "grad_norm": 0.3410218060016632, "learning_rate": 0.0004776258741258741, "loss": 3.3312, "step": 35100 }, { "epoch": 10.235890267342302, "grad_norm": 0.33754661679267883, "learning_rate": 0.0004774510489510489, "loss": 3.3364, "step": 35150 }, { "epoch": 10.250451394956025, "grad_norm": 0.3520354628562927, "learning_rate": 0.00047727622377622374, "loss": 3.3463, "step": 35200 }, { "epoch": 10.265012522569748, "grad_norm": 0.3703749179840088, "learning_rate": 0.00047710139860139854, "loss": 3.352, "step": 35250 }, { "epoch": 10.27957365018347, "grad_norm": 0.3496933877468109, "learning_rate": 0.0004769265734265734, "loss": 3.3469, "step": 35300 }, { "epoch": 10.294134777797193, "grad_norm": 0.3573146164417267, "learning_rate": 0.0004767517482517482, "loss": 3.3455, "step": 35350 }, { "epoch": 10.308695905410914, "grad_norm": 0.34615665674209595, "learning_rate": 0.00047657692307692304, "loss": 3.3578, "step": 35400 }, { "epoch": 10.323257033024637, "grad_norm": 0.3539220690727234, "learning_rate": 0.0004764020979020979, "loss": 3.3548, "step": 35450 }, { "epoch": 10.33781816063836, "grad_norm": 0.3664166033267975, "learning_rate": 0.0004762272727272727, "loss": 3.3533, "step": 35500 }, { "epoch": 10.352379288252083, "grad_norm": 0.35653188824653625, "learning_rate": 0.00047605244755244755, "loss": 3.3557, "step": 35550 }, { "epoch": 10.366940415865805, "grad_norm": 0.3385823965072632, "learning_rate": 0.00047587762237762235, "loss": 3.3517, "step": 35600 }, { "epoch": 10.381501543479526, "grad_norm": 0.36101484298706055, "learning_rate": 0.0004757027972027972, "loss": 3.3607, "step": 35650 }, { "epoch": 10.396062671093249, "grad_norm": 0.34660804271698, "learning_rate": 0.00047552797202797195, "loss": 3.3548, "step": 35700 }, { "epoch": 10.410623798706972, "grad_norm": 0.3321472704410553, "learning_rate": 0.0004753531468531468, "loss": 3.3513, "step": 35750 }, { "epoch": 10.425184926320695, "grad_norm": 0.3524532616138458, "learning_rate": 0.0004751783216783216, "loss": 3.3542, "step": 35800 }, { "epoch": 10.439746053934417, "grad_norm": 0.33926790952682495, "learning_rate": 0.00047500349650349646, "loss": 3.3685, "step": 35850 }, { "epoch": 10.454307181548138, "grad_norm": 0.3707771599292755, "learning_rate": 0.00047482867132867126, "loss": 3.3754, "step": 35900 }, { "epoch": 10.468868309161861, "grad_norm": 0.36390265822410583, "learning_rate": 0.0004746538461538461, "loss": 3.3671, "step": 35950 }, { "epoch": 10.483429436775584, "grad_norm": 0.32835954427719116, "learning_rate": 0.0004744790209790209, "loss": 3.3638, "step": 36000 }, { "epoch": 10.483429436775584, "eval_accuracy": 0.36857488020958123, "eval_loss": 3.5680227279663086, "eval_runtime": 176.8143, "eval_samples_per_second": 94.144, "eval_steps_per_second": 5.888, "step": 36000 }, { "epoch": 10.497990564389307, "grad_norm": 0.36081641912460327, "learning_rate": 0.00047430419580419576, "loss": 3.3636, "step": 36050 }, { "epoch": 10.51255169200303, "grad_norm": 0.3701050579547882, "learning_rate": 0.0004741293706293706, "loss": 3.3462, "step": 36100 }, { "epoch": 10.52711281961675, "grad_norm": 0.3523276150226593, "learning_rate": 0.0004739545454545454, "loss": 3.3713, "step": 36150 }, { "epoch": 10.541673947230473, "grad_norm": 0.34822702407836914, "learning_rate": 0.00047377972027972027, "loss": 3.3808, "step": 36200 }, { "epoch": 10.556235074844196, "grad_norm": 0.34755954146385193, "learning_rate": 0.00047360489510489507, "loss": 3.3718, "step": 36250 }, { "epoch": 10.570796202457919, "grad_norm": 0.3505362868309021, "learning_rate": 0.0004734300699300699, "loss": 3.3684, "step": 36300 }, { "epoch": 10.585357330071641, "grad_norm": 0.36556077003479004, "learning_rate": 0.0004732552447552447, "loss": 3.3585, "step": 36350 }, { "epoch": 10.599918457685362, "grad_norm": 0.36185914278030396, "learning_rate": 0.0004730804195804196, "loss": 3.3745, "step": 36400 }, { "epoch": 10.614479585299085, "grad_norm": 0.3472917973995209, "learning_rate": 0.0004729055944055943, "loss": 3.369, "step": 36450 }, { "epoch": 10.629040712912808, "grad_norm": 0.3542565405368805, "learning_rate": 0.0004727307692307692, "loss": 3.377, "step": 36500 }, { "epoch": 10.64360184052653, "grad_norm": 0.3553348779678345, "learning_rate": 0.000472555944055944, "loss": 3.3614, "step": 36550 }, { "epoch": 10.658162968140253, "grad_norm": 0.3704865872859955, "learning_rate": 0.00047238111888111883, "loss": 3.3763, "step": 36600 }, { "epoch": 10.672724095753976, "grad_norm": 0.3559943437576294, "learning_rate": 0.00047220629370629363, "loss": 3.3715, "step": 36650 }, { "epoch": 10.687285223367697, "grad_norm": 0.34045952558517456, "learning_rate": 0.0004720314685314685, "loss": 3.3823, "step": 36700 }, { "epoch": 10.70184635098142, "grad_norm": 0.3242470622062683, "learning_rate": 0.0004718566433566433, "loss": 3.3757, "step": 36750 }, { "epoch": 10.716407478595142, "grad_norm": 0.35755789279937744, "learning_rate": 0.00047168181818181814, "loss": 3.3811, "step": 36800 }, { "epoch": 10.730968606208865, "grad_norm": 0.37564322352409363, "learning_rate": 0.000471506993006993, "loss": 3.3723, "step": 36850 }, { "epoch": 10.745529733822588, "grad_norm": 0.3541555404663086, "learning_rate": 0.0004713321678321678, "loss": 3.373, "step": 36900 }, { "epoch": 10.760090861436309, "grad_norm": 0.3599989414215088, "learning_rate": 0.00047115734265734265, "loss": 3.3864, "step": 36950 }, { "epoch": 10.774651989050032, "grad_norm": 0.35538411140441895, "learning_rate": 0.00047098251748251745, "loss": 3.3768, "step": 37000 }, { "epoch": 10.774651989050032, "eval_accuracy": 0.3687768527690579, "eval_loss": 3.5605952739715576, "eval_runtime": 176.8123, "eval_samples_per_second": 94.145, "eval_steps_per_second": 5.888, "step": 37000 }, { "epoch": 10.789213116663754, "grad_norm": 0.32753708958625793, "learning_rate": 0.0004708076923076923, "loss": 3.3872, "step": 37050 }, { "epoch": 10.803774244277477, "grad_norm": 0.3197953402996063, "learning_rate": 0.0004706328671328671, "loss": 3.3832, "step": 37100 }, { "epoch": 10.8183353718912, "grad_norm": 0.33160483837127686, "learning_rate": 0.00047045804195804195, "loss": 3.375, "step": 37150 }, { "epoch": 10.83289649950492, "grad_norm": 0.36038827896118164, "learning_rate": 0.0004702832167832167, "loss": 3.3957, "step": 37200 }, { "epoch": 10.847457627118644, "grad_norm": 0.33877766132354736, "learning_rate": 0.00047010839160839155, "loss": 3.3873, "step": 37250 }, { "epoch": 10.862018754732366, "grad_norm": 0.34722471237182617, "learning_rate": 0.00046993356643356635, "loss": 3.3901, "step": 37300 }, { "epoch": 10.876579882346089, "grad_norm": 0.3494495153427124, "learning_rate": 0.0004697587412587412, "loss": 3.3954, "step": 37350 }, { "epoch": 10.891141009959812, "grad_norm": 0.34382206201553345, "learning_rate": 0.000469583916083916, "loss": 3.3816, "step": 37400 }, { "epoch": 10.905702137573535, "grad_norm": 0.33641430735588074, "learning_rate": 0.00046940909090909086, "loss": 3.3772, "step": 37450 }, { "epoch": 10.920263265187256, "grad_norm": 0.3528263568878174, "learning_rate": 0.0004692342657342657, "loss": 3.3876, "step": 37500 }, { "epoch": 10.934824392800978, "grad_norm": 0.34849458932876587, "learning_rate": 0.0004690594405594405, "loss": 3.3914, "step": 37550 }, { "epoch": 10.949385520414701, "grad_norm": 0.34705743193626404, "learning_rate": 0.00046888461538461537, "loss": 3.3793, "step": 37600 }, { "epoch": 10.963946648028424, "grad_norm": 0.3272669017314911, "learning_rate": 0.00046870979020979017, "loss": 3.3788, "step": 37650 }, { "epoch": 10.978507775642147, "grad_norm": 0.33112722635269165, "learning_rate": 0.000468534965034965, "loss": 3.3837, "step": 37700 }, { "epoch": 10.993068903255867, "grad_norm": 0.3313654363155365, "learning_rate": 0.0004683601398601398, "loss": 3.3966, "step": 37750 }, { "epoch": 11.007571786359136, "grad_norm": 0.3786119222640991, "learning_rate": 0.0004681853146853147, "loss": 3.3267, "step": 37800 }, { "epoch": 11.022132913972857, "grad_norm": 0.35445159673690796, "learning_rate": 0.0004680104895104895, "loss": 3.2648, "step": 37850 }, { "epoch": 11.03669404158658, "grad_norm": 0.3338647782802582, "learning_rate": 0.00046783566433566433, "loss": 3.2768, "step": 37900 }, { "epoch": 11.051255169200303, "grad_norm": 0.33461859822273254, "learning_rate": 0.0004676608391608391, "loss": 3.2856, "step": 37950 }, { "epoch": 11.065816296814026, "grad_norm": 0.3517346978187561, "learning_rate": 0.00046748601398601393, "loss": 3.2775, "step": 38000 }, { "epoch": 11.065816296814026, "eval_accuracy": 0.36888348205395044, "eval_loss": 3.570584297180176, "eval_runtime": 176.7964, "eval_samples_per_second": 94.153, "eval_steps_per_second": 5.888, "step": 38000 }, { "epoch": 11.080377424427748, "grad_norm": 0.3435273766517639, "learning_rate": 0.00046731118881118873, "loss": 3.2915, "step": 38050 }, { "epoch": 11.09493855204147, "grad_norm": 0.3325134813785553, "learning_rate": 0.0004671363636363636, "loss": 3.2978, "step": 38100 }, { "epoch": 11.109499679655192, "grad_norm": 0.36426037549972534, "learning_rate": 0.00046696153846153844, "loss": 3.3033, "step": 38150 }, { "epoch": 11.124060807268915, "grad_norm": 0.327764093875885, "learning_rate": 0.00046678671328671324, "loss": 3.3043, "step": 38200 }, { "epoch": 11.138621934882638, "grad_norm": 0.3818574845790863, "learning_rate": 0.0004666118881118881, "loss": 3.3042, "step": 38250 }, { "epoch": 11.15318306249636, "grad_norm": 0.38499313592910767, "learning_rate": 0.0004664370629370629, "loss": 3.2968, "step": 38300 }, { "epoch": 11.167744190110081, "grad_norm": 0.3457299768924713, "learning_rate": 0.00046626223776223774, "loss": 3.299, "step": 38350 }, { "epoch": 11.182305317723804, "grad_norm": 0.37761256098747253, "learning_rate": 0.00046608741258741254, "loss": 3.3019, "step": 38400 }, { "epoch": 11.196866445337527, "grad_norm": 0.35469600558280945, "learning_rate": 0.0004659125874125874, "loss": 3.3116, "step": 38450 }, { "epoch": 11.21142757295125, "grad_norm": 0.3749145269393921, "learning_rate": 0.0004657377622377622, "loss": 3.3128, "step": 38500 }, { "epoch": 11.225988700564972, "grad_norm": 0.38890284299850464, "learning_rate": 0.00046556293706293705, "loss": 3.332, "step": 38550 }, { "epoch": 11.240549828178693, "grad_norm": 0.3799166679382324, "learning_rate": 0.00046538811188811185, "loss": 3.3111, "step": 38600 }, { "epoch": 11.255110955792416, "grad_norm": 0.3511936366558075, "learning_rate": 0.0004652132867132867, "loss": 3.3251, "step": 38650 }, { "epoch": 11.269672083406139, "grad_norm": 0.37755730748176575, "learning_rate": 0.00046503846153846145, "loss": 3.3257, "step": 38700 }, { "epoch": 11.284233211019862, "grad_norm": 0.34114590287208557, "learning_rate": 0.0004648636363636363, "loss": 3.3212, "step": 38750 }, { "epoch": 11.298794338633584, "grad_norm": 0.3562406897544861, "learning_rate": 0.0004646888111888111, "loss": 3.3259, "step": 38800 }, { "epoch": 11.313355466247307, "grad_norm": 0.3757557272911072, "learning_rate": 0.00046451398601398596, "loss": 3.3395, "step": 38850 }, { "epoch": 11.327916593861028, "grad_norm": 0.3693144619464874, "learning_rate": 0.0004643391608391608, "loss": 3.3458, "step": 38900 }, { "epoch": 11.34247772147475, "grad_norm": 0.3550606071949005, "learning_rate": 0.0004641643356643356, "loss": 3.336, "step": 38950 }, { "epoch": 11.357038849088473, "grad_norm": 0.36543989181518555, "learning_rate": 0.00046398951048951046, "loss": 3.331, "step": 39000 }, { "epoch": 11.357038849088473, "eval_accuracy": 0.3695512376638617, "eval_loss": 3.567185878753662, "eval_runtime": 176.8379, "eval_samples_per_second": 94.131, "eval_steps_per_second": 5.887, "step": 39000 }, { "epoch": 11.371599976702196, "grad_norm": 0.36690041422843933, "learning_rate": 0.00046381468531468526, "loss": 3.3383, "step": 39050 }, { "epoch": 11.386161104315919, "grad_norm": 0.3439081013202667, "learning_rate": 0.0004636398601398601, "loss": 3.3272, "step": 39100 }, { "epoch": 11.40072223192964, "grad_norm": 0.3694179654121399, "learning_rate": 0.0004634650349650349, "loss": 3.337, "step": 39150 }, { "epoch": 11.415283359543363, "grad_norm": 0.36797547340393066, "learning_rate": 0.00046329020979020977, "loss": 3.3276, "step": 39200 }, { "epoch": 11.429844487157085, "grad_norm": 0.40091055631637573, "learning_rate": 0.00046311538461538457, "loss": 3.3355, "step": 39250 }, { "epoch": 11.444405614770808, "grad_norm": 0.34869131445884705, "learning_rate": 0.0004629405594405594, "loss": 3.3352, "step": 39300 }, { "epoch": 11.458966742384531, "grad_norm": 0.35575979948043823, "learning_rate": 0.0004627657342657342, "loss": 3.3462, "step": 39350 }, { "epoch": 11.473527869998252, "grad_norm": 0.3693273365497589, "learning_rate": 0.0004625909090909091, "loss": 3.3398, "step": 39400 }, { "epoch": 11.488088997611975, "grad_norm": 0.3422088623046875, "learning_rate": 0.0004624160839160838, "loss": 3.3537, "step": 39450 }, { "epoch": 11.502650125225697, "grad_norm": 0.35034725069999695, "learning_rate": 0.0004622412587412587, "loss": 3.3536, "step": 39500 }, { "epoch": 11.51721125283942, "grad_norm": 0.3721543550491333, "learning_rate": 0.00046206643356643353, "loss": 3.3611, "step": 39550 }, { "epoch": 11.531772380453143, "grad_norm": 0.3786129653453827, "learning_rate": 0.00046189160839160833, "loss": 3.3612, "step": 39600 }, { "epoch": 11.546333508066864, "grad_norm": 0.37621766328811646, "learning_rate": 0.0004617167832167832, "loss": 3.3513, "step": 39650 }, { "epoch": 11.560894635680587, "grad_norm": 0.33818793296813965, "learning_rate": 0.000461541958041958, "loss": 3.3585, "step": 39700 }, { "epoch": 11.57545576329431, "grad_norm": 0.3617473840713501, "learning_rate": 0.00046136713286713284, "loss": 3.3416, "step": 39750 }, { "epoch": 11.590016890908032, "grad_norm": 0.36892908811569214, "learning_rate": 0.00046119230769230764, "loss": 3.3564, "step": 39800 }, { "epoch": 11.604578018521755, "grad_norm": 0.3816312849521637, "learning_rate": 0.0004610174825174825, "loss": 3.3536, "step": 39850 }, { "epoch": 11.619139146135478, "grad_norm": 0.3597758114337921, "learning_rate": 0.0004608426573426573, "loss": 3.3576, "step": 39900 }, { "epoch": 11.633700273749199, "grad_norm": 0.3639199733734131, "learning_rate": 0.00046066783216783215, "loss": 3.354, "step": 39950 }, { "epoch": 11.648261401362921, "grad_norm": 0.3267276883125305, "learning_rate": 0.00046049300699300695, "loss": 3.3565, "step": 40000 }, { "epoch": 11.648261401362921, "eval_accuracy": 0.36973110845315116, "eval_loss": 3.5581958293914795, "eval_runtime": 176.7361, "eval_samples_per_second": 94.186, "eval_steps_per_second": 5.89, "step": 40000 }, { "epoch": 11.662822528976644, "grad_norm": 0.34614065289497375, "learning_rate": 0.0004603181818181818, "loss": 3.3599, "step": 40050 }, { "epoch": 11.677383656590367, "grad_norm": 0.3481432795524597, "learning_rate": 0.0004601433566433566, "loss": 3.3579, "step": 40100 }, { "epoch": 11.69194478420409, "grad_norm": 0.36765721440315247, "learning_rate": 0.00045996853146853145, "loss": 3.3666, "step": 40150 }, { "epoch": 11.70650591181781, "grad_norm": 0.34848764538764954, "learning_rate": 0.0004597937062937062, "loss": 3.3521, "step": 40200 }, { "epoch": 11.721067039431533, "grad_norm": 0.33765318989753723, "learning_rate": 0.00045961888111888105, "loss": 3.3765, "step": 40250 }, { "epoch": 11.735628167045256, "grad_norm": 0.35295817255973816, "learning_rate": 0.0004594440559440559, "loss": 3.3707, "step": 40300 }, { "epoch": 11.750189294658979, "grad_norm": 0.3578189015388489, "learning_rate": 0.0004592692307692307, "loss": 3.3524, "step": 40350 }, { "epoch": 11.764750422272702, "grad_norm": 0.36456865072250366, "learning_rate": 0.00045909440559440556, "loss": 3.3704, "step": 40400 }, { "epoch": 11.779311549886422, "grad_norm": 0.3516547381877899, "learning_rate": 0.00045891958041958036, "loss": 3.3665, "step": 40450 }, { "epoch": 11.793872677500145, "grad_norm": 0.3550742268562317, "learning_rate": 0.0004587447552447552, "loss": 3.3572, "step": 40500 }, { "epoch": 11.808433805113868, "grad_norm": 0.34383824467658997, "learning_rate": 0.00045856993006993, "loss": 3.3619, "step": 40550 }, { "epoch": 11.82299493272759, "grad_norm": 0.3489612638950348, "learning_rate": 0.00045839510489510487, "loss": 3.3644, "step": 40600 }, { "epoch": 11.837556060341313, "grad_norm": 0.3638046085834503, "learning_rate": 0.00045822027972027967, "loss": 3.3625, "step": 40650 }, { "epoch": 11.852117187955034, "grad_norm": 0.350432425737381, "learning_rate": 0.0004580454545454545, "loss": 3.3741, "step": 40700 }, { "epoch": 11.866678315568757, "grad_norm": 0.34485307335853577, "learning_rate": 0.0004578706293706293, "loss": 3.3554, "step": 40750 }, { "epoch": 11.88123944318248, "grad_norm": 0.35631051659584045, "learning_rate": 0.0004576958041958042, "loss": 3.3613, "step": 40800 }, { "epoch": 11.895800570796203, "grad_norm": 0.3506964147090912, "learning_rate": 0.000457520979020979, "loss": 3.3655, "step": 40850 }, { "epoch": 11.910361698409925, "grad_norm": 0.35819366574287415, "learning_rate": 0.00045734615384615383, "loss": 3.3665, "step": 40900 }, { "epoch": 11.924922826023646, "grad_norm": 0.35843780636787415, "learning_rate": 0.0004571713286713287, "loss": 3.3722, "step": 40950 }, { "epoch": 11.93948395363737, "grad_norm": 0.3654157519340515, "learning_rate": 0.00045699650349650343, "loss": 3.3535, "step": 41000 }, { "epoch": 11.93948395363737, "eval_accuracy": 0.37033843688286977, "eval_loss": 3.5503153800964355, "eval_runtime": 176.6657, "eval_samples_per_second": 94.223, "eval_steps_per_second": 5.892, "step": 41000 }, { "epoch": 11.954045081251092, "grad_norm": 0.34321489930152893, "learning_rate": 0.0004568216783216783, "loss": 3.3734, "step": 41050 }, { "epoch": 11.968606208864815, "grad_norm": 0.34735623002052307, "learning_rate": 0.0004566468531468531, "loss": 3.3769, "step": 41100 }, { "epoch": 11.983167336478537, "grad_norm": 0.33347296714782715, "learning_rate": 0.00045647202797202794, "loss": 3.3724, "step": 41150 }, { "epoch": 11.99772846409226, "grad_norm": 0.35811206698417664, "learning_rate": 0.00045629720279720274, "loss": 3.3783, "step": 41200 }, { "epoch": 12.012231347195527, "grad_norm": 0.35955357551574707, "learning_rate": 0.0004561223776223776, "loss": 3.2728, "step": 41250 }, { "epoch": 12.02679247480925, "grad_norm": 0.3704414963722229, "learning_rate": 0.0004559475524475524, "loss": 3.2625, "step": 41300 }, { "epoch": 12.041353602422971, "grad_norm": 0.3613720238208771, "learning_rate": 0.00045577272727272724, "loss": 3.2562, "step": 41350 }, { "epoch": 12.055914730036694, "grad_norm": 0.34454989433288574, "learning_rate": 0.00045559790209790204, "loss": 3.2825, "step": 41400 }, { "epoch": 12.070475857650417, "grad_norm": 0.3962979018688202, "learning_rate": 0.0004554230769230769, "loss": 3.2717, "step": 41450 }, { "epoch": 12.08503698526414, "grad_norm": 0.39520618319511414, "learning_rate": 0.0004552482517482517, "loss": 3.2835, "step": 41500 }, { "epoch": 12.099598112877862, "grad_norm": 0.34733277559280396, "learning_rate": 0.00045507342657342655, "loss": 3.2717, "step": 41550 }, { "epoch": 12.114159240491583, "grad_norm": 0.3350951671600342, "learning_rate": 0.00045489860139860135, "loss": 3.2852, "step": 41600 }, { "epoch": 12.128720368105306, "grad_norm": 0.3716607391834259, "learning_rate": 0.0004547237762237762, "loss": 3.2812, "step": 41650 }, { "epoch": 12.143281495719028, "grad_norm": 0.3612712025642395, "learning_rate": 0.00045454895104895106, "loss": 3.2897, "step": 41700 }, { "epoch": 12.157842623332751, "grad_norm": 0.36439794301986694, "learning_rate": 0.0004543741258741258, "loss": 3.289, "step": 41750 }, { "epoch": 12.172403750946474, "grad_norm": 0.3498424291610718, "learning_rate": 0.00045419930069930066, "loss": 3.2888, "step": 41800 }, { "epoch": 12.186964878560195, "grad_norm": 0.34217751026153564, "learning_rate": 0.00045402447552447546, "loss": 3.2825, "step": 41850 }, { "epoch": 12.201526006173918, "grad_norm": 0.35701295733451843, "learning_rate": 0.0004538496503496503, "loss": 3.2988, "step": 41900 }, { "epoch": 12.21608713378764, "grad_norm": 0.4013568162918091, "learning_rate": 0.0004536748251748251, "loss": 3.2988, "step": 41950 }, { "epoch": 12.230648261401363, "grad_norm": 0.35291993618011475, "learning_rate": 0.00045349999999999996, "loss": 3.3005, "step": 42000 }, { "epoch": 12.230648261401363, "eval_accuracy": 0.370066279446788, "eval_loss": 3.563735246658325, "eval_runtime": 176.7458, "eval_samples_per_second": 94.18, "eval_steps_per_second": 5.89, "step": 42000 }, { "epoch": 12.245209389015086, "grad_norm": 0.3605182468891144, "learning_rate": 0.00045332517482517476, "loss": 3.3089, "step": 42050 }, { "epoch": 12.259770516628807, "grad_norm": 0.36443638801574707, "learning_rate": 0.0004531503496503496, "loss": 3.3091, "step": 42100 }, { "epoch": 12.27433164424253, "grad_norm": 0.3468221127986908, "learning_rate": 0.0004529755244755244, "loss": 3.309, "step": 42150 }, { "epoch": 12.288892771856252, "grad_norm": 0.3665069341659546, "learning_rate": 0.00045280069930069927, "loss": 3.3124, "step": 42200 }, { "epoch": 12.303453899469975, "grad_norm": 0.35970741510391235, "learning_rate": 0.00045262587412587407, "loss": 3.3093, "step": 42250 }, { "epoch": 12.318015027083698, "grad_norm": 0.35841190814971924, "learning_rate": 0.0004524510489510489, "loss": 3.3132, "step": 42300 }, { "epoch": 12.33257615469742, "grad_norm": 0.37371912598609924, "learning_rate": 0.0004522762237762238, "loss": 3.3182, "step": 42350 }, { "epoch": 12.347137282311142, "grad_norm": 0.37245097756385803, "learning_rate": 0.0004521013986013986, "loss": 3.3059, "step": 42400 }, { "epoch": 12.361698409924864, "grad_norm": 0.3761231303215027, "learning_rate": 0.00045192657342657343, "loss": 3.3175, "step": 42450 }, { "epoch": 12.376259537538587, "grad_norm": 0.3797931969165802, "learning_rate": 0.0004517517482517482, "loss": 3.3263, "step": 42500 }, { "epoch": 12.39082066515231, "grad_norm": 0.37721502780914307, "learning_rate": 0.00045157692307692303, "loss": 3.3078, "step": 42550 }, { "epoch": 12.405381792766033, "grad_norm": 0.3475891351699829, "learning_rate": 0.00045140209790209783, "loss": 3.332, "step": 42600 }, { "epoch": 12.419942920379754, "grad_norm": 0.3628634214401245, "learning_rate": 0.0004512272727272727, "loss": 3.3225, "step": 42650 }, { "epoch": 12.434504047993476, "grad_norm": 0.3399854302406311, "learning_rate": 0.0004510524475524475, "loss": 3.3231, "step": 42700 }, { "epoch": 12.449065175607199, "grad_norm": 0.36382219195365906, "learning_rate": 0.00045087762237762234, "loss": 3.3166, "step": 42750 }, { "epoch": 12.463626303220922, "grad_norm": 0.3841249942779541, "learning_rate": 0.00045070279720279714, "loss": 3.3256, "step": 42800 }, { "epoch": 12.478187430834645, "grad_norm": 0.37852030992507935, "learning_rate": 0.000450527972027972, "loss": 3.3214, "step": 42850 }, { "epoch": 12.492748558448366, "grad_norm": 0.3684305250644684, "learning_rate": 0.0004503531468531468, "loss": 3.3269, "step": 42900 }, { "epoch": 12.507309686062088, "grad_norm": 0.36599355936050415, "learning_rate": 0.00045017832167832165, "loss": 3.3291, "step": 42950 }, { "epoch": 12.521870813675811, "grad_norm": 0.35258835554122925, "learning_rate": 0.0004500034965034965, "loss": 3.3365, "step": 43000 }, { "epoch": 12.521870813675811, "eval_accuracy": 0.3700295999132858, "eval_loss": 3.5569796562194824, "eval_runtime": 176.7498, "eval_samples_per_second": 94.178, "eval_steps_per_second": 5.89, "step": 43000 }, { "epoch": 12.536431941289534, "grad_norm": 0.39876219630241394, "learning_rate": 0.0004498286713286713, "loss": 3.3302, "step": 43050 }, { "epoch": 12.550993068903256, "grad_norm": 0.3408920168876648, "learning_rate": 0.00044965384615384615, "loss": 3.3342, "step": 43100 }, { "epoch": 12.565554196516977, "grad_norm": 0.3744554817676544, "learning_rate": 0.00044947902097902095, "loss": 3.3331, "step": 43150 }, { "epoch": 12.5801153241307, "grad_norm": 0.3452394902706146, "learning_rate": 0.0004493041958041958, "loss": 3.3371, "step": 43200 }, { "epoch": 12.594676451744423, "grad_norm": 0.3524150550365448, "learning_rate": 0.00044912937062937055, "loss": 3.3321, "step": 43250 }, { "epoch": 12.609237579358146, "grad_norm": 0.37291622161865234, "learning_rate": 0.0004489545454545454, "loss": 3.3219, "step": 43300 }, { "epoch": 12.623798706971868, "grad_norm": 0.34290096163749695, "learning_rate": 0.0004487797202797202, "loss": 3.3307, "step": 43350 }, { "epoch": 12.63835983458559, "grad_norm": 0.36508339643478394, "learning_rate": 0.00044860489510489506, "loss": 3.3332, "step": 43400 }, { "epoch": 12.652920962199312, "grad_norm": 0.4050483703613281, "learning_rate": 0.00044843006993006986, "loss": 3.3433, "step": 43450 }, { "epoch": 12.667482089813035, "grad_norm": 0.34905606508255005, "learning_rate": 0.0004482552447552447, "loss": 3.3518, "step": 43500 }, { "epoch": 12.682043217426758, "grad_norm": 0.34155765175819397, "learning_rate": 0.0004480804195804195, "loss": 3.3444, "step": 43550 }, { "epoch": 12.69660434504048, "grad_norm": 0.3530701994895935, "learning_rate": 0.00044790559440559437, "loss": 3.3392, "step": 43600 }, { "epoch": 12.711165472654203, "grad_norm": 0.4012036621570587, "learning_rate": 0.00044773076923076917, "loss": 3.3461, "step": 43650 }, { "epoch": 12.725726600267924, "grad_norm": 0.3666689097881317, "learning_rate": 0.000447555944055944, "loss": 3.3431, "step": 43700 }, { "epoch": 12.740287727881647, "grad_norm": 0.377628892660141, "learning_rate": 0.0004473811188811189, "loss": 3.3503, "step": 43750 }, { "epoch": 12.75484885549537, "grad_norm": 0.3553931713104248, "learning_rate": 0.0004472062937062937, "loss": 3.3467, "step": 43800 }, { "epoch": 12.769409983109092, "grad_norm": 0.3805387020111084, "learning_rate": 0.00044703146853146853, "loss": 3.3458, "step": 43850 }, { "epoch": 12.783971110722815, "grad_norm": 0.39872968196868896, "learning_rate": 0.00044685664335664333, "loss": 3.347, "step": 43900 }, { "epoch": 12.798532238336536, "grad_norm": 0.3444846272468567, "learning_rate": 0.0004466818181818182, "loss": 3.3526, "step": 43950 }, { "epoch": 12.813093365950259, "grad_norm": 0.3671543598175049, "learning_rate": 0.00044650699300699293, "loss": 3.3409, "step": 44000 }, { "epoch": 12.813093365950259, "eval_accuracy": 0.37057367966023463, "eval_loss": 3.548551559448242, "eval_runtime": 176.7365, "eval_samples_per_second": 94.185, "eval_steps_per_second": 5.89, "step": 44000 }, { "epoch": 12.827654493563982, "grad_norm": 0.35598888993263245, "learning_rate": 0.0004463321678321678, "loss": 3.3533, "step": 44050 }, { "epoch": 12.842215621177704, "grad_norm": 0.34838467836380005, "learning_rate": 0.0004461573426573426, "loss": 3.3469, "step": 44100 }, { "epoch": 12.856776748791427, "grad_norm": 0.36640146374702454, "learning_rate": 0.00044598251748251744, "loss": 3.3389, "step": 44150 }, { "epoch": 12.871337876405148, "grad_norm": 0.33382320404052734, "learning_rate": 0.00044580769230769224, "loss": 3.3511, "step": 44200 }, { "epoch": 12.88589900401887, "grad_norm": 0.341463565826416, "learning_rate": 0.0004456328671328671, "loss": 3.3517, "step": 44250 }, { "epoch": 12.900460131632594, "grad_norm": 0.37801268696784973, "learning_rate": 0.0004454580419580419, "loss": 3.3596, "step": 44300 }, { "epoch": 12.915021259246316, "grad_norm": 0.37234899401664734, "learning_rate": 0.00044528321678321674, "loss": 3.3506, "step": 44350 }, { "epoch": 12.929582386860039, "grad_norm": 0.38583022356033325, "learning_rate": 0.0004451083916083916, "loss": 3.3452, "step": 44400 }, { "epoch": 12.944143514473762, "grad_norm": 0.3857607841491699, "learning_rate": 0.0004449335664335664, "loss": 3.3561, "step": 44450 }, { "epoch": 12.958704642087483, "grad_norm": 0.3415960371494293, "learning_rate": 0.00044475874125874125, "loss": 3.3509, "step": 44500 }, { "epoch": 12.973265769701205, "grad_norm": 0.35617539286613464, "learning_rate": 0.00044458391608391605, "loss": 3.3557, "step": 44550 }, { "epoch": 12.987826897314928, "grad_norm": 0.3598819673061371, "learning_rate": 0.0004444090909090909, "loss": 3.3423, "step": 44600 }, { "epoch": 13.002329780418195, "grad_norm": 0.3762507140636444, "learning_rate": 0.0004442342657342657, "loss": 3.3355, "step": 44650 }, { "epoch": 13.016890908031918, "grad_norm": 0.36438143253326416, "learning_rate": 0.00044405944055944056, "loss": 3.2297, "step": 44700 }, { "epoch": 13.031452035645641, "grad_norm": 0.35098105669021606, "learning_rate": 0.0004438846153846153, "loss": 3.2446, "step": 44750 }, { "epoch": 13.046013163259364, "grad_norm": 0.3709072470664978, "learning_rate": 0.00044370979020979016, "loss": 3.2529, "step": 44800 }, { "epoch": 13.060574290873085, "grad_norm": 0.3667590916156769, "learning_rate": 0.00044353496503496496, "loss": 3.2429, "step": 44850 }, { "epoch": 13.075135418486807, "grad_norm": 0.3482988774776459, "learning_rate": 0.0004433601398601398, "loss": 3.2617, "step": 44900 }, { "epoch": 13.08969654610053, "grad_norm": 0.342864453792572, "learning_rate": 0.0004431853146853146, "loss": 3.2595, "step": 44950 }, { "epoch": 13.104257673714253, "grad_norm": 0.36293044686317444, "learning_rate": 0.00044301048951048946, "loss": 3.2575, "step": 45000 }, { "epoch": 13.104257673714253, "eval_accuracy": 0.37003489023061786, "eval_loss": 3.5607776641845703, "eval_runtime": 176.7546, "eval_samples_per_second": 94.176, "eval_steps_per_second": 5.89, "step": 45000 }, { "epoch": 13.118818801327976, "grad_norm": 0.3598528504371643, "learning_rate": 0.00044283566433566426, "loss": 3.2656, "step": 45050 }, { "epoch": 13.133379928941697, "grad_norm": 0.387949675321579, "learning_rate": 0.0004426608391608391, "loss": 3.266, "step": 45100 }, { "epoch": 13.14794105655542, "grad_norm": 0.37262654304504395, "learning_rate": 0.00044248601398601397, "loss": 3.2734, "step": 45150 }, { "epoch": 13.162502184169142, "grad_norm": 0.38459402322769165, "learning_rate": 0.00044231118881118877, "loss": 3.2757, "step": 45200 }, { "epoch": 13.177063311782865, "grad_norm": 0.3644809424877167, "learning_rate": 0.0004421363636363636, "loss": 3.2803, "step": 45250 }, { "epoch": 13.191624439396588, "grad_norm": 0.36597248911857605, "learning_rate": 0.0004419615384615384, "loss": 3.2775, "step": 45300 }, { "epoch": 13.206185567010309, "grad_norm": 0.3780094385147095, "learning_rate": 0.0004417867132867133, "loss": 3.2709, "step": 45350 }, { "epoch": 13.220746694624031, "grad_norm": 0.3701019585132599, "learning_rate": 0.0004416118881118881, "loss": 3.2711, "step": 45400 }, { "epoch": 13.235307822237754, "grad_norm": 0.4011037051677704, "learning_rate": 0.00044143706293706293, "loss": 3.2755, "step": 45450 }, { "epoch": 13.249868949851477, "grad_norm": 0.39571163058280945, "learning_rate": 0.0004412622377622377, "loss": 3.2932, "step": 45500 }, { "epoch": 13.2644300774652, "grad_norm": 0.3917701244354248, "learning_rate": 0.00044108741258741253, "loss": 3.2888, "step": 45550 }, { "epoch": 13.27899120507892, "grad_norm": 0.36588260531425476, "learning_rate": 0.00044091258741258733, "loss": 3.29, "step": 45600 }, { "epoch": 13.293552332692643, "grad_norm": 0.374526709318161, "learning_rate": 0.0004407377622377622, "loss": 3.286, "step": 45650 }, { "epoch": 13.308113460306366, "grad_norm": 0.3714640438556671, "learning_rate": 0.000440562937062937, "loss": 3.2921, "step": 45700 }, { "epoch": 13.322674587920089, "grad_norm": 0.3636552095413208, "learning_rate": 0.00044038811188811184, "loss": 3.2898, "step": 45750 }, { "epoch": 13.337235715533811, "grad_norm": 0.3829694092273712, "learning_rate": 0.0004402132867132867, "loss": 3.295, "step": 45800 }, { "epoch": 13.351796843147532, "grad_norm": 0.3656383156776428, "learning_rate": 0.0004400384615384615, "loss": 3.302, "step": 45850 }, { "epoch": 13.366357970761255, "grad_norm": 0.36748331785202026, "learning_rate": 0.00043986363636363635, "loss": 3.3006, "step": 45900 }, { "epoch": 13.380919098374978, "grad_norm": 0.4061829745769501, "learning_rate": 0.00043968881118881115, "loss": 3.3072, "step": 45950 }, { "epoch": 13.3954802259887, "grad_norm": 0.35320812463760376, "learning_rate": 0.000439513986013986, "loss": 3.2947, "step": 46000 }, { "epoch": 13.3954802259887, "eval_accuracy": 0.3706560910480072, "eval_loss": 3.5595102310180664, "eval_runtime": 176.8796, "eval_samples_per_second": 94.109, "eval_steps_per_second": 5.885, "step": 46000 }, { "epoch": 13.410041353602423, "grad_norm": 0.37065041065216064, "learning_rate": 0.0004393391608391608, "loss": 3.3009, "step": 46050 }, { "epoch": 13.424602481216146, "grad_norm": 0.34920239448547363, "learning_rate": 0.00043916433566433565, "loss": 3.3226, "step": 46100 }, { "epoch": 13.439163608829867, "grad_norm": 0.34855538606643677, "learning_rate": 0.00043898951048951045, "loss": 3.3077, "step": 46150 }, { "epoch": 13.45372473644359, "grad_norm": 0.393020898103714, "learning_rate": 0.0004388146853146853, "loss": 3.3102, "step": 46200 }, { "epoch": 13.468285864057313, "grad_norm": 0.3607465922832489, "learning_rate": 0.00043863986013986005, "loss": 3.3301, "step": 46250 }, { "epoch": 13.482846991671035, "grad_norm": 0.3694727420806885, "learning_rate": 0.0004384650349650349, "loss": 3.3042, "step": 46300 }, { "epoch": 13.497408119284758, "grad_norm": 0.363496869802475, "learning_rate": 0.0004382902097902097, "loss": 3.3042, "step": 46350 }, { "epoch": 13.51196924689848, "grad_norm": 0.371970534324646, "learning_rate": 0.00043811538461538456, "loss": 3.3135, "step": 46400 }, { "epoch": 13.526530374512202, "grad_norm": 0.3483518362045288, "learning_rate": 0.0004379405594405594, "loss": 3.3094, "step": 46450 }, { "epoch": 13.541091502125925, "grad_norm": 0.3581492304801941, "learning_rate": 0.0004377657342657342, "loss": 3.3157, "step": 46500 }, { "epoch": 13.555652629739647, "grad_norm": 0.38505807518959045, "learning_rate": 0.00043759090909090907, "loss": 3.3226, "step": 46550 }, { "epoch": 13.57021375735337, "grad_norm": 0.37216296792030334, "learning_rate": 0.00043741608391608387, "loss": 3.3191, "step": 46600 }, { "epoch": 13.584774884967091, "grad_norm": 0.3617168366909027, "learning_rate": 0.0004372412587412587, "loss": 3.3292, "step": 46650 }, { "epoch": 13.599336012580814, "grad_norm": 0.3550806939601898, "learning_rate": 0.0004370664335664335, "loss": 3.3298, "step": 46700 }, { "epoch": 13.613897140194537, "grad_norm": 0.3403339684009552, "learning_rate": 0.0004368916083916084, "loss": 3.3266, "step": 46750 }, { "epoch": 13.62845826780826, "grad_norm": 0.3585992753505707, "learning_rate": 0.0004367167832167832, "loss": 3.3226, "step": 46800 }, { "epoch": 13.643019395421982, "grad_norm": 0.37410402297973633, "learning_rate": 0.00043654195804195803, "loss": 3.3137, "step": 46850 }, { "epoch": 13.657580523035705, "grad_norm": 0.33496248722076416, "learning_rate": 0.00043636713286713283, "loss": 3.3236, "step": 46900 }, { "epoch": 13.672141650649426, "grad_norm": 0.3673466145992279, "learning_rate": 0.0004361923076923077, "loss": 3.3212, "step": 46950 }, { "epoch": 13.686702778263149, "grad_norm": 0.39346593618392944, "learning_rate": 0.00043601748251748243, "loss": 3.316, "step": 47000 }, { "epoch": 13.686702778263149, "eval_accuracy": 0.3710174785030894, "eval_loss": 3.5519208908081055, "eval_runtime": 176.6516, "eval_samples_per_second": 94.231, "eval_steps_per_second": 5.893, "step": 47000 }, { "epoch": 13.701263905876871, "grad_norm": 0.3582032024860382, "learning_rate": 0.00043584265734265734, "loss": 3.3268, "step": 47050 }, { "epoch": 13.715825033490594, "grad_norm": 0.3773166835308075, "learning_rate": 0.0004356678321678321, "loss": 3.3306, "step": 47100 }, { "epoch": 13.730386161104317, "grad_norm": 0.35744547843933105, "learning_rate": 0.00043549300699300694, "loss": 3.3312, "step": 47150 }, { "epoch": 13.744947288718038, "grad_norm": 0.3659146726131439, "learning_rate": 0.0004353181818181818, "loss": 3.3377, "step": 47200 }, { "epoch": 13.75950841633176, "grad_norm": 0.3603513240814209, "learning_rate": 0.0004351433566433566, "loss": 3.3284, "step": 47250 }, { "epoch": 13.774069543945483, "grad_norm": 0.3560996651649475, "learning_rate": 0.00043496853146853144, "loss": 3.333, "step": 47300 }, { "epoch": 13.788630671559206, "grad_norm": 0.3501158058643341, "learning_rate": 0.00043479370629370624, "loss": 3.3274, "step": 47350 }, { "epoch": 13.803191799172929, "grad_norm": 0.38781869411468506, "learning_rate": 0.0004346188811188811, "loss": 3.3277, "step": 47400 }, { "epoch": 13.81775292678665, "grad_norm": 0.3612052798271179, "learning_rate": 0.0004344440559440559, "loss": 3.3432, "step": 47450 }, { "epoch": 13.832314054400372, "grad_norm": 0.35562631487846375, "learning_rate": 0.00043426923076923075, "loss": 3.3315, "step": 47500 }, { "epoch": 13.846875182014095, "grad_norm": 0.36037319898605347, "learning_rate": 0.00043409440559440555, "loss": 3.343, "step": 47550 }, { "epoch": 13.861436309627818, "grad_norm": 0.3716438114643097, "learning_rate": 0.0004339195804195804, "loss": 3.327, "step": 47600 }, { "epoch": 13.87599743724154, "grad_norm": 0.3590458929538727, "learning_rate": 0.0004337447552447552, "loss": 3.3399, "step": 47650 }, { "epoch": 13.890558564855262, "grad_norm": 0.36003828048706055, "learning_rate": 0.00043356993006993006, "loss": 3.3323, "step": 47700 }, { "epoch": 13.905119692468984, "grad_norm": 0.34905195236206055, "learning_rate": 0.0004333951048951048, "loss": 3.3425, "step": 47750 }, { "epoch": 13.919680820082707, "grad_norm": 0.36625543236732483, "learning_rate": 0.0004332202797202797, "loss": 3.3323, "step": 47800 }, { "epoch": 13.93424194769643, "grad_norm": 0.3424186110496521, "learning_rate": 0.00043304545454545456, "loss": 3.3272, "step": 47850 }, { "epoch": 13.948803075310153, "grad_norm": 0.38451874256134033, "learning_rate": 0.0004328706293706293, "loss": 3.3336, "step": 47900 }, { "epoch": 13.963364202923874, "grad_norm": 0.3730528950691223, "learning_rate": 0.00043269580419580416, "loss": 3.3345, "step": 47950 }, { "epoch": 13.977925330537596, "grad_norm": 0.35438069701194763, "learning_rate": 0.00043252097902097896, "loss": 3.3273, "step": 48000 }, { "epoch": 13.977925330537596, "eval_accuracy": 0.37150407013503006, "eval_loss": 3.540936231613159, "eval_runtime": 176.1535, "eval_samples_per_second": 94.497, "eval_steps_per_second": 5.91, "step": 48000 }, { "epoch": 13.992486458151319, "grad_norm": 0.4073244631290436, "learning_rate": 0.0004323461538461538, "loss": 3.3316, "step": 48050 }, { "epoch": 14.006989341254586, "grad_norm": 0.3481259346008301, "learning_rate": 0.0004321713286713286, "loss": 3.2878, "step": 48100 }, { "epoch": 14.021550468868309, "grad_norm": 0.3760628402233124, "learning_rate": 0.00043199650349650347, "loss": 3.2307, "step": 48150 }, { "epoch": 14.036111596482032, "grad_norm": 0.3724685609340668, "learning_rate": 0.00043182167832167827, "loss": 3.2366, "step": 48200 }, { "epoch": 14.050672724095755, "grad_norm": 0.3769129812717438, "learning_rate": 0.0004316468531468531, "loss": 3.2383, "step": 48250 }, { "epoch": 14.065233851709475, "grad_norm": 0.35858991742134094, "learning_rate": 0.0004314720279720279, "loss": 3.24, "step": 48300 }, { "epoch": 14.079794979323198, "grad_norm": 0.3559223413467407, "learning_rate": 0.0004312972027972028, "loss": 3.2413, "step": 48350 }, { "epoch": 14.094356106936921, "grad_norm": 0.3592166006565094, "learning_rate": 0.0004311223776223776, "loss": 3.2387, "step": 48400 }, { "epoch": 14.108917234550644, "grad_norm": 0.35541239380836487, "learning_rate": 0.00043094755244755243, "loss": 3.2419, "step": 48450 }, { "epoch": 14.123478362164366, "grad_norm": 0.3605928421020508, "learning_rate": 0.0004307727272727272, "loss": 3.2576, "step": 48500 }, { "epoch": 14.13803948977809, "grad_norm": 0.3428098261356354, "learning_rate": 0.0004305979020979021, "loss": 3.2561, "step": 48550 }, { "epoch": 14.15260061739181, "grad_norm": 0.35958048701286316, "learning_rate": 0.00043042307692307694, "loss": 3.2628, "step": 48600 }, { "epoch": 14.167161745005533, "grad_norm": 0.3569062352180481, "learning_rate": 0.0004302482517482517, "loss": 3.2525, "step": 48650 }, { "epoch": 14.181722872619256, "grad_norm": 0.37909266352653503, "learning_rate": 0.00043007342657342654, "loss": 3.2621, "step": 48700 }, { "epoch": 14.196284000232978, "grad_norm": 0.36472129821777344, "learning_rate": 0.00042989860139860134, "loss": 3.2607, "step": 48750 }, { "epoch": 14.210845127846701, "grad_norm": 0.3571818768978119, "learning_rate": 0.0004297237762237762, "loss": 3.2711, "step": 48800 }, { "epoch": 14.225406255460422, "grad_norm": 0.3748188316822052, "learning_rate": 0.000429548951048951, "loss": 3.2693, "step": 48850 }, { "epoch": 14.239967383074145, "grad_norm": 0.35874032974243164, "learning_rate": 0.00042937412587412585, "loss": 3.2678, "step": 48900 }, { "epoch": 14.254528510687868, "grad_norm": 0.3767135739326477, "learning_rate": 0.00042919930069930065, "loss": 3.2705, "step": 48950 }, { "epoch": 14.26908963830159, "grad_norm": 0.39121341705322266, "learning_rate": 0.0004290244755244755, "loss": 3.2788, "step": 49000 }, { "epoch": 14.26908963830159, "eval_accuracy": 0.37083478621122284, "eval_loss": 3.5569944381713867, "eval_runtime": 176.6049, "eval_samples_per_second": 94.256, "eval_steps_per_second": 5.895, "step": 49000 }, { "epoch": 14.283650765915313, "grad_norm": 0.34727412462234497, "learning_rate": 0.0004288496503496503, "loss": 3.2756, "step": 49050 }, { "epoch": 14.298211893529034, "grad_norm": 0.38754183053970337, "learning_rate": 0.00042867482517482515, "loss": 3.2804, "step": 49100 }, { "epoch": 14.312773021142757, "grad_norm": 0.3641073703765869, "learning_rate": 0.00042849999999999995, "loss": 3.2789, "step": 49150 }, { "epoch": 14.32733414875648, "grad_norm": 0.3624279499053955, "learning_rate": 0.0004283251748251748, "loss": 3.2686, "step": 49200 }, { "epoch": 14.341895276370202, "grad_norm": 0.3492709696292877, "learning_rate": 0.00042815034965034966, "loss": 3.2804, "step": 49250 }, { "epoch": 14.356456403983925, "grad_norm": 0.3812512159347534, "learning_rate": 0.00042797552447552446, "loss": 3.2914, "step": 49300 }, { "epoch": 14.371017531597648, "grad_norm": 0.3958264887332916, "learning_rate": 0.0004278006993006993, "loss": 3.2856, "step": 49350 }, { "epoch": 14.385578659211369, "grad_norm": 0.42628660798072815, "learning_rate": 0.00042762587412587406, "loss": 3.2918, "step": 49400 }, { "epoch": 14.400139786825092, "grad_norm": 0.3979325592517853, "learning_rate": 0.0004274510489510489, "loss": 3.2881, "step": 49450 }, { "epoch": 14.414700914438814, "grad_norm": 0.36341267824172974, "learning_rate": 0.0004272762237762237, "loss": 3.3029, "step": 49500 }, { "epoch": 14.429262042052537, "grad_norm": 0.41063278913497925, "learning_rate": 0.00042710139860139857, "loss": 3.2871, "step": 49550 }, { "epoch": 14.44382316966626, "grad_norm": 0.37008652091026306, "learning_rate": 0.00042692657342657337, "loss": 3.2857, "step": 49600 }, { "epoch": 14.45838429727998, "grad_norm": 0.3727281391620636, "learning_rate": 0.0004267517482517482, "loss": 3.2988, "step": 49650 }, { "epoch": 14.472945424893704, "grad_norm": 0.390828013420105, "learning_rate": 0.000426576923076923, "loss": 3.3007, "step": 49700 }, { "epoch": 14.487506552507426, "grad_norm": 0.39125722646713257, "learning_rate": 0.0004264020979020979, "loss": 3.2951, "step": 49750 }, { "epoch": 14.502067680121149, "grad_norm": 0.3569597899913788, "learning_rate": 0.0004262272727272727, "loss": 3.29, "step": 49800 }, { "epoch": 14.516628807734872, "grad_norm": 0.3710508644580841, "learning_rate": 0.00042605244755244753, "loss": 3.31, "step": 49850 }, { "epoch": 14.531189935348593, "grad_norm": 0.3962744474411011, "learning_rate": 0.00042587762237762233, "loss": 3.2959, "step": 49900 }, { "epoch": 14.545751062962315, "grad_norm": 0.381999671459198, "learning_rate": 0.0004257027972027972, "loss": 3.301, "step": 49950 }, { "epoch": 14.560312190576038, "grad_norm": 0.3498106300830841, "learning_rate": 0.00042552797202797204, "loss": 3.3039, "step": 50000 }, { "epoch": 14.560312190576038, "eval_accuracy": 0.3711573780058701, "eval_loss": 3.5517921447753906, "eval_runtime": 176.7467, "eval_samples_per_second": 94.18, "eval_steps_per_second": 5.89, "step": 50000 }, { "epoch": 14.574873318189761, "grad_norm": 0.3523785173892975, "learning_rate": 0.00042535314685314684, "loss": 3.3028, "step": 50050 }, { "epoch": 14.589434445803484, "grad_norm": 0.36807239055633545, "learning_rate": 0.0004251783216783217, "loss": 3.2884, "step": 50100 }, { "epoch": 14.603995573417205, "grad_norm": 0.36314699053764343, "learning_rate": 0.00042500349650349643, "loss": 3.2948, "step": 50150 }, { "epoch": 14.618556701030927, "grad_norm": 0.36440083384513855, "learning_rate": 0.0004248286713286713, "loss": 3.3073, "step": 50200 }, { "epoch": 14.63311782864465, "grad_norm": 0.37359005212783813, "learning_rate": 0.0004246538461538461, "loss": 3.2999, "step": 50250 }, { "epoch": 14.647678956258373, "grad_norm": 0.3689667284488678, "learning_rate": 0.00042447902097902094, "loss": 3.311, "step": 50300 }, { "epoch": 14.662240083872096, "grad_norm": 0.3932121694087982, "learning_rate": 0.00042430419580419574, "loss": 3.3077, "step": 50350 }, { "epoch": 14.676801211485817, "grad_norm": 0.3882981538772583, "learning_rate": 0.0004241293706293706, "loss": 3.3027, "step": 50400 }, { "epoch": 14.69136233909954, "grad_norm": 0.3506374955177307, "learning_rate": 0.0004239545454545454, "loss": 3.3008, "step": 50450 }, { "epoch": 14.705923466713262, "grad_norm": 0.3615427017211914, "learning_rate": 0.00042377972027972025, "loss": 3.3073, "step": 50500 }, { "epoch": 14.720484594326985, "grad_norm": 0.4344766139984131, "learning_rate": 0.00042360489510489505, "loss": 3.3079, "step": 50550 }, { "epoch": 14.735045721940708, "grad_norm": 0.36327511072158813, "learning_rate": 0.0004234300699300699, "loss": 3.3088, "step": 50600 }, { "epoch": 14.749606849554429, "grad_norm": 0.3784516751766205, "learning_rate": 0.00042325524475524476, "loss": 3.3218, "step": 50650 }, { "epoch": 14.764167977168151, "grad_norm": 0.3505086302757263, "learning_rate": 0.00042308041958041956, "loss": 3.3191, "step": 50700 }, { "epoch": 14.778729104781874, "grad_norm": 0.36759206652641296, "learning_rate": 0.0004229055944055944, "loss": 3.3038, "step": 50750 }, { "epoch": 14.793290232395597, "grad_norm": 0.36110028624534607, "learning_rate": 0.0004227307692307692, "loss": 3.316, "step": 50800 }, { "epoch": 14.80785136000932, "grad_norm": 0.3693578839302063, "learning_rate": 0.00042255594405594406, "loss": 3.3142, "step": 50850 }, { "epoch": 14.822412487623042, "grad_norm": 0.39012613892555237, "learning_rate": 0.0004223811188811188, "loss": 3.3252, "step": 50900 }, { "epoch": 14.836973615236763, "grad_norm": 0.3453630805015564, "learning_rate": 0.00042220629370629366, "loss": 3.3151, "step": 50950 }, { "epoch": 14.851534742850486, "grad_norm": 0.3628343343734741, "learning_rate": 0.00042203146853146846, "loss": 3.3262, "step": 51000 }, { "epoch": 14.851534742850486, "eval_accuracy": 0.371780930075407, "eval_loss": 3.543164014816284, "eval_runtime": 192.0146, "eval_samples_per_second": 86.691, "eval_steps_per_second": 5.421, "step": 51000 }, { "epoch": 14.866095870464209, "grad_norm": 0.3488563597202301, "learning_rate": 0.0004218566433566433, "loss": 3.3265, "step": 51050 }, { "epoch": 14.880656998077932, "grad_norm": 0.35563579201698303, "learning_rate": 0.0004216818181818181, "loss": 3.319, "step": 51100 }, { "epoch": 14.895218125691654, "grad_norm": 0.3641704022884369, "learning_rate": 0.00042150699300699297, "loss": 3.3326, "step": 51150 }, { "epoch": 14.909779253305375, "grad_norm": 0.35692253708839417, "learning_rate": 0.00042133216783216777, "loss": 3.3256, "step": 51200 }, { "epoch": 14.924340380919098, "grad_norm": 0.35215649008750916, "learning_rate": 0.0004211573426573426, "loss": 3.3076, "step": 51250 }, { "epoch": 14.93890150853282, "grad_norm": 0.38561251759529114, "learning_rate": 0.0004209825174825175, "loss": 3.3155, "step": 51300 }, { "epoch": 14.953462636146543, "grad_norm": 0.38138601183891296, "learning_rate": 0.0004208076923076923, "loss": 3.3298, "step": 51350 }, { "epoch": 14.968023763760266, "grad_norm": 0.33545321226119995, "learning_rate": 0.00042063286713286713, "loss": 3.3351, "step": 51400 }, { "epoch": 14.982584891373987, "grad_norm": 0.3515348732471466, "learning_rate": 0.00042045804195804193, "loss": 3.3166, "step": 51450 }, { "epoch": 14.99714601898771, "grad_norm": 0.3943118751049042, "learning_rate": 0.0004202832167832168, "loss": 3.3196, "step": 51500 }, { "epoch": 15.011648902090977, "grad_norm": 0.38474971055984497, "learning_rate": 0.0004201083916083916, "loss": 3.2397, "step": 51550 }, { "epoch": 15.0262100297047, "grad_norm": 0.34515100717544556, "learning_rate": 0.00041993356643356644, "loss": 3.2115, "step": 51600 }, { "epoch": 15.040771157318423, "grad_norm": 0.3761281967163086, "learning_rate": 0.0004197587412587412, "loss": 3.2062, "step": 51650 }, { "epoch": 15.055332284932145, "grad_norm": 0.37440815567970276, "learning_rate": 0.00041958391608391604, "loss": 3.21, "step": 51700 }, { "epoch": 15.069893412545868, "grad_norm": 0.372003436088562, "learning_rate": 0.00041940909090909084, "loss": 3.219, "step": 51750 }, { "epoch": 15.084454540159589, "grad_norm": 0.3606085479259491, "learning_rate": 0.0004192342657342657, "loss": 3.2271, "step": 51800 }, { "epoch": 15.099015667773312, "grad_norm": 0.3450397551059723, "learning_rate": 0.0004190594405594405, "loss": 3.2275, "step": 51850 }, { "epoch": 15.113576795387035, "grad_norm": 0.3841650187969208, "learning_rate": 0.00041888461538461535, "loss": 3.2392, "step": 51900 }, { "epoch": 15.128137923000757, "grad_norm": 0.37573206424713135, "learning_rate": 0.00041870979020979015, "loss": 3.2433, "step": 51950 }, { "epoch": 15.14269905061448, "grad_norm": 0.3840627372264862, "learning_rate": 0.000418534965034965, "loss": 3.2326, "step": 52000 }, { "epoch": 15.14269905061448, "eval_accuracy": 0.3707733009675638, "eval_loss": 3.5588457584381104, "eval_runtime": 234.5819, "eval_samples_per_second": 70.96, "eval_steps_per_second": 4.438, "step": 52000 }, { "epoch": 15.157260178228203, "grad_norm": 0.3871009647846222, "learning_rate": 0.00041836013986013985, "loss": 3.2445, "step": 52050 }, { "epoch": 15.171821305841924, "grad_norm": 0.38697710633277893, "learning_rate": 0.00041818531468531465, "loss": 3.2426, "step": 52100 }, { "epoch": 15.186382433455647, "grad_norm": 0.3763573467731476, "learning_rate": 0.0004180104895104895, "loss": 3.2361, "step": 52150 }, { "epoch": 15.20094356106937, "grad_norm": 0.3644348084926605, "learning_rate": 0.0004178356643356643, "loss": 3.2524, "step": 52200 }, { "epoch": 15.215504688683092, "grad_norm": 0.3680911064147949, "learning_rate": 0.00041766083916083916, "loss": 3.2562, "step": 52250 }, { "epoch": 15.230065816296815, "grad_norm": 0.36646798253059387, "learning_rate": 0.00041748601398601396, "loss": 3.253, "step": 52300 }, { "epoch": 15.244626943910536, "grad_norm": 0.36125648021698, "learning_rate": 0.0004173111888111888, "loss": 3.2513, "step": 52350 }, { "epoch": 15.259188071524258, "grad_norm": 0.4070035517215729, "learning_rate": 0.00041713636363636356, "loss": 3.2524, "step": 52400 }, { "epoch": 15.273749199137981, "grad_norm": 0.3583230972290039, "learning_rate": 0.0004169615384615384, "loss": 3.2626, "step": 52450 }, { "epoch": 15.288310326751704, "grad_norm": 0.35534557700157166, "learning_rate": 0.0004167867132867132, "loss": 3.2654, "step": 52500 }, { "epoch": 15.302871454365427, "grad_norm": 0.3749006688594818, "learning_rate": 0.00041661188811188807, "loss": 3.2722, "step": 52550 }, { "epoch": 15.317432581979148, "grad_norm": 0.3583391010761261, "learning_rate": 0.00041643706293706287, "loss": 3.2732, "step": 52600 }, { "epoch": 15.33199370959287, "grad_norm": 0.38538479804992676, "learning_rate": 0.0004162622377622377, "loss": 3.2584, "step": 52650 }, { "epoch": 15.346554837206593, "grad_norm": 0.3732128143310547, "learning_rate": 0.0004160874125874126, "loss": 3.2695, "step": 52700 }, { "epoch": 15.361115964820316, "grad_norm": 0.3843606412410736, "learning_rate": 0.0004159125874125874, "loss": 3.2872, "step": 52750 }, { "epoch": 15.375677092434039, "grad_norm": 0.3661695122718811, "learning_rate": 0.00041573776223776223, "loss": 3.2795, "step": 52800 }, { "epoch": 15.39023822004776, "grad_norm": 0.37233254313468933, "learning_rate": 0.00041556293706293703, "loss": 3.2667, "step": 52850 }, { "epoch": 15.404799347661482, "grad_norm": 0.3598570227622986, "learning_rate": 0.0004153881118881119, "loss": 3.2878, "step": 52900 }, { "epoch": 15.419360475275205, "grad_norm": 0.4191581606864929, "learning_rate": 0.0004152132867132867, "loss": 3.2742, "step": 52950 }, { "epoch": 15.433921602888928, "grad_norm": 0.3881734013557434, "learning_rate": 0.00041503846153846154, "loss": 3.2782, "step": 53000 }, { "epoch": 15.433921602888928, "eval_accuracy": 0.3715597948109276, "eval_loss": 3.553468704223633, "eval_runtime": 176.6089, "eval_samples_per_second": 94.253, "eval_steps_per_second": 5.894, "step": 53000 }, { "epoch": 15.44848273050265, "grad_norm": 0.3607991337776184, "learning_rate": 0.00041486363636363634, "loss": 3.2807, "step": 53050 }, { "epoch": 15.463043858116373, "grad_norm": 0.3780747056007385, "learning_rate": 0.0004146888111888112, "loss": 3.2863, "step": 53100 }, { "epoch": 15.477604985730094, "grad_norm": 0.36878126859664917, "learning_rate": 0.00041451398601398593, "loss": 3.2814, "step": 53150 }, { "epoch": 15.492166113343817, "grad_norm": 0.3719026744365692, "learning_rate": 0.0004143391608391608, "loss": 3.2728, "step": 53200 }, { "epoch": 15.50672724095754, "grad_norm": 0.3782918155193329, "learning_rate": 0.0004141643356643356, "loss": 3.2923, "step": 53250 }, { "epoch": 15.521288368571263, "grad_norm": 0.39025405049324036, "learning_rate": 0.00041398951048951044, "loss": 3.2833, "step": 53300 }, { "epoch": 15.535849496184985, "grad_norm": 0.3703659474849701, "learning_rate": 0.00041381468531468524, "loss": 3.2985, "step": 53350 }, { "epoch": 15.550410623798706, "grad_norm": 0.38350167870521545, "learning_rate": 0.0004136398601398601, "loss": 3.2747, "step": 53400 }, { "epoch": 15.564971751412429, "grad_norm": 0.3728027939796448, "learning_rate": 0.00041346503496503495, "loss": 3.2896, "step": 53450 }, { "epoch": 15.579532879026152, "grad_norm": 0.3606811761856079, "learning_rate": 0.00041329020979020975, "loss": 3.2925, "step": 53500 }, { "epoch": 15.594094006639875, "grad_norm": 0.396258682012558, "learning_rate": 0.0004131153846153846, "loss": 3.2913, "step": 53550 }, { "epoch": 15.608655134253597, "grad_norm": 0.34805792570114136, "learning_rate": 0.0004129405594405594, "loss": 3.2967, "step": 53600 }, { "epoch": 15.623216261867318, "grad_norm": 0.36827948689460754, "learning_rate": 0.00041276573426573426, "loss": 3.3035, "step": 53650 }, { "epoch": 15.637777389481041, "grad_norm": 0.3775792717933655, "learning_rate": 0.00041259090909090906, "loss": 3.2914, "step": 53700 }, { "epoch": 15.652338517094764, "grad_norm": 0.349970281124115, "learning_rate": 0.0004124160839160839, "loss": 3.2794, "step": 53750 }, { "epoch": 15.666899644708487, "grad_norm": 0.3787984848022461, "learning_rate": 0.0004122412587412587, "loss": 3.3042, "step": 53800 }, { "epoch": 15.68146077232221, "grad_norm": 0.37652289867401123, "learning_rate": 0.00041206643356643356, "loss": 3.2926, "step": 53850 }, { "epoch": 15.69602189993593, "grad_norm": 0.3538573980331421, "learning_rate": 0.0004118916083916083, "loss": 3.2842, "step": 53900 }, { "epoch": 15.710583027549653, "grad_norm": 0.38587185740470886, "learning_rate": 0.00041171678321678316, "loss": 3.2925, "step": 53950 }, { "epoch": 15.725144155163376, "grad_norm": 0.389236181974411, "learning_rate": 0.00041154195804195796, "loss": 3.3013, "step": 54000 }, { "epoch": 15.725144155163376, "eval_accuracy": 0.37206578427308573, "eval_loss": 3.5431649684906006, "eval_runtime": 176.9505, "eval_samples_per_second": 94.072, "eval_steps_per_second": 5.883, "step": 54000 }, { "epoch": 15.739705282777098, "grad_norm": 0.3759095072746277, "learning_rate": 0.0004113671328671328, "loss": 3.2932, "step": 54050 }, { "epoch": 15.754266410390821, "grad_norm": 0.3525058329105377, "learning_rate": 0.00041119230769230767, "loss": 3.3078, "step": 54100 }, { "epoch": 15.768827538004544, "grad_norm": 0.3641799986362457, "learning_rate": 0.00041101748251748247, "loss": 3.3085, "step": 54150 }, { "epoch": 15.783388665618265, "grad_norm": 0.35214030742645264, "learning_rate": 0.0004108426573426573, "loss": 3.304, "step": 54200 }, { "epoch": 15.797949793231988, "grad_norm": 0.37992826104164124, "learning_rate": 0.0004106678321678321, "loss": 3.3111, "step": 54250 }, { "epoch": 15.81251092084571, "grad_norm": 0.3650667667388916, "learning_rate": 0.000410493006993007, "loss": 3.302, "step": 54300 }, { "epoch": 15.827072048459433, "grad_norm": 0.40603387355804443, "learning_rate": 0.0004103181818181818, "loss": 3.3107, "step": 54350 }, { "epoch": 15.841633176073156, "grad_norm": 0.3534162938594818, "learning_rate": 0.00041014335664335663, "loss": 3.2988, "step": 54400 }, { "epoch": 15.856194303686877, "grad_norm": 0.3707028925418854, "learning_rate": 0.00040996853146853143, "loss": 3.2963, "step": 54450 }, { "epoch": 15.8707554313006, "grad_norm": 0.3668697774410248, "learning_rate": 0.0004097937062937063, "loss": 3.2991, "step": 54500 }, { "epoch": 15.885316558914322, "grad_norm": 0.36460986733436584, "learning_rate": 0.0004096188811188811, "loss": 3.3037, "step": 54550 }, { "epoch": 15.899877686528045, "grad_norm": 0.37511947751045227, "learning_rate": 0.00040944405594405594, "loss": 3.3046, "step": 54600 }, { "epoch": 15.914438814141768, "grad_norm": 0.3535802364349365, "learning_rate": 0.0004092692307692307, "loss": 3.3105, "step": 54650 }, { "epoch": 15.928999941755489, "grad_norm": 0.3434038758277893, "learning_rate": 0.00040909440559440554, "loss": 3.3249, "step": 54700 }, { "epoch": 15.943561069369212, "grad_norm": 0.3652832806110382, "learning_rate": 0.00040891958041958034, "loss": 3.3041, "step": 54750 }, { "epoch": 15.958122196982934, "grad_norm": 0.4125228226184845, "learning_rate": 0.0004087447552447552, "loss": 3.2997, "step": 54800 }, { "epoch": 15.972683324596657, "grad_norm": 0.3930729329586029, "learning_rate": 0.00040856993006993005, "loss": 3.3162, "step": 54850 }, { "epoch": 15.98724445221038, "grad_norm": 0.3785282373428345, "learning_rate": 0.00040839510489510485, "loss": 3.3036, "step": 54900 }, { "epoch": 16.001747335313645, "grad_norm": 0.38858112692832947, "learning_rate": 0.0004082202797202797, "loss": 3.3013, "step": 54950 }, { "epoch": 16.01630846292737, "grad_norm": 0.3704979419708252, "learning_rate": 0.0004080454545454545, "loss": 3.2179, "step": 55000 }, { "epoch": 16.01630846292737, "eval_accuracy": 0.37145504652775313, "eval_loss": 3.5535995960235596, "eval_runtime": 1031.4847, "eval_samples_per_second": 16.138, "eval_steps_per_second": 1.009, "step": 55000 }, { "epoch": 16.03086959054109, "grad_norm": 0.3996082842350006, "learning_rate": 0.00040787062937062935, "loss": 3.1959, "step": 55050 }, { "epoch": 16.045430718154815, "grad_norm": 0.36863553524017334, "learning_rate": 0.00040769580419580415, "loss": 3.2073, "step": 55100 }, { "epoch": 16.059991845768536, "grad_norm": 0.3670997619628906, "learning_rate": 0.000407520979020979, "loss": 3.2119, "step": 55150 }, { "epoch": 16.074552973382257, "grad_norm": 0.36851897835731506, "learning_rate": 0.0004073461538461538, "loss": 3.2101, "step": 55200 }, { "epoch": 16.08911410099598, "grad_norm": 0.35575056076049805, "learning_rate": 0.00040717132867132866, "loss": 3.221, "step": 55250 }, { "epoch": 16.103675228609703, "grad_norm": 0.35587427020072937, "learning_rate": 0.00040699650349650346, "loss": 3.2227, "step": 55300 }, { "epoch": 16.118236356223427, "grad_norm": 0.3824006915092468, "learning_rate": 0.0004068216783216783, "loss": 3.2373, "step": 55350 }, { "epoch": 16.132797483837148, "grad_norm": 0.36476144194602966, "learning_rate": 0.00040664685314685306, "loss": 3.2184, "step": 55400 }, { "epoch": 16.14735861145087, "grad_norm": 0.3826502859592438, "learning_rate": 0.0004064720279720279, "loss": 3.223, "step": 55450 }, { "epoch": 16.161919739064594, "grad_norm": 0.3793790638446808, "learning_rate": 0.00040629720279720277, "loss": 3.2229, "step": 55500 }, { "epoch": 16.176480866678315, "grad_norm": 0.3965374529361725, "learning_rate": 0.00040612237762237757, "loss": 3.2315, "step": 55550 }, { "epoch": 16.19104199429204, "grad_norm": 0.40393194556236267, "learning_rate": 0.0004059475524475524, "loss": 3.2372, "step": 55600 }, { "epoch": 16.20560312190576, "grad_norm": 0.37616032361984253, "learning_rate": 0.0004057727272727272, "loss": 3.2396, "step": 55650 }, { "epoch": 16.22016424951948, "grad_norm": 0.3601635992527008, "learning_rate": 0.0004055979020979021, "loss": 3.2295, "step": 55700 }, { "epoch": 16.234725377133206, "grad_norm": 0.35567811131477356, "learning_rate": 0.0004054230769230769, "loss": 3.2405, "step": 55750 }, { "epoch": 16.249286504746927, "grad_norm": 0.39285311102867126, "learning_rate": 0.00040524825174825173, "loss": 3.2432, "step": 55800 }, { "epoch": 16.26384763236065, "grad_norm": 0.37538665533065796, "learning_rate": 0.00040507342657342653, "loss": 3.2541, "step": 55850 }, { "epoch": 16.278408759974372, "grad_norm": 0.3745563328266144, "learning_rate": 0.0004048986013986014, "loss": 3.243, "step": 55900 }, { "epoch": 16.292969887588093, "grad_norm": 0.37519165873527527, "learning_rate": 0.0004047237762237762, "loss": 3.2425, "step": 55950 }, { "epoch": 16.307531015201818, "grad_norm": 0.39395275712013245, "learning_rate": 0.00040454895104895104, "loss": 3.2511, "step": 56000 }, { "epoch": 16.307531015201818, "eval_accuracy": 0.37147256335625256, "eval_loss": 3.5547873973846436, "eval_runtime": 176.8077, "eval_samples_per_second": 94.147, "eval_steps_per_second": 5.888, "step": 56000 }, { "epoch": 16.32209214281554, "grad_norm": 0.3879060745239258, "learning_rate": 0.00040437412587412583, "loss": 3.2549, "step": 56050 }, { "epoch": 16.336653270429263, "grad_norm": 0.40138155221939087, "learning_rate": 0.0004041993006993007, "loss": 3.2667, "step": 56100 }, { "epoch": 16.351214398042984, "grad_norm": 0.37023717164993286, "learning_rate": 0.00040402447552447554, "loss": 3.2522, "step": 56150 }, { "epoch": 16.36577552565671, "grad_norm": 0.3748226463794708, "learning_rate": 0.0004038496503496503, "loss": 3.2659, "step": 56200 }, { "epoch": 16.38033665327043, "grad_norm": 0.4036701023578644, "learning_rate": 0.00040367482517482514, "loss": 3.2558, "step": 56250 }, { "epoch": 16.39489778088415, "grad_norm": 0.3921171724796295, "learning_rate": 0.00040349999999999994, "loss": 3.2668, "step": 56300 }, { "epoch": 16.409458908497875, "grad_norm": 0.3839941620826721, "learning_rate": 0.0004033251748251748, "loss": 3.2673, "step": 56350 }, { "epoch": 16.424020036111596, "grad_norm": 0.37228092551231384, "learning_rate": 0.0004031503496503496, "loss": 3.253, "step": 56400 }, { "epoch": 16.43858116372532, "grad_norm": 0.37965667247772217, "learning_rate": 0.00040297552447552445, "loss": 3.2689, "step": 56450 }, { "epoch": 16.45314229133904, "grad_norm": 0.37183481454849243, "learning_rate": 0.00040280069930069925, "loss": 3.2604, "step": 56500 }, { "epoch": 16.467703418952762, "grad_norm": 0.38374224305152893, "learning_rate": 0.0004026258741258741, "loss": 3.2681, "step": 56550 }, { "epoch": 16.482264546566487, "grad_norm": 0.36288636922836304, "learning_rate": 0.0004024510489510489, "loss": 3.2725, "step": 56600 }, { "epoch": 16.496825674180208, "grad_norm": 0.37157145142555237, "learning_rate": 0.00040227622377622376, "loss": 3.2709, "step": 56650 }, { "epoch": 16.511386801793932, "grad_norm": 0.42881858348846436, "learning_rate": 0.00040210139860139856, "loss": 3.2691, "step": 56700 }, { "epoch": 16.525947929407653, "grad_norm": 0.3978147804737091, "learning_rate": 0.0004019265734265734, "loss": 3.271, "step": 56750 }, { "epoch": 16.540509057021374, "grad_norm": 0.3714050352573395, "learning_rate": 0.0004017517482517482, "loss": 3.2847, "step": 56800 }, { "epoch": 16.5550701846351, "grad_norm": 0.385984867811203, "learning_rate": 0.00040157692307692306, "loss": 3.2766, "step": 56850 }, { "epoch": 16.56963131224882, "grad_norm": 0.3949538469314575, "learning_rate": 0.0004014020979020979, "loss": 3.2751, "step": 56900 }, { "epoch": 16.584192439862544, "grad_norm": 0.3426177501678467, "learning_rate": 0.00040122727272727266, "loss": 3.2844, "step": 56950 }, { "epoch": 16.598753567476265, "grad_norm": 0.367607980966568, "learning_rate": 0.0004010524475524475, "loss": 3.2776, "step": 57000 }, { "epoch": 16.598753567476265, "eval_accuracy": 0.3724155330300375, "eval_loss": 3.545069694519043, "eval_runtime": 176.6764, "eval_samples_per_second": 94.217, "eval_steps_per_second": 5.892, "step": 57000 }, { "epoch": 16.613314695089986, "grad_norm": 0.3597051799297333, "learning_rate": 0.0004008776223776223, "loss": 3.2801, "step": 57050 }, { "epoch": 16.62787582270371, "grad_norm": 0.4093649089336395, "learning_rate": 0.00040070279720279717, "loss": 3.2799, "step": 57100 }, { "epoch": 16.642436950317432, "grad_norm": 0.3761058747768402, "learning_rate": 0.00040052797202797197, "loss": 3.2746, "step": 57150 }, { "epoch": 16.656998077931156, "grad_norm": 0.3886961340904236, "learning_rate": 0.0004003531468531468, "loss": 3.2749, "step": 57200 }, { "epoch": 16.671559205544877, "grad_norm": 0.4015483260154724, "learning_rate": 0.0004001783216783216, "loss": 3.2848, "step": 57250 }, { "epoch": 16.6861203331586, "grad_norm": 0.32575860619544983, "learning_rate": 0.0004000034965034965, "loss": 3.2882, "step": 57300 }, { "epoch": 16.700681460772323, "grad_norm": 0.3802250027656555, "learning_rate": 0.0003998286713286713, "loss": 3.2752, "step": 57350 }, { "epoch": 16.715242588386044, "grad_norm": 0.40613871812820435, "learning_rate": 0.00039965384615384613, "loss": 3.282, "step": 57400 }, { "epoch": 16.72980371599977, "grad_norm": 0.3728645145893097, "learning_rate": 0.00039947902097902093, "loss": 3.2825, "step": 57450 }, { "epoch": 16.74436484361349, "grad_norm": 0.3911345899105072, "learning_rate": 0.0003993041958041958, "loss": 3.2838, "step": 57500 }, { "epoch": 16.75892597122721, "grad_norm": 0.3494943380355835, "learning_rate": 0.00039912937062937064, "loss": 3.2856, "step": 57550 }, { "epoch": 16.773487098840935, "grad_norm": 0.3775020241737366, "learning_rate": 0.00039895454545454544, "loss": 3.2892, "step": 57600 }, { "epoch": 16.788048226454656, "grad_norm": 0.38465675711631775, "learning_rate": 0.0003987797202797203, "loss": 3.2902, "step": 57650 }, { "epoch": 16.80260935406838, "grad_norm": 0.35246166586875916, "learning_rate": 0.00039860489510489504, "loss": 3.2878, "step": 57700 }, { "epoch": 16.8171704816821, "grad_norm": 0.39134299755096436, "learning_rate": 0.0003984300699300699, "loss": 3.2983, "step": 57750 }, { "epoch": 16.831731609295822, "grad_norm": 0.37794309854507446, "learning_rate": 0.0003982552447552447, "loss": 3.2789, "step": 57800 }, { "epoch": 16.846292736909547, "grad_norm": 0.3562477231025696, "learning_rate": 0.00039808041958041955, "loss": 3.283, "step": 57850 }, { "epoch": 16.860853864523268, "grad_norm": 0.3600902855396271, "learning_rate": 0.00039790559440559435, "loss": 3.2844, "step": 57900 }, { "epoch": 16.875414992136992, "grad_norm": 0.37436723709106445, "learning_rate": 0.0003977307692307692, "loss": 3.2983, "step": 57950 }, { "epoch": 16.889976119750713, "grad_norm": 0.3761354088783264, "learning_rate": 0.000397555944055944, "loss": 3.3012, "step": 58000 }, { "epoch": 16.889976119750713, "eval_accuracy": 0.3723873180042666, "eval_loss": 3.537418842315674, "eval_runtime": 176.6887, "eval_samples_per_second": 94.211, "eval_steps_per_second": 5.892, "step": 58000 }, { "epoch": 16.904537247364434, "grad_norm": 0.38446810841560364, "learning_rate": 0.00039738111888111885, "loss": 3.2982, "step": 58050 }, { "epoch": 16.91909837497816, "grad_norm": 0.3951273262500763, "learning_rate": 0.00039720629370629365, "loss": 3.2955, "step": 58100 }, { "epoch": 16.93365950259188, "grad_norm": 0.3898870348930359, "learning_rate": 0.0003970314685314685, "loss": 3.2935, "step": 58150 }, { "epoch": 16.948220630205604, "grad_norm": 0.42458799481391907, "learning_rate": 0.0003968566433566433, "loss": 3.2932, "step": 58200 }, { "epoch": 16.962781757819325, "grad_norm": 0.3850706219673157, "learning_rate": 0.00039668181818181816, "loss": 3.2934, "step": 58250 }, { "epoch": 16.977342885433046, "grad_norm": 0.3601333200931549, "learning_rate": 0.000396506993006993, "loss": 3.2967, "step": 58300 }, { "epoch": 16.99190401304677, "grad_norm": 0.3920387625694275, "learning_rate": 0.0003963321678321678, "loss": 3.291, "step": 58350 }, { "epoch": 17.006406896150036, "grad_norm": 0.356235533952713, "learning_rate": 0.00039615734265734267, "loss": 3.2548, "step": 58400 }, { "epoch": 17.02096802376376, "grad_norm": 0.3619932532310486, "learning_rate": 0.0003959825174825174, "loss": 3.1841, "step": 58450 }, { "epoch": 17.03552915137748, "grad_norm": 0.36357995867729187, "learning_rate": 0.00039580769230769227, "loss": 3.1947, "step": 58500 }, { "epoch": 17.050090278991206, "grad_norm": 0.3915276527404785, "learning_rate": 0.00039563286713286707, "loss": 3.197, "step": 58550 }, { "epoch": 17.064651406604927, "grad_norm": 0.38006600737571716, "learning_rate": 0.0003954580419580419, "loss": 3.2012, "step": 58600 }, { "epoch": 17.07921253421865, "grad_norm": 0.38852155208587646, "learning_rate": 0.0003952832167832167, "loss": 3.2075, "step": 58650 }, { "epoch": 17.093773661832373, "grad_norm": 0.39789924025535583, "learning_rate": 0.0003951083916083916, "loss": 3.2123, "step": 58700 }, { "epoch": 17.108334789446094, "grad_norm": 0.37218016386032104, "learning_rate": 0.0003949335664335664, "loss": 3.2188, "step": 58750 }, { "epoch": 17.122895917059818, "grad_norm": 0.4078844487667084, "learning_rate": 0.00039475874125874123, "loss": 3.2145, "step": 58800 }, { "epoch": 17.13745704467354, "grad_norm": 0.36902859807014465, "learning_rate": 0.00039458391608391603, "loss": 3.2187, "step": 58850 }, { "epoch": 17.152018172287264, "grad_norm": 0.3675275146961212, "learning_rate": 0.0003944090909090909, "loss": 3.214, "step": 58900 }, { "epoch": 17.166579299900985, "grad_norm": 0.3768998086452484, "learning_rate": 0.00039423426573426573, "loss": 3.2148, "step": 58950 }, { "epoch": 17.181140427514705, "grad_norm": 0.3707525134086609, "learning_rate": 0.00039405944055944053, "loss": 3.2162, "step": 59000 }, { "epoch": 17.181140427514705, "eval_accuracy": 0.3720486201324084, "eval_loss": 3.556793451309204, "eval_runtime": 176.7267, "eval_samples_per_second": 94.191, "eval_steps_per_second": 5.89, "step": 59000 }, { "epoch": 17.19570155512843, "grad_norm": 0.43334779143333435, "learning_rate": 0.0003938846153846154, "loss": 3.2422, "step": 59050 }, { "epoch": 17.21026268274215, "grad_norm": 0.3813783526420593, "learning_rate": 0.0003937097902097902, "loss": 3.2418, "step": 59100 }, { "epoch": 17.224823810355876, "grad_norm": 0.3798333704471588, "learning_rate": 0.00039353496503496504, "loss": 3.2242, "step": 59150 }, { "epoch": 17.239384937969596, "grad_norm": 0.3806883692741394, "learning_rate": 0.0003933601398601398, "loss": 3.2254, "step": 59200 }, { "epoch": 17.253946065583317, "grad_norm": 0.36410582065582275, "learning_rate": 0.00039318531468531464, "loss": 3.2408, "step": 59250 }, { "epoch": 17.268507193197042, "grad_norm": 0.35998713970184326, "learning_rate": 0.00039301048951048944, "loss": 3.2307, "step": 59300 }, { "epoch": 17.283068320810763, "grad_norm": 0.3704741597175598, "learning_rate": 0.0003928356643356643, "loss": 3.2397, "step": 59350 }, { "epoch": 17.297629448424487, "grad_norm": 0.3956085741519928, "learning_rate": 0.0003926608391608391, "loss": 3.2416, "step": 59400 }, { "epoch": 17.31219057603821, "grad_norm": 0.42266151309013367, "learning_rate": 0.00039248601398601395, "loss": 3.2274, "step": 59450 }, { "epoch": 17.32675170365193, "grad_norm": 0.3695532977581024, "learning_rate": 0.00039231118881118875, "loss": 3.2472, "step": 59500 }, { "epoch": 17.341312831265654, "grad_norm": 0.3581390380859375, "learning_rate": 0.0003921363636363636, "loss": 3.2371, "step": 59550 }, { "epoch": 17.355873958879375, "grad_norm": 0.3758191466331482, "learning_rate": 0.00039196153846153846, "loss": 3.2587, "step": 59600 }, { "epoch": 17.3704350864931, "grad_norm": 0.39191579818725586, "learning_rate": 0.00039178671328671326, "loss": 3.248, "step": 59650 }, { "epoch": 17.38499621410682, "grad_norm": 0.3746451735496521, "learning_rate": 0.0003916118881118881, "loss": 3.2459, "step": 59700 }, { "epoch": 17.39955734172054, "grad_norm": 0.4051074981689453, "learning_rate": 0.0003914370629370629, "loss": 3.2527, "step": 59750 }, { "epoch": 17.414118469334266, "grad_norm": 0.37997952103614807, "learning_rate": 0.00039126223776223776, "loss": 3.2562, "step": 59800 }, { "epoch": 17.428679596947987, "grad_norm": 0.39231473207473755, "learning_rate": 0.00039108741258741256, "loss": 3.2435, "step": 59850 }, { "epoch": 17.44324072456171, "grad_norm": 0.36344578862190247, "learning_rate": 0.0003909125874125874, "loss": 3.2429, "step": 59900 }, { "epoch": 17.457801852175432, "grad_norm": 0.3916427195072174, "learning_rate": 0.00039073776223776216, "loss": 3.2645, "step": 59950 }, { "epoch": 17.472362979789153, "grad_norm": 0.38155242800712585, "learning_rate": 0.000390562937062937, "loss": 3.2594, "step": 60000 }, { "epoch": 17.472362979789153, "eval_accuracy": 0.37229009372796434, "eval_loss": 3.546949625015259, "eval_runtime": 176.6688, "eval_samples_per_second": 94.222, "eval_steps_per_second": 5.892, "step": 60000 }, { "epoch": 17.486924107402878, "grad_norm": 0.379336953163147, "learning_rate": 0.0003903881118881118, "loss": 3.2501, "step": 60050 }, { "epoch": 17.5014852350166, "grad_norm": 0.38774874806404114, "learning_rate": 0.00039021328671328667, "loss": 3.2633, "step": 60100 }, { "epoch": 17.516046362630323, "grad_norm": 0.38635146617889404, "learning_rate": 0.00039003846153846147, "loss": 3.2788, "step": 60150 }, { "epoch": 17.530607490244044, "grad_norm": 0.3680366277694702, "learning_rate": 0.0003898636363636363, "loss": 3.2576, "step": 60200 }, { "epoch": 17.545168617857765, "grad_norm": 0.42120352387428284, "learning_rate": 0.0003896888111888111, "loss": 3.2614, "step": 60250 }, { "epoch": 17.55972974547149, "grad_norm": 0.3842788636684418, "learning_rate": 0.000389513986013986, "loss": 3.2616, "step": 60300 }, { "epoch": 17.57429087308521, "grad_norm": 0.41452935338020325, "learning_rate": 0.00038933916083916083, "loss": 3.2673, "step": 60350 }, { "epoch": 17.588852000698935, "grad_norm": 0.3977508246898651, "learning_rate": 0.00038916433566433563, "loss": 3.2756, "step": 60400 }, { "epoch": 17.603413128312656, "grad_norm": 0.3657890856266022, "learning_rate": 0.0003889895104895105, "loss": 3.2675, "step": 60450 }, { "epoch": 17.617974255926377, "grad_norm": 0.37939056754112244, "learning_rate": 0.0003888146853146853, "loss": 3.2713, "step": 60500 }, { "epoch": 17.6325353835401, "grad_norm": 0.37325090169906616, "learning_rate": 0.00038863986013986014, "loss": 3.2587, "step": 60550 }, { "epoch": 17.647096511153823, "grad_norm": 0.38156962394714355, "learning_rate": 0.00038846503496503494, "loss": 3.2694, "step": 60600 }, { "epoch": 17.661657638767547, "grad_norm": 0.3832074999809265, "learning_rate": 0.0003882902097902098, "loss": 3.2743, "step": 60650 }, { "epoch": 17.676218766381268, "grad_norm": 0.36399969458580017, "learning_rate": 0.00038811538461538454, "loss": 3.2614, "step": 60700 }, { "epoch": 17.690779893994993, "grad_norm": 0.37998104095458984, "learning_rate": 0.0003879405594405594, "loss": 3.2678, "step": 60750 }, { "epoch": 17.705341021608714, "grad_norm": 0.3782190978527069, "learning_rate": 0.0003877657342657342, "loss": 3.2588, "step": 60800 }, { "epoch": 17.719902149222435, "grad_norm": 0.3613925278186798, "learning_rate": 0.00038759090909090905, "loss": 3.272, "step": 60850 }, { "epoch": 17.73446327683616, "grad_norm": 0.4221477210521698, "learning_rate": 0.00038741608391608384, "loss": 3.2725, "step": 60900 }, { "epoch": 17.74902440444988, "grad_norm": 0.38485991954803467, "learning_rate": 0.0003872412587412587, "loss": 3.2701, "step": 60950 }, { "epoch": 17.763585532063605, "grad_norm": 0.4068686068058014, "learning_rate": 0.00038706643356643355, "loss": 3.262, "step": 61000 }, { "epoch": 17.763585532063605, "eval_accuracy": 0.37286426950240215, "eval_loss": 3.5396716594696045, "eval_runtime": 176.7294, "eval_samples_per_second": 94.189, "eval_steps_per_second": 5.89, "step": 61000 }, { "epoch": 17.778146659677326, "grad_norm": 0.408058226108551, "learning_rate": 0.00038689160839160835, "loss": 3.2739, "step": 61050 }, { "epoch": 17.792707787291047, "grad_norm": 0.3897380530834198, "learning_rate": 0.0003867167832167832, "loss": 3.2618, "step": 61100 }, { "epoch": 17.80726891490477, "grad_norm": 0.37916770577430725, "learning_rate": 0.000386541958041958, "loss": 3.2838, "step": 61150 }, { "epoch": 17.821830042518492, "grad_norm": 0.3925793468952179, "learning_rate": 0.00038636713286713286, "loss": 3.2773, "step": 61200 }, { "epoch": 17.836391170132217, "grad_norm": 0.3665933609008789, "learning_rate": 0.00038619230769230766, "loss": 3.2799, "step": 61250 }, { "epoch": 17.850952297745938, "grad_norm": 0.3553600311279297, "learning_rate": 0.0003860174825174825, "loss": 3.2711, "step": 61300 }, { "epoch": 17.86551342535966, "grad_norm": 0.392948180437088, "learning_rate": 0.0003858426573426573, "loss": 3.2847, "step": 61350 }, { "epoch": 17.880074552973383, "grad_norm": 0.3728949725627899, "learning_rate": 0.00038566783216783217, "loss": 3.2846, "step": 61400 }, { "epoch": 17.894635680587104, "grad_norm": 0.36359310150146484, "learning_rate": 0.0003854930069930069, "loss": 3.2798, "step": 61450 }, { "epoch": 17.90919680820083, "grad_norm": 0.38679951429367065, "learning_rate": 0.00038531818181818177, "loss": 3.2756, "step": 61500 }, { "epoch": 17.92375793581455, "grad_norm": 0.3753286898136139, "learning_rate": 0.00038514335664335657, "loss": 3.2761, "step": 61550 }, { "epoch": 17.93831906342827, "grad_norm": 0.38257867097854614, "learning_rate": 0.0003849685314685314, "loss": 3.2809, "step": 61600 }, { "epoch": 17.952880191041995, "grad_norm": 0.3879117965698242, "learning_rate": 0.0003847937062937062, "loss": 3.2732, "step": 61650 }, { "epoch": 17.967441318655716, "grad_norm": 0.3776424825191498, "learning_rate": 0.0003846188811188811, "loss": 3.2809, "step": 61700 }, { "epoch": 17.98200244626944, "grad_norm": 0.3808050751686096, "learning_rate": 0.00038444405594405593, "loss": 3.2822, "step": 61750 }, { "epoch": 17.99656357388316, "grad_norm": 0.4322393536567688, "learning_rate": 0.00038426923076923073, "loss": 3.2754, "step": 61800 }, { "epoch": 18.01106645698643, "grad_norm": 0.3647869825363159, "learning_rate": 0.0003840944055944056, "loss": 3.2042, "step": 61850 }, { "epoch": 18.02562758460015, "grad_norm": 0.3591197729110718, "learning_rate": 0.0003839195804195804, "loss": 3.1856, "step": 61900 }, { "epoch": 18.040188712213872, "grad_norm": 0.36585232615470886, "learning_rate": 0.00038374475524475523, "loss": 3.1831, "step": 61950 }, { "epoch": 18.054749839827597, "grad_norm": 0.37394168972969055, "learning_rate": 0.00038356993006993003, "loss": 3.1853, "step": 62000 }, { "epoch": 18.054749839827597, "eval_accuracy": 0.37270273847986374, "eval_loss": 3.549687623977661, "eval_runtime": 176.6151, "eval_samples_per_second": 94.25, "eval_steps_per_second": 5.894, "step": 62000 }, { "epoch": 18.069310967441318, "grad_norm": 0.3535243570804596, "learning_rate": 0.0003833951048951049, "loss": 3.1757, "step": 62050 }, { "epoch": 18.083872095055042, "grad_norm": 0.4245968163013458, "learning_rate": 0.0003832202797202797, "loss": 3.1926, "step": 62100 }, { "epoch": 18.098433222668763, "grad_norm": 0.3650285601615906, "learning_rate": 0.00038304545454545454, "loss": 3.1886, "step": 62150 }, { "epoch": 18.112994350282484, "grad_norm": 0.36255475878715515, "learning_rate": 0.0003828706293706293, "loss": 3.1969, "step": 62200 }, { "epoch": 18.12755547789621, "grad_norm": 0.4018917381763458, "learning_rate": 0.00038269580419580414, "loss": 3.2017, "step": 62250 }, { "epoch": 18.14211660550993, "grad_norm": 0.39976486563682556, "learning_rate": 0.00038252097902097894, "loss": 3.202, "step": 62300 }, { "epoch": 18.156677733123654, "grad_norm": 0.4026753306388855, "learning_rate": 0.0003823461538461538, "loss": 3.2164, "step": 62350 }, { "epoch": 18.171238860737375, "grad_norm": 0.3889211416244507, "learning_rate": 0.00038217132867132865, "loss": 3.2084, "step": 62400 }, { "epoch": 18.185799988351096, "grad_norm": 0.3884575664997101, "learning_rate": 0.00038199650349650345, "loss": 3.2041, "step": 62450 }, { "epoch": 18.20036111596482, "grad_norm": 0.4104594886302948, "learning_rate": 0.0003818216783216783, "loss": 3.2115, "step": 62500 }, { "epoch": 18.214922243578542, "grad_norm": 0.40058377385139465, "learning_rate": 0.0003816468531468531, "loss": 3.2133, "step": 62550 }, { "epoch": 18.229483371192266, "grad_norm": 0.38720259070396423, "learning_rate": 0.00038147202797202796, "loss": 3.2087, "step": 62600 }, { "epoch": 18.244044498805987, "grad_norm": 0.38542911410331726, "learning_rate": 0.00038129720279720276, "loss": 3.2212, "step": 62650 }, { "epoch": 18.25860562641971, "grad_norm": 0.39218926429748535, "learning_rate": 0.0003811223776223776, "loss": 3.2203, "step": 62700 }, { "epoch": 18.273166754033433, "grad_norm": 0.40636080503463745, "learning_rate": 0.0003809475524475524, "loss": 3.2261, "step": 62750 }, { "epoch": 18.287727881647154, "grad_norm": 0.3898778259754181, "learning_rate": 0.00038077272727272726, "loss": 3.2183, "step": 62800 }, { "epoch": 18.30228900926088, "grad_norm": 0.3944312334060669, "learning_rate": 0.00038059790209790206, "loss": 3.2277, "step": 62850 }, { "epoch": 18.3168501368746, "grad_norm": 0.38517752289772034, "learning_rate": 0.0003804230769230769, "loss": 3.2321, "step": 62900 }, { "epoch": 18.33141126448832, "grad_norm": 0.38035276532173157, "learning_rate": 0.00038024825174825166, "loss": 3.2303, "step": 62950 }, { "epoch": 18.345972392102045, "grad_norm": 0.3749290704727173, "learning_rate": 0.0003800734265734265, "loss": 3.239, "step": 63000 }, { "epoch": 18.345972392102045, "eval_accuracy": 0.3725924647541425, "eval_loss": 3.548588514328003, "eval_runtime": 176.7112, "eval_samples_per_second": 94.199, "eval_steps_per_second": 5.891, "step": 63000 }, { "epoch": 18.360533519715766, "grad_norm": 0.36092501878738403, "learning_rate": 0.0003798986013986013, "loss": 3.2431, "step": 63050 }, { "epoch": 18.37509464732949, "grad_norm": 0.36838671565055847, "learning_rate": 0.00037972377622377617, "loss": 3.2186, "step": 63100 }, { "epoch": 18.38965577494321, "grad_norm": 0.38395121693611145, "learning_rate": 0.000379548951048951, "loss": 3.2228, "step": 63150 }, { "epoch": 18.404216902556932, "grad_norm": 0.40639710426330566, "learning_rate": 0.0003793741258741258, "loss": 3.2346, "step": 63200 }, { "epoch": 18.418778030170657, "grad_norm": 0.3863862156867981, "learning_rate": 0.0003791993006993007, "loss": 3.2347, "step": 63250 }, { "epoch": 18.433339157784378, "grad_norm": 0.40013137459754944, "learning_rate": 0.0003790244755244755, "loss": 3.2429, "step": 63300 }, { "epoch": 18.447900285398102, "grad_norm": 0.40367773175239563, "learning_rate": 0.00037884965034965033, "loss": 3.2509, "step": 63350 }, { "epoch": 18.462461413011823, "grad_norm": 0.38813164830207825, "learning_rate": 0.00037867482517482513, "loss": 3.2466, "step": 63400 }, { "epoch": 18.477022540625548, "grad_norm": 0.3755754232406616, "learning_rate": 0.0003785, "loss": 3.2389, "step": 63450 }, { "epoch": 18.49158366823927, "grad_norm": 0.3727249503135681, "learning_rate": 0.0003783251748251748, "loss": 3.2401, "step": 63500 }, { "epoch": 18.50614479585299, "grad_norm": 0.3614943027496338, "learning_rate": 0.00037815034965034964, "loss": 3.2367, "step": 63550 }, { "epoch": 18.520705923466714, "grad_norm": 0.3896731436252594, "learning_rate": 0.00037797552447552444, "loss": 3.2412, "step": 63600 }, { "epoch": 18.535267051080435, "grad_norm": 0.3661154508590698, "learning_rate": 0.0003778006993006993, "loss": 3.2522, "step": 63650 }, { "epoch": 18.54982817869416, "grad_norm": 0.37164583802223206, "learning_rate": 0.00037762587412587404, "loss": 3.2627, "step": 63700 }, { "epoch": 18.56438930630788, "grad_norm": 0.4042896628379822, "learning_rate": 0.0003774510489510489, "loss": 3.2595, "step": 63750 }, { "epoch": 18.5789504339216, "grad_norm": 0.36743268370628357, "learning_rate": 0.0003772762237762238, "loss": 3.245, "step": 63800 }, { "epoch": 18.593511561535326, "grad_norm": 0.3608546853065491, "learning_rate": 0.00037710139860139854, "loss": 3.2389, "step": 63850 }, { "epoch": 18.608072689149047, "grad_norm": 0.37642180919647217, "learning_rate": 0.0003769265734265734, "loss": 3.248, "step": 63900 }, { "epoch": 18.62263381676277, "grad_norm": 0.3853236734867096, "learning_rate": 0.0003767517482517482, "loss": 3.253, "step": 63950 }, { "epoch": 18.637194944376493, "grad_norm": 0.39252620935440063, "learning_rate": 0.00037657692307692305, "loss": 3.2576, "step": 64000 }, { "epoch": 18.637194944376493, "eval_accuracy": 0.3727976115040184, "eval_loss": 3.543614149093628, "eval_runtime": 176.7187, "eval_samples_per_second": 94.195, "eval_steps_per_second": 5.891, "step": 64000 }, { "epoch": 18.651756071990214, "grad_norm": 0.3591403663158417, "learning_rate": 0.00037640209790209785, "loss": 3.2592, "step": 64050 }, { "epoch": 18.666317199603938, "grad_norm": 0.3907413184642792, "learning_rate": 0.0003762272727272727, "loss": 3.2667, "step": 64100 }, { "epoch": 18.68087832721766, "grad_norm": 0.37977805733680725, "learning_rate": 0.0003760524475524475, "loss": 3.2631, "step": 64150 }, { "epoch": 18.695439454831384, "grad_norm": 0.3672906160354614, "learning_rate": 0.00037587762237762236, "loss": 3.2624, "step": 64200 }, { "epoch": 18.710000582445105, "grad_norm": 0.3918323516845703, "learning_rate": 0.00037570279720279716, "loss": 3.2652, "step": 64250 }, { "epoch": 18.724561710058826, "grad_norm": 0.3797747790813446, "learning_rate": 0.000375527972027972, "loss": 3.2573, "step": 64300 }, { "epoch": 18.73912283767255, "grad_norm": 0.3932407796382904, "learning_rate": 0.0003753531468531468, "loss": 3.2666, "step": 64350 }, { "epoch": 18.75368396528627, "grad_norm": 0.3932470381259918, "learning_rate": 0.00037517832167832167, "loss": 3.261, "step": 64400 }, { "epoch": 18.768245092899996, "grad_norm": 0.42663368582725525, "learning_rate": 0.0003750034965034965, "loss": 3.2692, "step": 64450 }, { "epoch": 18.782806220513717, "grad_norm": 0.3719504773616791, "learning_rate": 0.00037482867132867127, "loss": 3.2746, "step": 64500 }, { "epoch": 18.797367348127437, "grad_norm": 0.3768567442893982, "learning_rate": 0.0003746538461538462, "loss": 3.263, "step": 64550 }, { "epoch": 18.811928475741162, "grad_norm": 0.38159942626953125, "learning_rate": 0.0003744790209790209, "loss": 3.2707, "step": 64600 }, { "epoch": 18.826489603354883, "grad_norm": 0.3998315632343292, "learning_rate": 0.0003743041958041958, "loss": 3.2678, "step": 64650 }, { "epoch": 18.841050730968607, "grad_norm": 0.40491533279418945, "learning_rate": 0.0003741293706293706, "loss": 3.2784, "step": 64700 }, { "epoch": 18.85561185858233, "grad_norm": 0.39045944809913635, "learning_rate": 0.0003739545454545454, "loss": 3.2642, "step": 64750 }, { "epoch": 18.87017298619605, "grad_norm": 0.3865518569946289, "learning_rate": 0.0003737797202797202, "loss": 3.278, "step": 64800 }, { "epoch": 18.884734113809774, "grad_norm": 0.40172427892684937, "learning_rate": 0.0003736048951048951, "loss": 3.2685, "step": 64850 }, { "epoch": 18.899295241423495, "grad_norm": 0.37912994623184204, "learning_rate": 0.0003734300699300699, "loss": 3.2602, "step": 64900 }, { "epoch": 18.91385636903722, "grad_norm": 0.36898526549339294, "learning_rate": 0.00037325524475524473, "loss": 3.2762, "step": 64950 }, { "epoch": 18.92841749665094, "grad_norm": 0.4474957287311554, "learning_rate": 0.00037308041958041953, "loss": 3.2736, "step": 65000 }, { "epoch": 18.92841749665094, "eval_accuracy": 0.3733603837055405, "eval_loss": 3.5318191051483154, "eval_runtime": 176.8986, "eval_samples_per_second": 94.099, "eval_steps_per_second": 5.885, "step": 65000 }, { "epoch": 18.94297862426466, "grad_norm": 0.3818124830722809, "learning_rate": 0.0003729055944055944, "loss": 3.2739, "step": 65050 }, { "epoch": 18.957539751878386, "grad_norm": 0.39567485451698303, "learning_rate": 0.0003727307692307692, "loss": 3.2638, "step": 65100 }, { "epoch": 18.972100879492107, "grad_norm": 0.3894628882408142, "learning_rate": 0.00037255594405594404, "loss": 3.2719, "step": 65150 }, { "epoch": 18.98666200710583, "grad_norm": 0.3980902135372162, "learning_rate": 0.0003723811188811189, "loss": 3.2853, "step": 65200 }, { "epoch": 19.001164890209097, "grad_norm": 0.40676209330558777, "learning_rate": 0.00037220629370629364, "loss": 3.2692, "step": 65250 }, { "epoch": 19.01572601782282, "grad_norm": 0.43653473258018494, "learning_rate": 0.00037203146853146855, "loss": 3.1594, "step": 65300 }, { "epoch": 19.030287145436542, "grad_norm": 0.3918614387512207, "learning_rate": 0.0003718566433566433, "loss": 3.1726, "step": 65350 }, { "epoch": 19.044848273050263, "grad_norm": 0.3767155110836029, "learning_rate": 0.00037168181818181815, "loss": 3.1696, "step": 65400 }, { "epoch": 19.059409400663988, "grad_norm": 0.3941446840763092, "learning_rate": 0.00037150699300699295, "loss": 3.1736, "step": 65450 }, { "epoch": 19.07397052827771, "grad_norm": 0.4108849763870239, "learning_rate": 0.0003713321678321678, "loss": 3.1941, "step": 65500 }, { "epoch": 19.088531655891433, "grad_norm": 0.40548184514045715, "learning_rate": 0.0003711573426573426, "loss": 3.186, "step": 65550 }, { "epoch": 19.103092783505154, "grad_norm": 0.3613770306110382, "learning_rate": 0.00037098251748251746, "loss": 3.1908, "step": 65600 }, { "epoch": 19.11765391111888, "grad_norm": 0.379202276468277, "learning_rate": 0.00037080769230769226, "loss": 3.1949, "step": 65650 }, { "epoch": 19.1322150387326, "grad_norm": 0.39599117636680603, "learning_rate": 0.0003706328671328671, "loss": 3.1999, "step": 65700 }, { "epoch": 19.14677616634632, "grad_norm": 0.36203324794769287, "learning_rate": 0.0003704580419580419, "loss": 3.2018, "step": 65750 }, { "epoch": 19.161337293960045, "grad_norm": 0.389349102973938, "learning_rate": 0.00037028321678321676, "loss": 3.1938, "step": 65800 }, { "epoch": 19.175898421573766, "grad_norm": 0.3745954930782318, "learning_rate": 0.0003701083916083916, "loss": 3.2047, "step": 65850 }, { "epoch": 19.19045954918749, "grad_norm": 0.41214945912361145, "learning_rate": 0.0003699335664335664, "loss": 3.209, "step": 65900 }, { "epoch": 19.20502067680121, "grad_norm": 0.411510169506073, "learning_rate": 0.00036975874125874127, "loss": 3.2059, "step": 65950 }, { "epoch": 19.219581804414933, "grad_norm": 0.3707365393638611, "learning_rate": 0.00036958391608391607, "loss": 3.1832, "step": 66000 }, { "epoch": 19.219581804414933, "eval_accuracy": 0.3726959198486358, "eval_loss": 3.5537943840026855, "eval_runtime": 176.7606, "eval_samples_per_second": 94.173, "eval_steps_per_second": 5.889, "step": 66000 }, { "epoch": 19.234142932028657, "grad_norm": 0.44211095571517944, "learning_rate": 0.0003694090909090909, "loss": 3.2018, "step": 66050 }, { "epoch": 19.248704059642378, "grad_norm": 0.3968811631202698, "learning_rate": 0.00036923426573426567, "loss": 3.2157, "step": 66100 }, { "epoch": 19.263265187256103, "grad_norm": 0.3941178321838379, "learning_rate": 0.0003690594405594405, "loss": 3.2102, "step": 66150 }, { "epoch": 19.277826314869824, "grad_norm": 0.36997464299201965, "learning_rate": 0.0003688846153846153, "loss": 3.2282, "step": 66200 }, { "epoch": 19.292387442483545, "grad_norm": 0.3910622000694275, "learning_rate": 0.0003687097902097902, "loss": 3.2084, "step": 66250 }, { "epoch": 19.30694857009727, "grad_norm": 0.4019615054130554, "learning_rate": 0.000368534965034965, "loss": 3.2072, "step": 66300 }, { "epoch": 19.32150969771099, "grad_norm": 0.4168713092803955, "learning_rate": 0.00036836013986013983, "loss": 3.2272, "step": 66350 }, { "epoch": 19.336070825324715, "grad_norm": 0.3616064190864563, "learning_rate": 0.00036818531468531463, "loss": 3.2187, "step": 66400 }, { "epoch": 19.350631952938436, "grad_norm": 0.38284698128700256, "learning_rate": 0.0003680104895104895, "loss": 3.2132, "step": 66450 }, { "epoch": 19.365193080552157, "grad_norm": 0.38134893774986267, "learning_rate": 0.0003678356643356643, "loss": 3.2144, "step": 66500 }, { "epoch": 19.37975420816588, "grad_norm": 0.37159982323646545, "learning_rate": 0.00036766083916083914, "loss": 3.2269, "step": 66550 }, { "epoch": 19.394315335779602, "grad_norm": 0.3975840210914612, "learning_rate": 0.000367486013986014, "loss": 3.2155, "step": 66600 }, { "epoch": 19.408876463393327, "grad_norm": 0.40514418482780457, "learning_rate": 0.0003673111888111888, "loss": 3.2185, "step": 66650 }, { "epoch": 19.423437591007048, "grad_norm": 0.9633327722549438, "learning_rate": 0.00036713636363636365, "loss": 3.2242, "step": 66700 }, { "epoch": 19.43799871862077, "grad_norm": 0.38619816303253174, "learning_rate": 0.00036696153846153844, "loss": 3.2197, "step": 66750 }, { "epoch": 19.452559846234493, "grad_norm": 0.38963261246681213, "learning_rate": 0.0003667867132867133, "loss": 3.2261, "step": 66800 }, { "epoch": 19.467120973848214, "grad_norm": 0.4200080931186676, "learning_rate": 0.00036661188811188804, "loss": 3.2345, "step": 66850 }, { "epoch": 19.48168210146194, "grad_norm": 0.3921654522418976, "learning_rate": 0.0003664370629370629, "loss": 3.2386, "step": 66900 }, { "epoch": 19.49624322907566, "grad_norm": 0.37555941939353943, "learning_rate": 0.0003662622377622377, "loss": 3.2454, "step": 66950 }, { "epoch": 19.51080435668938, "grad_norm": 0.38313743472099304, "learning_rate": 0.00036608741258741255, "loss": 3.2362, "step": 67000 }, { "epoch": 19.51080435668938, "eval_accuracy": 0.3727880889328207, "eval_loss": 3.547034740447998, "eval_runtime": 176.9088, "eval_samples_per_second": 94.094, "eval_steps_per_second": 5.884, "step": 67000 }, { "epoch": 19.525365484303105, "grad_norm": 0.39016804099082947, "learning_rate": 0.00036591258741258735, "loss": 3.2385, "step": 67050 }, { "epoch": 19.539926611916826, "grad_norm": 0.3868093490600586, "learning_rate": 0.0003657377622377622, "loss": 3.2346, "step": 67100 }, { "epoch": 19.55448773953055, "grad_norm": 0.3916149139404297, "learning_rate": 0.000365562937062937, "loss": 3.2407, "step": 67150 }, { "epoch": 19.56904886714427, "grad_norm": 0.41935995221138, "learning_rate": 0.00036538811188811186, "loss": 3.2306, "step": 67200 }, { "epoch": 19.583609994757992, "grad_norm": 0.37837082147598267, "learning_rate": 0.0003652132867132867, "loss": 3.2337, "step": 67250 }, { "epoch": 19.598171122371717, "grad_norm": 0.38619598746299744, "learning_rate": 0.0003650384615384615, "loss": 3.2514, "step": 67300 }, { "epoch": 19.612732249985438, "grad_norm": 0.399539053440094, "learning_rate": 0.00036486363636363637, "loss": 3.2456, "step": 67350 }, { "epoch": 19.627293377599162, "grad_norm": 0.41010504961013794, "learning_rate": 0.00036468881118881117, "loss": 3.2411, "step": 67400 }, { "epoch": 19.641854505212883, "grad_norm": 0.40376871824264526, "learning_rate": 0.000364513986013986, "loss": 3.25, "step": 67450 }, { "epoch": 19.656415632826604, "grad_norm": 0.3830404579639435, "learning_rate": 0.0003643391608391608, "loss": 3.2437, "step": 67500 }, { "epoch": 19.67097676044033, "grad_norm": 0.4260205030441284, "learning_rate": 0.0003641643356643357, "loss": 3.2468, "step": 67550 }, { "epoch": 19.68553788805405, "grad_norm": 0.386489599943161, "learning_rate": 0.0003639895104895104, "loss": 3.2339, "step": 67600 }, { "epoch": 19.700099015667774, "grad_norm": 0.3752810060977936, "learning_rate": 0.0003638146853146853, "loss": 3.2543, "step": 67650 }, { "epoch": 19.714660143281495, "grad_norm": 0.40031254291534424, "learning_rate": 0.00036363986013986007, "loss": 3.2511, "step": 67700 }, { "epoch": 19.729221270895216, "grad_norm": 0.38260605931282043, "learning_rate": 0.0003634650349650349, "loss": 3.2462, "step": 67750 }, { "epoch": 19.74378239850894, "grad_norm": 0.4128180742263794, "learning_rate": 0.0003632902097902097, "loss": 3.2518, "step": 67800 }, { "epoch": 19.758343526122662, "grad_norm": 0.3811860978603363, "learning_rate": 0.0003631153846153846, "loss": 3.2542, "step": 67850 }, { "epoch": 19.772904653736386, "grad_norm": 0.3886754512786865, "learning_rate": 0.00036294055944055943, "loss": 3.256, "step": 67900 }, { "epoch": 19.787465781350107, "grad_norm": 0.41729703545570374, "learning_rate": 0.00036276573426573423, "loss": 3.2541, "step": 67950 }, { "epoch": 19.802026908963832, "grad_norm": 0.37454211711883545, "learning_rate": 0.0003625909090909091, "loss": 3.2708, "step": 68000 }, { "epoch": 19.802026908963832, "eval_accuracy": 0.3732008512473275, "eval_loss": 3.5364534854888916, "eval_runtime": 176.7504, "eval_samples_per_second": 94.178, "eval_steps_per_second": 5.89, "step": 68000 }, { "epoch": 19.816588036577553, "grad_norm": 0.41786983609199524, "learning_rate": 0.0003624160839160839, "loss": 3.2541, "step": 68050 }, { "epoch": 19.831149164191274, "grad_norm": 0.3999040722846985, "learning_rate": 0.00036224125874125874, "loss": 3.2462, "step": 68100 }, { "epoch": 19.845710291805, "grad_norm": 0.38314878940582275, "learning_rate": 0.00036206643356643354, "loss": 3.2512, "step": 68150 }, { "epoch": 19.86027141941872, "grad_norm": 0.3930557370185852, "learning_rate": 0.0003618916083916084, "loss": 3.2551, "step": 68200 }, { "epoch": 19.874832547032444, "grad_norm": 0.38350459933280945, "learning_rate": 0.0003617167832167832, "loss": 3.2679, "step": 68250 }, { "epoch": 19.889393674646165, "grad_norm": 0.395722895860672, "learning_rate": 0.00036154195804195805, "loss": 3.2665, "step": 68300 }, { "epoch": 19.903954802259886, "grad_norm": 0.4258788526058197, "learning_rate": 0.0003613671328671328, "loss": 3.2693, "step": 68350 }, { "epoch": 19.91851592987361, "grad_norm": 0.408592164516449, "learning_rate": 0.00036119230769230765, "loss": 3.253, "step": 68400 }, { "epoch": 19.93307705748733, "grad_norm": 0.36436349153518677, "learning_rate": 0.00036101748251748245, "loss": 3.2564, "step": 68450 }, { "epoch": 19.947638185101056, "grad_norm": 0.36261311173439026, "learning_rate": 0.0003608426573426573, "loss": 3.2638, "step": 68500 }, { "epoch": 19.962199312714777, "grad_norm": 0.40500375628471375, "learning_rate": 0.0003606678321678321, "loss": 3.2594, "step": 68550 }, { "epoch": 19.976760440328498, "grad_norm": 0.400728702545166, "learning_rate": 0.00036049300699300696, "loss": 3.2556, "step": 68600 }, { "epoch": 19.991321567942222, "grad_norm": 0.38320621848106384, "learning_rate": 0.0003603181818181818, "loss": 3.2556, "step": 68650 }, { "epoch": 20.005824451045488, "grad_norm": 0.387529581785202, "learning_rate": 0.0003601433566433566, "loss": 3.2309, "step": 68700 }, { "epoch": 20.020385578659212, "grad_norm": 0.37018364667892456, "learning_rate": 0.00035996853146853146, "loss": 3.1433, "step": 68750 }, { "epoch": 20.034946706272933, "grad_norm": 0.4018804728984833, "learning_rate": 0.00035979370629370626, "loss": 3.1541, "step": 68800 }, { "epoch": 20.049507833886658, "grad_norm": 0.4254935681819916, "learning_rate": 0.0003596188811188811, "loss": 3.161, "step": 68850 }, { "epoch": 20.06406896150038, "grad_norm": 0.3899883031845093, "learning_rate": 0.0003594440559440559, "loss": 3.1717, "step": 68900 }, { "epoch": 20.0786300891141, "grad_norm": 0.41561880707740784, "learning_rate": 0.00035926923076923077, "loss": 3.1755, "step": 68950 }, { "epoch": 20.093191216727824, "grad_norm": 0.3727256655693054, "learning_rate": 0.00035909440559440557, "loss": 3.1756, "step": 69000 }, { "epoch": 20.093191216727824, "eval_accuracy": 0.37314242263146025, "eval_loss": 3.547905683517456, "eval_runtime": 176.1962, "eval_samples_per_second": 94.474, "eval_steps_per_second": 5.908, "step": 69000 }, { "epoch": 20.107752344341545, "grad_norm": 0.41738736629486084, "learning_rate": 0.0003589195804195804, "loss": 3.1825, "step": 69050 }, { "epoch": 20.12231347195527, "grad_norm": 0.38791269063949585, "learning_rate": 0.00035874475524475517, "loss": 3.1829, "step": 69100 }, { "epoch": 20.13687459956899, "grad_norm": 0.3935379981994629, "learning_rate": 0.00035856993006993, "loss": 3.1919, "step": 69150 }, { "epoch": 20.15143572718271, "grad_norm": 0.3939097821712494, "learning_rate": 0.0003583951048951048, "loss": 3.1868, "step": 69200 }, { "epoch": 20.165996854796436, "grad_norm": 0.41222381591796875, "learning_rate": 0.0003582202797202797, "loss": 3.1935, "step": 69250 }, { "epoch": 20.180557982410157, "grad_norm": 0.3910202085971832, "learning_rate": 0.00035804545454545453, "loss": 3.189, "step": 69300 }, { "epoch": 20.19511911002388, "grad_norm": 0.3945898711681366, "learning_rate": 0.00035787062937062933, "loss": 3.1866, "step": 69350 }, { "epoch": 20.209680237637603, "grad_norm": 0.41844651103019714, "learning_rate": 0.0003576958041958042, "loss": 3.1961, "step": 69400 }, { "epoch": 20.224241365251324, "grad_norm": 0.39506423473358154, "learning_rate": 0.000357520979020979, "loss": 3.2082, "step": 69450 }, { "epoch": 20.238802492865048, "grad_norm": 0.4030466079711914, "learning_rate": 0.00035734615384615384, "loss": 3.1928, "step": 69500 }, { "epoch": 20.25336362047877, "grad_norm": 0.3733743131160736, "learning_rate": 0.00035717132867132864, "loss": 3.1989, "step": 69550 }, { "epoch": 20.267924748092494, "grad_norm": 0.39910760521888733, "learning_rate": 0.0003569965034965035, "loss": 3.2017, "step": 69600 }, { "epoch": 20.282485875706215, "grad_norm": 0.4016686975955963, "learning_rate": 0.0003568216783216783, "loss": 3.208, "step": 69650 }, { "epoch": 20.297047003319935, "grad_norm": 0.4043295383453369, "learning_rate": 0.00035664685314685314, "loss": 3.1946, "step": 69700 }, { "epoch": 20.31160813093366, "grad_norm": 0.43771249055862427, "learning_rate": 0.00035647202797202794, "loss": 3.2096, "step": 69750 }, { "epoch": 20.32616925854738, "grad_norm": 0.40544554591178894, "learning_rate": 0.0003562972027972028, "loss": 3.2098, "step": 69800 }, { "epoch": 20.340730386161106, "grad_norm": 0.3885228931903839, "learning_rate": 0.00035612237762237754, "loss": 3.2138, "step": 69850 }, { "epoch": 20.355291513774826, "grad_norm": 0.375761479139328, "learning_rate": 0.0003559475524475524, "loss": 3.2018, "step": 69900 }, { "epoch": 20.369852641388547, "grad_norm": 0.3793325424194336, "learning_rate": 0.0003557727272727272, "loss": 3.2129, "step": 69950 }, { "epoch": 20.384413769002272, "grad_norm": 0.4034629762172699, "learning_rate": 0.00035559790209790205, "loss": 3.2256, "step": 70000 }, { "epoch": 20.384413769002272, "eval_accuracy": 0.3726505406821876, "eval_loss": 3.5475194454193115, "eval_runtime": 176.6033, "eval_samples_per_second": 94.256, "eval_steps_per_second": 5.895, "step": 70000 }, { "epoch": 20.398974896615993, "grad_norm": 0.378439724445343, "learning_rate": 0.0003554230769230769, "loss": 3.2109, "step": 70050 }, { "epoch": 20.413536024229717, "grad_norm": 0.3883298933506012, "learning_rate": 0.0003552482517482517, "loss": 3.2145, "step": 70100 }, { "epoch": 20.42809715184344, "grad_norm": 0.37452587485313416, "learning_rate": 0.00035507342657342656, "loss": 3.2067, "step": 70150 }, { "epoch": 20.442658279457163, "grad_norm": 0.408031165599823, "learning_rate": 0.00035489860139860136, "loss": 3.2018, "step": 70200 }, { "epoch": 20.457219407070884, "grad_norm": 0.39902517199516296, "learning_rate": 0.0003547237762237762, "loss": 3.213, "step": 70250 }, { "epoch": 20.471780534684605, "grad_norm": 0.41096633672714233, "learning_rate": 0.000354548951048951, "loss": 3.224, "step": 70300 }, { "epoch": 20.48634166229833, "grad_norm": 0.3833194673061371, "learning_rate": 0.00035437412587412587, "loss": 3.2183, "step": 70350 }, { "epoch": 20.50090278991205, "grad_norm": 0.43597906827926636, "learning_rate": 0.00035419930069930067, "loss": 3.2267, "step": 70400 }, { "epoch": 20.51546391752577, "grad_norm": 0.38731446862220764, "learning_rate": 0.0003540244755244755, "loss": 3.2153, "step": 70450 }, { "epoch": 20.530025045139496, "grad_norm": 0.40491819381713867, "learning_rate": 0.0003538496503496503, "loss": 3.231, "step": 70500 }, { "epoch": 20.544586172753217, "grad_norm": 0.394074946641922, "learning_rate": 0.0003536748251748252, "loss": 3.2364, "step": 70550 }, { "epoch": 20.55914730036694, "grad_norm": 0.38757994771003723, "learning_rate": 0.0003534999999999999, "loss": 3.2268, "step": 70600 }, { "epoch": 20.573708427980662, "grad_norm": 0.4309883415699005, "learning_rate": 0.00035332517482517477, "loss": 3.233, "step": 70650 }, { "epoch": 20.588269555594387, "grad_norm": 0.3867761194705963, "learning_rate": 0.0003531503496503496, "loss": 3.2245, "step": 70700 }, { "epoch": 20.602830683208108, "grad_norm": 0.40435120463371277, "learning_rate": 0.0003529755244755244, "loss": 3.2288, "step": 70750 }, { "epoch": 20.61739181082183, "grad_norm": 0.3826408088207245, "learning_rate": 0.0003528006993006993, "loss": 3.231, "step": 70800 }, { "epoch": 20.631952938435553, "grad_norm": 0.3879588842391968, "learning_rate": 0.0003526258741258741, "loss": 3.2358, "step": 70850 }, { "epoch": 20.646514066049274, "grad_norm": 0.3931640386581421, "learning_rate": 0.00035245104895104893, "loss": 3.2322, "step": 70900 }, { "epoch": 20.661075193663, "grad_norm": 0.42434677481651306, "learning_rate": 0.00035227622377622373, "loss": 3.2263, "step": 70950 }, { "epoch": 20.67563632127672, "grad_norm": 0.4020628333091736, "learning_rate": 0.0003521013986013986, "loss": 3.2426, "step": 71000 }, { "epoch": 20.67563632127672, "eval_accuracy": 0.37326292430402347, "eval_loss": 3.540471315383911, "eval_runtime": 176.6008, "eval_samples_per_second": 94.258, "eval_steps_per_second": 5.895, "step": 71000 }, { "epoch": 20.69019744889044, "grad_norm": 0.3879493474960327, "learning_rate": 0.0003519265734265734, "loss": 3.2328, "step": 71050 }, { "epoch": 20.704758576504165, "grad_norm": 0.41016706824302673, "learning_rate": 0.00035175174825174824, "loss": 3.2518, "step": 71100 }, { "epoch": 20.719319704117886, "grad_norm": 0.41510647535324097, "learning_rate": 0.00035157692307692304, "loss": 3.2364, "step": 71150 }, { "epoch": 20.73388083173161, "grad_norm": 0.37887150049209595, "learning_rate": 0.0003514020979020979, "loss": 3.2328, "step": 71200 }, { "epoch": 20.74844195934533, "grad_norm": 0.40134233236312866, "learning_rate": 0.0003512272727272727, "loss": 3.2413, "step": 71250 }, { "epoch": 20.763003086959053, "grad_norm": 0.3549582064151764, "learning_rate": 0.00035105244755244755, "loss": 3.2469, "step": 71300 }, { "epoch": 20.777564214572777, "grad_norm": 0.40901145339012146, "learning_rate": 0.0003508776223776223, "loss": 3.2349, "step": 71350 }, { "epoch": 20.792125342186498, "grad_norm": 0.3721853494644165, "learning_rate": 0.00035070279720279715, "loss": 3.2393, "step": 71400 }, { "epoch": 20.806686469800223, "grad_norm": 0.3537469208240509, "learning_rate": 0.000350527972027972, "loss": 3.246, "step": 71450 }, { "epoch": 20.821247597413944, "grad_norm": 0.3981519341468811, "learning_rate": 0.0003503531468531468, "loss": 3.2462, "step": 71500 }, { "epoch": 20.835808725027665, "grad_norm": 0.3936062455177307, "learning_rate": 0.00035017832167832166, "loss": 3.2562, "step": 71550 }, { "epoch": 20.85036985264139, "grad_norm": 0.4099869430065155, "learning_rate": 0.00035000349650349645, "loss": 3.2435, "step": 71600 }, { "epoch": 20.86493098025511, "grad_norm": 0.40036559104919434, "learning_rate": 0.0003498286713286713, "loss": 3.2392, "step": 71650 }, { "epoch": 20.879492107868835, "grad_norm": 0.3792988955974579, "learning_rate": 0.0003496538461538461, "loss": 3.245, "step": 71700 }, { "epoch": 20.894053235482556, "grad_norm": 0.3893389403820038, "learning_rate": 0.00034947902097902096, "loss": 3.258, "step": 71750 }, { "epoch": 20.908614363096277, "grad_norm": 0.40422552824020386, "learning_rate": 0.00034930419580419576, "loss": 3.2478, "step": 71800 }, { "epoch": 20.92317549071, "grad_norm": 0.36332860589027405, "learning_rate": 0.0003491293706293706, "loss": 3.2579, "step": 71850 }, { "epoch": 20.937736618323722, "grad_norm": 0.41438668966293335, "learning_rate": 0.0003489545454545454, "loss": 3.2507, "step": 71900 }, { "epoch": 20.952297745937447, "grad_norm": 0.3990573287010193, "learning_rate": 0.00034877972027972027, "loss": 3.2588, "step": 71950 }, { "epoch": 20.966858873551168, "grad_norm": 0.40813231468200684, "learning_rate": 0.00034860489510489507, "loss": 3.2544, "step": 72000 }, { "epoch": 20.966858873551168, "eval_accuracy": 0.3740354281971092, "eval_loss": 3.5308823585510254, "eval_runtime": 176.4619, "eval_samples_per_second": 94.332, "eval_steps_per_second": 5.899, "step": 72000 }, { "epoch": 20.98142000116489, "grad_norm": 0.40066397190093994, "learning_rate": 0.0003484300699300699, "loss": 3.246, "step": 72050 }, { "epoch": 20.995981128778613, "grad_norm": 0.36475440859794617, "learning_rate": 0.0003482552447552448, "loss": 3.2523, "step": 72100 }, { "epoch": 21.01048401188188, "grad_norm": 0.40291792154312134, "learning_rate": 0.0003480804195804195, "loss": 3.1823, "step": 72150 }, { "epoch": 21.025045139495603, "grad_norm": 0.39930352568626404, "learning_rate": 0.0003479055944055944, "loss": 3.1448, "step": 72200 }, { "epoch": 21.039606267109324, "grad_norm": 0.4272351861000061, "learning_rate": 0.0003477307692307692, "loss": 3.1528, "step": 72250 }, { "epoch": 21.05416739472305, "grad_norm": 0.37879079580307007, "learning_rate": 0.00034755594405594403, "loss": 3.1626, "step": 72300 }, { "epoch": 21.06872852233677, "grad_norm": 0.37789633870124817, "learning_rate": 0.00034738111888111883, "loss": 3.1653, "step": 72350 }, { "epoch": 21.08328964995049, "grad_norm": 0.3773847818374634, "learning_rate": 0.0003472062937062937, "loss": 3.1581, "step": 72400 }, { "epoch": 21.097850777564215, "grad_norm": 0.39947396516799927, "learning_rate": 0.0003470314685314685, "loss": 3.1668, "step": 72450 }, { "epoch": 21.112411905177936, "grad_norm": 0.42217087745666504, "learning_rate": 0.00034685664335664334, "loss": 3.1779, "step": 72500 }, { "epoch": 21.12697303279166, "grad_norm": 0.40650278329849243, "learning_rate": 0.00034668181818181814, "loss": 3.161, "step": 72550 }, { "epoch": 21.14153416040538, "grad_norm": 0.4245632588863373, "learning_rate": 0.000346506993006993, "loss": 3.1737, "step": 72600 }, { "epoch": 21.156095288019102, "grad_norm": 0.42671096324920654, "learning_rate": 0.0003463321678321678, "loss": 3.1808, "step": 72650 }, { "epoch": 21.170656415632827, "grad_norm": 0.38275977969169617, "learning_rate": 0.00034615734265734264, "loss": 3.179, "step": 72700 }, { "epoch": 21.185217543246548, "grad_norm": 0.37691304087638855, "learning_rate": 0.0003459825174825175, "loss": 3.1855, "step": 72750 }, { "epoch": 21.199778670860272, "grad_norm": 0.4288446009159088, "learning_rate": 0.0003458076923076923, "loss": 3.1851, "step": 72800 }, { "epoch": 21.214339798473993, "grad_norm": 0.39425280690193176, "learning_rate": 0.00034563286713286715, "loss": 3.1851, "step": 72850 }, { "epoch": 21.228900926087718, "grad_norm": 0.4057941138744354, "learning_rate": 0.0003454580419580419, "loss": 3.1758, "step": 72900 }, { "epoch": 21.24346205370144, "grad_norm": 0.43333491683006287, "learning_rate": 0.00034528321678321675, "loss": 3.1818, "step": 72950 }, { "epoch": 21.25802318131516, "grad_norm": 0.3756488859653473, "learning_rate": 0.00034510839160839155, "loss": 3.1981, "step": 73000 }, { "epoch": 21.25802318131516, "eval_accuracy": 0.3729420959484869, "eval_loss": 3.5505897998809814, "eval_runtime": 179.1, "eval_samples_per_second": 92.942, "eval_steps_per_second": 5.812, "step": 73000 }, { "epoch": 21.272584308928884, "grad_norm": 0.38150060176849365, "learning_rate": 0.0003449335664335664, "loss": 3.1967, "step": 73050 }, { "epoch": 21.287145436542605, "grad_norm": 0.38942277431488037, "learning_rate": 0.0003447587412587412, "loss": 3.197, "step": 73100 }, { "epoch": 21.30170656415633, "grad_norm": 0.40391889214515686, "learning_rate": 0.00034458391608391606, "loss": 3.1933, "step": 73150 }, { "epoch": 21.31626769177005, "grad_norm": 0.4259825646877289, "learning_rate": 0.00034440909090909086, "loss": 3.1901, "step": 73200 }, { "epoch": 21.330828819383772, "grad_norm": 0.4131545126438141, "learning_rate": 0.0003442342657342657, "loss": 3.1913, "step": 73250 }, { "epoch": 21.345389946997496, "grad_norm": 0.40731021761894226, "learning_rate": 0.0003440594405594405, "loss": 3.2106, "step": 73300 }, { "epoch": 21.359951074611217, "grad_norm": 0.39349207282066345, "learning_rate": 0.00034388461538461537, "loss": 3.2047, "step": 73350 }, { "epoch": 21.374512202224942, "grad_norm": 0.3948974907398224, "learning_rate": 0.00034370979020979017, "loss": 3.2043, "step": 73400 }, { "epoch": 21.389073329838663, "grad_norm": 0.3808380961418152, "learning_rate": 0.000343534965034965, "loss": 3.2066, "step": 73450 }, { "epoch": 21.403634457452384, "grad_norm": 0.3924180865287781, "learning_rate": 0.0003433601398601399, "loss": 3.2087, "step": 73500 }, { "epoch": 21.41819558506611, "grad_norm": 0.38892677426338196, "learning_rate": 0.0003431853146853147, "loss": 3.201, "step": 73550 }, { "epoch": 21.43275671267983, "grad_norm": 0.38498401641845703, "learning_rate": 0.0003430104895104895, "loss": 3.2173, "step": 73600 }, { "epoch": 21.447317840293554, "grad_norm": 0.4188741445541382, "learning_rate": 0.00034283566433566427, "loss": 3.2155, "step": 73650 }, { "epoch": 21.461878967907275, "grad_norm": 0.37732937932014465, "learning_rate": 0.0003426608391608391, "loss": 3.2243, "step": 73700 }, { "epoch": 21.476440095520996, "grad_norm": 0.41169071197509766, "learning_rate": 0.0003424860139860139, "loss": 3.2149, "step": 73750 }, { "epoch": 21.49100122313472, "grad_norm": 0.39298930764198303, "learning_rate": 0.0003423111888111888, "loss": 3.201, "step": 73800 }, { "epoch": 21.50556235074844, "grad_norm": 0.426394522190094, "learning_rate": 0.0003421363636363636, "loss": 3.2131, "step": 73850 }, { "epoch": 21.520123478362166, "grad_norm": 0.4090413451194763, "learning_rate": 0.00034196153846153843, "loss": 3.2134, "step": 73900 }, { "epoch": 21.534684605975887, "grad_norm": 0.41145437955856323, "learning_rate": 0.00034178671328671323, "loss": 3.207, "step": 73950 }, { "epoch": 21.549245733589608, "grad_norm": 0.4169312119483948, "learning_rate": 0.0003416118881118881, "loss": 3.2153, "step": 74000 }, { "epoch": 21.549245733589608, "eval_accuracy": 0.37356447239194995, "eval_loss": 3.5442752838134766, "eval_runtime": 178.9213, "eval_samples_per_second": 93.035, "eval_steps_per_second": 5.818, "step": 74000 }, { "epoch": 21.563806861203332, "grad_norm": 0.40608638525009155, "learning_rate": 0.0003414370629370629, "loss": 3.213, "step": 74050 }, { "epoch": 21.578367988817053, "grad_norm": 0.397708922624588, "learning_rate": 0.00034126223776223774, "loss": 3.2146, "step": 74100 }, { "epoch": 21.592929116430778, "grad_norm": 0.3892466723918915, "learning_rate": 0.0003410874125874126, "loss": 3.2289, "step": 74150 }, { "epoch": 21.6074902440445, "grad_norm": 0.39655137062072754, "learning_rate": 0.0003409125874125874, "loss": 3.2232, "step": 74200 }, { "epoch": 21.62205137165822, "grad_norm": 0.3863903284072876, "learning_rate": 0.00034073776223776225, "loss": 3.2263, "step": 74250 }, { "epoch": 21.636612499271944, "grad_norm": 0.3805066645145416, "learning_rate": 0.00034056293706293705, "loss": 3.2317, "step": 74300 }, { "epoch": 21.651173626885665, "grad_norm": 0.3918662965297699, "learning_rate": 0.0003403881118881119, "loss": 3.2376, "step": 74350 }, { "epoch": 21.66573475449939, "grad_norm": 0.40353602170944214, "learning_rate": 0.00034021328671328665, "loss": 3.2326, "step": 74400 }, { "epoch": 21.68029588211311, "grad_norm": 0.3726404309272766, "learning_rate": 0.0003400384615384615, "loss": 3.2354, "step": 74450 }, { "epoch": 21.69485700972683, "grad_norm": 0.38688957691192627, "learning_rate": 0.0003398636363636363, "loss": 3.2209, "step": 74500 }, { "epoch": 21.709418137340556, "grad_norm": 0.3922900855541229, "learning_rate": 0.00033968881118881115, "loss": 3.2291, "step": 74550 }, { "epoch": 21.723979264954277, "grad_norm": 0.39975613355636597, "learning_rate": 0.00033951398601398595, "loss": 3.2215, "step": 74600 }, { "epoch": 21.738540392568, "grad_norm": 0.4230974614620209, "learning_rate": 0.0003393391608391608, "loss": 3.2271, "step": 74650 }, { "epoch": 21.753101520181723, "grad_norm": 0.3907955586910248, "learning_rate": 0.0003391643356643356, "loss": 3.2258, "step": 74700 }, { "epoch": 21.767662647795444, "grad_norm": 0.4291480779647827, "learning_rate": 0.00033898951048951046, "loss": 3.2237, "step": 74750 }, { "epoch": 21.782223775409168, "grad_norm": 0.3837590515613556, "learning_rate": 0.00033881468531468526, "loss": 3.2264, "step": 74800 }, { "epoch": 21.79678490302289, "grad_norm": 0.4333752989768982, "learning_rate": 0.0003386398601398601, "loss": 3.223, "step": 74850 }, { "epoch": 21.811346030636614, "grad_norm": 0.41991353034973145, "learning_rate": 0.00033846503496503497, "loss": 3.2259, "step": 74900 }, { "epoch": 21.825907158250335, "grad_norm": 0.4055936634540558, "learning_rate": 0.00033829020979020977, "loss": 3.2374, "step": 74950 }, { "epoch": 21.840468285864056, "grad_norm": 0.39700213074684143, "learning_rate": 0.0003381153846153846, "loss": 3.2287, "step": 75000 }, { "epoch": 21.840468285864056, "eval_accuracy": 0.3741963714066107, "eval_loss": 3.534473180770874, "eval_runtime": 179.2224, "eval_samples_per_second": 92.879, "eval_steps_per_second": 5.808, "step": 75000 }, { "epoch": 21.85502941347778, "grad_norm": 0.43522578477859497, "learning_rate": 0.0003379405594405594, "loss": 3.2402, "step": 75050 }, { "epoch": 21.8695905410915, "grad_norm": 0.38809141516685486, "learning_rate": 0.0003377657342657343, "loss": 3.2153, "step": 75100 }, { "epoch": 21.884151668705226, "grad_norm": 0.3915172219276428, "learning_rate": 0.000337590909090909, "loss": 3.2482, "step": 75150 }, { "epoch": 21.898712796318947, "grad_norm": 0.3965000510215759, "learning_rate": 0.0003374160839160839, "loss": 3.2331, "step": 75200 }, { "epoch": 21.91327392393267, "grad_norm": 0.39834538102149963, "learning_rate": 0.0003372412587412587, "loss": 3.2402, "step": 75250 }, { "epoch": 21.927835051546392, "grad_norm": 0.3902565836906433, "learning_rate": 0.00033706643356643353, "loss": 3.2401, "step": 75300 }, { "epoch": 21.942396179160113, "grad_norm": 0.38109835982322693, "learning_rate": 0.00033689160839160833, "loss": 3.2298, "step": 75350 }, { "epoch": 21.956957306773838, "grad_norm": 0.3719768524169922, "learning_rate": 0.0003367167832167832, "loss": 3.2509, "step": 75400 }, { "epoch": 21.97151843438756, "grad_norm": 0.42772918939590454, "learning_rate": 0.000336541958041958, "loss": 3.2426, "step": 75450 }, { "epoch": 21.986079562001283, "grad_norm": 0.36617153882980347, "learning_rate": 0.00033636713286713284, "loss": 3.2474, "step": 75500 }, { "epoch": 22.00058244510455, "grad_norm": 0.39934152364730835, "learning_rate": 0.0003361923076923077, "loss": 3.238, "step": 75550 }, { "epoch": 22.015143572718273, "grad_norm": 0.4263432025909424, "learning_rate": 0.0003360174825174825, "loss": 3.1354, "step": 75600 }, { "epoch": 22.029704700331994, "grad_norm": 0.4135393500328064, "learning_rate": 0.00033584265734265734, "loss": 3.1378, "step": 75650 }, { "epoch": 22.044265827945715, "grad_norm": 0.41830018162727356, "learning_rate": 0.00033566783216783214, "loss": 3.1443, "step": 75700 }, { "epoch": 22.05882695555944, "grad_norm": 0.41102901101112366, "learning_rate": 0.000335493006993007, "loss": 3.1422, "step": 75750 }, { "epoch": 22.07338808317316, "grad_norm": 0.40229088068008423, "learning_rate": 0.0003353181818181818, "loss": 3.1599, "step": 75800 }, { "epoch": 22.087949210786885, "grad_norm": 0.3901878297328949, "learning_rate": 0.00033514335664335665, "loss": 3.139, "step": 75850 }, { "epoch": 22.102510338400606, "grad_norm": 0.3897287845611572, "learning_rate": 0.0003349685314685314, "loss": 3.1714, "step": 75900 }, { "epoch": 22.117071466014327, "grad_norm": 0.3760414719581604, "learning_rate": 0.00033479370629370625, "loss": 3.156, "step": 75950 }, { "epoch": 22.13163259362805, "grad_norm": 0.40278053283691406, "learning_rate": 0.00033461888111888105, "loss": 3.1662, "step": 76000 }, { "epoch": 22.13163259362805, "eval_accuracy": 0.3727959656275151, "eval_loss": 3.555933713912964, "eval_runtime": 178.8942, "eval_samples_per_second": 93.049, "eval_steps_per_second": 5.819, "step": 76000 }, { "epoch": 22.146193721241772, "grad_norm": 0.41445207595825195, "learning_rate": 0.0003344440559440559, "loss": 3.1631, "step": 76050 }, { "epoch": 22.160754848855497, "grad_norm": 0.3912239074707031, "learning_rate": 0.0003342692307692307, "loss": 3.1725, "step": 76100 }, { "epoch": 22.175315976469218, "grad_norm": 0.3929377794265747, "learning_rate": 0.00033409440559440556, "loss": 3.1578, "step": 76150 }, { "epoch": 22.18987710408294, "grad_norm": 0.40977317094802856, "learning_rate": 0.00033391958041958036, "loss": 3.1738, "step": 76200 }, { "epoch": 22.204438231696663, "grad_norm": 0.39516210556030273, "learning_rate": 0.0003337447552447552, "loss": 3.1761, "step": 76250 }, { "epoch": 22.218999359310384, "grad_norm": 0.3958999216556549, "learning_rate": 0.00033356993006993007, "loss": 3.1694, "step": 76300 }, { "epoch": 22.23356048692411, "grad_norm": 0.40409860014915466, "learning_rate": 0.00033339510489510487, "loss": 3.1816, "step": 76350 }, { "epoch": 22.24812161453783, "grad_norm": 0.43948835134506226, "learning_rate": 0.0003332202797202797, "loss": 3.1861, "step": 76400 }, { "epoch": 22.26268274215155, "grad_norm": 0.4331420660018921, "learning_rate": 0.0003330454545454545, "loss": 3.1809, "step": 76450 }, { "epoch": 22.277243869765275, "grad_norm": 0.4208989441394806, "learning_rate": 0.0003328706293706294, "loss": 3.1821, "step": 76500 }, { "epoch": 22.291804997378996, "grad_norm": 0.4600921869277954, "learning_rate": 0.00033269580419580417, "loss": 3.1938, "step": 76550 }, { "epoch": 22.30636612499272, "grad_norm": 0.4141092598438263, "learning_rate": 0.000332520979020979, "loss": 3.196, "step": 76600 }, { "epoch": 22.32092725260644, "grad_norm": 0.4022761881351471, "learning_rate": 0.00033234615384615377, "loss": 3.1862, "step": 76650 }, { "epoch": 22.335488380220163, "grad_norm": 0.39649698138237, "learning_rate": 0.0003321713286713286, "loss": 3.1896, "step": 76700 }, { "epoch": 22.350049507833887, "grad_norm": 0.4082602858543396, "learning_rate": 0.0003319965034965034, "loss": 3.1956, "step": 76750 }, { "epoch": 22.364610635447608, "grad_norm": 0.3875502943992615, "learning_rate": 0.0003318216783216783, "loss": 3.1923, "step": 76800 }, { "epoch": 22.379171763061333, "grad_norm": 0.40803077816963196, "learning_rate": 0.0003316468531468531, "loss": 3.1881, "step": 76850 }, { "epoch": 22.393732890675054, "grad_norm": 0.44484943151474, "learning_rate": 0.00033147202797202793, "loss": 3.1941, "step": 76900 }, { "epoch": 22.408294018288775, "grad_norm": 0.39733248949050903, "learning_rate": 0.0003312972027972028, "loss": 3.1924, "step": 76950 }, { "epoch": 22.4228551459025, "grad_norm": 0.41683346033096313, "learning_rate": 0.0003311223776223776, "loss": 3.1907, "step": 77000 }, { "epoch": 22.4228551459025, "eval_accuracy": 0.37371119052595864, "eval_loss": 3.5440673828125, "eval_runtime": 179.0008, "eval_samples_per_second": 92.994, "eval_steps_per_second": 5.816, "step": 77000 }, { "epoch": 22.43741627351622, "grad_norm": 0.3999550938606262, "learning_rate": 0.00033094755244755244, "loss": 3.1993, "step": 77050 }, { "epoch": 22.451977401129945, "grad_norm": 0.43484437465667725, "learning_rate": 0.00033077272727272724, "loss": 3.202, "step": 77100 }, { "epoch": 22.466538528743666, "grad_norm": 0.4053047001361847, "learning_rate": 0.0003305979020979021, "loss": 3.1986, "step": 77150 }, { "epoch": 22.481099656357387, "grad_norm": 0.42602503299713135, "learning_rate": 0.0003304230769230769, "loss": 3.1993, "step": 77200 }, { "epoch": 22.49566078397111, "grad_norm": 0.406269907951355, "learning_rate": 0.00033024825174825175, "loss": 3.201, "step": 77250 }, { "epoch": 22.510221911584832, "grad_norm": 0.38955157995224, "learning_rate": 0.00033007342657342655, "loss": 3.2068, "step": 77300 }, { "epoch": 22.524783039198557, "grad_norm": 0.4480753540992737, "learning_rate": 0.0003298986013986014, "loss": 3.2082, "step": 77350 }, { "epoch": 22.539344166812278, "grad_norm": 0.42242157459259033, "learning_rate": 0.00032972377622377615, "loss": 3.2064, "step": 77400 }, { "epoch": 22.553905294426002, "grad_norm": 0.4058876931667328, "learning_rate": 0.000329548951048951, "loss": 3.2034, "step": 77450 }, { "epoch": 22.568466422039723, "grad_norm": 0.4551337659358978, "learning_rate": 0.0003293741258741258, "loss": 3.2124, "step": 77500 }, { "epoch": 22.583027549653444, "grad_norm": 0.4230758547782898, "learning_rate": 0.00032919930069930065, "loss": 3.2141, "step": 77550 }, { "epoch": 22.59758867726717, "grad_norm": 0.3794611692428589, "learning_rate": 0.0003290244755244755, "loss": 3.2088, "step": 77600 }, { "epoch": 22.61214980488089, "grad_norm": 0.43101099133491516, "learning_rate": 0.0003288496503496503, "loss": 3.2062, "step": 77650 }, { "epoch": 22.626710932494614, "grad_norm": 0.3839839696884155, "learning_rate": 0.00032867482517482516, "loss": 3.2087, "step": 77700 }, { "epoch": 22.641272060108335, "grad_norm": 0.40371859073638916, "learning_rate": 0.00032849999999999996, "loss": 3.2173, "step": 77750 }, { "epoch": 22.655833187722056, "grad_norm": 0.41439804434776306, "learning_rate": 0.0003283251748251748, "loss": 3.2186, "step": 77800 }, { "epoch": 22.67039431533578, "grad_norm": 0.4533017575740814, "learning_rate": 0.0003281503496503496, "loss": 3.2081, "step": 77850 }, { "epoch": 22.6849554429495, "grad_norm": 0.40645161271095276, "learning_rate": 0.00032797552447552447, "loss": 3.2124, "step": 77900 }, { "epoch": 22.699516570563226, "grad_norm": 0.411747545003891, "learning_rate": 0.00032780069930069927, "loss": 3.2292, "step": 77950 }, { "epoch": 22.714077698176947, "grad_norm": 0.3930788040161133, "learning_rate": 0.0003276258741258741, "loss": 3.2248, "step": 78000 }, { "epoch": 22.714077698176947, "eval_accuracy": 0.3740154425538548, "eval_loss": 3.537041664123535, "eval_runtime": 177.136, "eval_samples_per_second": 93.973, "eval_steps_per_second": 5.877, "step": 78000 }, { "epoch": 22.728638825790668, "grad_norm": 0.39840519428253174, "learning_rate": 0.0003274510489510489, "loss": 3.2236, "step": 78050 }, { "epoch": 22.743199953404392, "grad_norm": 0.3945090174674988, "learning_rate": 0.0003272762237762238, "loss": 3.2243, "step": 78100 }, { "epoch": 22.757761081018113, "grad_norm": 0.40629005432128906, "learning_rate": 0.0003271013986013985, "loss": 3.2116, "step": 78150 }, { "epoch": 22.772322208631838, "grad_norm": 0.3936091363430023, "learning_rate": 0.0003269265734265734, "loss": 3.2266, "step": 78200 }, { "epoch": 22.78688333624556, "grad_norm": 0.4197155237197876, "learning_rate": 0.0003267517482517482, "loss": 3.2297, "step": 78250 }, { "epoch": 22.80144446385928, "grad_norm": 0.4412369132041931, "learning_rate": 0.00032657692307692303, "loss": 3.2289, "step": 78300 }, { "epoch": 22.816005591473004, "grad_norm": 0.3846014142036438, "learning_rate": 0.0003264020979020979, "loss": 3.2186, "step": 78350 }, { "epoch": 22.830566719086725, "grad_norm": 0.40719151496887207, "learning_rate": 0.0003262272727272727, "loss": 3.2149, "step": 78400 }, { "epoch": 22.84512784670045, "grad_norm": 0.39673471450805664, "learning_rate": 0.00032605244755244754, "loss": 3.2335, "step": 78450 }, { "epoch": 22.85968897431417, "grad_norm": 0.40791651606559753, "learning_rate": 0.00032587762237762234, "loss": 3.2268, "step": 78500 }, { "epoch": 22.874250101927892, "grad_norm": 0.3985656201839447, "learning_rate": 0.0003257027972027972, "loss": 3.2262, "step": 78550 }, { "epoch": 22.888811229541616, "grad_norm": 0.37513139843940735, "learning_rate": 0.000325527972027972, "loss": 3.2292, "step": 78600 }, { "epoch": 22.903372357155337, "grad_norm": 0.4130067825317383, "learning_rate": 0.00032535314685314684, "loss": 3.2287, "step": 78650 }, { "epoch": 22.917933484769062, "grad_norm": 0.40743470191955566, "learning_rate": 0.00032517832167832164, "loss": 3.2296, "step": 78700 }, { "epoch": 22.932494612382783, "grad_norm": 0.4178810119628906, "learning_rate": 0.0003250034965034965, "loss": 3.2217, "step": 78750 }, { "epoch": 22.947055739996504, "grad_norm": 0.3990694582462311, "learning_rate": 0.0003248286713286713, "loss": 3.2269, "step": 78800 }, { "epoch": 22.96161686761023, "grad_norm": 0.46170514822006226, "learning_rate": 0.00032465384615384615, "loss": 3.224, "step": 78850 }, { "epoch": 22.97617799522395, "grad_norm": 0.42402830719947815, "learning_rate": 0.0003244790209790209, "loss": 3.2342, "step": 78900 }, { "epoch": 22.990739122837674, "grad_norm": 0.37818992137908936, "learning_rate": 0.00032430419580419575, "loss": 3.2272, "step": 78950 }, { "epoch": 23.00524200594094, "grad_norm": 0.40692174434661865, "learning_rate": 0.00032412937062937066, "loss": 3.1964, "step": 79000 }, { "epoch": 23.00524200594094, "eval_accuracy": 0.3737579804436954, "eval_loss": 3.5501067638397217, "eval_runtime": 176.3895, "eval_samples_per_second": 94.371, "eval_steps_per_second": 5.902, "step": 79000 }, { "epoch": 23.019803133554664, "grad_norm": 0.3965844213962555, "learning_rate": 0.0003239545454545454, "loss": 3.1108, "step": 79050 }, { "epoch": 23.034364261168385, "grad_norm": 0.40269994735717773, "learning_rate": 0.00032377972027972026, "loss": 3.1373, "step": 79100 }, { "epoch": 23.048925388782106, "grad_norm": 0.4315015971660614, "learning_rate": 0.00032360489510489506, "loss": 3.1402, "step": 79150 }, { "epoch": 23.06348651639583, "grad_norm": 0.3808068037033081, "learning_rate": 0.0003234300699300699, "loss": 3.1429, "step": 79200 }, { "epoch": 23.07804764400955, "grad_norm": 0.4001609981060028, "learning_rate": 0.0003232552447552447, "loss": 3.1472, "step": 79250 }, { "epoch": 23.092608771623276, "grad_norm": 0.4144390821456909, "learning_rate": 0.00032308041958041957, "loss": 3.159, "step": 79300 }, { "epoch": 23.107169899236997, "grad_norm": 0.4327448308467865, "learning_rate": 0.00032290559440559437, "loss": 3.1542, "step": 79350 }, { "epoch": 23.121731026850718, "grad_norm": 0.3874760866165161, "learning_rate": 0.0003227307692307692, "loss": 3.158, "step": 79400 }, { "epoch": 23.136292154464442, "grad_norm": 0.40993136167526245, "learning_rate": 0.000322555944055944, "loss": 3.1433, "step": 79450 }, { "epoch": 23.150853282078163, "grad_norm": 0.421434611082077, "learning_rate": 0.00032238111888111887, "loss": 3.1411, "step": 79500 }, { "epoch": 23.165414409691888, "grad_norm": 0.4032493829727173, "learning_rate": 0.00032220629370629367, "loss": 3.1729, "step": 79550 }, { "epoch": 23.17997553730561, "grad_norm": 0.4178277552127838, "learning_rate": 0.0003220314685314685, "loss": 3.1528, "step": 79600 }, { "epoch": 23.19453666491933, "grad_norm": 0.3926478326320648, "learning_rate": 0.00032185664335664327, "loss": 3.1495, "step": 79650 }, { "epoch": 23.209097792533054, "grad_norm": 0.38440054655075073, "learning_rate": 0.0003216818181818181, "loss": 3.1602, "step": 79700 }, { "epoch": 23.223658920146775, "grad_norm": 0.42098525166511536, "learning_rate": 0.00032150699300699303, "loss": 3.1784, "step": 79750 }, { "epoch": 23.2382200477605, "grad_norm": 0.4336051046848297, "learning_rate": 0.0003213321678321678, "loss": 3.1667, "step": 79800 }, { "epoch": 23.25278117537422, "grad_norm": 0.4008271396160126, "learning_rate": 0.00032115734265734263, "loss": 3.1702, "step": 79850 }, { "epoch": 23.26734230298794, "grad_norm": 0.427082359790802, "learning_rate": 0.00032098251748251743, "loss": 3.1729, "step": 79900 }, { "epoch": 23.281903430601666, "grad_norm": 0.46416816115379333, "learning_rate": 0.0003208076923076923, "loss": 3.1725, "step": 79950 }, { "epoch": 23.296464558215387, "grad_norm": 0.37470319867134094, "learning_rate": 0.0003206328671328671, "loss": 3.1882, "step": 80000 }, { "epoch": 23.296464558215387, "eval_accuracy": 0.3735514229425309, "eval_loss": 3.5463123321533203, "eval_runtime": 176.9835, "eval_samples_per_second": 94.054, "eval_steps_per_second": 5.882, "step": 80000 }, { "epoch": 23.31102568582911, "grad_norm": 0.45111221075057983, "learning_rate": 0.00032045804195804194, "loss": 3.1322, "step": 80050 }, { "epoch": 23.325586813442833, "grad_norm": 0.427644819021225, "learning_rate": 0.00032028321678321674, "loss": 3.1348, "step": 80100 }, { "epoch": 23.340147941056557, "grad_norm": 0.4981081783771515, "learning_rate": 0.0003201083916083916, "loss": 3.1441, "step": 80150 }, { "epoch": 23.354709068670278, "grad_norm": 0.42692831158638, "learning_rate": 0.0003199335664335664, "loss": 3.1436, "step": 80200 }, { "epoch": 23.369270196284, "grad_norm": 0.43491700291633606, "learning_rate": 0.00031975874125874125, "loss": 3.144, "step": 80250 }, { "epoch": 23.383831323897724, "grad_norm": 0.4590279757976532, "learning_rate": 0.00031958391608391605, "loss": 3.1532, "step": 80300 }, { "epoch": 23.398392451511445, "grad_norm": 0.42716917395591736, "learning_rate": 0.0003194090909090909, "loss": 3.1646, "step": 80350 }, { "epoch": 23.41295357912517, "grad_norm": 0.4419504702091217, "learning_rate": 0.00031923426573426576, "loss": 3.156, "step": 80400 }, { "epoch": 23.42751470673889, "grad_norm": 0.42690083384513855, "learning_rate": 0.0003190594405594405, "loss": 3.1625, "step": 80450 }, { "epoch": 23.44207583435261, "grad_norm": 0.41010886430740356, "learning_rate": 0.0003188846153846154, "loss": 3.1491, "step": 80500 }, { "epoch": 23.456636961966336, "grad_norm": 0.41106072068214417, "learning_rate": 0.00031870979020979015, "loss": 3.1736, "step": 80550 }, { "epoch": 23.471198089580056, "grad_norm": 0.4519209861755371, "learning_rate": 0.000318534965034965, "loss": 3.1633, "step": 80600 }, { "epoch": 23.48575921719378, "grad_norm": 0.41818633675575256, "learning_rate": 0.0003183601398601398, "loss": 3.1721, "step": 80650 }, { "epoch": 23.500320344807502, "grad_norm": 0.38743162155151367, "learning_rate": 0.00031818531468531466, "loss": 3.1703, "step": 80700 }, { "epoch": 23.514881472421223, "grad_norm": 0.4087889790534973, "learning_rate": 0.00031801048951048946, "loss": 3.1617, "step": 80750 }, { "epoch": 23.529442600034947, "grad_norm": 0.4340457320213318, "learning_rate": 0.0003178356643356643, "loss": 3.1721, "step": 80800 }, { "epoch": 23.54400372764867, "grad_norm": 0.3856426477432251, "learning_rate": 0.0003176608391608391, "loss": 3.1715, "step": 80850 }, { "epoch": 23.558564855262393, "grad_norm": 0.3838791847229004, "learning_rate": 0.00031748601398601397, "loss": 3.1676, "step": 80900 }, { "epoch": 23.573125982876114, "grad_norm": 0.41913339495658875, "learning_rate": 0.00031731118881118877, "loss": 3.1758, "step": 80950 }, { "epoch": 23.587687110489835, "grad_norm": 0.4226961135864258, "learning_rate": 0.0003171363636363636, "loss": 3.1855, "step": 81000 }, { "epoch": 23.587687110489835, "eval_accuracy": 0.3734603119218124, "eval_loss": 3.5508110523223877, "eval_runtime": 182.9847, "eval_samples_per_second": 90.969, "eval_steps_per_second": 5.689, "step": 81000 }, { "epoch": 23.60224823810356, "grad_norm": 0.39404693245887756, "learning_rate": 0.0003169615384615385, "loss": 3.1797, "step": 81050 }, { "epoch": 23.61680936571728, "grad_norm": 0.41680702567100525, "learning_rate": 0.0003167867132867133, "loss": 3.1846, "step": 81100 }, { "epoch": 23.631370493331005, "grad_norm": 0.38685619831085205, "learning_rate": 0.00031661188811188813, "loss": 3.1678, "step": 81150 }, { "epoch": 23.645931620944726, "grad_norm": 0.42534857988357544, "learning_rate": 0.0003164370629370629, "loss": 3.1897, "step": 81200 }, { "epoch": 23.660492748558447, "grad_norm": 0.42006826400756836, "learning_rate": 0.0003162622377622378, "loss": 3.1834, "step": 81250 }, { "epoch": 23.67505387617217, "grad_norm": 0.40510988235473633, "learning_rate": 0.00031608741258741253, "loss": 3.1783, "step": 81300 }, { "epoch": 23.689615003785892, "grad_norm": 0.4101458191871643, "learning_rate": 0.0003159125874125874, "loss": 3.1803, "step": 81350 }, { "epoch": 23.704176131399617, "grad_norm": 0.40217939019203186, "learning_rate": 0.0003157377622377622, "loss": 3.1829, "step": 81400 }, { "epoch": 23.718737259013338, "grad_norm": 0.397582083940506, "learning_rate": 0.00031556293706293704, "loss": 3.1848, "step": 81450 }, { "epoch": 23.73329838662706, "grad_norm": 0.4027913808822632, "learning_rate": 0.00031538811188811184, "loss": 3.1967, "step": 81500 }, { "epoch": 23.747859514240783, "grad_norm": 0.43434393405914307, "learning_rate": 0.0003152132867132867, "loss": 3.1933, "step": 81550 }, { "epoch": 23.762420641854504, "grad_norm": 0.43189942836761475, "learning_rate": 0.0003150384615384615, "loss": 3.1976, "step": 81600 }, { "epoch": 23.77698176946823, "grad_norm": 0.4070805013179779, "learning_rate": 0.00031486363636363634, "loss": 3.1962, "step": 81650 }, { "epoch": 23.79154289708195, "grad_norm": 0.44222292304039, "learning_rate": 0.00031468881118881114, "loss": 3.1887, "step": 81700 }, { "epoch": 23.80610402469567, "grad_norm": 0.45916780829429626, "learning_rate": 0.000314513986013986, "loss": 3.1934, "step": 81750 }, { "epoch": 23.820665152309395, "grad_norm": 0.42806705832481384, "learning_rate": 0.00031433916083916085, "loss": 3.1996, "step": 81800 }, { "epoch": 23.835226279923116, "grad_norm": 0.42251259088516235, "learning_rate": 0.00031416433566433565, "loss": 3.2017, "step": 81850 }, { "epoch": 23.84978740753684, "grad_norm": 0.4552309811115265, "learning_rate": 0.0003139895104895105, "loss": 3.1962, "step": 81900 }, { "epoch": 23.86434853515056, "grad_norm": 0.41143572330474854, "learning_rate": 0.00031381468531468525, "loss": 3.1874, "step": 81950 }, { "epoch": 23.878909662764286, "grad_norm": 0.3958553075790405, "learning_rate": 0.00031363986013986016, "loss": 3.1989, "step": 82000 }, { "epoch": 23.878909662764286, "eval_accuracy": 0.3738077094266166, "eval_loss": 3.545875310897827, "eval_runtime": 182.4158, "eval_samples_per_second": 91.253, "eval_steps_per_second": 5.707, "step": 82000 }, { "epoch": 23.893470790378007, "grad_norm": 0.4320649802684784, "learning_rate": 0.0003134650349650349, "loss": 3.1949, "step": 82050 }, { "epoch": 23.908031917991728, "grad_norm": 0.4198630750179291, "learning_rate": 0.00031329020979020976, "loss": 3.2004, "step": 82100 }, { "epoch": 23.922593045605453, "grad_norm": 0.41041895747184753, "learning_rate": 0.00031311538461538456, "loss": 3.1914, "step": 82150 }, { "epoch": 23.937154173219174, "grad_norm": 0.3980288803577423, "learning_rate": 0.0003129405594405594, "loss": 3.2107, "step": 82200 }, { "epoch": 23.951715300832895, "grad_norm": 0.46288254857063293, "learning_rate": 0.0003127657342657342, "loss": 3.2086, "step": 82250 }, { "epoch": 23.96627642844662, "grad_norm": 0.4133715033531189, "learning_rate": 0.00031259090909090907, "loss": 3.1936, "step": 82300 }, { "epoch": 23.98083755606034, "grad_norm": 0.4262646734714508, "learning_rate": 0.00031241608391608386, "loss": 3.1925, "step": 82350 }, { "epoch": 23.995398683674065, "grad_norm": 0.39082765579223633, "learning_rate": 0.0003122412587412587, "loss": 3.2045, "step": 82400 }, { "epoch": 24.010192789329604, "grad_norm": 0.42849040031433105, "learning_rate": 0.00031206643356643357, "loss": 3.2134, "step": 82450 }, { "epoch": 24.02475391694333, "grad_norm": 0.41999033093452454, "learning_rate": 0.00031189160839160837, "loss": 3.1282, "step": 82500 }, { "epoch": 24.03931504455705, "grad_norm": 0.42225944995880127, "learning_rate": 0.0003117167832167832, "loss": 3.1347, "step": 82550 }, { "epoch": 24.053876172170774, "grad_norm": 0.3997180163860321, "learning_rate": 0.000311541958041958, "loss": 3.1311, "step": 82600 }, { "epoch": 24.068437299784495, "grad_norm": 0.4154646694660187, "learning_rate": 0.0003113671328671329, "loss": 3.1451, "step": 82650 }, { "epoch": 24.082998427398216, "grad_norm": 0.42067858576774597, "learning_rate": 0.0003111923076923076, "loss": 3.129, "step": 82700 }, { "epoch": 24.09755955501194, "grad_norm": 0.4242795705795288, "learning_rate": 0.00031101748251748253, "loss": 3.149, "step": 82750 }, { "epoch": 24.11212068262566, "grad_norm": 0.4412245452404022, "learning_rate": 0.0003108426573426573, "loss": 3.1334, "step": 82800 }, { "epoch": 24.126681810239386, "grad_norm": 0.4352100193500519, "learning_rate": 0.00031066783216783213, "loss": 3.1476, "step": 82850 }, { "epoch": 24.141242937853107, "grad_norm": 0.452750563621521, "learning_rate": 0.00031049300699300693, "loss": 3.1419, "step": 82900 }, { "epoch": 24.15580406546683, "grad_norm": 0.39354854822158813, "learning_rate": 0.0003103181818181818, "loss": 3.1623, "step": 82950 }, { "epoch": 24.170365193080553, "grad_norm": 0.41943788528442383, "learning_rate": 0.0003101433566433566, "loss": 3.1544, "step": 83000 }, { "epoch": 24.170365193080553, "eval_accuracy": 0.37311115097789754, "eval_loss": 3.558357000350952, "eval_runtime": 181.8256, "eval_samples_per_second": 91.549, "eval_steps_per_second": 5.725, "step": 83000 }, { "epoch": 24.184926320694274, "grad_norm": 0.40159091353416443, "learning_rate": 0.00030996853146853144, "loss": 3.1446, "step": 83050 }, { "epoch": 24.199487448308, "grad_norm": 0.39169952273368835, "learning_rate": 0.00030979370629370624, "loss": 3.1614, "step": 83100 }, { "epoch": 24.21404857592172, "grad_norm": 0.39745545387268066, "learning_rate": 0.0003096188811188811, "loss": 3.1627, "step": 83150 }, { "epoch": 24.22860970353544, "grad_norm": 0.4220676124095917, "learning_rate": 0.00030944405594405595, "loss": 3.1664, "step": 83200 }, { "epoch": 24.243170831149165, "grad_norm": 0.4526214301586151, "learning_rate": 0.00030926923076923075, "loss": 3.1581, "step": 83250 }, { "epoch": 24.257731958762886, "grad_norm": 0.43737250566482544, "learning_rate": 0.0003090944055944056, "loss": 3.1569, "step": 83300 }, { "epoch": 24.27229308637661, "grad_norm": 0.4575173556804657, "learning_rate": 0.0003089195804195804, "loss": 3.1739, "step": 83350 }, { "epoch": 24.28685421399033, "grad_norm": 0.4393577575683594, "learning_rate": 0.00030874475524475525, "loss": 3.1765, "step": 83400 }, { "epoch": 24.301415341604052, "grad_norm": 0.41313502192497253, "learning_rate": 0.00030856993006993, "loss": 3.1567, "step": 83450 }, { "epoch": 24.315976469217777, "grad_norm": 0.4203622043132782, "learning_rate": 0.0003083951048951049, "loss": 3.163, "step": 83500 }, { "epoch": 24.330537596831498, "grad_norm": 0.41048669815063477, "learning_rate": 0.00030822027972027965, "loss": 3.174, "step": 83550 }, { "epoch": 24.345098724445222, "grad_norm": 0.4165532886981964, "learning_rate": 0.0003080454545454545, "loss": 3.1816, "step": 83600 }, { "epoch": 24.359659852058943, "grad_norm": 0.3939666152000427, "learning_rate": 0.0003078706293706293, "loss": 3.1633, "step": 83650 }, { "epoch": 24.374220979672664, "grad_norm": 0.4266517460346222, "learning_rate": 0.00030769580419580416, "loss": 3.1838, "step": 83700 }, { "epoch": 24.38878210728639, "grad_norm": 0.4121813476085663, "learning_rate": 0.00030752097902097896, "loss": 3.1829, "step": 83750 }, { "epoch": 24.40334323490011, "grad_norm": 0.42097821831703186, "learning_rate": 0.0003073461538461538, "loss": 3.1885, "step": 83800 }, { "epoch": 24.417904362513834, "grad_norm": 0.4097153842449188, "learning_rate": 0.00030717132867132867, "loss": 3.1914, "step": 83850 }, { "epoch": 24.432465490127555, "grad_norm": 0.40335527062416077, "learning_rate": 0.00030699650349650347, "loss": 3.1849, "step": 83900 }, { "epoch": 24.44702661774128, "grad_norm": 0.4203415513038635, "learning_rate": 0.0003068216783216783, "loss": 3.1831, "step": 83950 }, { "epoch": 24.461587745355, "grad_norm": 0.4227919280529022, "learning_rate": 0.0003066468531468531, "loss": 3.1822, "step": 84000 }, { "epoch": 24.461587745355, "eval_accuracy": 0.3738135875569855, "eval_loss": 3.5504536628723145, "eval_runtime": 181.79, "eval_samples_per_second": 91.567, "eval_steps_per_second": 5.726, "step": 84000 }, { "epoch": 24.47614887296872, "grad_norm": 0.4112480580806732, "learning_rate": 0.000306472027972028, "loss": 3.1963, "step": 84050 }, { "epoch": 24.490710000582446, "grad_norm": 0.436646431684494, "learning_rate": 0.0003062972027972028, "loss": 3.1987, "step": 84100 }, { "epoch": 24.505271128196167, "grad_norm": 0.44917264580726624, "learning_rate": 0.00030612237762237763, "loss": 3.1892, "step": 84150 }, { "epoch": 24.51983225580989, "grad_norm": 0.40418100357055664, "learning_rate": 0.0003059475524475524, "loss": 3.1847, "step": 84200 }, { "epoch": 24.534393383423613, "grad_norm": 0.4214847981929779, "learning_rate": 0.0003057727272727273, "loss": 3.189, "step": 84250 }, { "epoch": 24.548954511037334, "grad_norm": 0.4034685790538788, "learning_rate": 0.00030559790209790203, "loss": 3.1885, "step": 84300 }, { "epoch": 24.563515638651058, "grad_norm": 0.42310208082199097, "learning_rate": 0.0003054230769230769, "loss": 3.1911, "step": 84350 }, { "epoch": 24.57807676626478, "grad_norm": 0.42262396216392517, "learning_rate": 0.0003052482517482517, "loss": 3.1973, "step": 84400 }, { "epoch": 24.592637893878504, "grad_norm": 0.42703932523727417, "learning_rate": 0.00030507342657342654, "loss": 3.2048, "step": 84450 }, { "epoch": 24.607199021492224, "grad_norm": 0.40316981077194214, "learning_rate": 0.00030489860139860134, "loss": 3.1964, "step": 84500 }, { "epoch": 24.621760149105945, "grad_norm": 0.4285880923271179, "learning_rate": 0.0003047237762237762, "loss": 3.2069, "step": 84550 }, { "epoch": 24.63632127671967, "grad_norm": 0.43559470772743225, "learning_rate": 0.00030454895104895104, "loss": 3.1981, "step": 84600 }, { "epoch": 24.65088240433339, "grad_norm": 0.424099862575531, "learning_rate": 0.00030437412587412584, "loss": 3.1876, "step": 84650 }, { "epoch": 24.665443531947115, "grad_norm": 0.4152607321739197, "learning_rate": 0.0003041993006993007, "loss": 3.2057, "step": 84700 }, { "epoch": 24.680004659560836, "grad_norm": 0.40943652391433716, "learning_rate": 0.0003040244755244755, "loss": 3.2007, "step": 84750 }, { "epoch": 24.694565787174557, "grad_norm": 0.3958180248737335, "learning_rate": 0.00030384965034965035, "loss": 3.2014, "step": 84800 }, { "epoch": 24.709126914788282, "grad_norm": 0.41481125354766846, "learning_rate": 0.00030367482517482515, "loss": 3.2057, "step": 84850 }, { "epoch": 24.723688042402003, "grad_norm": 0.3919677436351776, "learning_rate": 0.0003035, "loss": 3.2067, "step": 84900 }, { "epoch": 24.738249170015727, "grad_norm": 0.4104112386703491, "learning_rate": 0.00030332517482517475, "loss": 3.2026, "step": 84950 }, { "epoch": 24.75281029762945, "grad_norm": 0.40666452050209045, "learning_rate": 0.00030315034965034966, "loss": 3.2122, "step": 85000 }, { "epoch": 24.75281029762945, "eval_accuracy": 0.3740955026894798, "eval_loss": 3.5392696857452393, "eval_runtime": 182.2209, "eval_samples_per_second": 91.351, "eval_steps_per_second": 5.713, "step": 85000 }, { "epoch": 24.76737142524317, "grad_norm": 0.40630656480789185, "learning_rate": 0.0003029755244755244, "loss": 3.2014, "step": 85050 }, { "epoch": 24.781932552856894, "grad_norm": 0.42266345024108887, "learning_rate": 0.00030280069930069926, "loss": 3.1997, "step": 85100 }, { "epoch": 24.796493680470615, "grad_norm": 0.3945619463920593, "learning_rate": 0.00030262587412587406, "loss": 3.203, "step": 85150 }, { "epoch": 24.81105480808434, "grad_norm": 0.415470153093338, "learning_rate": 0.0003024510489510489, "loss": 3.1974, "step": 85200 }, { "epoch": 24.82561593569806, "grad_norm": 0.4282160997390747, "learning_rate": 0.00030227622377622377, "loss": 3.2002, "step": 85250 }, { "epoch": 24.84017706331178, "grad_norm": 0.4053383767604828, "learning_rate": 0.00030210139860139856, "loss": 3.2013, "step": 85300 }, { "epoch": 24.854738190925506, "grad_norm": 0.3949108421802521, "learning_rate": 0.0003019265734265734, "loss": 3.2068, "step": 85350 }, { "epoch": 24.869299318539227, "grad_norm": 0.4064263105392456, "learning_rate": 0.0003017517482517482, "loss": 3.1976, "step": 85400 }, { "epoch": 24.88386044615295, "grad_norm": 0.4585686922073364, "learning_rate": 0.00030157692307692307, "loss": 3.2119, "step": 85450 }, { "epoch": 24.898421573766672, "grad_norm": 0.424032062292099, "learning_rate": 0.00030140209790209787, "loss": 3.2085, "step": 85500 }, { "epoch": 24.912982701380393, "grad_norm": 0.4028070271015167, "learning_rate": 0.0003012272727272727, "loss": 3.2179, "step": 85550 }, { "epoch": 24.927543828994118, "grad_norm": 0.41038432717323303, "learning_rate": 0.0003010524475524475, "loss": 3.2107, "step": 85600 }, { "epoch": 24.94210495660784, "grad_norm": 0.4204416573047638, "learning_rate": 0.0003008776223776224, "loss": 3.2166, "step": 85650 }, { "epoch": 24.956666084221563, "grad_norm": 0.4086936116218567, "learning_rate": 0.0003007027972027972, "loss": 3.2159, "step": 85700 }, { "epoch": 24.971227211835284, "grad_norm": 0.39162677526474, "learning_rate": 0.00030052797202797203, "loss": 3.2221, "step": 85750 }, { "epoch": 24.985788339449005, "grad_norm": 0.4139779508113861, "learning_rate": 0.0003003531468531468, "loss": 3.2221, "step": 85800 }, { "epoch": 25.000291222552274, "grad_norm": 0.4680265784263611, "learning_rate": 0.00030017832167832163, "loss": 3.2179, "step": 85850 }, { "epoch": 25.014852350165995, "grad_norm": 0.4011513888835907, "learning_rate": 0.0003000034965034965, "loss": 3.1148, "step": 85900 }, { "epoch": 25.02941347777972, "grad_norm": 0.413959801197052, "learning_rate": 0.0002998286713286713, "loss": 3.113, "step": 85950 }, { "epoch": 25.04397460539344, "grad_norm": 0.41489461064338684, "learning_rate": 0.00029965384615384614, "loss": 3.1227, "step": 86000 }, { "epoch": 25.04397460539344, "eval_accuracy": 0.3735611806389434, "eval_loss": 3.550731897354126, "eval_runtime": 181.3258, "eval_samples_per_second": 91.802, "eval_steps_per_second": 5.741, "step": 86000 }, { "epoch": 25.058535733007165, "grad_norm": 0.4093635380268097, "learning_rate": 0.00029947902097902094, "loss": 3.1202, "step": 86050 }, { "epoch": 25.073096860620886, "grad_norm": 0.4468414783477783, "learning_rate": 0.0002993041958041958, "loss": 3.1258, "step": 86100 }, { "epoch": 25.087657988234607, "grad_norm": 0.452058881521225, "learning_rate": 0.0002991293706293706, "loss": 3.1351, "step": 86150 }, { "epoch": 25.10221911584833, "grad_norm": 0.44358283281326294, "learning_rate": 0.0002989545454545454, "loss": 3.1289, "step": 86200 }, { "epoch": 25.116780243462053, "grad_norm": 0.40111038088798523, "learning_rate": 0.00029877972027972025, "loss": 3.1312, "step": 86250 }, { "epoch": 25.131341371075777, "grad_norm": 0.4112999439239502, "learning_rate": 0.0002986048951048951, "loss": 3.1404, "step": 86300 }, { "epoch": 25.145902498689498, "grad_norm": 0.44098880887031555, "learning_rate": 0.0002984300699300699, "loss": 3.1299, "step": 86350 }, { "epoch": 25.160463626303223, "grad_norm": 0.4231182932853699, "learning_rate": 0.00029825524475524475, "loss": 3.1419, "step": 86400 }, { "epoch": 25.175024753916944, "grad_norm": 0.407163143157959, "learning_rate": 0.00029808041958041955, "loss": 3.1487, "step": 86450 }, { "epoch": 25.189585881530665, "grad_norm": 0.4498389959335327, "learning_rate": 0.0002979055944055944, "loss": 3.1463, "step": 86500 }, { "epoch": 25.20414700914439, "grad_norm": 0.4448164105415344, "learning_rate": 0.0002977307692307692, "loss": 3.1494, "step": 86550 }, { "epoch": 25.21870813675811, "grad_norm": 0.40641462802886963, "learning_rate": 0.000297555944055944, "loss": 3.1534, "step": 86600 }, { "epoch": 25.233269264371835, "grad_norm": 0.4092680811882019, "learning_rate": 0.00029738111888111886, "loss": 3.1535, "step": 86650 }, { "epoch": 25.247830391985556, "grad_norm": 0.41941606998443604, "learning_rate": 0.00029720629370629366, "loss": 3.1544, "step": 86700 }, { "epoch": 25.262391519599277, "grad_norm": 0.43551942706108093, "learning_rate": 0.0002970314685314685, "loss": 3.1584, "step": 86750 }, { "epoch": 25.276952647213, "grad_norm": 0.41969966888427734, "learning_rate": 0.0002968566433566433, "loss": 3.1581, "step": 86800 }, { "epoch": 25.291513774826722, "grad_norm": 0.4162365794181824, "learning_rate": 0.00029668181818181817, "loss": 3.1493, "step": 86850 }, { "epoch": 25.306074902440447, "grad_norm": 0.43325042724609375, "learning_rate": 0.00029650699300699297, "loss": 3.1591, "step": 86900 }, { "epoch": 25.320636030054168, "grad_norm": 0.4242747724056244, "learning_rate": 0.0002963321678321678, "loss": 3.1761, "step": 86950 }, { "epoch": 25.33519715766789, "grad_norm": 0.5050255656242371, "learning_rate": 0.0002961573426573426, "loss": 3.1596, "step": 87000 }, { "epoch": 25.33519715766789, "eval_accuracy": 0.37365311459791356, "eval_loss": 3.5509300231933594, "eval_runtime": 181.2051, "eval_samples_per_second": 91.863, "eval_steps_per_second": 5.745, "step": 87000 }, { "epoch": 25.349758285281613, "grad_norm": 0.43530839681625366, "learning_rate": 0.0002959825174825175, "loss": 3.16, "step": 87050 }, { "epoch": 25.364319412895334, "grad_norm": 0.4388258755207062, "learning_rate": 0.0002958076923076923, "loss": 3.1526, "step": 87100 }, { "epoch": 25.37888054050906, "grad_norm": 0.4287799596786499, "learning_rate": 0.00029563286713286713, "loss": 3.1797, "step": 87150 }, { "epoch": 25.39344166812278, "grad_norm": 0.44115379452705383, "learning_rate": 0.00029545804195804193, "loss": 3.1749, "step": 87200 }, { "epoch": 25.4080027957365, "grad_norm": 0.4438251256942749, "learning_rate": 0.0002952832167832168, "loss": 3.1676, "step": 87250 }, { "epoch": 25.422563923350225, "grad_norm": 0.41780340671539307, "learning_rate": 0.0002951083916083916, "loss": 3.1613, "step": 87300 }, { "epoch": 25.437125050963946, "grad_norm": 0.4080037772655487, "learning_rate": 0.0002949335664335664, "loss": 3.17, "step": 87350 }, { "epoch": 25.45168617857767, "grad_norm": 0.44067227840423584, "learning_rate": 0.00029475874125874124, "loss": 3.1738, "step": 87400 }, { "epoch": 25.46624730619139, "grad_norm": 0.4406086206436157, "learning_rate": 0.00029458391608391604, "loss": 3.1834, "step": 87450 }, { "epoch": 25.480808433805112, "grad_norm": 0.3974784016609192, "learning_rate": 0.0002944090909090909, "loss": 3.181, "step": 87500 }, { "epoch": 25.495369561418837, "grad_norm": 0.41356295347213745, "learning_rate": 0.0002942342657342657, "loss": 3.1728, "step": 87550 }, { "epoch": 25.509930689032558, "grad_norm": 0.4187457859516144, "learning_rate": 0.00029405944055944054, "loss": 3.1659, "step": 87600 }, { "epoch": 25.524491816646282, "grad_norm": 0.41289469599723816, "learning_rate": 0.0002938846153846154, "loss": 3.1753, "step": 87650 }, { "epoch": 25.539052944260003, "grad_norm": 0.3981243669986725, "learning_rate": 0.0002937097902097902, "loss": 3.1649, "step": 87700 }, { "epoch": 25.553614071873724, "grad_norm": 0.42982515692710876, "learning_rate": 0.000293534965034965, "loss": 3.1872, "step": 87750 }, { "epoch": 25.56817519948745, "grad_norm": 0.4279654324054718, "learning_rate": 0.00029336013986013985, "loss": 3.1956, "step": 87800 }, { "epoch": 25.58273632710117, "grad_norm": 0.38420966267585754, "learning_rate": 0.00029318531468531465, "loss": 3.1845, "step": 87850 }, { "epoch": 25.597297454714894, "grad_norm": 0.4386531114578247, "learning_rate": 0.0002930104895104895, "loss": 3.1866, "step": 87900 }, { "epoch": 25.611858582328615, "grad_norm": 0.4742956757545471, "learning_rate": 0.0002928356643356643, "loss": 3.1911, "step": 87950 }, { "epoch": 25.626419709942336, "grad_norm": 0.411289244890213, "learning_rate": 0.00029266083916083916, "loss": 3.1876, "step": 88000 }, { "epoch": 25.626419709942336, "eval_accuracy": 0.3742827799230341, "eval_loss": 3.541804313659668, "eval_runtime": 181.327, "eval_samples_per_second": 91.801, "eval_steps_per_second": 5.741, "step": 88000 }, { "epoch": 25.64098083755606, "grad_norm": 0.4100117087364197, "learning_rate": 0.00029248601398601396, "loss": 3.196, "step": 88050 }, { "epoch": 25.655541965169782, "grad_norm": 0.41513684391975403, "learning_rate": 0.00029231118881118876, "loss": 3.1885, "step": 88100 }, { "epoch": 25.670103092783506, "grad_norm": 0.40996086597442627, "learning_rate": 0.0002921363636363636, "loss": 3.1824, "step": 88150 }, { "epoch": 25.684664220397227, "grad_norm": 0.40860715508461, "learning_rate": 0.0002919615384615384, "loss": 3.1817, "step": 88200 }, { "epoch": 25.69922534801095, "grad_norm": 0.4030529260635376, "learning_rate": 0.00029178671328671326, "loss": 3.2061, "step": 88250 }, { "epoch": 25.713786475624673, "grad_norm": 0.40431034564971924, "learning_rate": 0.00029161188811188806, "loss": 3.2011, "step": 88300 }, { "epoch": 25.728347603238394, "grad_norm": 0.4311061501502991, "learning_rate": 0.0002914370629370629, "loss": 3.1881, "step": 88350 }, { "epoch": 25.74290873085212, "grad_norm": 0.4359666407108307, "learning_rate": 0.00029126223776223777, "loss": 3.1922, "step": 88400 }, { "epoch": 25.75746985846584, "grad_norm": 0.43195220828056335, "learning_rate": 0.00029108741258741257, "loss": 3.1969, "step": 88450 }, { "epoch": 25.772030986079564, "grad_norm": 0.4211742877960205, "learning_rate": 0.00029091258741258737, "loss": 3.1924, "step": 88500 }, { "epoch": 25.786592113693285, "grad_norm": 0.40398746728897095, "learning_rate": 0.0002907377622377622, "loss": 3.1982, "step": 88550 }, { "epoch": 25.801153241307006, "grad_norm": 0.4384838938713074, "learning_rate": 0.000290562937062937, "loss": 3.1991, "step": 88600 }, { "epoch": 25.81571436892073, "grad_norm": 0.431005597114563, "learning_rate": 0.0002903881118881119, "loss": 3.2046, "step": 88650 }, { "epoch": 25.83027549653445, "grad_norm": 0.4143125116825104, "learning_rate": 0.0002902132867132867, "loss": 3.1902, "step": 88700 }, { "epoch": 25.844836624148176, "grad_norm": 0.39128798246383667, "learning_rate": 0.00029003846153846153, "loss": 3.2102, "step": 88750 }, { "epoch": 25.859397751761897, "grad_norm": 0.41086676716804504, "learning_rate": 0.00028986363636363633, "loss": 3.1996, "step": 88800 }, { "epoch": 25.873958879375618, "grad_norm": 0.40953710675239563, "learning_rate": 0.00028968881118881113, "loss": 3.1907, "step": 88850 }, { "epoch": 25.888520006989342, "grad_norm": 0.4275113344192505, "learning_rate": 0.000289513986013986, "loss": 3.2055, "step": 88900 }, { "epoch": 25.903081134603063, "grad_norm": 0.4466639757156372, "learning_rate": 0.0002893391608391608, "loss": 3.1968, "step": 88950 }, { "epoch": 25.917642262216788, "grad_norm": 0.4273034632205963, "learning_rate": 0.00028916433566433564, "loss": 3.1966, "step": 89000 }, { "epoch": 25.917642262216788, "eval_accuracy": 0.37498956631859515, "eval_loss": 3.530644655227661, "eval_runtime": 181.3443, "eval_samples_per_second": 91.792, "eval_steps_per_second": 5.74, "step": 89000 }, { "epoch": 25.93220338983051, "grad_norm": 0.4303942322731018, "learning_rate": 0.0002889895104895105, "loss": 3.204, "step": 89050 }, { "epoch": 25.94676451744423, "grad_norm": 0.4104004502296448, "learning_rate": 0.0002888146853146853, "loss": 3.2073, "step": 89100 }, { "epoch": 25.961325645057954, "grad_norm": 0.4273345172405243, "learning_rate": 0.00028863986013986015, "loss": 3.1909, "step": 89150 }, { "epoch": 25.975886772671675, "grad_norm": 0.4050820469856262, "learning_rate": 0.00028846503496503495, "loss": 3.207, "step": 89200 }, { "epoch": 25.9904479002854, "grad_norm": 0.4026417136192322, "learning_rate": 0.00028829020979020975, "loss": 3.1941, "step": 89250 }, { "epoch": 26.004950783388665, "grad_norm": 0.4386459290981293, "learning_rate": 0.0002881153846153846, "loss": 3.1719, "step": 89300 }, { "epoch": 26.01951191100239, "grad_norm": 0.4158802330493927, "learning_rate": 0.0002879405594405594, "loss": 3.0844, "step": 89350 }, { "epoch": 26.03407303861611, "grad_norm": 0.44361788034439087, "learning_rate": 0.00028776573426573425, "loss": 3.1136, "step": 89400 }, { "epoch": 26.04863416622983, "grad_norm": 0.4118858277797699, "learning_rate": 0.00028759090909090905, "loss": 3.1005, "step": 89450 }, { "epoch": 26.063195293843556, "grad_norm": 0.4068799316883087, "learning_rate": 0.0002874160839160839, "loss": 3.0999, "step": 89500 }, { "epoch": 26.077756421457277, "grad_norm": 0.4220964312553406, "learning_rate": 0.0002872412587412587, "loss": 3.124, "step": 89550 }, { "epoch": 26.092317549071, "grad_norm": 0.4196264147758484, "learning_rate": 0.0002870664335664335, "loss": 3.1175, "step": 89600 }, { "epoch": 26.106878676684723, "grad_norm": 0.43385326862335205, "learning_rate": 0.00028689160839160836, "loss": 3.1222, "step": 89650 }, { "epoch": 26.121439804298443, "grad_norm": 0.45031943917274475, "learning_rate": 0.0002867167832167832, "loss": 3.1352, "step": 89700 }, { "epoch": 26.136000931912168, "grad_norm": 0.41545194387435913, "learning_rate": 0.000286541958041958, "loss": 3.1274, "step": 89750 }, { "epoch": 26.15056205952589, "grad_norm": 0.42113929986953735, "learning_rate": 0.00028636713286713287, "loss": 3.1266, "step": 89800 }, { "epoch": 26.165123187139613, "grad_norm": 0.4348122179508209, "learning_rate": 0.00028619230769230767, "loss": 3.1279, "step": 89850 }, { "epoch": 26.179684314753334, "grad_norm": 0.45158812403678894, "learning_rate": 0.0002860174825174825, "loss": 3.125, "step": 89900 }, { "epoch": 26.194245442367055, "grad_norm": 0.4448310136795044, "learning_rate": 0.0002858426573426573, "loss": 3.1502, "step": 89950 }, { "epoch": 26.20880656998078, "grad_norm": 0.4374328553676605, "learning_rate": 0.0002856678321678321, "loss": 3.1457, "step": 90000 }, { "epoch": 26.20880656998078, "eval_accuracy": 0.3740267285641632, "eval_loss": 3.5505359172821045, "eval_runtime": 181.0087, "eval_samples_per_second": 91.962, "eval_steps_per_second": 5.751, "step": 90000 }, { "epoch": 26.2233676975945, "grad_norm": 0.42174583673477173, "learning_rate": 0.000285493006993007, "loss": 3.1509, "step": 90050 }, { "epoch": 26.237928825208225, "grad_norm": 0.44193533062934875, "learning_rate": 0.0002853181818181818, "loss": 3.1557, "step": 90100 }, { "epoch": 26.252489952821946, "grad_norm": 0.42890018224716187, "learning_rate": 0.00028514335664335663, "loss": 3.1485, "step": 90150 }, { "epoch": 26.267051080435667, "grad_norm": 0.41337689757347107, "learning_rate": 0.00028496853146853143, "loss": 3.1492, "step": 90200 }, { "epoch": 26.281612208049392, "grad_norm": 0.4084251821041107, "learning_rate": 0.0002847937062937063, "loss": 3.1507, "step": 90250 }, { "epoch": 26.296173335663113, "grad_norm": 0.42145994305610657, "learning_rate": 0.0002846188811188811, "loss": 3.1506, "step": 90300 }, { "epoch": 26.310734463276837, "grad_norm": 0.4321393668651581, "learning_rate": 0.0002844440559440559, "loss": 3.162, "step": 90350 }, { "epoch": 26.32529559089056, "grad_norm": 0.4253658950328827, "learning_rate": 0.00028426923076923074, "loss": 3.1556, "step": 90400 }, { "epoch": 26.33985671850428, "grad_norm": 0.456359326839447, "learning_rate": 0.0002840944055944056, "loss": 3.1634, "step": 90450 }, { "epoch": 26.354417846118004, "grad_norm": 0.4094768166542053, "learning_rate": 0.0002839195804195804, "loss": 3.1461, "step": 90500 }, { "epoch": 26.368978973731725, "grad_norm": 0.4343591034412384, "learning_rate": 0.00028374475524475524, "loss": 3.1494, "step": 90550 }, { "epoch": 26.38354010134545, "grad_norm": 0.4192764163017273, "learning_rate": 0.00028356993006993004, "loss": 3.1609, "step": 90600 }, { "epoch": 26.39810122895917, "grad_norm": 0.41687625646591187, "learning_rate": 0.0002833951048951049, "loss": 3.1549, "step": 90650 }, { "epoch": 26.41266235657289, "grad_norm": 0.42564550042152405, "learning_rate": 0.0002832202797202797, "loss": 3.1641, "step": 90700 }, { "epoch": 26.427223484186616, "grad_norm": 0.4025818705558777, "learning_rate": 0.0002830454545454545, "loss": 3.1625, "step": 90750 }, { "epoch": 26.441784611800337, "grad_norm": 0.4263351261615753, "learning_rate": 0.00028287062937062935, "loss": 3.164, "step": 90800 }, { "epoch": 26.45634573941406, "grad_norm": 0.4589676558971405, "learning_rate": 0.00028269580419580415, "loss": 3.1734, "step": 90850 }, { "epoch": 26.470906867027782, "grad_norm": 0.45051443576812744, "learning_rate": 0.000282520979020979, "loss": 3.1644, "step": 90900 }, { "epoch": 26.485467994641503, "grad_norm": 0.4175613820552826, "learning_rate": 0.0002823461538461538, "loss": 3.174, "step": 90950 }, { "epoch": 26.500029122255228, "grad_norm": 0.4551471769809723, "learning_rate": 0.00028217132867132866, "loss": 3.1694, "step": 91000 }, { "epoch": 26.500029122255228, "eval_accuracy": 0.37420977354385193, "eval_loss": 3.5417368412017822, "eval_runtime": 180.9394, "eval_samples_per_second": 91.998, "eval_steps_per_second": 5.753, "step": 91000 }, { "epoch": 26.51459024986895, "grad_norm": 0.397365003824234, "learning_rate": 0.00028199650349650346, "loss": 3.166, "step": 91050 }, { "epoch": 26.529151377482673, "grad_norm": 0.4480145573616028, "learning_rate": 0.0002818216783216783, "loss": 3.1662, "step": 91100 }, { "epoch": 26.543712505096394, "grad_norm": 0.42891454696655273, "learning_rate": 0.0002816468531468531, "loss": 3.1695, "step": 91150 }, { "epoch": 26.55827363271012, "grad_norm": 0.41012558341026306, "learning_rate": 0.00028147202797202796, "loss": 3.1757, "step": 91200 }, { "epoch": 26.57283476032384, "grad_norm": 0.4216759502887726, "learning_rate": 0.00028129720279720276, "loss": 3.1814, "step": 91250 }, { "epoch": 26.58739588793756, "grad_norm": 0.41179201006889343, "learning_rate": 0.0002811223776223776, "loss": 3.1793, "step": 91300 }, { "epoch": 26.601957015551285, "grad_norm": 0.4413478374481201, "learning_rate": 0.0002809475524475524, "loss": 3.1764, "step": 91350 }, { "epoch": 26.616518143165006, "grad_norm": 0.39704814553260803, "learning_rate": 0.00028077272727272727, "loss": 3.1747, "step": 91400 }, { "epoch": 26.63107927077873, "grad_norm": 0.41823717951774597, "learning_rate": 0.00028059790209790207, "loss": 3.1668, "step": 91450 }, { "epoch": 26.64564039839245, "grad_norm": 0.43079426884651184, "learning_rate": 0.00028042307692307687, "loss": 3.1834, "step": 91500 }, { "epoch": 26.660201526006173, "grad_norm": 0.42470020055770874, "learning_rate": 0.0002802482517482517, "loss": 3.1938, "step": 91550 }, { "epoch": 26.674762653619897, "grad_norm": 0.42501482367515564, "learning_rate": 0.0002800734265734265, "loss": 3.1755, "step": 91600 }, { "epoch": 26.689323781233618, "grad_norm": 0.4389747679233551, "learning_rate": 0.0002798986013986014, "loss": 3.1832, "step": 91650 }, { "epoch": 26.703884908847343, "grad_norm": 0.40251684188842773, "learning_rate": 0.0002797237762237762, "loss": 3.171, "step": 91700 }, { "epoch": 26.718446036461064, "grad_norm": 0.4349626898765564, "learning_rate": 0.00027954895104895103, "loss": 3.1768, "step": 91750 }, { "epoch": 26.733007164074785, "grad_norm": 0.41921675205230713, "learning_rate": 0.0002793741258741259, "loss": 3.1882, "step": 91800 }, { "epoch": 26.74756829168851, "grad_norm": 0.41158103942871094, "learning_rate": 0.0002791993006993007, "loss": 3.1867, "step": 91850 }, { "epoch": 26.76212941930223, "grad_norm": 0.4082656502723694, "learning_rate": 0.0002790244755244755, "loss": 3.1754, "step": 91900 }, { "epoch": 26.776690546915955, "grad_norm": 0.44691285490989685, "learning_rate": 0.00027884965034965034, "loss": 3.1832, "step": 91950 }, { "epoch": 26.791251674529676, "grad_norm": 0.42668673396110535, "learning_rate": 0.00027867482517482514, "loss": 3.1857, "step": 92000 }, { "epoch": 26.791251674529676, "eval_accuracy": 0.3747800697522462, "eval_loss": 3.537883996963501, "eval_runtime": 181.0914, "eval_samples_per_second": 91.92, "eval_steps_per_second": 5.748, "step": 92000 }, { "epoch": 26.805812802143397, "grad_norm": 0.42242157459259033, "learning_rate": 0.0002785, "loss": 3.1905, "step": 92050 }, { "epoch": 26.82037392975712, "grad_norm": 0.42586851119995117, "learning_rate": 0.0002783251748251748, "loss": 3.1868, "step": 92100 }, { "epoch": 26.834935057370842, "grad_norm": 0.43328920006752014, "learning_rate": 0.00027815034965034965, "loss": 3.191, "step": 92150 }, { "epoch": 26.849496184984567, "grad_norm": 0.44402876496315, "learning_rate": 0.00027797552447552445, "loss": 3.187, "step": 92200 }, { "epoch": 26.864057312598288, "grad_norm": 0.450183242559433, "learning_rate": 0.00027780069930069925, "loss": 3.1948, "step": 92250 }, { "epoch": 26.87861844021201, "grad_norm": 0.42371490597724915, "learning_rate": 0.0002776258741258741, "loss": 3.1943, "step": 92300 }, { "epoch": 26.893179567825733, "grad_norm": 0.40242645144462585, "learning_rate": 0.0002774510489510489, "loss": 3.1945, "step": 92350 }, { "epoch": 26.907740695439454, "grad_norm": 0.4435162842273712, "learning_rate": 0.00027727622377622375, "loss": 3.1853, "step": 92400 }, { "epoch": 26.92230182305318, "grad_norm": 0.4274364411830902, "learning_rate": 0.00027710139860139855, "loss": 3.1863, "step": 92450 }, { "epoch": 26.9368629506669, "grad_norm": 0.4162507653236389, "learning_rate": 0.0002769265734265734, "loss": 3.1857, "step": 92500 }, { "epoch": 26.95142407828062, "grad_norm": 0.40001586079597473, "learning_rate": 0.00027675174825174826, "loss": 3.2034, "step": 92550 }, { "epoch": 26.965985205894345, "grad_norm": 0.4307563304901123, "learning_rate": 0.00027657692307692306, "loss": 3.183, "step": 92600 }, { "epoch": 26.980546333508066, "grad_norm": 0.4021975100040436, "learning_rate": 0.00027640209790209786, "loss": 3.1857, "step": 92650 }, { "epoch": 26.99510746112179, "grad_norm": 0.4598715901374817, "learning_rate": 0.0002762272727272727, "loss": 3.2017, "step": 92700 }, { "epoch": 27.009610344225056, "grad_norm": 0.4127676784992218, "learning_rate": 0.0002760524475524475, "loss": 3.1279, "step": 92750 }, { "epoch": 27.02417147183878, "grad_norm": 0.4335608184337616, "learning_rate": 0.00027587762237762237, "loss": 3.1031, "step": 92800 }, { "epoch": 27.0387325994525, "grad_norm": 0.41142764687538147, "learning_rate": 0.00027570279720279717, "loss": 3.1039, "step": 92850 }, { "epoch": 27.053293727066222, "grad_norm": 0.42291176319122314, "learning_rate": 0.000275527972027972, "loss": 3.0966, "step": 92900 }, { "epoch": 27.067854854679947, "grad_norm": 0.41084083914756775, "learning_rate": 0.0002753531468531468, "loss": 3.1131, "step": 92950 }, { "epoch": 27.082415982293668, "grad_norm": 0.45117121934890747, "learning_rate": 0.0002751783216783216, "loss": 3.1146, "step": 93000 }, { "epoch": 27.082415982293668, "eval_accuracy": 0.37372600341448836, "eval_loss": 3.5564258098602295, "eval_runtime": 181.0502, "eval_samples_per_second": 91.941, "eval_steps_per_second": 5.75, "step": 93000 }, { "epoch": 27.096977109907392, "grad_norm": 0.4476958215236664, "learning_rate": 0.0002750034965034965, "loss": 3.1217, "step": 93050 }, { "epoch": 27.111538237521113, "grad_norm": 0.43004608154296875, "learning_rate": 0.0002748286713286713, "loss": 3.1158, "step": 93100 }, { "epoch": 27.126099365134834, "grad_norm": 0.4326183795928955, "learning_rate": 0.00027465384615384613, "loss": 3.122, "step": 93150 }, { "epoch": 27.14066049274856, "grad_norm": 0.46706339716911316, "learning_rate": 0.000274479020979021, "loss": 3.1226, "step": 93200 }, { "epoch": 27.15522162036228, "grad_norm": 0.42264524102211, "learning_rate": 0.0002743041958041958, "loss": 3.1192, "step": 93250 }, { "epoch": 27.169782747976004, "grad_norm": 0.45010286569595337, "learning_rate": 0.00027412937062937064, "loss": 3.1194, "step": 93300 }, { "epoch": 27.184343875589725, "grad_norm": 0.44601142406463623, "learning_rate": 0.00027395454545454544, "loss": 3.117, "step": 93350 }, { "epoch": 27.19890500320345, "grad_norm": 0.44697099924087524, "learning_rate": 0.00027377972027972024, "loss": 3.1289, "step": 93400 }, { "epoch": 27.21346613081717, "grad_norm": 0.41724687814712524, "learning_rate": 0.0002736048951048951, "loss": 3.1254, "step": 93450 }, { "epoch": 27.228027258430892, "grad_norm": 0.4229699969291687, "learning_rate": 0.0002734300699300699, "loss": 3.1306, "step": 93500 }, { "epoch": 27.242588386044616, "grad_norm": 0.42787185311317444, "learning_rate": 0.00027325524475524474, "loss": 3.1234, "step": 93550 }, { "epoch": 27.257149513658337, "grad_norm": 0.43008866906166077, "learning_rate": 0.00027308041958041954, "loss": 3.1421, "step": 93600 }, { "epoch": 27.271710641272062, "grad_norm": 0.40983253717422485, "learning_rate": 0.0002729055944055944, "loss": 3.136, "step": 93650 }, { "epoch": 27.286271768885783, "grad_norm": 0.4102419316768646, "learning_rate": 0.0002727307692307692, "loss": 3.1395, "step": 93700 }, { "epoch": 27.300832896499504, "grad_norm": 0.447038471698761, "learning_rate": 0.000272555944055944, "loss": 3.1426, "step": 93750 }, { "epoch": 27.31539402411323, "grad_norm": 0.44245871901512146, "learning_rate": 0.00027238111888111885, "loss": 3.1448, "step": 93800 }, { "epoch": 27.32995515172695, "grad_norm": 0.442560613155365, "learning_rate": 0.0002722062937062937, "loss": 3.1514, "step": 93850 }, { "epoch": 27.344516279340674, "grad_norm": 0.4968518614768982, "learning_rate": 0.0002720314685314685, "loss": 3.1468, "step": 93900 }, { "epoch": 27.359077406954395, "grad_norm": 0.4427065849304199, "learning_rate": 0.00027185664335664336, "loss": 3.1354, "step": 93950 }, { "epoch": 27.373638534568116, "grad_norm": 0.44690605998039246, "learning_rate": 0.00027168181818181816, "loss": 3.1478, "step": 94000 }, { "epoch": 27.373638534568116, "eval_accuracy": 0.3741124317049423, "eval_loss": 3.5478107929229736, "eval_runtime": 181.2792, "eval_samples_per_second": 91.825, "eval_steps_per_second": 5.743, "step": 94000 }, { "epoch": 27.38819966218184, "grad_norm": 0.4106026291847229, "learning_rate": 0.000271506993006993, "loss": 3.1407, "step": 94050 }, { "epoch": 27.40276078979556, "grad_norm": 0.4304497539997101, "learning_rate": 0.0002713321678321678, "loss": 3.1485, "step": 94100 }, { "epoch": 27.417321917409286, "grad_norm": 0.4128531217575073, "learning_rate": 0.0002711573426573426, "loss": 3.1525, "step": 94150 }, { "epoch": 27.431883045023007, "grad_norm": 0.43160468339920044, "learning_rate": 0.00027098251748251746, "loss": 3.1553, "step": 94200 }, { "epoch": 27.446444172636728, "grad_norm": 0.42581573128700256, "learning_rate": 0.00027080769230769226, "loss": 3.1617, "step": 94250 }, { "epoch": 27.461005300250452, "grad_norm": 0.4796488285064697, "learning_rate": 0.0002706328671328671, "loss": 3.1466, "step": 94300 }, { "epoch": 27.475566427864173, "grad_norm": 0.45811060070991516, "learning_rate": 0.0002704580419580419, "loss": 3.1417, "step": 94350 }, { "epoch": 27.490127555477898, "grad_norm": 0.4167649447917938, "learning_rate": 0.00027028321678321677, "loss": 3.1777, "step": 94400 }, { "epoch": 27.50468868309162, "grad_norm": 0.43479686975479126, "learning_rate": 0.00027010839160839157, "loss": 3.1601, "step": 94450 }, { "epoch": 27.51924981070534, "grad_norm": 0.42193397879600525, "learning_rate": 0.00026993356643356637, "loss": 3.1669, "step": 94500 }, { "epoch": 27.533810938319064, "grad_norm": 0.47727319598197937, "learning_rate": 0.0002697587412587412, "loss": 3.1698, "step": 94550 }, { "epoch": 27.548372065932785, "grad_norm": 0.4227813482284546, "learning_rate": 0.0002695839160839161, "loss": 3.1649, "step": 94600 }, { "epoch": 27.56293319354651, "grad_norm": 0.4389137029647827, "learning_rate": 0.0002694090909090909, "loss": 3.1721, "step": 94650 }, { "epoch": 27.57749432116023, "grad_norm": 0.4371396601200104, "learning_rate": 0.00026923426573426573, "loss": 3.1665, "step": 94700 }, { "epoch": 27.59205544877395, "grad_norm": 0.45018795132637024, "learning_rate": 0.00026905944055944053, "loss": 3.1634, "step": 94750 }, { "epoch": 27.606616576387676, "grad_norm": 0.3988571763038635, "learning_rate": 0.0002688846153846154, "loss": 3.1685, "step": 94800 }, { "epoch": 27.621177704001397, "grad_norm": 0.43770983815193176, "learning_rate": 0.0002687097902097902, "loss": 3.1807, "step": 94850 }, { "epoch": 27.63573883161512, "grad_norm": 0.43427303433418274, "learning_rate": 0.000268534965034965, "loss": 3.1776, "step": 94900 }, { "epoch": 27.650299959228843, "grad_norm": 0.4287642538547516, "learning_rate": 0.00026836013986013984, "loss": 3.1615, "step": 94950 }, { "epoch": 27.664861086842564, "grad_norm": 0.42970359325408936, "learning_rate": 0.00026818531468531464, "loss": 3.1677, "step": 95000 }, { "epoch": 27.664861086842564, "eval_accuracy": 0.3748591893870121, "eval_loss": 3.537830114364624, "eval_runtime": 181.1281, "eval_samples_per_second": 91.902, "eval_steps_per_second": 5.747, "step": 95000 }, { "epoch": 27.679422214456288, "grad_norm": 0.4214085638523102, "learning_rate": 0.0002680104895104895, "loss": 3.1704, "step": 95050 }, { "epoch": 27.69398334207001, "grad_norm": 0.4056569039821625, "learning_rate": 0.0002678356643356643, "loss": 3.1684, "step": 95100 }, { "epoch": 27.708544469683734, "grad_norm": 0.4250808656215668, "learning_rate": 0.00026766083916083915, "loss": 3.1809, "step": 95150 }, { "epoch": 27.723105597297454, "grad_norm": 0.46413102746009827, "learning_rate": 0.00026748601398601395, "loss": 3.1842, "step": 95200 }, { "epoch": 27.737666724911175, "grad_norm": 0.4296405017375946, "learning_rate": 0.0002673111888111888, "loss": 3.1742, "step": 95250 }, { "epoch": 27.7522278525249, "grad_norm": 0.41784653067588806, "learning_rate": 0.0002671363636363636, "loss": 3.1853, "step": 95300 }, { "epoch": 27.76678898013862, "grad_norm": 0.42170286178588867, "learning_rate": 0.00026696153846153845, "loss": 3.173, "step": 95350 }, { "epoch": 27.781350107752345, "grad_norm": 0.4131033420562744, "learning_rate": 0.00026678671328671325, "loss": 3.1809, "step": 95400 }, { "epoch": 27.795911235366066, "grad_norm": 0.41183069348335266, "learning_rate": 0.0002666118881118881, "loss": 3.168, "step": 95450 }, { "epoch": 27.810472362979787, "grad_norm": 0.4307995140552521, "learning_rate": 0.0002664370629370629, "loss": 3.1832, "step": 95500 }, { "epoch": 27.825033490593512, "grad_norm": 0.4343326687812805, "learning_rate": 0.00026626223776223776, "loss": 3.1806, "step": 95550 }, { "epoch": 27.839594618207233, "grad_norm": 0.4288672208786011, "learning_rate": 0.00026608741258741256, "loss": 3.1826, "step": 95600 }, { "epoch": 27.854155745820957, "grad_norm": 0.45248332619667053, "learning_rate": 0.00026591258741258736, "loss": 3.1664, "step": 95650 }, { "epoch": 27.86871687343468, "grad_norm": 0.4320247769355774, "learning_rate": 0.0002657377622377622, "loss": 3.1905, "step": 95700 }, { "epoch": 27.883278001048403, "grad_norm": 0.4126224219799042, "learning_rate": 0.000265562937062937, "loss": 3.1741, "step": 95750 }, { "epoch": 27.897839128662124, "grad_norm": 0.43033668398857117, "learning_rate": 0.00026538811188811187, "loss": 3.1842, "step": 95800 }, { "epoch": 27.912400256275845, "grad_norm": 0.43303316831588745, "learning_rate": 0.00026521328671328667, "loss": 3.1813, "step": 95850 }, { "epoch": 27.92696138388957, "grad_norm": 0.4476587176322937, "learning_rate": 0.0002650384615384615, "loss": 3.1903, "step": 95900 }, { "epoch": 27.94152251150329, "grad_norm": 0.44143158197402954, "learning_rate": 0.0002648636363636364, "loss": 3.1735, "step": 95950 }, { "epoch": 27.956083639117015, "grad_norm": 0.4395497441291809, "learning_rate": 0.0002646888111888112, "loss": 3.1917, "step": 96000 }, { "epoch": 27.956083639117015, "eval_accuracy": 0.37492549469757375, "eval_loss": 3.533468008041382, "eval_runtime": 181.422, "eval_samples_per_second": 91.753, "eval_steps_per_second": 5.738, "step": 96000 }, { "epoch": 27.970644766730736, "grad_norm": 0.39390426874160767, "learning_rate": 0.000264513986013986, "loss": 3.1874, "step": 96050 }, { "epoch": 27.985205894344457, "grad_norm": 0.4274383783340454, "learning_rate": 0.00026433916083916083, "loss": 3.186, "step": 96100 }, { "epoch": 27.99976702195818, "grad_norm": 0.43289220333099365, "learning_rate": 0.00026416433566433563, "loss": 3.1902, "step": 96150 }, { "epoch": 28.014269905061447, "grad_norm": 0.46341463923454285, "learning_rate": 0.0002639895104895105, "loss": 3.0824, "step": 96200 }, { "epoch": 28.02883103267517, "grad_norm": 0.45257487893104553, "learning_rate": 0.0002638146853146853, "loss": 3.0785, "step": 96250 }, { "epoch": 28.043392160288892, "grad_norm": 0.4652392566204071, "learning_rate": 0.00026363986013986014, "loss": 3.0985, "step": 96300 }, { "epoch": 28.057953287902617, "grad_norm": 0.4572628140449524, "learning_rate": 0.00026346503496503494, "loss": 3.1004, "step": 96350 }, { "epoch": 28.072514415516338, "grad_norm": 0.4591698944568634, "learning_rate": 0.00026329020979020974, "loss": 3.0914, "step": 96400 }, { "epoch": 28.08707554313006, "grad_norm": 0.44280004501342773, "learning_rate": 0.0002631153846153846, "loss": 3.1029, "step": 96450 }, { "epoch": 28.101636670743783, "grad_norm": 0.4677381217479706, "learning_rate": 0.0002629405594405594, "loss": 3.1036, "step": 96500 }, { "epoch": 28.116197798357504, "grad_norm": 0.48905616998672485, "learning_rate": 0.00026276573426573424, "loss": 3.1068, "step": 96550 }, { "epoch": 28.13075892597123, "grad_norm": 0.456222802400589, "learning_rate": 0.00026259090909090904, "loss": 3.108, "step": 96600 }, { "epoch": 28.14532005358495, "grad_norm": 0.46212059259414673, "learning_rate": 0.0002624160839160839, "loss": 3.1184, "step": 96650 }, { "epoch": 28.15988118119867, "grad_norm": 0.4542124271392822, "learning_rate": 0.00026224125874125875, "loss": 3.1212, "step": 96700 }, { "epoch": 28.174442308812395, "grad_norm": 0.44424667954444885, "learning_rate": 0.00026206643356643355, "loss": 3.1177, "step": 96750 }, { "epoch": 28.189003436426116, "grad_norm": 0.47115257382392883, "learning_rate": 0.00026189160839160835, "loss": 3.1258, "step": 96800 }, { "epoch": 28.20356456403984, "grad_norm": 0.4265085458755493, "learning_rate": 0.0002617167832167832, "loss": 3.1237, "step": 96850 }, { "epoch": 28.21812569165356, "grad_norm": 0.44990769028663635, "learning_rate": 0.000261541958041958, "loss": 3.1388, "step": 96900 }, { "epoch": 28.232686819267283, "grad_norm": 0.44933226704597473, "learning_rate": 0.00026136713286713286, "loss": 3.1249, "step": 96950 }, { "epoch": 28.247247946881007, "grad_norm": 0.4495621919631958, "learning_rate": 0.00026119230769230766, "loss": 3.1284, "step": 97000 }, { "epoch": 28.247247946881007, "eval_accuracy": 0.37415416643056176, "eval_loss": 3.550610303878784, "eval_runtime": 181.4389, "eval_samples_per_second": 91.744, "eval_steps_per_second": 5.737, "step": 97000 }, { "epoch": 28.261809074494728, "grad_norm": 0.4604361653327942, "learning_rate": 0.0002610174825174825, "loss": 3.1232, "step": 97050 }, { "epoch": 28.276370202108453, "grad_norm": 0.4367489218711853, "learning_rate": 0.0002608426573426573, "loss": 3.1272, "step": 97100 }, { "epoch": 28.290931329722174, "grad_norm": 0.4408527612686157, "learning_rate": 0.0002606678321678321, "loss": 3.1239, "step": 97150 }, { "epoch": 28.305492457335895, "grad_norm": 0.4509691596031189, "learning_rate": 0.00026049300699300696, "loss": 3.1393, "step": 97200 }, { "epoch": 28.32005358494962, "grad_norm": 0.42577970027923584, "learning_rate": 0.00026031818181818176, "loss": 3.1271, "step": 97250 }, { "epoch": 28.33461471256334, "grad_norm": 0.4179809093475342, "learning_rate": 0.0002601433566433566, "loss": 3.1299, "step": 97300 }, { "epoch": 28.349175840177065, "grad_norm": 0.42162272334098816, "learning_rate": 0.00025996853146853147, "loss": 3.1311, "step": 97350 }, { "epoch": 28.363736967790786, "grad_norm": 0.4421674311161041, "learning_rate": 0.00025979370629370627, "loss": 3.1443, "step": 97400 }, { "epoch": 28.378298095404507, "grad_norm": 0.45821619033813477, "learning_rate": 0.0002596188811188811, "loss": 3.1496, "step": 97450 }, { "epoch": 28.39285922301823, "grad_norm": 0.4294704794883728, "learning_rate": 0.0002594440559440559, "loss": 3.1382, "step": 97500 }, { "epoch": 28.407420350631952, "grad_norm": 0.4437113106250763, "learning_rate": 0.0002592692307692307, "loss": 3.1473, "step": 97550 }, { "epoch": 28.421981478245677, "grad_norm": 0.449618399143219, "learning_rate": 0.0002590944055944056, "loss": 3.1572, "step": 97600 }, { "epoch": 28.436542605859398, "grad_norm": 0.43033522367477417, "learning_rate": 0.0002589195804195804, "loss": 3.1457, "step": 97650 }, { "epoch": 28.45110373347312, "grad_norm": 0.4679327607154846, "learning_rate": 0.00025874475524475523, "loss": 3.1525, "step": 97700 }, { "epoch": 28.465664861086843, "grad_norm": 0.4457078278064728, "learning_rate": 0.00025856993006993003, "loss": 3.1485, "step": 97750 }, { "epoch": 28.480225988700564, "grad_norm": 0.43366381525993347, "learning_rate": 0.0002583951048951049, "loss": 3.1554, "step": 97800 }, { "epoch": 28.49478711631429, "grad_norm": 0.44160208106040955, "learning_rate": 0.0002582202797202797, "loss": 3.1466, "step": 97850 }, { "epoch": 28.50934824392801, "grad_norm": 0.4116990268230438, "learning_rate": 0.0002580454545454545, "loss": 3.1515, "step": 97900 }, { "epoch": 28.523909371541734, "grad_norm": 0.4543076455593109, "learning_rate": 0.00025787062937062934, "loss": 3.1502, "step": 97950 }, { "epoch": 28.538470499155455, "grad_norm": 0.4376068711280823, "learning_rate": 0.0002576958041958042, "loss": 3.1637, "step": 98000 }, { "epoch": 28.538470499155455, "eval_accuracy": 0.374864127016522, "eval_loss": 3.543623924255371, "eval_runtime": 181.1432, "eval_samples_per_second": 91.894, "eval_steps_per_second": 5.747, "step": 98000 }, { "epoch": 28.553031626769176, "grad_norm": 0.41571149230003357, "learning_rate": 0.000257520979020979, "loss": 3.1527, "step": 98050 }, { "epoch": 28.5675927543829, "grad_norm": 0.42478272318840027, "learning_rate": 0.00025734615384615385, "loss": 3.1513, "step": 98100 }, { "epoch": 28.58215388199662, "grad_norm": 0.446841299533844, "learning_rate": 0.00025717132867132865, "loss": 3.1573, "step": 98150 }, { "epoch": 28.596715009610342, "grad_norm": 0.4337193965911865, "learning_rate": 0.0002569965034965035, "loss": 3.1565, "step": 98200 }, { "epoch": 28.611276137224067, "grad_norm": 0.41480013728141785, "learning_rate": 0.0002568216783216783, "loss": 3.1569, "step": 98250 }, { "epoch": 28.625837264837788, "grad_norm": 0.45146986842155457, "learning_rate": 0.0002566468531468531, "loss": 3.1727, "step": 98300 }, { "epoch": 28.640398392451512, "grad_norm": 0.44779402017593384, "learning_rate": 0.00025647202797202795, "loss": 3.1558, "step": 98350 }, { "epoch": 28.654959520065233, "grad_norm": 0.4365552067756653, "learning_rate": 0.00025629720279720275, "loss": 3.1651, "step": 98400 }, { "epoch": 28.669520647678958, "grad_norm": 0.4349055290222168, "learning_rate": 0.0002561223776223776, "loss": 3.1562, "step": 98450 }, { "epoch": 28.68408177529268, "grad_norm": 0.4297862946987152, "learning_rate": 0.0002559475524475524, "loss": 3.1711, "step": 98500 }, { "epoch": 28.6986429029064, "grad_norm": 0.44524985551834106, "learning_rate": 0.00025577272727272726, "loss": 3.1499, "step": 98550 }, { "epoch": 28.713204030520124, "grad_norm": 0.44011494517326355, "learning_rate": 0.00025559790209790206, "loss": 3.1627, "step": 98600 }, { "epoch": 28.727765158133845, "grad_norm": 0.44971513748168945, "learning_rate": 0.00025542307692307686, "loss": 3.1663, "step": 98650 }, { "epoch": 28.74232628574757, "grad_norm": 0.4642869532108307, "learning_rate": 0.00025524825174825177, "loss": 3.1668, "step": 98700 }, { "epoch": 28.75688741336129, "grad_norm": 0.43200162053108215, "learning_rate": 0.00025507342657342657, "loss": 3.177, "step": 98750 }, { "epoch": 28.771448540975012, "grad_norm": 0.43370288610458374, "learning_rate": 0.00025489860139860137, "loss": 3.1681, "step": 98800 }, { "epoch": 28.786009668588736, "grad_norm": 0.46173909306526184, "learning_rate": 0.0002547237762237762, "loss": 3.1736, "step": 98850 }, { "epoch": 28.800570796202457, "grad_norm": 0.4236147403717041, "learning_rate": 0.000254548951048951, "loss": 3.1683, "step": 98900 }, { "epoch": 28.815131923816182, "grad_norm": 0.439842164516449, "learning_rate": 0.0002543741258741259, "loss": 3.1618, "step": 98950 }, { "epoch": 28.829693051429903, "grad_norm": 0.41996750235557556, "learning_rate": 0.0002541993006993007, "loss": 3.1684, "step": 99000 }, { "epoch": 28.829693051429903, "eval_accuracy": 0.37464710644330085, "eval_loss": 3.5397422313690186, "eval_runtime": 180.731, "eval_samples_per_second": 92.104, "eval_steps_per_second": 5.76, "step": 99000 }, { "epoch": 28.844254179043624, "grad_norm": 0.4436616003513336, "learning_rate": 0.0002540244755244755, "loss": 3.1716, "step": 99050 }, { "epoch": 28.85881530665735, "grad_norm": 0.43761077523231506, "learning_rate": 0.00025384965034965033, "loss": 3.1711, "step": 99100 }, { "epoch": 28.87337643427107, "grad_norm": 0.4494776725769043, "learning_rate": 0.00025367482517482513, "loss": 3.1734, "step": 99150 }, { "epoch": 28.887937561884794, "grad_norm": 0.4295901954174042, "learning_rate": 0.0002535, "loss": 3.173, "step": 99200 }, { "epoch": 28.902498689498515, "grad_norm": 0.479999303817749, "learning_rate": 0.0002533251748251748, "loss": 3.1758, "step": 99250 }, { "epoch": 28.917059817112236, "grad_norm": 0.41707783937454224, "learning_rate": 0.00025315034965034964, "loss": 3.1793, "step": 99300 }, { "epoch": 28.93162094472596, "grad_norm": 0.41884753108024597, "learning_rate": 0.00025297552447552444, "loss": 3.1899, "step": 99350 }, { "epoch": 28.94618207233968, "grad_norm": 0.44001612067222595, "learning_rate": 0.0002528006993006993, "loss": 3.1776, "step": 99400 }, { "epoch": 28.960743199953406, "grad_norm": 0.44126561284065247, "learning_rate": 0.00025262587412587414, "loss": 3.1658, "step": 99450 }, { "epoch": 28.975304327567127, "grad_norm": 0.46320417523384094, "learning_rate": 0.00025245104895104894, "loss": 3.1692, "step": 99500 }, { "epoch": 28.989865455180848, "grad_norm": 0.4358890950679779, "learning_rate": 0.00025227622377622374, "loss": 3.1706, "step": 99550 }, { "epoch": 29.004368338284117, "grad_norm": 0.4340914189815521, "learning_rate": 0.0002521013986013986, "loss": 3.1496, "step": 99600 }, { "epoch": 29.018929465897838, "grad_norm": 0.4288674592971802, "learning_rate": 0.0002519265734265734, "loss": 3.086, "step": 99650 }, { "epoch": 29.033490593511562, "grad_norm": 0.44773054122924805, "learning_rate": 0.00025175174825174825, "loss": 3.0863, "step": 99700 }, { "epoch": 29.048051721125283, "grad_norm": 0.4277186095714569, "learning_rate": 0.00025157692307692305, "loss": 3.0906, "step": 99750 }, { "epoch": 29.062612848739008, "grad_norm": 0.44387462735176086, "learning_rate": 0.0002514020979020979, "loss": 3.0821, "step": 99800 }, { "epoch": 29.07717397635273, "grad_norm": 0.45624199509620667, "learning_rate": 0.0002512272727272727, "loss": 3.0912, "step": 99850 }, { "epoch": 29.09173510396645, "grad_norm": 0.4246789216995239, "learning_rate": 0.0002510524475524475, "loss": 3.107, "step": 99900 }, { "epoch": 29.106296231580174, "grad_norm": 0.43305352330207825, "learning_rate": 0.00025087762237762236, "loss": 3.1013, "step": 99950 }, { "epoch": 29.120857359193895, "grad_norm": 0.4623396694660187, "learning_rate": 0.00025070279720279716, "loss": 3.1085, "step": 100000 }, { "epoch": 29.120857359193895, "eval_accuracy": 0.3738908261900334, "eval_loss": 3.561875581741333, "eval_runtime": 180.7786, "eval_samples_per_second": 92.079, "eval_steps_per_second": 5.758, "step": 100000 }, { "epoch": 29.13541848680762, "grad_norm": 0.42507481575012207, "learning_rate": 0.000250527972027972, "loss": 3.116, "step": 100050 }, { "epoch": 29.14997961442134, "grad_norm": 0.44509923458099365, "learning_rate": 0.00025035314685314686, "loss": 3.1041, "step": 100100 }, { "epoch": 29.16454074203506, "grad_norm": 0.4786369800567627, "learning_rate": 0.00025017832167832166, "loss": 3.0974, "step": 100150 }, { "epoch": 29.179101869648786, "grad_norm": 0.4472801387310028, "learning_rate": 0.0002500034965034965, "loss": 3.1077, "step": 100200 }, { "epoch": 29.193662997262507, "grad_norm": 0.4288489818572998, "learning_rate": 0.0002498286713286713, "loss": 3.1073, "step": 100250 }, { "epoch": 29.20822412487623, "grad_norm": 0.43381935358047485, "learning_rate": 0.0002496538461538461, "loss": 3.1165, "step": 100300 }, { "epoch": 29.222785252489953, "grad_norm": 0.4412873089313507, "learning_rate": 0.00024947902097902097, "loss": 3.1316, "step": 100350 }, { "epoch": 29.237346380103673, "grad_norm": 0.4455450177192688, "learning_rate": 0.00024930419580419577, "loss": 3.1269, "step": 100400 }, { "epoch": 29.251907507717398, "grad_norm": 0.44914019107818604, "learning_rate": 0.0002491293706293706, "loss": 3.1267, "step": 100450 }, { "epoch": 29.26646863533112, "grad_norm": 0.4303150177001953, "learning_rate": 0.0002489545454545454, "loss": 3.1236, "step": 100500 }, { "epoch": 29.281029762944843, "grad_norm": 0.43328630924224854, "learning_rate": 0.0002487797202797203, "loss": 3.1383, "step": 100550 }, { "epoch": 29.295590890558564, "grad_norm": 0.45274579524993896, "learning_rate": 0.0002486048951048951, "loss": 3.1255, "step": 100600 }, { "epoch": 29.31015201817229, "grad_norm": 0.41539159417152405, "learning_rate": 0.0002484300699300699, "loss": 3.1345, "step": 100650 }, { "epoch": 29.32471314578601, "grad_norm": 0.4625714123249054, "learning_rate": 0.00024825524475524473, "loss": 3.1321, "step": 100700 }, { "epoch": 29.33927427339973, "grad_norm": 0.4518733024597168, "learning_rate": 0.00024808041958041953, "loss": 3.1221, "step": 100750 }, { "epoch": 29.353835401013455, "grad_norm": 0.44265568256378174, "learning_rate": 0.0002479055944055944, "loss": 3.137, "step": 100800 }, { "epoch": 29.368396528627176, "grad_norm": 0.45387569069862366, "learning_rate": 0.00024773076923076924, "loss": 3.1301, "step": 100850 }, { "epoch": 29.3829576562409, "grad_norm": 0.4308754503726959, "learning_rate": 0.00024755594405594404, "loss": 3.1258, "step": 100900 }, { "epoch": 29.397518783854622, "grad_norm": 0.45452380180358887, "learning_rate": 0.0002473811188811189, "loss": 3.1265, "step": 100950 }, { "epoch": 29.412079911468343, "grad_norm": 0.4379521906375885, "learning_rate": 0.0002472062937062937, "loss": 3.1363, "step": 101000 }, { "epoch": 29.412079911468343, "eval_accuracy": 0.37445783064542104, "eval_loss": 3.5464746952056885, "eval_runtime": 181.2606, "eval_samples_per_second": 91.835, "eval_steps_per_second": 5.743, "step": 101000 }, { "epoch": 29.426641039082067, "grad_norm": 0.4259090721607208, "learning_rate": 0.0002470314685314685, "loss": 3.1283, "step": 101050 }, { "epoch": 29.44120216669579, "grad_norm": 0.44320374727249146, "learning_rate": 0.00024685664335664335, "loss": 3.1338, "step": 101100 }, { "epoch": 29.455763294309513, "grad_norm": 0.42781728506088257, "learning_rate": 0.00024668181818181815, "loss": 3.1276, "step": 101150 }, { "epoch": 29.470324421923234, "grad_norm": 0.43914318084716797, "learning_rate": 0.000246506993006993, "loss": 3.1531, "step": 101200 }, { "epoch": 29.484885549536955, "grad_norm": 0.46921101212501526, "learning_rate": 0.0002463321678321678, "loss": 3.1328, "step": 101250 }, { "epoch": 29.49944667715068, "grad_norm": 0.48669424653053284, "learning_rate": 0.00024615734265734265, "loss": 3.1445, "step": 101300 }, { "epoch": 29.5140078047644, "grad_norm": 0.4008258879184723, "learning_rate": 0.00024598251748251745, "loss": 3.1475, "step": 101350 }, { "epoch": 29.528568932378125, "grad_norm": 0.44190502166748047, "learning_rate": 0.00024580769230769225, "loss": 3.147, "step": 101400 }, { "epoch": 29.543130059991846, "grad_norm": 0.4305746257305145, "learning_rate": 0.0002456328671328671, "loss": 3.1449, "step": 101450 }, { "epoch": 29.557691187605567, "grad_norm": 0.4507753551006317, "learning_rate": 0.00024545804195804196, "loss": 3.1421, "step": 101500 }, { "epoch": 29.57225231521929, "grad_norm": 0.45595842599868774, "learning_rate": 0.00024528321678321676, "loss": 3.1451, "step": 101550 }, { "epoch": 29.586813442833012, "grad_norm": 0.4408864378929138, "learning_rate": 0.0002451083916083916, "loss": 3.1427, "step": 101600 }, { "epoch": 29.601374570446737, "grad_norm": 0.45510685443878174, "learning_rate": 0.0002449335664335664, "loss": 3.146, "step": 101650 }, { "epoch": 29.615935698060458, "grad_norm": 0.4537654221057892, "learning_rate": 0.00024475874125874127, "loss": 3.1568, "step": 101700 }, { "epoch": 29.63049682567418, "grad_norm": 0.43733224272727966, "learning_rate": 0.00024458391608391607, "loss": 3.1603, "step": 101750 }, { "epoch": 29.645057953287903, "grad_norm": 0.464886873960495, "learning_rate": 0.00024440909090909087, "loss": 3.1502, "step": 101800 }, { "epoch": 29.659619080901624, "grad_norm": 0.436506450176239, "learning_rate": 0.0002442342657342657, "loss": 3.1477, "step": 101850 }, { "epoch": 29.67418020851535, "grad_norm": 0.4410904347896576, "learning_rate": 0.00024405944055944052, "loss": 3.1596, "step": 101900 }, { "epoch": 29.68874133612907, "grad_norm": 0.46896132826805115, "learning_rate": 0.00024388461538461535, "loss": 3.1555, "step": 101950 }, { "epoch": 29.70330246374279, "grad_norm": 0.44095733761787415, "learning_rate": 0.00024370979020979017, "loss": 3.1494, "step": 102000 }, { "epoch": 29.70330246374279, "eval_accuracy": 0.37494453983996906, "eval_loss": 3.5422708988189697, "eval_runtime": 182.8701, "eval_samples_per_second": 91.026, "eval_steps_per_second": 5.693, "step": 102000 }, { "epoch": 29.717863591356515, "grad_norm": 0.42940181493759155, "learning_rate": 0.000243534965034965, "loss": 3.1576, "step": 102050 }, { "epoch": 29.732424718970236, "grad_norm": 0.44519007205963135, "learning_rate": 0.00024336013986013983, "loss": 3.1612, "step": 102100 }, { "epoch": 29.74698584658396, "grad_norm": 0.4332563281059265, "learning_rate": 0.00024318531468531468, "loss": 3.1535, "step": 102150 }, { "epoch": 29.76154697419768, "grad_norm": 0.443339079618454, "learning_rate": 0.0002430104895104895, "loss": 3.1715, "step": 102200 }, { "epoch": 29.776108101811403, "grad_norm": 0.4404120147228241, "learning_rate": 0.00024283566433566434, "loss": 3.1508, "step": 102250 }, { "epoch": 29.790669229425127, "grad_norm": 0.4648321568965912, "learning_rate": 0.00024266083916083916, "loss": 3.1552, "step": 102300 }, { "epoch": 29.805230357038848, "grad_norm": 0.4375142753124237, "learning_rate": 0.00024248601398601396, "loss": 3.1549, "step": 102350 }, { "epoch": 29.819791484652573, "grad_norm": 0.4262080490589142, "learning_rate": 0.0002423111888111888, "loss": 3.1713, "step": 102400 }, { "epoch": 29.834352612266294, "grad_norm": 0.4613652229309082, "learning_rate": 0.00024213636363636362, "loss": 3.1598, "step": 102450 }, { "epoch": 29.848913739880015, "grad_norm": 0.4698253273963928, "learning_rate": 0.00024196153846153844, "loss": 3.1601, "step": 102500 }, { "epoch": 29.86347486749374, "grad_norm": 0.4185844361782074, "learning_rate": 0.00024178671328671327, "loss": 3.1459, "step": 102550 }, { "epoch": 29.87803599510746, "grad_norm": 0.43877002596855164, "learning_rate": 0.0002416118881118881, "loss": 3.1579, "step": 102600 }, { "epoch": 29.892597122721185, "grad_norm": 0.48648765683174133, "learning_rate": 0.0002414370629370629, "loss": 3.1628, "step": 102650 }, { "epoch": 29.907158250334906, "grad_norm": 0.44445931911468506, "learning_rate": 0.00024126223776223772, "loss": 3.1652, "step": 102700 }, { "epoch": 29.921719377948627, "grad_norm": 0.45707830786705017, "learning_rate": 0.00024108741258741255, "loss": 3.1594, "step": 102750 }, { "epoch": 29.93628050556235, "grad_norm": 0.4236975312232971, "learning_rate": 0.00024091258741258738, "loss": 3.1754, "step": 102800 }, { "epoch": 29.950841633176072, "grad_norm": 0.4243965148925781, "learning_rate": 0.00024073776223776223, "loss": 3.1653, "step": 102850 }, { "epoch": 29.965402760789797, "grad_norm": 0.46123450994491577, "learning_rate": 0.00024056293706293706, "loss": 3.163, "step": 102900 }, { "epoch": 29.979963888403518, "grad_norm": 0.4454895257949829, "learning_rate": 0.00024038811188811188, "loss": 3.1617, "step": 102950 }, { "epoch": 29.994525016017242, "grad_norm": 0.4245060384273529, "learning_rate": 0.0002402132867132867, "loss": 3.1697, "step": 103000 }, { "epoch": 29.994525016017242, "eval_accuracy": 0.37511112605462477, "eval_loss": 3.5357658863067627, "eval_runtime": 182.2662, "eval_samples_per_second": 91.328, "eval_steps_per_second": 5.711, "step": 103000 }, { "epoch": 30.009027899120507, "grad_norm": 0.4303753077983856, "learning_rate": 0.00024003846153846154, "loss": 3.1078, "step": 103050 }, { "epoch": 30.023589026734232, "grad_norm": 0.42869868874549866, "learning_rate": 0.00023986363636363634, "loss": 3.0725, "step": 103100 }, { "epoch": 30.038150154347953, "grad_norm": 0.4451257586479187, "learning_rate": 0.00023968881118881116, "loss": 3.0847, "step": 103150 }, { "epoch": 30.052711281961674, "grad_norm": 0.4753834903240204, "learning_rate": 0.000239513986013986, "loss": 3.0783, "step": 103200 }, { "epoch": 30.0672724095754, "grad_norm": 0.4139735996723175, "learning_rate": 0.00023933916083916082, "loss": 3.0859, "step": 103250 }, { "epoch": 30.08183353718912, "grad_norm": 0.4201156198978424, "learning_rate": 0.00023916433566433564, "loss": 3.0844, "step": 103300 }, { "epoch": 30.096394664802844, "grad_norm": 0.4758506417274475, "learning_rate": 0.00023898951048951047, "loss": 3.0864, "step": 103350 }, { "epoch": 30.110955792416565, "grad_norm": 0.42748937010765076, "learning_rate": 0.00023881468531468527, "loss": 3.0959, "step": 103400 }, { "epoch": 30.125516920030286, "grad_norm": 0.4744609594345093, "learning_rate": 0.0002386398601398601, "loss": 3.095, "step": 103450 }, { "epoch": 30.14007804764401, "grad_norm": 0.45903265476226807, "learning_rate": 0.00023846503496503492, "loss": 3.0994, "step": 103500 }, { "epoch": 30.15463917525773, "grad_norm": 0.46165403723716736, "learning_rate": 0.00023829020979020978, "loss": 3.1072, "step": 103550 }, { "epoch": 30.169200302871456, "grad_norm": 0.43678611516952515, "learning_rate": 0.0002381153846153846, "loss": 3.1052, "step": 103600 }, { "epoch": 30.183761430485177, "grad_norm": 0.4657299220561981, "learning_rate": 0.00023794055944055943, "loss": 3.099, "step": 103650 }, { "epoch": 30.198322558098898, "grad_norm": 0.4123986065387726, "learning_rate": 0.00023776573426573426, "loss": 3.1102, "step": 103700 }, { "epoch": 30.212883685712622, "grad_norm": 0.40457773208618164, "learning_rate": 0.00023759090909090909, "loss": 3.1176, "step": 103750 }, { "epoch": 30.227444813326343, "grad_norm": 0.46433863043785095, "learning_rate": 0.0002374160839160839, "loss": 3.106, "step": 103800 }, { "epoch": 30.242005940940068, "grad_norm": 0.45321351289749146, "learning_rate": 0.0002372412587412587, "loss": 3.1138, "step": 103850 }, { "epoch": 30.25656706855379, "grad_norm": 0.4226059019565582, "learning_rate": 0.00023706643356643354, "loss": 3.1195, "step": 103900 }, { "epoch": 30.27112819616751, "grad_norm": 0.43229028582572937, "learning_rate": 0.00023689160839160837, "loss": 3.118, "step": 103950 }, { "epoch": 30.285689323781234, "grad_norm": 0.43219342827796936, "learning_rate": 0.0002367167832167832, "loss": 3.1182, "step": 104000 }, { "epoch": 30.285689323781234, "eval_accuracy": 0.3742103613568888, "eval_loss": 3.5521020889282227, "eval_runtime": 182.9136, "eval_samples_per_second": 91.005, "eval_steps_per_second": 5.691, "step": 104000 }, { "epoch": 30.300250451394955, "grad_norm": 0.49742257595062256, "learning_rate": 0.00023654195804195802, "loss": 3.1162, "step": 104050 }, { "epoch": 30.31481157900868, "grad_norm": 0.45247024297714233, "learning_rate": 0.00023636713286713285, "loss": 3.1087, "step": 104100 }, { "epoch": 30.3293727066224, "grad_norm": 0.44636932015419006, "learning_rate": 0.00023619230769230765, "loss": 3.135, "step": 104150 }, { "epoch": 30.343933834236122, "grad_norm": 0.4359947144985199, "learning_rate": 0.00023601748251748247, "loss": 3.1223, "step": 104200 }, { "epoch": 30.358494961849846, "grad_norm": 0.45019811391830444, "learning_rate": 0.00023584265734265733, "loss": 3.1242, "step": 104250 }, { "epoch": 30.373056089463567, "grad_norm": 0.4336197078227997, "learning_rate": 0.00023566783216783215, "loss": 3.1102, "step": 104300 }, { "epoch": 30.387617217077292, "grad_norm": 0.45937708020210266, "learning_rate": 0.00023549300699300698, "loss": 3.126, "step": 104350 }, { "epoch": 30.402178344691013, "grad_norm": 0.46162450313568115, "learning_rate": 0.0002353181818181818, "loss": 3.1298, "step": 104400 }, { "epoch": 30.416739472304734, "grad_norm": 0.44591084122657776, "learning_rate": 0.00023514335664335663, "loss": 3.1202, "step": 104450 }, { "epoch": 30.43130059991846, "grad_norm": 0.42444440722465515, "learning_rate": 0.00023496853146853146, "loss": 3.1232, "step": 104500 }, { "epoch": 30.44586172753218, "grad_norm": 0.44376009702682495, "learning_rate": 0.0002347937062937063, "loss": 3.1209, "step": 104550 }, { "epoch": 30.460422855145904, "grad_norm": 0.4294810891151428, "learning_rate": 0.0002346188811188811, "loss": 3.1267, "step": 104600 }, { "epoch": 30.474983982759625, "grad_norm": 0.4419626295566559, "learning_rate": 0.0002344440559440559, "loss": 3.1297, "step": 104650 }, { "epoch": 30.489545110373346, "grad_norm": 0.4434393346309662, "learning_rate": 0.00023426923076923074, "loss": 3.1384, "step": 104700 }, { "epoch": 30.50410623798707, "grad_norm": 0.46597161889076233, "learning_rate": 0.00023409440559440557, "loss": 3.1336, "step": 104750 }, { "epoch": 30.51866736560079, "grad_norm": 0.42664727568626404, "learning_rate": 0.0002339195804195804, "loss": 3.1239, "step": 104800 }, { "epoch": 30.533228493214516, "grad_norm": 0.4466429352760315, "learning_rate": 0.00023374475524475522, "loss": 3.1521, "step": 104850 }, { "epoch": 30.547789620828237, "grad_norm": 0.44672852754592896, "learning_rate": 0.00023356993006993002, "loss": 3.1345, "step": 104900 }, { "epoch": 30.562350748441958, "grad_norm": 0.4642077386379242, "learning_rate": 0.0002333951048951049, "loss": 3.1459, "step": 104950 }, { "epoch": 30.576911876055682, "grad_norm": 0.4767882525920868, "learning_rate": 0.0002332202797202797, "loss": 3.123, "step": 105000 }, { "epoch": 30.576911876055682, "eval_accuracy": 0.3748681241451729, "eval_loss": 3.546727180480957, "eval_runtime": 182.7165, "eval_samples_per_second": 91.103, "eval_steps_per_second": 5.697, "step": 105000 }, { "epoch": 30.591473003669403, "grad_norm": 0.46989575028419495, "learning_rate": 0.00023304545454545453, "loss": 3.1409, "step": 105050 }, { "epoch": 30.606034131283128, "grad_norm": 0.4495246112346649, "learning_rate": 0.00023287062937062935, "loss": 3.1435, "step": 105100 }, { "epoch": 30.62059525889685, "grad_norm": 0.4512391984462738, "learning_rate": 0.00023269580419580418, "loss": 3.1473, "step": 105150 }, { "epoch": 30.635156386510573, "grad_norm": 0.4734998345375061, "learning_rate": 0.000232520979020979, "loss": 3.1405, "step": 105200 }, { "epoch": 30.649717514124294, "grad_norm": 0.4526105523109436, "learning_rate": 0.00023234615384615384, "loss": 3.1354, "step": 105250 }, { "epoch": 30.664278641738015, "grad_norm": 0.45943930745124817, "learning_rate": 0.00023217132867132866, "loss": 3.1388, "step": 105300 }, { "epoch": 30.67883976935174, "grad_norm": 0.42655590176582336, "learning_rate": 0.00023199650349650346, "loss": 3.1503, "step": 105350 }, { "epoch": 30.69340089696546, "grad_norm": 0.46072572469711304, "learning_rate": 0.0002318216783216783, "loss": 3.1436, "step": 105400 }, { "epoch": 30.707962024579185, "grad_norm": 0.4664587378501892, "learning_rate": 0.00023164685314685312, "loss": 3.1521, "step": 105450 }, { "epoch": 30.722523152192906, "grad_norm": 0.4768237769603729, "learning_rate": 0.00023147202797202794, "loss": 3.1544, "step": 105500 }, { "epoch": 30.737084279806627, "grad_norm": 0.4706445336341858, "learning_rate": 0.00023129720279720277, "loss": 3.1612, "step": 105550 }, { "epoch": 30.75164540742035, "grad_norm": 0.45245417952537537, "learning_rate": 0.0002311223776223776, "loss": 3.1573, "step": 105600 }, { "epoch": 30.766206535034073, "grad_norm": 0.4434870183467865, "learning_rate": 0.00023094755244755245, "loss": 3.1509, "step": 105650 }, { "epoch": 30.780767662647797, "grad_norm": 0.4726414680480957, "learning_rate": 0.00023077272727272728, "loss": 3.1527, "step": 105700 }, { "epoch": 30.795328790261518, "grad_norm": 0.4371493458747864, "learning_rate": 0.00023059790209790208, "loss": 3.1515, "step": 105750 }, { "epoch": 30.80988991787524, "grad_norm": 0.4734935462474823, "learning_rate": 0.0002304230769230769, "loss": 3.143, "step": 105800 }, { "epoch": 30.824451045488964, "grad_norm": 0.4717303216457367, "learning_rate": 0.00023024825174825173, "loss": 3.1503, "step": 105850 }, { "epoch": 30.839012173102684, "grad_norm": 0.46519219875335693, "learning_rate": 0.00023007342657342656, "loss": 3.1503, "step": 105900 }, { "epoch": 30.85357330071641, "grad_norm": 0.4358333349227905, "learning_rate": 0.00022989860139860138, "loss": 3.158, "step": 105950 }, { "epoch": 30.86813442833013, "grad_norm": 0.4397745132446289, "learning_rate": 0.0002297237762237762, "loss": 3.1642, "step": 106000 }, { "epoch": 30.86813442833013, "eval_accuracy": 0.37529217246998803, "eval_loss": 3.5388691425323486, "eval_runtime": 182.7835, "eval_samples_per_second": 91.07, "eval_steps_per_second": 5.695, "step": 106000 }, { "epoch": 30.88269555594385, "grad_norm": 0.4402293264865875, "learning_rate": 0.00022954895104895104, "loss": 3.158, "step": 106050 }, { "epoch": 30.897256683557575, "grad_norm": 0.4294586479663849, "learning_rate": 0.00022937412587412584, "loss": 3.1486, "step": 106100 }, { "epoch": 30.911817811171296, "grad_norm": 0.45872706174850464, "learning_rate": 0.00022919930069930066, "loss": 3.1578, "step": 106150 }, { "epoch": 30.92637893878502, "grad_norm": 0.44787290692329407, "learning_rate": 0.0002290244755244755, "loss": 3.1459, "step": 106200 }, { "epoch": 30.940940066398742, "grad_norm": 0.4345054626464844, "learning_rate": 0.00022884965034965032, "loss": 3.1525, "step": 106250 }, { "epoch": 30.955501194012463, "grad_norm": 0.4582189917564392, "learning_rate": 0.00022867482517482517, "loss": 3.164, "step": 106300 }, { "epoch": 30.970062321626187, "grad_norm": 0.4456637501716614, "learning_rate": 0.0002285, "loss": 3.1512, "step": 106350 }, { "epoch": 30.98462344923991, "grad_norm": 0.4364929497241974, "learning_rate": 0.00022832517482517482, "loss": 3.1603, "step": 106400 }, { "epoch": 30.999184576853633, "grad_norm": 0.4392761290073395, "learning_rate": 0.00022815034965034965, "loss": 3.1616, "step": 106450 }, { "epoch": 31.0136874599569, "grad_norm": 0.5046218037605286, "learning_rate": 0.00022797552447552445, "loss": 3.0821, "step": 106500 }, { "epoch": 31.028248587570623, "grad_norm": 0.4466058015823364, "learning_rate": 0.00022780069930069928, "loss": 3.0544, "step": 106550 }, { "epoch": 31.042809715184344, "grad_norm": 0.46955615282058716, "learning_rate": 0.0002276258741258741, "loss": 3.0667, "step": 106600 }, { "epoch": 31.057370842798065, "grad_norm": 0.44762715697288513, "learning_rate": 0.00022745104895104893, "loss": 3.0778, "step": 106650 }, { "epoch": 31.07193197041179, "grad_norm": 0.4619590640068054, "learning_rate": 0.00022727622377622376, "loss": 3.0813, "step": 106700 }, { "epoch": 31.08649309802551, "grad_norm": 0.4318313002586365, "learning_rate": 0.00022710139860139858, "loss": 3.0776, "step": 106750 }, { "epoch": 31.101054225639235, "grad_norm": 0.4294584095478058, "learning_rate": 0.0002269265734265734, "loss": 3.0869, "step": 106800 }, { "epoch": 31.115615353252956, "grad_norm": 0.487762987613678, "learning_rate": 0.0002267517482517482, "loss": 3.0852, "step": 106850 }, { "epoch": 31.130176480866677, "grad_norm": 0.46022409200668335, "learning_rate": 0.00022657692307692304, "loss": 3.089, "step": 106900 }, { "epoch": 31.1447376084804, "grad_norm": 0.4672282338142395, "learning_rate": 0.00022640209790209787, "loss": 3.084, "step": 106950 }, { "epoch": 31.159298736094122, "grad_norm": 0.46602603793144226, "learning_rate": 0.00022622727272727272, "loss": 3.1068, "step": 107000 }, { "epoch": 31.159298736094122, "eval_accuracy": 0.37467931859772263, "eval_loss": 3.555206060409546, "eval_runtime": 183.256, "eval_samples_per_second": 90.835, "eval_steps_per_second": 5.681, "step": 107000 }, { "epoch": 31.173859863707847, "grad_norm": 0.46551984548568726, "learning_rate": 0.00022605244755244755, "loss": 3.1023, "step": 107050 }, { "epoch": 31.188420991321568, "grad_norm": 0.44169747829437256, "learning_rate": 0.00022587762237762237, "loss": 3.1004, "step": 107100 }, { "epoch": 31.20298211893529, "grad_norm": 0.4496636688709259, "learning_rate": 0.0002257027972027972, "loss": 3.0991, "step": 107150 }, { "epoch": 31.217543246549013, "grad_norm": 0.459714412689209, "learning_rate": 0.00022552797202797203, "loss": 3.0932, "step": 107200 }, { "epoch": 31.232104374162734, "grad_norm": 0.47508567571640015, "learning_rate": 0.00022535314685314683, "loss": 3.0973, "step": 107250 }, { "epoch": 31.24666550177646, "grad_norm": 0.4399765431880951, "learning_rate": 0.00022517832167832165, "loss": 3.1033, "step": 107300 }, { "epoch": 31.26122662939018, "grad_norm": 0.4429820775985718, "learning_rate": 0.00022500349650349648, "loss": 3.1098, "step": 107350 }, { "epoch": 31.2757877570039, "grad_norm": 0.4285988509654999, "learning_rate": 0.0002248286713286713, "loss": 3.1182, "step": 107400 }, { "epoch": 31.290348884617625, "grad_norm": 0.443668931722641, "learning_rate": 0.00022465384615384613, "loss": 3.1011, "step": 107450 }, { "epoch": 31.304910012231346, "grad_norm": 0.4679843783378601, "learning_rate": 0.00022447902097902096, "loss": 3.1049, "step": 107500 }, { "epoch": 31.31947113984507, "grad_norm": 0.4384918510913849, "learning_rate": 0.0002243041958041958, "loss": 3.1097, "step": 107550 }, { "epoch": 31.33403226745879, "grad_norm": 0.4730290174484253, "learning_rate": 0.00022412937062937059, "loss": 3.123, "step": 107600 }, { "epoch": 31.348593395072513, "grad_norm": 0.4741573631763458, "learning_rate": 0.0002239545454545454, "loss": 3.125, "step": 107650 }, { "epoch": 31.363154522686237, "grad_norm": 0.4520423114299774, "learning_rate": 0.00022377972027972027, "loss": 3.1161, "step": 107700 }, { "epoch": 31.377715650299958, "grad_norm": 0.4628244936466217, "learning_rate": 0.0002236048951048951, "loss": 3.1193, "step": 107750 }, { "epoch": 31.392276777913683, "grad_norm": 0.48368245363235474, "learning_rate": 0.00022343006993006992, "loss": 3.115, "step": 107800 }, { "epoch": 31.406837905527404, "grad_norm": 0.4645143151283264, "learning_rate": 0.00022325524475524475, "loss": 3.1127, "step": 107850 }, { "epoch": 31.421399033141128, "grad_norm": 0.44322070479393005, "learning_rate": 0.00022308041958041957, "loss": 3.1188, "step": 107900 }, { "epoch": 31.43596016075485, "grad_norm": 0.44873929023742676, "learning_rate": 0.0002229055944055944, "loss": 3.1205, "step": 107950 }, { "epoch": 31.45052128836857, "grad_norm": 0.44997406005859375, "learning_rate": 0.0002227307692307692, "loss": 3.1252, "step": 108000 }, { "epoch": 31.45052128836857, "eval_accuracy": 0.37460748784461423, "eval_loss": 3.548530340194702, "eval_runtime": 181.6449, "eval_samples_per_second": 91.64, "eval_steps_per_second": 5.731, "step": 108000 }, { "epoch": 31.465082415982295, "grad_norm": 0.4543869197368622, "learning_rate": 0.00022255594405594403, "loss": 3.1266, "step": 108050 }, { "epoch": 31.479643543596016, "grad_norm": 0.47484591603279114, "learning_rate": 0.00022238111888111885, "loss": 3.1277, "step": 108100 }, { "epoch": 31.49420467120974, "grad_norm": 0.4543011784553528, "learning_rate": 0.00022220629370629368, "loss": 3.1312, "step": 108150 }, { "epoch": 31.50876579882346, "grad_norm": 0.44846051931381226, "learning_rate": 0.0002220314685314685, "loss": 3.1213, "step": 108200 }, { "epoch": 31.523326926437182, "grad_norm": 0.4550095498561859, "learning_rate": 0.00022185664335664333, "loss": 3.1252, "step": 108250 }, { "epoch": 31.537888054050907, "grad_norm": 0.47198644280433655, "learning_rate": 0.00022168181818181816, "loss": 3.1207, "step": 108300 }, { "epoch": 31.552449181664628, "grad_norm": 0.4757038354873657, "learning_rate": 0.00022150699300699296, "loss": 3.1373, "step": 108350 }, { "epoch": 31.567010309278352, "grad_norm": 0.42917540669441223, "learning_rate": 0.00022133216783216782, "loss": 3.1224, "step": 108400 }, { "epoch": 31.581571436892073, "grad_norm": 0.45333221554756165, "learning_rate": 0.00022115734265734264, "loss": 3.1347, "step": 108450 }, { "epoch": 31.596132564505794, "grad_norm": 0.441192626953125, "learning_rate": 0.00022098251748251747, "loss": 3.1299, "step": 108500 }, { "epoch": 31.61069369211952, "grad_norm": 0.4505492150783539, "learning_rate": 0.0002208076923076923, "loss": 3.1481, "step": 108550 }, { "epoch": 31.62525481973324, "grad_norm": 0.45278164744377136, "learning_rate": 0.00022063286713286712, "loss": 3.1286, "step": 108600 }, { "epoch": 31.639815947346964, "grad_norm": 0.45977330207824707, "learning_rate": 0.00022045804195804195, "loss": 3.1259, "step": 108650 }, { "epoch": 31.654377074960685, "grad_norm": 0.4866136908531189, "learning_rate": 0.00022028321678321678, "loss": 3.1395, "step": 108700 }, { "epoch": 31.668938202574406, "grad_norm": 0.4831617474555969, "learning_rate": 0.00022010839160839158, "loss": 3.131, "step": 108750 }, { "epoch": 31.68349933018813, "grad_norm": 0.43512704968452454, "learning_rate": 0.0002199335664335664, "loss": 3.1341, "step": 108800 }, { "epoch": 31.69806045780185, "grad_norm": 0.4579360783100128, "learning_rate": 0.00021975874125874123, "loss": 3.1367, "step": 108850 }, { "epoch": 31.712621585415576, "grad_norm": 0.44610920548439026, "learning_rate": 0.00021958391608391606, "loss": 3.1268, "step": 108900 }, { "epoch": 31.727182713029297, "grad_norm": 0.44888967275619507, "learning_rate": 0.00021940909090909088, "loss": 3.1372, "step": 108950 }, { "epoch": 31.741743840643018, "grad_norm": 0.4246661961078644, "learning_rate": 0.0002192342657342657, "loss": 3.1427, "step": 109000 }, { "epoch": 31.741743840643018, "eval_accuracy": 0.37508079490192103, "eval_loss": 3.542057514190674, "eval_runtime": 180.5188, "eval_samples_per_second": 92.212, "eval_steps_per_second": 5.767, "step": 109000 }, { "epoch": 31.741743840643018, "step": 109000, "total_flos": 2.27832542134272e+18, "train_loss": 0.8390047937795656, "train_runtime": 58351.8889, "train_samples_per_second": 235.382, "train_steps_per_second": 2.942 } ], "logging_steps": 50, "max_steps": 171700, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 10000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 20, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 11 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.27832542134272e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }