diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,5009 +1,74 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.5000436033836225, + "epoch": 0.10136157337367625, "eval_steps": 100, - "global_step": 5734, + "global_step": 67, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0008720676724513823, - "grad_norm": 9.447980880737305, - "learning_rate": 3.4843205574912896e-07, - "loss": 3.1, - "mean_token_accuracy": 0.4386619344353676, + "epoch": 0.015128593040847202, + "grad_norm": 33.7647987029562, + "learning_rate": 1.9876883405951378e-05, + "loss": 7.125, + "mean_token_accuracy": 0.14012572318315505, "step": 10 }, { - "epoch": 0.0017441353449027645, - "grad_norm": 10.540608406066895, - "learning_rate": 6.968641114982579e-07, - "loss": 3.1164, - "mean_token_accuracy": 0.42992905974388124, + "epoch": 0.030257186081694403, + "grad_norm": 10.612758737576552, + "learning_rate": 1.777145961456971e-05, + "loss": 5.5062, + "mean_token_accuracy": 0.1818697050213814, "step": 20 }, { - "epoch": 0.002616203017354147, - "grad_norm": 12.786429405212402, - "learning_rate": 1.045296167247387e-06, - "loss": 3.1391, - "mean_token_accuracy": 0.4331947162747383, + "epoch": 0.0453857791225416, + "grad_norm": 5.133709750875046, + "learning_rate": 1.3583679495453e-05, + "loss": 4.6188, + "mean_token_accuracy": 0.24615719988942147, "step": 30 }, { - "epoch": 0.003488270689805529, - "grad_norm": 9.747709274291992, - "learning_rate": 1.3937282229965158e-06, - "loss": 3.0719, - "mean_token_accuracy": 0.4360322892665863, + "epoch": 0.060514372163388806, + "grad_norm": 3.7566168018221653, + "learning_rate": 8.43565534959769e-06, + "loss": 4.1492, + "mean_token_accuracy": 0.2883812889456749, "step": 40 }, { - "epoch": 0.004360338362256911, - "grad_norm": 8.716520309448242, - "learning_rate": 1.742160278745645e-06, - "loss": 3.0047, - "mean_token_accuracy": 0.4452299430966377, + "epoch": 0.07564296520423601, + "grad_norm": 3.0972835436341253, + "learning_rate": 3.7067960895016277e-06, + "loss": 3.9453, + "mean_token_accuracy": 0.30843553096055987, "step": 50 }, { - "epoch": 0.005232406034708294, - "grad_norm": 13.018689155578613, - "learning_rate": 2.090592334494774e-06, - "loss": 3.0695, - "mean_token_accuracy": 0.4376590013504028, + "epoch": 0.0907715582450832, + "grad_norm": 2.978044733724223, + "learning_rate": 6.641957350279838e-07, + "loss": 3.8914, + "mean_token_accuracy": 0.3149686068296432, "step": 60 }, { - "epoch": 0.0061044737071596755, - "grad_norm": 7.014005184173584, - "learning_rate": 2.4390243902439027e-06, - "loss": 3.075, - "mean_token_accuracy": 0.4348948121070862, - "step": 70 - }, - { - "epoch": 0.006976541379611058, - "grad_norm": 7.294459342956543, - "learning_rate": 2.7874564459930316e-06, - "loss": 3.0641, - "mean_token_accuracy": 0.43597113639116286, - "step": 80 - }, - { - "epoch": 0.00784860905206244, - "grad_norm": 7.461134910583496, - "learning_rate": 3.13588850174216e-06, - "loss": 3.0187, - "mean_token_accuracy": 0.4360322907567024, - "step": 90 - }, - { - "epoch": 0.008720676724513822, - "grad_norm": 12.412550926208496, - "learning_rate": 3.48432055749129e-06, - "loss": 2.9297, - "mean_token_accuracy": 0.4520425647497177, - "step": 100 - }, - { - "epoch": 0.008720676724513822, - "eval_runtime": 4.1147, - "eval_samples_per_second": 266.605, - "eval_steps_per_second": 66.833, - "step": 100 - }, - { - "epoch": 0.009592744396965204, - "grad_norm": 6.273736000061035, - "learning_rate": 3.832752613240418e-06, - "loss": 2.8766, - "mean_token_accuracy": 0.4566536217927933, - "step": 110 - }, - { - "epoch": 0.010464812069416587, - "grad_norm": 9.404972076416016, - "learning_rate": 4.181184668989548e-06, - "loss": 2.8844, - "mean_token_accuracy": 0.451137475669384, - "step": 120 - }, - { - "epoch": 0.01133687974186797, - "grad_norm": 12.689948081970215, - "learning_rate": 4.529616724738676e-06, - "loss": 2.8375, - "mean_token_accuracy": 0.4530821919441223, - "step": 130 - }, - { - "epoch": 0.012208947414319351, - "grad_norm": 17.347368240356445, - "learning_rate": 4.8780487804878055e-06, - "loss": 2.7617, - "mean_token_accuracy": 0.4619985356926918, - "step": 140 - }, - { - "epoch": 0.013081015086770733, - "grad_norm": 13.873700141906738, - "learning_rate": 5.226480836236935e-06, - "loss": 2.7266, - "mean_token_accuracy": 0.4662426620721817, - "step": 150 - }, - { - "epoch": 0.013953082759222116, - "grad_norm": 11.277148246765137, - "learning_rate": 5.574912891986063e-06, - "loss": 2.7508, - "mean_token_accuracy": 0.46210860908031465, - "step": 160 - }, - { - "epoch": 0.014825150431673498, - "grad_norm": 18.343442916870117, - "learning_rate": 5.923344947735193e-06, - "loss": 2.7602, - "mean_token_accuracy": 0.4614114463329315, - "step": 170 - }, - { - "epoch": 0.01569721810412488, - "grad_norm": 8.960555076599121, - "learning_rate": 6.27177700348432e-06, - "loss": 2.6992, - "mean_token_accuracy": 0.46870107650756837, - "step": 180 - }, - { - "epoch": 0.01656928577657626, - "grad_norm": 6.929897308349609, - "learning_rate": 6.62020905923345e-06, - "loss": 2.7313, - "mean_token_accuracy": 0.4594667300581932, - "step": 190 - }, - { - "epoch": 0.017441353449027645, - "grad_norm": 6.672097206115723, - "learning_rate": 6.96864111498258e-06, - "loss": 2.6555, - "mean_token_accuracy": 0.476895797252655, - "step": 200 - }, - { - "epoch": 0.017441353449027645, - "eval_runtime": 4.0899, - "eval_samples_per_second": 268.22, - "eval_steps_per_second": 67.238, - "step": 200 - }, - { - "epoch": 0.018313421121479028, - "grad_norm": 5.700255393981934, - "learning_rate": 7.317073170731707e-06, - "loss": 2.7086, - "mean_token_accuracy": 0.4632338538765907, - "step": 210 - }, - { - "epoch": 0.019185488793930408, - "grad_norm": 3.922362804412842, - "learning_rate": 7.665505226480837e-06, - "loss": 2.6164, - "mean_token_accuracy": 0.47902397215366366, - "step": 220 - }, - { - "epoch": 0.02005755646638179, - "grad_norm": 3.947789192199707, - "learning_rate": 8.013937282229966e-06, - "loss": 2.625, - "mean_token_accuracy": 0.4705968677997589, - "step": 230 - }, - { - "epoch": 0.020929624138833175, - "grad_norm": 8.670129776000977, - "learning_rate": 8.362369337979095e-06, - "loss": 2.6398, - "mean_token_accuracy": 0.46730675250291825, - "step": 240 - }, - { - "epoch": 0.021801691811284555, - "grad_norm": 3.2554168701171875, - "learning_rate": 8.710801393728223e-06, - "loss": 2.5383, - "mean_token_accuracy": 0.4835616409778595, - "step": 250 - }, - { - "epoch": 0.02267375948373594, - "grad_norm": 2.8314220905303955, - "learning_rate": 9.059233449477352e-06, - "loss": 2.5797, - "mean_token_accuracy": 0.4750244602560997, - "step": 260 - }, - { - "epoch": 0.023545827156187322, - "grad_norm": 2.4087440967559814, - "learning_rate": 9.407665505226482e-06, - "loss": 2.4922, - "mean_token_accuracy": 0.49085127413272855, - "step": 270 - }, - { - "epoch": 0.024417894828638702, - "grad_norm": 2.769911050796509, - "learning_rate": 9.756097560975611e-06, - "loss": 2.5422, - "mean_token_accuracy": 0.4785714283585548, - "step": 280 - }, - { - "epoch": 0.025289962501090085, - "grad_norm": 2.791604995727539, - "learning_rate": 1.0104529616724739e-05, - "loss": 2.5086, - "mean_token_accuracy": 0.4822040110826492, - "step": 290 - }, - { - "epoch": 0.026162030173541465, - "grad_norm": 2.704972267150879, - "learning_rate": 1.045296167247387e-05, - "loss": 2.5102, - "mean_token_accuracy": 0.4795988291501999, - "step": 300 - }, - { - "epoch": 0.026162030173541465, - "eval_runtime": 4.0813, - "eval_samples_per_second": 268.787, - "eval_steps_per_second": 67.38, - "step": 300 - }, - { - "epoch": 0.02703409784599285, - "grad_norm": 3.0186307430267334, - "learning_rate": 1.0801393728222997e-05, - "loss": 2.5078, - "mean_token_accuracy": 0.4815802350640297, - "step": 310 - }, - { - "epoch": 0.027906165518444232, - "grad_norm": 2.475856304168701, - "learning_rate": 1.1149825783972127e-05, - "loss": 2.5023, - "mean_token_accuracy": 0.47424168437719344, - "step": 320 - }, - { - "epoch": 0.028778233190895612, - "grad_norm": 4.115312099456787, - "learning_rate": 1.1498257839721256e-05, - "loss": 2.4344, - "mean_token_accuracy": 0.4911325842142105, - "step": 330 - }, - { - "epoch": 0.029650300863346996, - "grad_norm": 2.9208383560180664, - "learning_rate": 1.1846689895470385e-05, - "loss": 2.4531, - "mean_token_accuracy": 0.49026418924331666, - "step": 340 - }, - { - "epoch": 0.03052236853579838, - "grad_norm": 2.760439395904541, - "learning_rate": 1.2195121951219513e-05, - "loss": 2.4133, - "mean_token_accuracy": 0.49324853271245955, - "step": 350 - }, - { - "epoch": 0.03139443620824976, - "grad_norm": 2.626922130584717, - "learning_rate": 1.254355400696864e-05, - "loss": 2.4203, - "mean_token_accuracy": 0.4946550831198692, - "step": 360 - }, - { - "epoch": 0.03226650388070114, - "grad_norm": 2.866455316543579, - "learning_rate": 1.2891986062717772e-05, - "loss": 2.4109, - "mean_token_accuracy": 0.49146282225847243, - "step": 370 - }, - { - "epoch": 0.03313857155315252, - "grad_norm": 3.003437042236328, - "learning_rate": 1.32404181184669e-05, - "loss": 2.3906, - "mean_token_accuracy": 0.49352983981370924, - "step": 380 - }, - { - "epoch": 0.034010639225603906, - "grad_norm": 2.803565263748169, - "learning_rate": 1.3588850174216028e-05, - "loss": 2.3805, - "mean_token_accuracy": 0.5005626231431961, - "step": 390 - }, - { - "epoch": 0.03488270689805529, - "grad_norm": 2.4779348373413086, - "learning_rate": 1.393728222996516e-05, - "loss": 2.3168, - "mean_token_accuracy": 0.5088918745517731, - "step": 400 - }, - { - "epoch": 0.03488270689805529, - "eval_runtime": 4.0964, - "eval_samples_per_second": 267.798, - "eval_steps_per_second": 67.133, - "step": 400 - }, - { - "epoch": 0.03575477457050667, - "grad_norm": 2.852496862411499, - "learning_rate": 1.4285714285714287e-05, - "loss": 2.3742, - "mean_token_accuracy": 0.49708903580904007, - "step": 410 - }, - { - "epoch": 0.036626842242958056, - "grad_norm": 2.6764421463012695, - "learning_rate": 1.4634146341463415e-05, - "loss": 2.3203, - "mean_token_accuracy": 0.5075097888708114, - "step": 420 - }, - { - "epoch": 0.03749890991540943, - "grad_norm": 2.5645339488983154, - "learning_rate": 1.4982578397212544e-05, - "loss": 2.3375, - "mean_token_accuracy": 0.5012964725494384, - "step": 430 - }, - { - "epoch": 0.038370977587860816, - "grad_norm": 3.382863759994507, - "learning_rate": 1.5331010452961673e-05, - "loss": 2.3125, - "mean_token_accuracy": 0.5063478454947472, - "step": 440 - }, - { - "epoch": 0.0392430452603122, - "grad_norm": 2.7387797832489014, - "learning_rate": 1.5679442508710803e-05, - "loss": 2.2758, - "mean_token_accuracy": 0.5121942207217216, - "step": 450 - }, - { - "epoch": 0.04011511293276358, - "grad_norm": 2.809790849685669, - "learning_rate": 1.6027874564459932e-05, - "loss": 2.2953, - "mean_token_accuracy": 0.5054916888475418, - "step": 460 - }, - { - "epoch": 0.040987180605214966, - "grad_norm": 2.2248854637145996, - "learning_rate": 1.637630662020906e-05, - "loss": 2.3219, - "mean_token_accuracy": 0.5051492154598236, - "step": 470 - }, - { - "epoch": 0.04185924827766635, - "grad_norm": 2.1804261207580566, - "learning_rate": 1.672473867595819e-05, - "loss": 2.2859, - "mean_token_accuracy": 0.5125611573457718, - "step": 480 - }, - { - "epoch": 0.042731315950117726, - "grad_norm": 2.3304409980773926, - "learning_rate": 1.7073170731707317e-05, - "loss": 2.2621, - "mean_token_accuracy": 0.5109833672642707, - "step": 490 - }, - { - "epoch": 0.04360338362256911, - "grad_norm": 2.4848010540008545, - "learning_rate": 1.7421602787456446e-05, - "loss": 2.3117, - "mean_token_accuracy": 0.5048434436321259, - "step": 500 - }, - { - "epoch": 0.04360338362256911, - "eval_runtime": 4.0941, - "eval_samples_per_second": 267.946, - "eval_steps_per_second": 67.17, - "step": 500 - }, - { - "epoch": 0.04447545129502049, - "grad_norm": 2.473914623260498, - "learning_rate": 1.7770034843205575e-05, - "loss": 2.2977, - "mean_token_accuracy": 0.5050635993480682, - "step": 510 - }, - { - "epoch": 0.04534751896747188, - "grad_norm": 2.530372142791748, - "learning_rate": 1.8118466898954705e-05, - "loss": 2.2805, - "mean_token_accuracy": 0.5075097844004631, - "step": 520 - }, - { - "epoch": 0.04621958663992326, - "grad_norm": 3.098284959793091, - "learning_rate": 1.8466898954703834e-05, - "loss": 2.307, - "mean_token_accuracy": 0.5051003009080887, - "step": 530 - }, - { - "epoch": 0.047091654312374644, - "grad_norm": 2.2857539653778076, - "learning_rate": 1.8815331010452963e-05, - "loss": 2.2883, - "mean_token_accuracy": 0.5091365054249763, - "step": 540 - }, - { - "epoch": 0.04796372198482602, - "grad_norm": 2.7540576457977295, - "learning_rate": 1.9163763066202093e-05, - "loss": 2.2352, - "mean_token_accuracy": 0.5152764156460762, - "step": 550 - }, - { - "epoch": 0.048835789657277404, - "grad_norm": 2.501436233520508, - "learning_rate": 1.9512195121951222e-05, - "loss": 2.2785, - "mean_token_accuracy": 0.508378179371357, - "step": 560 - }, - { - "epoch": 0.04970785732972879, - "grad_norm": 2.658519983291626, - "learning_rate": 1.9860627177700348e-05, - "loss": 2.2617, - "mean_token_accuracy": 0.5109222128987312, - "step": 570 - }, - { - "epoch": 0.05057992500218017, - "grad_norm": 2.585157871246338, - "learning_rate": 1.9999933277491715e-05, - "loss": 2.243, - "mean_token_accuracy": 0.516621820628643, - "step": 580 - }, - { - "epoch": 0.051451992674631554, - "grad_norm": 2.668612480163574, - "learning_rate": 1.999952553205438e-05, - "loss": 2.2328, - "mean_token_accuracy": 0.515936890244484, - "step": 590 - }, - { - "epoch": 0.05232406034708293, - "grad_norm": 2.563499927520752, - "learning_rate": 1.999874712433585e-05, - "loss": 2.1949, - "mean_token_accuracy": 0.525648233294487, - "step": 600 - }, - { - "epoch": 0.05232406034708293, - "eval_runtime": 4.0875, - "eval_samples_per_second": 268.377, - "eval_steps_per_second": 67.278, - "step": 600 - }, - { - "epoch": 0.053196128019534314, - "grad_norm": 2.2233502864837646, - "learning_rate": 1.999759808319013e-05, - "loss": 2.198, - "mean_token_accuracy": 0.5231653615832329, - "step": 610 - }, - { - "epoch": 0.0540681956919857, - "grad_norm": 2.592988967895508, - "learning_rate": 1.9996078451209863e-05, - "loss": 2.2395, - "mean_token_accuracy": 0.514420248568058, - "step": 620 - }, - { - "epoch": 0.05494026336443708, - "grad_norm": 2.5479214191436768, - "learning_rate": 1.999418828472475e-05, - "loss": 2.2297, - "mean_token_accuracy": 0.5147994101047516, - "step": 630 - }, - { - "epoch": 0.055812331036888464, - "grad_norm": 2.3214004039764404, - "learning_rate": 1.9991927653799458e-05, - "loss": 2.1664, - "mean_token_accuracy": 0.5280332669615746, - "step": 640 - }, - { - "epoch": 0.05668439870933985, - "grad_norm": 2.183429002761841, - "learning_rate": 1.998929664223102e-05, - "loss": 2.2062, - "mean_token_accuracy": 0.5173067569732666, - "step": 650 - }, - { - "epoch": 0.057556466381791224, - "grad_norm": 2.5746781826019287, - "learning_rate": 1.9986295347545738e-05, - "loss": 2.1824, - "mean_token_accuracy": 0.5247309163212777, - "step": 660 - }, - { - "epoch": 0.05842853405424261, - "grad_norm": 2.3568668365478516, - "learning_rate": 1.998292388099557e-05, - "loss": 2.2656, - "mean_token_accuracy": 0.5104941308498383, - "step": 670 - }, - { - "epoch": 0.05930060172669399, - "grad_norm": 2.1310341358184814, - "learning_rate": 1.9979182367553994e-05, - "loss": 2.1855, - "mean_token_accuracy": 0.5254403069615364, - "step": 680 - }, - { - "epoch": 0.060172669399145375, - "grad_norm": 2.3862550258636475, - "learning_rate": 1.997507094591137e-05, - "loss": 2.1898, - "mean_token_accuracy": 0.5196428626775742, - "step": 690 - }, - { - "epoch": 0.06104473707159676, - "grad_norm": 2.506866931915283, - "learning_rate": 1.9970589768469833e-05, - "loss": 2.216, - "mean_token_accuracy": 0.5200587093830109, - "step": 700 - }, - { - "epoch": 0.06104473707159676, - "eval_runtime": 4.0966, - "eval_samples_per_second": 267.786, - "eval_steps_per_second": 67.129, - "step": 700 - }, - { - "epoch": 0.06191680474404814, - "grad_norm": 2.2180984020233154, - "learning_rate": 1.996573900133761e-05, - "loss": 2.1742, - "mean_token_accuracy": 0.5204133987426758, - "step": 710 - }, - { - "epoch": 0.06278887241649952, - "grad_norm": 2.4005072116851807, - "learning_rate": 1.996051882432286e-05, - "loss": 2.1625, - "mean_token_accuracy": 0.5273116439580917, - "step": 720 - }, - { - "epoch": 0.0636609400889509, - "grad_norm": 2.260364294052124, - "learning_rate": 1.995492943092705e-05, - "loss": 2.1422, - "mean_token_accuracy": 0.5257705479860306, - "step": 730 - }, - { - "epoch": 0.06453300776140228, - "grad_norm": 2.158372640609741, - "learning_rate": 1.9948971028337737e-05, - "loss": 2.1773, - "mean_token_accuracy": 0.5222725048661232, - "step": 740 - }, - { - "epoch": 0.06540507543385367, - "grad_norm": 2.3536641597747803, - "learning_rate": 1.9942643837420904e-05, - "loss": 2.1754, - "mean_token_accuracy": 0.5249021515250206, - "step": 750 - }, - { - "epoch": 0.06627714310630504, - "grad_norm": 2.374206066131592, - "learning_rate": 1.9935948092712792e-05, - "loss": 2.1254, - "mean_token_accuracy": 0.5323140889406204, - "step": 760 - }, - { - "epoch": 0.06714921077875644, - "grad_norm": 2.4303531646728516, - "learning_rate": 1.992888404241117e-05, - "loss": 2.2102, - "mean_token_accuracy": 0.5182363063097, - "step": 770 - }, - { - "epoch": 0.06802127845120781, - "grad_norm": 2.266817092895508, - "learning_rate": 1.992145194836616e-05, - "loss": 2.168, - "mean_token_accuracy": 0.5266511738300323, - "step": 780 - }, - { - "epoch": 0.0688933461236592, - "grad_norm": 2.6725378036499023, - "learning_rate": 1.9913652086070535e-05, - "loss": 2.1617, - "mean_token_accuracy": 0.5230308219790458, - "step": 790 - }, - { - "epoch": 0.06976541379611058, - "grad_norm": 2.221536874771118, - "learning_rate": 1.9905484744649484e-05, - "loss": 2.2039, - "mean_token_accuracy": 0.5158145785331726, - "step": 800 - }, - { - "epoch": 0.06976541379611058, - "eval_runtime": 4.0799, - "eval_samples_per_second": 268.88, - "eval_steps_per_second": 67.404, - "step": 800 - }, - { - "epoch": 0.07063748146856196, - "grad_norm": 2.5027382373809814, - "learning_rate": 1.989695022684991e-05, - "loss": 2.1441, - "mean_token_accuracy": 0.5249999985098839, - "step": 810 - }, - { - "epoch": 0.07150954914101335, - "grad_norm": 2.2361056804656982, - "learning_rate": 1.988804884902921e-05, - "loss": 2.1254, - "mean_token_accuracy": 0.5334760263562203, - "step": 820 - }, - { - "epoch": 0.07238161681346472, - "grad_norm": 2.2107291221618652, - "learning_rate": 1.9878780941143538e-05, - "loss": 2.1406, - "mean_token_accuracy": 0.5259662419557571, - "step": 830 - }, - { - "epoch": 0.07325368448591611, - "grad_norm": 2.1947028636932373, - "learning_rate": 1.9869146846735576e-05, - "loss": 2.1145, - "mean_token_accuracy": 0.5310420781373978, - "step": 840 - }, - { - "epoch": 0.07412575215836749, - "grad_norm": 2.0034220218658447, - "learning_rate": 1.985914692292182e-05, - "loss": 2.159, - "mean_token_accuracy": 0.5246453016996384, - "step": 850 - }, - { - "epoch": 0.07499781983081887, - "grad_norm": 2.0088765621185303, - "learning_rate": 1.9848781540379312e-05, - "loss": 2.109, - "mean_token_accuracy": 0.5350293487310409, - "step": 860 - }, - { - "epoch": 0.07586988750327026, - "grad_norm": 2.2562777996063232, - "learning_rate": 1.983805108333191e-05, - "loss": 2.1594, - "mean_token_accuracy": 0.5257460832595825, - "step": 870 - }, - { - "epoch": 0.07674195517572163, - "grad_norm": 2.2073802947998047, - "learning_rate": 1.9826955949536062e-05, - "loss": 2.1336, - "mean_token_accuracy": 0.5285591945052147, - "step": 880 - }, - { - "epoch": 0.07761402284817302, - "grad_norm": 1.9717388153076172, - "learning_rate": 1.9815496550266036e-05, - "loss": 2.1453, - "mean_token_accuracy": 0.5289995044469833, - "step": 890 - }, - { - "epoch": 0.0784860905206244, - "grad_norm": 2.2082736492156982, - "learning_rate": 1.98036733102987e-05, - "loss": 2.1738, - "mean_token_accuracy": 0.5225561872124672, - "step": 900 - }, - { - "epoch": 0.0784860905206244, - "eval_runtime": 4.0793, - "eval_samples_per_second": 268.916, - "eval_steps_per_second": 67.413, - "step": 900 - }, - { - "epoch": 0.07935815819307579, - "grad_norm": 2.0694751739501953, - "learning_rate": 1.979148666789775e-05, - "loss": 2.1297, - "mean_token_accuracy": 0.5318370833992958, - "step": 910 - }, - { - "epoch": 0.08023022586552717, - "grad_norm": 2.11228346824646, - "learning_rate": 1.9778937074797494e-05, - "loss": 2.1348, - "mean_token_accuracy": 0.5269605636596679, - "step": 920 - }, - { - "epoch": 0.08110229353797854, - "grad_norm": 1.9903819561004639, - "learning_rate": 1.976602499618608e-05, - "loss": 2.1156, - "mean_token_accuracy": 0.5305161446332931, - "step": 930 - }, - { - "epoch": 0.08197436121042993, - "grad_norm": 2.335895538330078, - "learning_rate": 1.9752750910688278e-05, - "loss": 2.127, - "mean_token_accuracy": 0.5351394325494766, - "step": 940 - }, - { - "epoch": 0.08284642888288131, - "grad_norm": 2.282573938369751, - "learning_rate": 1.9739115310347698e-05, - "loss": 2.0879, - "mean_token_accuracy": 0.5351516619324684, - "step": 950 - }, - { - "epoch": 0.0837184965553327, - "grad_norm": 2.094139814376831, - "learning_rate": 1.972511870060861e-05, - "loss": 2.1539, - "mean_token_accuracy": 0.5238869935274124, - "step": 960 - }, - { - "epoch": 0.08459056422778408, - "grad_norm": 2.2499539852142334, - "learning_rate": 1.9710761600297147e-05, - "loss": 2.1316, - "mean_token_accuracy": 0.5300513654947281, - "step": 970 - }, - { - "epoch": 0.08546263190023545, - "grad_norm": 2.038626194000244, - "learning_rate": 1.9696044541602126e-05, - "loss": 2.1668, - "mean_token_accuracy": 0.5199486300349235, - "step": 980 - }, - { - "epoch": 0.08633469957268684, - "grad_norm": 2.06842303276062, - "learning_rate": 1.968096807005528e-05, - "loss": 2.1141, - "mean_token_accuracy": 0.5352862074971199, - "step": 990 - }, - { - "epoch": 0.08720676724513822, - "grad_norm": 1.8886855840682983, - "learning_rate": 1.966553274451106e-05, - "loss": 2.0754, - "mean_token_accuracy": 0.5416462823748589, - "step": 1000 - }, - { - "epoch": 0.08720676724513822, - "eval_runtime": 4.0932, - "eval_samples_per_second": 268.008, - "eval_steps_per_second": 67.185, - "step": 1000 - }, - { - "epoch": 0.08807883491758961, - "grad_norm": 2.0366406440734863, - "learning_rate": 1.964973913712591e-05, - "loss": 2.1184, - "mean_token_accuracy": 0.5309564560651779, - "step": 1010 - }, - { - "epoch": 0.08895090259004099, - "grad_norm": 2.1524252891540527, - "learning_rate": 1.9633587833337064e-05, - "loss": 2.0801, - "mean_token_accuracy": 0.5336472600698471, - "step": 1020 - }, - { - "epoch": 0.08982297026249236, - "grad_norm": 2.390187978744507, - "learning_rate": 1.961707943184083e-05, - "loss": 2.1164, - "mean_token_accuracy": 0.5267612487077713, - "step": 1030 - }, - { - "epoch": 0.09069503793494375, - "grad_norm": 2.111487627029419, - "learning_rate": 1.9600214544570432e-05, - "loss": 2.0734, - "mean_token_accuracy": 0.539567020535469, - "step": 1040 - }, - { - "epoch": 0.09156710560739513, - "grad_norm": 2.078021287918091, - "learning_rate": 1.958299379667328e-05, - "loss": 2.1063, - "mean_token_accuracy": 0.5260029315948487, - "step": 1050 - }, - { - "epoch": 0.09243917327984652, - "grad_norm": 1.886529564857483, - "learning_rate": 1.9565417826487835e-05, - "loss": 2.0305, - "mean_token_accuracy": 0.5494373708963394, - "step": 1060 - }, - { - "epoch": 0.0933112409522979, - "grad_norm": 2.315904140472412, - "learning_rate": 1.9547487285519922e-05, - "loss": 2.1324, - "mean_token_accuracy": 0.5291678056120872, - "step": 1070 - }, - { - "epoch": 0.09418330862474929, - "grad_norm": 1.985088586807251, - "learning_rate": 1.952920283841861e-05, - "loss": 2.1047, - "mean_token_accuracy": 0.533170260488987, - "step": 1080 - }, - { - "epoch": 0.09505537629720066, - "grad_norm": 1.998026728630066, - "learning_rate": 1.9510565162951538e-05, - "loss": 2.082, - "mean_token_accuracy": 0.533549402654171, - "step": 1090 - }, - { - "epoch": 0.09592744396965204, - "grad_norm": 2.08966326713562, - "learning_rate": 1.9491574949979814e-05, - "loss": 2.1031, - "mean_token_accuracy": 0.534796966612339, - "step": 1100 - }, - { - "epoch": 0.09592744396965204, - "eval_runtime": 4.093, - "eval_samples_per_second": 268.018, - "eval_steps_per_second": 67.188, - "step": 1100 - }, - { - "epoch": 0.09679951164210343, - "grad_norm": 2.154848575592041, - "learning_rate": 1.9472232903432406e-05, - "loss": 2.0398, - "mean_token_accuracy": 0.5479085132479667, - "step": 1110 - }, - { - "epoch": 0.09767157931455481, - "grad_norm": 1.974173665046692, - "learning_rate": 1.945253974028004e-05, - "loss": 2.0895, - "mean_token_accuracy": 0.5327544078230858, - "step": 1120 - }, - { - "epoch": 0.0985436469870062, - "grad_norm": 2.005913257598877, - "learning_rate": 1.9432496190508633e-05, - "loss": 2.0445, - "mean_token_accuracy": 0.5410469681024551, - "step": 1130 - }, - { - "epoch": 0.09941571465945757, - "grad_norm": 1.9923633337020874, - "learning_rate": 1.941210299709222e-05, - "loss": 2.0875, - "mean_token_accuracy": 0.5345401093363762, - "step": 1140 - }, - { - "epoch": 0.10028778233190895, - "grad_norm": 1.9088701009750366, - "learning_rate": 1.9391360915965426e-05, - "loss": 2.0762, - "mean_token_accuracy": 0.5394814103841782, - "step": 1150 - }, - { - "epoch": 0.10115985000436034, - "grad_norm": 2.402848482131958, - "learning_rate": 1.9370270715995447e-05, - "loss": 2.1145, - "mean_token_accuracy": 0.5307363018393516, - "step": 1160 - }, - { - "epoch": 0.10203191767681172, - "grad_norm": 2.3139448165893555, - "learning_rate": 1.934883317895354e-05, - "loss": 2.0543, - "mean_token_accuracy": 0.5443982392549515, - "step": 1170 - }, - { - "epoch": 0.10290398534926311, - "grad_norm": 2.0338592529296875, - "learning_rate": 1.932704909948604e-05, - "loss": 2.0898, - "mean_token_accuracy": 0.5323018610477448, - "step": 1180 - }, - { - "epoch": 0.10377605302171448, - "grad_norm": 1.8027479648590088, - "learning_rate": 1.930491928508492e-05, - "loss": 2.0109, - "mean_token_accuracy": 0.5482876777648926, - "step": 1190 - }, - { - "epoch": 0.10464812069416586, - "grad_norm": 1.8102346658706665, - "learning_rate": 1.9282444556057855e-05, - "loss": 2.0559, - "mean_token_accuracy": 0.5385151654481888, - "step": 1200 - }, - { - "epoch": 0.10464812069416586, - "eval_runtime": 4.0911, - "eval_samples_per_second": 268.144, - "eval_steps_per_second": 67.219, - "step": 1200 - }, - { - "epoch": 0.10552018836661725, - "grad_norm": 2.1177432537078857, - "learning_rate": 1.9259625745497803e-05, - "loss": 2.0148, - "mean_token_accuracy": 0.5481898218393326, - "step": 1210 - }, - { - "epoch": 0.10639225603906863, - "grad_norm": 1.9435949325561523, - "learning_rate": 1.9236463699252136e-05, - "loss": 2.0629, - "mean_token_accuracy": 0.5404109612107277, - "step": 1220 - }, - { - "epoch": 0.10726432371152002, - "grad_norm": 2.074747085571289, - "learning_rate": 1.921295927589127e-05, - "loss": 2.0172, - "mean_token_accuracy": 0.5436521500349045, - "step": 1230 - }, - { - "epoch": 0.1081363913839714, - "grad_norm": 2.03115177154541, - "learning_rate": 1.9189113346676878e-05, - "loss": 2.0512, - "mean_token_accuracy": 0.5414138853549957, - "step": 1240 - }, - { - "epoch": 0.10900845905642279, - "grad_norm": 1.9985569715499878, - "learning_rate": 1.916492679552954e-05, - "loss": 2.0254, - "mean_token_accuracy": 0.5437255263328552, - "step": 1250 - }, - { - "epoch": 0.10988052672887416, - "grad_norm": 2.2252185344696045, - "learning_rate": 1.914040051899602e-05, - "loss": 2.0141, - "mean_token_accuracy": 0.5530821919441223, - "step": 1260 - }, - { - "epoch": 0.11075259440132554, - "grad_norm": 1.8892325162887573, - "learning_rate": 1.9115535426216018e-05, - "loss": 2.0156, - "mean_token_accuracy": 0.5499510824680328, - "step": 1270 - }, - { - "epoch": 0.11162466207377693, - "grad_norm": 1.7789946794509888, - "learning_rate": 1.9090332438888458e-05, - "loss": 2.027, - "mean_token_accuracy": 0.5451076358556748, - "step": 1280 - }, - { - "epoch": 0.1124967297462283, - "grad_norm": 1.7884410619735718, - "learning_rate": 1.906479249123735e-05, - "loss": 2.0156, - "mean_token_accuracy": 0.5463307216763497, - "step": 1290 - }, - { - "epoch": 0.1133687974186797, - "grad_norm": 1.8309537172317505, - "learning_rate": 1.9038916529977136e-05, - "loss": 2.0625, - "mean_token_accuracy": 0.541059197485447, - "step": 1300 - }, - { - "epoch": 0.1133687974186797, - "eval_runtime": 4.0859, - "eval_samples_per_second": 268.486, - "eval_steps_per_second": 67.305, - "step": 1300 - }, - { - "epoch": 0.11424086509113107, - "grad_norm": 1.9085613489151, - "learning_rate": 1.901270551427761e-05, - "loss": 2.0309, - "mean_token_accuracy": 0.5424412995576858, - "step": 1310 - }, - { - "epoch": 0.11511293276358245, - "grad_norm": 1.9727972745895386, - "learning_rate": 1.898616041572836e-05, - "loss": 2.0727, - "mean_token_accuracy": 0.5358610570430755, - "step": 1320 - }, - { - "epoch": 0.11598500043603384, - "grad_norm": 1.707030177116394, - "learning_rate": 1.8959282218302746e-05, - "loss": 1.9707, - "mean_token_accuracy": 0.5605675145983696, - "step": 1330 - }, - { - "epoch": 0.11685706810848522, - "grad_norm": 2.2144105434417725, - "learning_rate": 1.893207191832144e-05, - "loss": 2.0105, - "mean_token_accuracy": 0.5472480446100235, - "step": 1340 - }, - { - "epoch": 0.1177291357809366, - "grad_norm": 2.021352529525757, - "learning_rate": 1.8904530524415483e-05, - "loss": 2.0809, - "mean_token_accuracy": 0.533280324935913, - "step": 1350 - }, - { - "epoch": 0.11860120345338798, - "grad_norm": 2.0539307594299316, - "learning_rate": 1.8876659057488905e-05, - "loss": 2.0039, - "mean_token_accuracy": 0.5503424674272537, - "step": 1360 - }, - { - "epoch": 0.11947327112583936, - "grad_norm": 2.1200454235076904, - "learning_rate": 1.8848458550680875e-05, - "loss": 2.0027, - "mean_token_accuracy": 0.5509173214435578, - "step": 1370 - }, - { - "epoch": 0.12034533879829075, - "grad_norm": 1.8671112060546875, - "learning_rate": 1.8819930049327412e-05, - "loss": 2.0055, - "mean_token_accuracy": 0.5503302395343781, - "step": 1380 - }, - { - "epoch": 0.12121740647074213, - "grad_norm": 1.9496164321899414, - "learning_rate": 1.8791074610922624e-05, - "loss": 2.0184, - "mean_token_accuracy": 0.5513209402561188, - "step": 1390 - }, - { - "epoch": 0.12208947414319352, - "grad_norm": 1.8682540655136108, - "learning_rate": 1.8761893305079528e-05, - "loss": 2.0645, - "mean_token_accuracy": 0.5349070489406585, - "step": 1400 - }, - { - "epoch": 0.12208947414319352, - "eval_runtime": 4.0891, - "eval_samples_per_second": 268.275, - "eval_steps_per_second": 67.252, - "step": 1400 - }, - { - "epoch": 0.12296154181564489, - "grad_norm": 2.0305871963500977, - "learning_rate": 1.873238721349038e-05, - "loss": 2.0207, - "mean_token_accuracy": 0.54487524330616, - "step": 1410 - }, - { - "epoch": 0.12383360948809628, - "grad_norm": 2.1415932178497314, - "learning_rate": 1.8702557429886607e-05, - "loss": 2.0352, - "mean_token_accuracy": 0.5416462823748589, - "step": 1420 - }, - { - "epoch": 0.12470567716054766, - "grad_norm": 2.2434582710266113, - "learning_rate": 1.8672405059998228e-05, - "loss": 1.9727, - "mean_token_accuracy": 0.5558341443538666, - "step": 1430 - }, - { - "epoch": 0.12557774483299905, - "grad_norm": 1.7249352931976318, - "learning_rate": 1.8641931221512895e-05, - "loss": 2.0516, - "mean_token_accuracy": 0.5391511768102646, - "step": 1440 - }, - { - "epoch": 0.12644981250545043, - "grad_norm": 2.067841053009033, - "learning_rate": 1.8611137044034454e-05, - "loss": 2.0187, - "mean_token_accuracy": 0.5488992154598236, - "step": 1450 - }, - { - "epoch": 0.1273218801779018, - "grad_norm": 1.9240036010742188, - "learning_rate": 1.858002366904107e-05, - "loss": 2.0129, - "mean_token_accuracy": 0.5490704476833344, - "step": 1460 - }, - { - "epoch": 0.12819394785035318, - "grad_norm": 1.75105881690979, - "learning_rate": 1.854859224984292e-05, - "loss": 2.0531, - "mean_token_accuracy": 0.5430528372526169, - "step": 1470 - }, - { - "epoch": 0.12906601552280456, - "grad_norm": 1.7874717712402344, - "learning_rate": 1.851684395153944e-05, - "loss": 2.0074, - "mean_token_accuracy": 0.5472480416297912, - "step": 1480 - }, - { - "epoch": 0.12993808319525596, - "grad_norm": 2.0042412281036377, - "learning_rate": 1.8484779950976133e-05, - "loss": 2.0863, - "mean_token_accuracy": 0.5330234810709953, - "step": 1490 - }, - { - "epoch": 0.13081015086770734, - "grad_norm": 1.8129173517227173, - "learning_rate": 1.8452401436700954e-05, - "loss": 2.0531, - "mean_token_accuracy": 0.538759782910347, - "step": 1500 - }, - { - "epoch": 0.13081015086770734, - "eval_runtime": 4.0904, - "eval_samples_per_second": 268.189, - "eval_steps_per_second": 67.231, - "step": 1500 - }, - { - "epoch": 0.1316822185401587, - "grad_norm": 1.795745611190796, - "learning_rate": 1.8419709608920243e-05, - "loss": 2.0617, - "mean_token_accuracy": 0.5387597799301147, - "step": 1510 - }, - { - "epoch": 0.1325542862126101, - "grad_norm": 1.83107328414917, - "learning_rate": 1.8386705679454243e-05, - "loss": 1.9785, - "mean_token_accuracy": 0.5538282811641693, - "step": 1520 - }, - { - "epoch": 0.1334263538850615, - "grad_norm": 1.8500800132751465, - "learning_rate": 1.8353390871692176e-05, - "loss": 2.0379, - "mean_token_accuracy": 0.5416952073574066, - "step": 1530 - }, - { - "epoch": 0.13429842155751287, - "grad_norm": 1.7861286401748657, - "learning_rate": 1.8319766420546902e-05, - "loss": 2.023, - "mean_token_accuracy": 0.5422089010477066, - "step": 1540 - }, - { - "epoch": 0.13517048922996425, - "grad_norm": 1.8236924409866333, - "learning_rate": 1.8285833572409135e-05, - "loss": 2.0141, - "mean_token_accuracy": 0.5479941323399544, - "step": 1550 - }, - { - "epoch": 0.13604255690241562, - "grad_norm": 1.8954976797103882, - "learning_rate": 1.8251593585101243e-05, - "loss": 2.016, - "mean_token_accuracy": 0.5421477556228638, - "step": 1560 - }, - { - "epoch": 0.136914624574867, - "grad_norm": 1.7415026426315308, - "learning_rate": 1.821704772783063e-05, - "loss": 1.9969, - "mean_token_accuracy": 0.5512597829103469, - "step": 1570 - }, - { - "epoch": 0.1377866922473184, - "grad_norm": 1.6556979417800903, - "learning_rate": 1.818219728114267e-05, - "loss": 1.9957, - "mean_token_accuracy": 0.5486423671245575, - "step": 1580 - }, - { - "epoch": 0.13865875991976978, - "grad_norm": 1.6093050241470337, - "learning_rate": 1.8147043536873275e-05, - "loss": 2.0148, - "mean_token_accuracy": 0.5471379607915878, - "step": 1590 - }, - { - "epoch": 0.13953082759222116, - "grad_norm": 1.7889467477798462, - "learning_rate": 1.8111587798100974e-05, - "loss": 2.0305, - "mean_token_accuracy": 0.5468199610710144, - "step": 1600 - }, - { - "epoch": 0.13953082759222116, - "eval_runtime": 4.095, - "eval_samples_per_second": 267.891, - "eval_steps_per_second": 67.156, - "step": 1600 - }, - { - "epoch": 0.14040289526467253, - "grad_norm": 1.809481143951416, - "learning_rate": 1.807583137909862e-05, - "loss": 2.0063, - "mean_token_accuracy": 0.5502201586961746, - "step": 1610 - }, - { - "epoch": 0.1412749629371239, - "grad_norm": 1.8792319297790527, - "learning_rate": 1.8039775605284687e-05, - "loss": 1.9555, - "mean_token_accuracy": 0.5548312157392502, - "step": 1620 - }, - { - "epoch": 0.14214703060957531, - "grad_norm": 1.7202839851379395, - "learning_rate": 1.800342181317413e-05, - "loss": 2.0246, - "mean_token_accuracy": 0.5414016634225846, - "step": 1630 - }, - { - "epoch": 0.1430190982820267, - "grad_norm": 1.7062311172485352, - "learning_rate": 1.7966771350328825e-05, - "loss": 2.0168, - "mean_token_accuracy": 0.5433952987194062, - "step": 1640 - }, - { - "epoch": 0.14389116595447807, - "grad_norm": 1.8232218027114868, - "learning_rate": 1.7929825575307665e-05, - "loss": 2.0258, - "mean_token_accuracy": 0.5441658467054367, - "step": 1650 - }, - { - "epoch": 0.14476323362692944, - "grad_norm": 1.8427523374557495, - "learning_rate": 1.7892585857616144e-05, - "loss": 1.9937, - "mean_token_accuracy": 0.5491316005587578, - "step": 1660 - }, - { - "epoch": 0.14563530129938082, - "grad_norm": 1.8657242059707642, - "learning_rate": 1.785505357765563e-05, - "loss": 2.0148, - "mean_token_accuracy": 0.5445327788591385, - "step": 1670 - }, - { - "epoch": 0.14650736897183222, - "grad_norm": 1.589664101600647, - "learning_rate": 1.781723012667218e-05, - "loss": 1.9684, - "mean_token_accuracy": 0.5564212381839753, - "step": 1680 - }, - { - "epoch": 0.1473794366442836, - "grad_norm": 1.7547872066497803, - "learning_rate": 1.7779116906704986e-05, - "loss": 2.009, - "mean_token_accuracy": 0.5466242611408234, - "step": 1690 - }, - { - "epoch": 0.14825150431673498, - "grad_norm": 1.7682321071624756, - "learning_rate": 1.7740715330534383e-05, - "loss": 1.9891, - "mean_token_accuracy": 0.5518591061234475, - "step": 1700 - }, - { - "epoch": 0.14825150431673498, - "eval_runtime": 4.0897, - "eval_samples_per_second": 268.233, - "eval_steps_per_second": 67.242, - "step": 1700 - }, - { - "epoch": 0.14912357198918635, - "grad_norm": 1.9206633567810059, - "learning_rate": 1.770202682162949e-05, - "loss": 1.9539, - "mean_token_accuracy": 0.5583781778812409, - "step": 1710 - }, - { - "epoch": 0.14999563966163773, - "grad_norm": 1.6907927989959717, - "learning_rate": 1.7663052814095447e-05, - "loss": 1.9785, - "mean_token_accuracy": 0.5497431576251983, - "step": 1720 - }, - { - "epoch": 0.15086770733408914, - "grad_norm": 1.8036787509918213, - "learning_rate": 1.7623794752620255e-05, - "loss": 2.0391, - "mean_token_accuracy": 0.5395058691501617, - "step": 1730 - }, - { - "epoch": 0.1517397750065405, - "grad_norm": 1.7763701677322388, - "learning_rate": 1.7584254092421226e-05, - "loss": 2.0191, - "mean_token_accuracy": 0.5450587123632431, - "step": 1740 - }, - { - "epoch": 0.1526118426789919, - "grad_norm": 1.7862834930419922, - "learning_rate": 1.754443229919103e-05, - "loss": 1.9836, - "mean_token_accuracy": 0.5475905045866967, - "step": 1750 - }, - { - "epoch": 0.15348391035144326, - "grad_norm": 1.7891547679901123, - "learning_rate": 1.7504330849043373e-05, - "loss": 1.9965, - "mean_token_accuracy": 0.54887475669384, - "step": 1760 - }, - { - "epoch": 0.15435597802389464, - "grad_norm": 1.8181962966918945, - "learning_rate": 1.7463951228458288e-05, - "loss": 1.9965, - "mean_token_accuracy": 0.5471135050058364, - "step": 1770 - }, - { - "epoch": 0.15522804569634605, - "grad_norm": 1.6581006050109863, - "learning_rate": 1.7423294934227017e-05, - "loss": 2.0129, - "mean_token_accuracy": 0.543370833992958, - "step": 1780 - }, - { - "epoch": 0.15610011336879742, - "grad_norm": 1.7232521772384644, - "learning_rate": 1.7382363473396543e-05, - "loss": 2.0187, - "mean_token_accuracy": 0.546159490942955, - "step": 1790 - }, - { - "epoch": 0.1569721810412488, - "grad_norm": 1.7022972106933594, - "learning_rate": 1.734115836321372e-05, - "loss": 2.0207, - "mean_token_accuracy": 0.5500366926193238, - "step": 1800 - }, - { - "epoch": 0.1569721810412488, - "eval_runtime": 4.0834, - "eval_samples_per_second": 268.652, - "eval_steps_per_second": 67.347, - "step": 1800 - }, - { - "epoch": 0.15784424871370017, - "grad_norm": 1.8141206502914429, - "learning_rate": 1.7299681131069026e-05, - "loss": 1.9383, - "mean_token_accuracy": 0.5588184863328933, - "step": 1810 - }, - { - "epoch": 0.15871631638615158, - "grad_norm": 1.747899055480957, - "learning_rate": 1.725793331443996e-05, - "loss": 1.9492, - "mean_token_accuracy": 0.5605063617229462, - "step": 1820 - }, - { - "epoch": 0.15958838405860296, - "grad_norm": 1.6529823541641235, - "learning_rate": 1.7215916460834048e-05, - "loss": 2.0438, - "mean_token_accuracy": 0.5419153541326522, - "step": 1830 - }, - { - "epoch": 0.16046045173105433, - "grad_norm": 1.659896969795227, - "learning_rate": 1.7173632127731462e-05, - "loss": 2.0094, - "mean_token_accuracy": 0.544863010942936, - "step": 1840 - }, - { - "epoch": 0.1613325194035057, - "grad_norm": 1.7290623188018799, - "learning_rate": 1.7131081882527305e-05, - "loss": 1.9723, - "mean_token_accuracy": 0.551626718044281, - "step": 1850 - }, - { - "epoch": 0.16220458707595709, - "grad_norm": 1.8506168127059937, - "learning_rate": 1.708826730247351e-05, - "loss": 2.0238, - "mean_token_accuracy": 0.5429183006286621, - "step": 1860 - }, - { - "epoch": 0.1630766547484085, - "grad_norm": 1.7684937715530396, - "learning_rate": 1.704518997462037e-05, - "loss": 1.993, - "mean_token_accuracy": 0.5506360068917274, - "step": 1870 - }, - { - "epoch": 0.16394872242085987, - "grad_norm": 1.6402524709701538, - "learning_rate": 1.7001851495757708e-05, - "loss": 1.9516, - "mean_token_accuracy": 0.557167312502861, - "step": 1880 - }, - { - "epoch": 0.16482079009331124, - "grad_norm": 1.8503413200378418, - "learning_rate": 1.6958253472355687e-05, - "loss": 2.027, - "mean_token_accuracy": 0.5421355172991753, - "step": 1890 - }, - { - "epoch": 0.16569285776576262, - "grad_norm": 1.753427505493164, - "learning_rate": 1.6914397520505267e-05, - "loss": 2.0004, - "mean_token_accuracy": 0.5448997005820274, - "step": 1900 - }, - { - "epoch": 0.16569285776576262, - "eval_runtime": 4.0821, - "eval_samples_per_second": 268.731, - "eval_steps_per_second": 67.366, - "step": 1900 - }, - { - "epoch": 0.166564925438214, - "grad_norm": 1.6272205114364624, - "learning_rate": 1.6870285265858298e-05, - "loss": 1.9832, - "mean_token_accuracy": 0.5465875744819642, - "step": 1910 - }, - { - "epoch": 0.1674369931106654, - "grad_norm": 1.7397983074188232, - "learning_rate": 1.6825918343567257e-05, - "loss": 1.9766, - "mean_token_accuracy": 0.5532045066356659, - "step": 1920 - }, - { - "epoch": 0.16830906078311678, - "grad_norm": 1.591707468032837, - "learning_rate": 1.678129839822463e-05, - "loss": 1.9559, - "mean_token_accuracy": 0.5569227039813995, - "step": 1930 - }, - { - "epoch": 0.16918112845556815, - "grad_norm": 2.082350969314575, - "learning_rate": 1.673642708380198e-05, - "loss": 1.9668, - "mean_token_accuracy": 0.5507093966007233, - "step": 1940 - }, - { - "epoch": 0.17005319612801953, - "grad_norm": 1.788334608078003, - "learning_rate": 1.6691306063588583e-05, - "loss": 1.9852, - "mean_token_accuracy": 0.5441046953201294, - "step": 1950 - }, - { - "epoch": 0.1709252638004709, - "grad_norm": 1.6117647886276245, - "learning_rate": 1.6645937010129837e-05, - "loss": 1.9234, - "mean_token_accuracy": 0.5628913819789887, - "step": 1960 - }, - { - "epoch": 0.1717973314729223, - "grad_norm": 1.7404828071594238, - "learning_rate": 1.660032160516522e-05, - "loss": 1.9871, - "mean_token_accuracy": 0.5472969651222229, - "step": 1970 - }, - { - "epoch": 0.1726693991453737, - "grad_norm": 1.7610373497009277, - "learning_rate": 1.6554461539565953e-05, - "loss": 1.9852, - "mean_token_accuracy": 0.5505136966705322, - "step": 1980 - }, - { - "epoch": 0.17354146681782506, - "grad_norm": 1.7075769901275635, - "learning_rate": 1.650835851327236e-05, - "loss": 1.9492, - "mean_token_accuracy": 0.5550024449825287, - "step": 1990 - }, - { - "epoch": 0.17441353449027644, - "grad_norm": 1.6555006504058838, - "learning_rate": 1.6462014235230805e-05, - "loss": 1.9664, - "mean_token_accuracy": 0.5513454034924508, - "step": 2000 - }, - { - "epoch": 0.17441353449027644, - "eval_runtime": 4.0817, - "eval_samples_per_second": 268.76, - "eval_steps_per_second": 67.374, - "step": 2000 - }, - { - "epoch": 0.17528560216272782, - "grad_norm": 1.61020827293396, - "learning_rate": 1.641543042333038e-05, - "loss": 1.9262, - "mean_token_accuracy": 0.560702046751976, - "step": 2010 - }, - { - "epoch": 0.17615766983517922, - "grad_norm": 1.797007441520691, - "learning_rate": 1.636860880433922e-05, - "loss": 2.0184, - "mean_token_accuracy": 0.5454134061932564, - "step": 2020 - }, - { - "epoch": 0.1770297375076306, - "grad_norm": 1.764316201210022, - "learning_rate": 1.632155111384047e-05, - "loss": 2.0137, - "mean_token_accuracy": 0.5446917861700058, - "step": 2030 - }, - { - "epoch": 0.17790180518008197, - "grad_norm": 1.9471633434295654, - "learning_rate": 1.6274259096168e-05, - "loss": 1.9656, - "mean_token_accuracy": 0.5520547956228257, - "step": 2040 - }, - { - "epoch": 0.17877387285253335, - "grad_norm": 1.736385703086853, - "learning_rate": 1.622673450434169e-05, - "loss": 1.9652, - "mean_token_accuracy": 0.5534491121768952, - "step": 2050 - }, - { - "epoch": 0.17964594052498473, - "grad_norm": 1.6941977739334106, - "learning_rate": 1.6178979100002486e-05, - "loss": 1.9734, - "mean_token_accuracy": 0.5521893262863159, - "step": 2060 - }, - { - "epoch": 0.18051800819743613, - "grad_norm": 1.7591136693954468, - "learning_rate": 1.6130994653347096e-05, - "loss": 2.0137, - "mean_token_accuracy": 0.5424168288707734, - "step": 2070 - }, - { - "epoch": 0.1813900758698875, - "grad_norm": 1.646781325340271, - "learning_rate": 1.6082782943062355e-05, - "loss": 1.991, - "mean_token_accuracy": 0.5457681030035019, - "step": 2080 - }, - { - "epoch": 0.18226214354233888, - "grad_norm": 1.8365243673324585, - "learning_rate": 1.6034345756259303e-05, - "loss": 1.9172, - "mean_token_accuracy": 0.5654965698719024, - "step": 2090 - }, - { - "epoch": 0.18313421121479026, - "grad_norm": 1.591349720954895, - "learning_rate": 1.598568488840695e-05, - "loss": 1.9695, - "mean_token_accuracy": 0.5511130094528198, - "step": 2100 - }, - { - "epoch": 0.18313421121479026, - "eval_runtime": 4.085, - "eval_samples_per_second": 268.545, - "eval_steps_per_second": 67.32, - "step": 2100 - }, - { - "epoch": 0.18400627888724164, - "grad_norm": 1.6916700601577759, - "learning_rate": 1.5936802143265708e-05, - "loss": 1.984, - "mean_token_accuracy": 0.5461961805820466, - "step": 2110 - }, - { - "epoch": 0.18487834655969304, - "grad_norm": 1.5550907850265503, - "learning_rate": 1.5887699332820527e-05, - "loss": 1.9746, - "mean_token_accuracy": 0.5480171293020248, - "step": 2120 - }, - { - "epoch": 0.18575041423214442, - "grad_norm": 1.6250457763671875, - "learning_rate": 1.5838378277213745e-05, - "loss": 1.9629, - "mean_token_accuracy": 0.5539383560419082, - "step": 2130 - }, - { - "epoch": 0.1866224819045958, - "grad_norm": 1.5899889469146729, - "learning_rate": 1.57888408046776e-05, - "loss": 1.9738, - "mean_token_accuracy": 0.5491193801164627, - "step": 2140 - }, - { - "epoch": 0.18749454957704717, - "grad_norm": 1.6946805715560913, - "learning_rate": 1.573908875146648e-05, - "loss": 1.9484, - "mean_token_accuracy": 0.5561888456344605, - "step": 2150 - }, - { - "epoch": 0.18836661724949857, - "grad_norm": 1.6672033071517944, - "learning_rate": 1.5689123961788834e-05, - "loss": 1.9758, - "mean_token_accuracy": 0.551149708032608, - "step": 2160 - }, - { - "epoch": 0.18923868492194995, - "grad_norm": 1.6237884759902954, - "learning_rate": 1.563894828773883e-05, - "loss": 1.9566, - "mean_token_accuracy": 0.5554672122001648, - "step": 2170 - }, - { - "epoch": 0.19011075259440133, - "grad_norm": 1.603264570236206, - "learning_rate": 1.55885635892277e-05, - "loss": 2.0395, - "mean_token_accuracy": 0.5357754364609718, - "step": 2180 - }, - { - "epoch": 0.1909828202668527, - "grad_norm": 1.6045056581497192, - "learning_rate": 1.5537971733914784e-05, - "loss": 1.9953, - "mean_token_accuracy": 0.545070943236351, - "step": 2190 - }, - { - "epoch": 0.19185488793930408, - "grad_norm": 1.6375876665115356, - "learning_rate": 1.5487174597138314e-05, - "loss": 1.9297, - "mean_token_accuracy": 0.5666952043771744, - "step": 2200 - }, - { - "epoch": 0.19185488793930408, - "eval_runtime": 4.0865, - "eval_samples_per_second": 268.445, - "eval_steps_per_second": 67.295, - "step": 2200 - }, - { - "epoch": 0.19272695561175548, - "grad_norm": 1.5572760105133057, - "learning_rate": 1.543617406184589e-05, - "loss": 1.9449, - "mean_token_accuracy": 0.5586594969034195, - "step": 2210 - }, - { - "epoch": 0.19359902328420686, - "grad_norm": 1.6193562746047974, - "learning_rate": 1.5384972018524678e-05, - "loss": 1.9625, - "mean_token_accuracy": 0.5591487273573875, - "step": 2220 - }, - { - "epoch": 0.19447109095665824, - "grad_norm": 1.7012639045715332, - "learning_rate": 1.5333570365131353e-05, - "loss": 1.973, - "mean_token_accuracy": 0.5488502979278564, - "step": 2230 - }, - { - "epoch": 0.19534315862910961, - "grad_norm": 1.713308334350586, - "learning_rate": 1.5281971007021728e-05, - "loss": 1.9691, - "mean_token_accuracy": 0.5539383590221405, - "step": 2240 - }, - { - "epoch": 0.196215226301561, - "grad_norm": 1.577191710472107, - "learning_rate": 1.5230175856880132e-05, - "loss": 1.9426, - "mean_token_accuracy": 0.5545621335506439, - "step": 2250 - }, - { - "epoch": 0.1970872939740124, - "grad_norm": 1.4686049222946167, - "learning_rate": 1.5178186834648509e-05, - "loss": 1.9023, - "mean_token_accuracy": 0.5648116439580917, - "step": 2260 - }, - { - "epoch": 0.19795936164646377, - "grad_norm": 1.752318024635315, - "learning_rate": 1.5126005867455256e-05, - "loss": 1.907, - "mean_token_accuracy": 0.5621819943189621, - "step": 2270 - }, - { - "epoch": 0.19883142931891515, - "grad_norm": 1.5747108459472656, - "learning_rate": 1.5073634889543778e-05, - "loss": 2.0176, - "mean_token_accuracy": 0.5422945141792297, - "step": 2280 - }, - { - "epoch": 0.19970349699136652, - "grad_norm": 1.511744499206543, - "learning_rate": 1.5021075842200796e-05, - "loss": 1.9855, - "mean_token_accuracy": 0.5495352268218994, - "step": 2290 - }, - { - "epoch": 0.2005755646638179, - "grad_norm": 1.7432910203933716, - "learning_rate": 1.4968330673684387e-05, - "loss": 1.9703, - "mean_token_accuracy": 0.5514799416065216, - "step": 2300 - }, - { - "epoch": 0.2005755646638179, - "eval_runtime": 4.0831, - "eval_samples_per_second": 268.666, - "eval_steps_per_second": 67.35, - "step": 2300 - }, - { - "epoch": 0.2014476323362693, - "grad_norm": 1.5639376640319824, - "learning_rate": 1.4915401339151769e-05, - "loss": 1.9164, - "mean_token_accuracy": 0.5603595852851868, - "step": 2310 - }, - { - "epoch": 0.20231970000872068, - "grad_norm": 1.601038932800293, - "learning_rate": 1.486228980058682e-05, - "loss": 1.9656, - "mean_token_accuracy": 0.5523238748311996, - "step": 2320 - }, - { - "epoch": 0.20319176768117206, - "grad_norm": 1.513109803199768, - "learning_rate": 1.4808998026727348e-05, - "loss": 1.9836, - "mean_token_accuracy": 0.548984831571579, - "step": 2330 - }, - { - "epoch": 0.20406383535362344, - "grad_norm": 1.5771284103393555, - "learning_rate": 1.4755527992992133e-05, - "loss": 1.9187, - "mean_token_accuracy": 0.5605675190687179, - "step": 2340 - }, - { - "epoch": 0.2049359030260748, - "grad_norm": 1.529685378074646, - "learning_rate": 1.4701881681407684e-05, - "loss": 1.9469, - "mean_token_accuracy": 0.5570450097322464, - "step": 2350 - }, - { - "epoch": 0.20580797069852622, - "grad_norm": 1.7209432125091553, - "learning_rate": 1.464806108053477e-05, - "loss": 1.9492, - "mean_token_accuracy": 0.5532778888940811, - "step": 2360 - }, - { - "epoch": 0.2066800383709776, - "grad_norm": 1.703606367111206, - "learning_rate": 1.4594068185394723e-05, - "loss": 1.9629, - "mean_token_accuracy": 0.5488625228404999, - "step": 2370 - }, - { - "epoch": 0.20755210604342897, - "grad_norm": 1.7174687385559082, - "learning_rate": 1.4539904997395468e-05, - "loss": 1.9824, - "mean_token_accuracy": 0.5477005928754807, - "step": 2380 - }, - { - "epoch": 0.20842417371588035, - "grad_norm": 1.4723925590515137, - "learning_rate": 1.448557352425735e-05, - "loss": 1.9461, - "mean_token_accuracy": 0.5542441248893738, - "step": 2390 - }, - { - "epoch": 0.20929624138833172, - "grad_norm": 1.5282446146011353, - "learning_rate": 1.44310757799387e-05, - "loss": 1.9059, - "mean_token_accuracy": 0.562536695599556, - "step": 2400 - }, - { - "epoch": 0.20929624138833172, - "eval_runtime": 4.0828, - "eval_samples_per_second": 268.686, - "eval_steps_per_second": 67.355, - "step": 2400 - }, - { - "epoch": 0.21016830906078313, - "grad_norm": 1.4995638132095337, - "learning_rate": 1.437641378456119e-05, - "loss": 1.9898, - "mean_token_accuracy": 0.5491316050291062, - "step": 2410 - }, - { - "epoch": 0.2110403767332345, - "grad_norm": 1.6420363187789917, - "learning_rate": 1.4321589564334946e-05, - "loss": 1.9488, - "mean_token_accuracy": 0.5546110510826111, - "step": 2420 - }, - { - "epoch": 0.21191244440568588, - "grad_norm": 1.6747080087661743, - "learning_rate": 1.4266605151483444e-05, - "loss": 1.9758, - "mean_token_accuracy": 0.551602253317833, - "step": 2430 - }, - { - "epoch": 0.21278451207813726, - "grad_norm": 1.8265970945358276, - "learning_rate": 1.4211462584168178e-05, - "loss": 1.9332, - "mean_token_accuracy": 0.5583170250058174, - "step": 2440 - }, - { - "epoch": 0.21365657975058863, - "grad_norm": 1.528663992881775, - "learning_rate": 1.4156163906413113e-05, - "loss": 1.9191, - "mean_token_accuracy": 0.5639677077531815, - "step": 2450 - }, - { - "epoch": 0.21452864742304004, - "grad_norm": 1.5550668239593506, - "learning_rate": 1.4100711168028906e-05, - "loss": 1.948, - "mean_token_accuracy": 0.5544275999069214, - "step": 2460 - }, - { - "epoch": 0.2154007150954914, - "grad_norm": 1.6412177085876465, - "learning_rate": 1.4045106424536938e-05, - "loss": 1.9402, - "mean_token_accuracy": 0.5568615466356277, - "step": 2470 - }, - { - "epoch": 0.2162727827679428, - "grad_norm": 1.5339934825897217, - "learning_rate": 1.398935173709311e-05, - "loss": 1.9871, - "mean_token_accuracy": 0.5472847372293472, - "step": 2480 - }, - { - "epoch": 0.21714485044039417, - "grad_norm": 1.713802695274353, - "learning_rate": 1.3933449172411446e-05, - "loss": 1.9125, - "mean_token_accuracy": 0.5600538134574891, - "step": 2490 - }, - { - "epoch": 0.21801691811284557, - "grad_norm": 1.770918607711792, - "learning_rate": 1.387740080268748e-05, - "loss": 1.934, - "mean_token_accuracy": 0.5601272076368332, - "step": 2500 - }, - { - "epoch": 0.21801691811284557, - "eval_runtime": 4.0863, - "eval_samples_per_second": 268.46, - "eval_steps_per_second": 67.298, - "step": 2500 - }, - { - "epoch": 0.21888898578529695, - "grad_norm": 1.5284432172775269, - "learning_rate": 1.3821208705521442e-05, - "loss": 1.9656, - "mean_token_accuracy": 0.550281310081482, - "step": 2510 - }, - { - "epoch": 0.21976105345774832, - "grad_norm": 1.5161765813827515, - "learning_rate": 1.3764874963841255e-05, - "loss": 2.0004, - "mean_token_accuracy": 0.5463551864027977, - "step": 2520 - }, - { - "epoch": 0.2206331211301997, - "grad_norm": 1.6041077375411987, - "learning_rate": 1.3708401665825319e-05, - "loss": 1.9125, - "mean_token_accuracy": 0.5601272046566009, - "step": 2530 - }, - { - "epoch": 0.22150518880265108, - "grad_norm": 1.6916691064834595, - "learning_rate": 1.36517909048251e-05, - "loss": 1.9496, - "mean_token_accuracy": 0.5540117353200913, - "step": 2540 - }, - { - "epoch": 0.22237725647510248, - "grad_norm": 1.92775559425354, - "learning_rate": 1.3595044779287543e-05, - "loss": 1.9449, - "mean_token_accuracy": 0.5540606647729873, - "step": 2550 - }, - { - "epoch": 0.22324932414755386, - "grad_norm": 1.696441650390625, - "learning_rate": 1.3538165392677288e-05, - "loss": 1.9805, - "mean_token_accuracy": 0.5501467704772949, - "step": 2560 - }, - { - "epoch": 0.22412139182000523, - "grad_norm": 1.5236026048660278, - "learning_rate": 1.3481154853398686e-05, - "loss": 1.9695, - "mean_token_accuracy": 0.5457803398370743, - "step": 2570 - }, - { - "epoch": 0.2249934594924566, - "grad_norm": 1.6779955625534058, - "learning_rate": 1.3424015274717665e-05, - "loss": 1.9391, - "mean_token_accuracy": 0.5563723117113113, - "step": 2580 - }, - { - "epoch": 0.225865527164908, - "grad_norm": 1.5860055685043335, - "learning_rate": 1.3366748774683376e-05, - "loss": 1.9191, - "mean_token_accuracy": 0.561411452293396, - "step": 2590 - }, - { - "epoch": 0.2267375948373594, - "grad_norm": 1.7570159435272217, - "learning_rate": 1.3309357476049686e-05, - "loss": 1.9176, - "mean_token_accuracy": 0.5621452987194061, - "step": 2600 - }, - { - "epoch": 0.2267375948373594, - "eval_runtime": 4.0838, - "eval_samples_per_second": 268.625, - "eval_steps_per_second": 67.34, - "step": 2600 - }, - { - "epoch": 0.22760966250981077, - "grad_norm": 1.6816412210464478, - "learning_rate": 1.3251843506196508e-05, - "loss": 1.9297, - "mean_token_accuracy": 0.5580846339464187, - "step": 2610 - }, - { - "epoch": 0.22848173018226214, - "grad_norm": 1.6356149911880493, - "learning_rate": 1.3194208997050915e-05, - "loss": 1.9023, - "mean_token_accuracy": 0.5675758302211762, - "step": 2620 - }, - { - "epoch": 0.22935379785471352, - "grad_norm": 1.6503158807754517, - "learning_rate": 1.313645608500814e-05, - "loss": 1.941, - "mean_token_accuracy": 0.5584637999534607, - "step": 2630 - }, - { - "epoch": 0.2302258655271649, - "grad_norm": 1.5784369707107544, - "learning_rate": 1.3078586910852364e-05, - "loss": 1.9395, - "mean_token_accuracy": 0.5558708399534226, - "step": 2640 - }, - { - "epoch": 0.2310979331996163, - "grad_norm": 1.5842807292938232, - "learning_rate": 1.3020603619677378e-05, - "loss": 1.9355, - "mean_token_accuracy": 0.5551736712455749, - "step": 2650 - }, - { - "epoch": 0.23197000087206768, - "grad_norm": 1.5968657732009888, - "learning_rate": 1.296250836080706e-05, - "loss": 1.9301, - "mean_token_accuracy": 0.5569104701280594, - "step": 2660 - }, - { - "epoch": 0.23284206854451905, - "grad_norm": 1.5821949243545532, - "learning_rate": 1.2904303287715702e-05, - "loss": 1.9555, - "mean_token_accuracy": 0.5515655547380447, - "step": 2670 - }, - { - "epoch": 0.23371413621697043, - "grad_norm": 1.4713672399520874, - "learning_rate": 1.284599055794819e-05, - "loss": 1.934, - "mean_token_accuracy": 0.553840509057045, - "step": 2680 - }, - { - "epoch": 0.2345862038894218, - "grad_norm": 1.5115606784820557, - "learning_rate": 1.2787572333040022e-05, - "loss": 1.9703, - "mean_token_accuracy": 0.5483855247497559, - "step": 2690 - }, - { - "epoch": 0.2354582715618732, - "grad_norm": 1.545019507408142, - "learning_rate": 1.2729050778437197e-05, - "loss": 1.9535, - "mean_token_accuracy": 0.5512720137834549, - "step": 2700 - }, - { - "epoch": 0.2354582715618732, - "eval_runtime": 4.0782, - "eval_samples_per_second": 268.989, - "eval_steps_per_second": 67.431, - "step": 2700 - }, - { - "epoch": 0.2363303392343246, - "grad_norm": 1.5752264261245728, - "learning_rate": 1.2670428063415932e-05, - "loss": 1.9473, - "mean_token_accuracy": 0.5551736801862717, - "step": 2710 - }, - { - "epoch": 0.23720240690677596, - "grad_norm": 1.5344406366348267, - "learning_rate": 1.2611706361002254e-05, - "loss": 1.8836, - "mean_token_accuracy": 0.5689212352037429, - "step": 2720 - }, - { - "epoch": 0.23807447457922734, - "grad_norm": 1.4946064949035645, - "learning_rate": 1.2552887847891462e-05, - "loss": 1.9746, - "mean_token_accuracy": 0.5502690762281418, - "step": 2730 - }, - { - "epoch": 0.23894654225167872, - "grad_norm": 1.5531439781188965, - "learning_rate": 1.2493974704367427e-05, - "loss": 1.9402, - "mean_token_accuracy": 0.5537793606519699, - "step": 2740 - }, - { - "epoch": 0.23981860992413012, - "grad_norm": 1.50486421585083, - "learning_rate": 1.2434969114221777e-05, - "loss": 1.9164, - "mean_token_accuracy": 0.5588429629802704, - "step": 2750 - }, - { - "epoch": 0.2406906775965815, - "grad_norm": 1.7339839935302734, - "learning_rate": 1.237587326467296e-05, - "loss": 1.957, - "mean_token_accuracy": 0.5529231876134872, - "step": 2760 - }, - { - "epoch": 0.24156274526903287, - "grad_norm": 1.5544097423553467, - "learning_rate": 1.2316689346285146e-05, - "loss": 1.9391, - "mean_token_accuracy": 0.5560053795576095, - "step": 2770 - }, - { - "epoch": 0.24243481294148425, - "grad_norm": 1.4663689136505127, - "learning_rate": 1.2257419552887047e-05, - "loss": 1.95, - "mean_token_accuracy": 0.5576320916414261, - "step": 2780 - }, - { - "epoch": 0.24330688061393563, - "grad_norm": 1.5417518615722656, - "learning_rate": 1.2198066081490585e-05, - "loss": 1.9477, - "mean_token_accuracy": 0.5545132160186768, - "step": 2790 - }, - { - "epoch": 0.24417894828638703, - "grad_norm": 1.6155837774276733, - "learning_rate": 1.213863113220946e-05, - "loss": 1.9059, - "mean_token_accuracy": 0.5640166342258454, - "step": 2800 - }, - { - "epoch": 0.24417894828638703, - "eval_runtime": 4.0898, - "eval_samples_per_second": 268.23, - "eval_steps_per_second": 67.241, - "step": 2800 - }, - { - "epoch": 0.2450510159588384, - "grad_norm": 1.5956717729568481, - "learning_rate": 1.2079116908177592e-05, - "loss": 1.9336, - "mean_token_accuracy": 0.559344419836998, - "step": 2810 - }, - { - "epoch": 0.24592308363128978, - "grad_norm": 1.401995301246643, - "learning_rate": 1.2019525615467462e-05, - "loss": 1.8941, - "mean_token_accuracy": 0.5678449153900147, - "step": 2820 - }, - { - "epoch": 0.24679515130374116, - "grad_norm": 1.5462545156478882, - "learning_rate": 1.1959859463008316e-05, - "loss": 1.9523, - "mean_token_accuracy": 0.5523727923631668, - "step": 2830 - }, - { - "epoch": 0.24766721897619257, - "grad_norm": 1.7832167148590088, - "learning_rate": 1.1900120662504315e-05, - "loss": 1.9289, - "mean_token_accuracy": 0.5563723117113113, - "step": 2840 - }, - { - "epoch": 0.24853928664864394, - "grad_norm": 1.44082510471344, - "learning_rate": 1.1840311428352536e-05, - "loss": 1.9203, - "mean_token_accuracy": 0.562059685587883, - "step": 2850 - }, - { - "epoch": 0.24941135432109532, - "grad_norm": 1.4894593954086304, - "learning_rate": 1.1780433977560879e-05, - "loss": 1.9094, - "mean_token_accuracy": 0.5629647731781006, - "step": 2860 - }, - { - "epoch": 0.2502834219935467, - "grad_norm": 1.464408278465271, - "learning_rate": 1.1720490529665904e-05, - "loss": 1.9258, - "mean_token_accuracy": 0.5583292603492737, - "step": 2870 - }, - { - "epoch": 0.2511554896659981, - "grad_norm": 1.5469698905944824, - "learning_rate": 1.1660483306650558e-05, - "loss": 1.8949, - "mean_token_accuracy": 0.5638209402561187, - "step": 2880 - }, - { - "epoch": 0.25202755733844945, - "grad_norm": 1.3923354148864746, - "learning_rate": 1.160041453286179e-05, - "loss": 1.9855, - "mean_token_accuracy": 0.5512720167636871, - "step": 2890 - }, - { - "epoch": 0.25289962501090085, - "grad_norm": 1.3185718059539795, - "learning_rate": 1.154028643492812e-05, - "loss": 1.9332, - "mean_token_accuracy": 0.5567881554365158, - "step": 2900 - }, - { - "epoch": 0.25289962501090085, - "eval_runtime": 4.0827, - "eval_samples_per_second": 268.692, - "eval_steps_per_second": 67.357, - "step": 2900 - }, - { - "epoch": 0.2537716926833522, - "grad_norm": 1.5093870162963867, - "learning_rate": 1.1480101241677097e-05, - "loss": 1.9281, - "mean_token_accuracy": 0.5590264230966568, - "step": 2910 - }, - { - "epoch": 0.2546437603558036, - "grad_norm": 1.588794231414795, - "learning_rate": 1.1419861184052669e-05, - "loss": 1.9012, - "mean_token_accuracy": 0.5649339586496354, - "step": 2920 - }, - { - "epoch": 0.255515828028255, - "grad_norm": 1.4751886129379272, - "learning_rate": 1.1359568495032505e-05, - "loss": 1.9609, - "mean_token_accuracy": 0.5513331741094589, - "step": 2930 - }, - { - "epoch": 0.25638789570070636, - "grad_norm": 1.4998481273651123, - "learning_rate": 1.1299225409545207e-05, - "loss": 1.9059, - "mean_token_accuracy": 0.5650682866573333, - "step": 2940 - }, - { - "epoch": 0.25725996337315776, - "grad_norm": 1.6425520181655884, - "learning_rate": 1.123883416438748e-05, - "loss": 1.9359, - "mean_token_accuracy": 0.5557974576950073, - "step": 2950 - }, - { - "epoch": 0.2581320310456091, - "grad_norm": 1.7462652921676636, - "learning_rate": 1.1178396998141206e-05, - "loss": 1.9055, - "mean_token_accuracy": 0.5619740784168243, - "step": 2960 - }, - { - "epoch": 0.2590040987180605, - "grad_norm": 1.4744683504104614, - "learning_rate": 1.1117916151090469e-05, - "loss": 1.8906, - "mean_token_accuracy": 0.5662304311990738, - "step": 2970 - }, - { - "epoch": 0.2598761663905119, - "grad_norm": 1.4409739971160889, - "learning_rate": 1.1057393865138513e-05, - "loss": 1.9062, - "mean_token_accuracy": 0.5584882616996765, - "step": 2980 - }, - { - "epoch": 0.26074823406296327, - "grad_norm": 1.658846139907837, - "learning_rate": 1.099683238372464e-05, - "loss": 1.9102, - "mean_token_accuracy": 0.5615215301513672, - "step": 2990 - }, - { - "epoch": 0.2616203017354147, - "grad_norm": 1.5255091190338135, - "learning_rate": 1.0936233951741052e-05, - "loss": 1.891, - "mean_token_accuracy": 0.5645181030035019, - "step": 3000 - }, - { - "epoch": 0.2616203017354147, - "eval_runtime": 4.0846, - "eval_samples_per_second": 268.567, - "eval_steps_per_second": 67.325, - "step": 3000 - }, - { - "epoch": 0.2624923694078661, - "grad_norm": 1.4099208116531372, - "learning_rate": 1.0875600815449624e-05, - "loss": 1.8602, - "mean_token_accuracy": 0.5746575325727463, - "step": 3010 - }, - { - "epoch": 0.2633644370803174, - "grad_norm": 1.534428596496582, - "learning_rate": 1.081493522239866e-05, - "loss": 1.934, - "mean_token_accuracy": 0.5573140889406204, - "step": 3020 - }, - { - "epoch": 0.26423650475276883, - "grad_norm": 1.6368540525436401, - "learning_rate": 1.075423942133957e-05, - "loss": 1.893, - "mean_token_accuracy": 0.5663527399301529, - "step": 3030 - }, - { - "epoch": 0.2651085724252202, - "grad_norm": 1.5446292161941528, - "learning_rate": 1.0693515662143505e-05, - "loss": 1.9332, - "mean_token_accuracy": 0.5582069456577301, - "step": 3040 - }, - { - "epoch": 0.2659806400976716, - "grad_norm": 1.5439326763153076, - "learning_rate": 1.0632766195717979e-05, - "loss": 1.9027, - "mean_token_accuracy": 0.5597602754831315, - "step": 3050 - }, - { - "epoch": 0.266852707770123, - "grad_norm": 1.5670526027679443, - "learning_rate": 1.0571993273923412e-05, - "loss": 1.9098, - "mean_token_accuracy": 0.5607387512922287, - "step": 3060 - }, - { - "epoch": 0.26772477544257434, - "grad_norm": 1.442647933959961, - "learning_rate": 1.0511199149489673e-05, - "loss": 1.8949, - "mean_token_accuracy": 0.5646159499883652, - "step": 3070 - }, - { - "epoch": 0.26859684311502574, - "grad_norm": 1.4678329229354858, - "learning_rate": 1.0450386075932571e-05, - "loss": 1.898, - "mean_token_accuracy": 0.5630503982305527, - "step": 3080 - }, - { - "epoch": 0.2694689107874771, - "grad_norm": 1.5092103481292725, - "learning_rate": 1.0389556307470316e-05, - "loss": 1.9227, - "mean_token_accuracy": 0.5559319972991943, - "step": 3090 - }, - { - "epoch": 0.2703409784599285, - "grad_norm": 1.6279524564743042, - "learning_rate": 1.0328712098939968e-05, - "loss": 1.9723, - "mean_token_accuracy": 0.5526785761117935, - "step": 3100 - }, - { - "epoch": 0.2703409784599285, - "eval_runtime": 4.0896, - "eval_samples_per_second": 268.24, - "eval_steps_per_second": 67.243, - "step": 3100 - }, - { - "epoch": 0.2712130461323799, - "grad_norm": 1.4583823680877686, - "learning_rate": 1.0267855705713854e-05, - "loss": 1.8852, - "mean_token_accuracy": 0.5667196691036225, - "step": 3110 - }, - { - "epoch": 0.27208511380483125, - "grad_norm": 1.5141912698745728, - "learning_rate": 1.020698938361595e-05, - "loss": 1.9297, - "mean_token_accuracy": 0.5533879637718201, - "step": 3120 - }, - { - "epoch": 0.27295718147728265, - "grad_norm": 1.6877249479293823, - "learning_rate": 1.0146115388838293e-05, - "loss": 1.8852, - "mean_token_accuracy": 0.5637475490570069, - "step": 3130 - }, - { - "epoch": 0.273829249149734, - "grad_norm": 1.6511071920394897, - "learning_rate": 1.0085235977857322e-05, - "loss": 1.8508, - "mean_token_accuracy": 0.572296965122223, - "step": 3140 - }, - { - "epoch": 0.2747013168221854, - "grad_norm": 1.5826408863067627, - "learning_rate": 1.002435340735024e-05, - "loss": 1.8996, - "mean_token_accuracy": 0.5607876688241958, - "step": 3150 - }, - { - "epoch": 0.2755733844946368, - "grad_norm": 1.5323454141616821, - "learning_rate": 9.963469934111374e-06, - "loss": 1.8852, - "mean_token_accuracy": 0.5662671208381653, - "step": 3160 - }, - { - "epoch": 0.27644545216708816, - "grad_norm": 1.5647304058074951, - "learning_rate": 9.90258781496851e-06, - "loss": 1.9348, - "mean_token_accuracy": 0.5581213295459747, - "step": 3170 - }, - { - "epoch": 0.27731751983953956, - "grad_norm": 1.5409218072891235, - "learning_rate": 9.841709306699245e-06, - "loss": 1.9219, - "mean_token_accuracy": 0.5589041084051132, - "step": 3180 - }, - { - "epoch": 0.2781895875119909, - "grad_norm": 1.5608783960342407, - "learning_rate": 9.78083666594732e-06, - "loss": 1.9379, - "mean_token_accuracy": 0.5579764008522033, - "step": 3190 - }, - { - "epoch": 0.2790616551844423, - "grad_norm": 1.413813591003418, - "learning_rate": 9.719972149138985e-06, - "loss": 1.9078, - "mean_token_accuracy": 0.5645792603492736, - "step": 3200 - }, - { - "epoch": 0.2790616551844423, - "eval_runtime": 4.0877, - "eval_samples_per_second": 268.365, - "eval_steps_per_second": 67.275, - "step": 3200 - }, - { - "epoch": 0.2799337228568937, - "grad_norm": 1.5967097282409668, - "learning_rate": 9.659118012399352e-06, - "loss": 1.9207, - "mean_token_accuracy": 0.5589774966239929, - "step": 3210 - }, - { - "epoch": 0.28080579052934507, - "grad_norm": 1.454176902770996, - "learning_rate": 9.598276511468763e-06, - "loss": 1.8473, - "mean_token_accuracy": 0.5716364949941635, - "step": 3220 - }, - { - "epoch": 0.28167785820179647, - "grad_norm": 1.6636513471603394, - "learning_rate": 9.537449901619174e-06, - "loss": 1.9395, - "mean_token_accuracy": 0.5527641892433166, - "step": 3230 - }, - { - "epoch": 0.2825499258742478, - "grad_norm": 1.5804682970046997, - "learning_rate": 9.476640437570562e-06, - "loss": 1.9648, - "mean_token_accuracy": 0.5519936442375183, - "step": 3240 - }, - { - "epoch": 0.2834219935466992, - "grad_norm": 1.4875261783599854, - "learning_rate": 9.415850373407342e-06, - "loss": 1.9031, - "mean_token_accuracy": 0.5627935409545899, - "step": 3250 - }, - { - "epoch": 0.28429406121915063, - "grad_norm": 1.7054407596588135, - "learning_rate": 9.355081962494815e-06, - "loss": 1.8504, - "mean_token_accuracy": 0.5755754441022873, - "step": 3260 - }, - { - "epoch": 0.285166128891602, - "grad_norm": 1.5067362785339355, - "learning_rate": 9.294337457395638e-06, - "loss": 1.8762, - "mean_token_accuracy": 0.5702788710594178, - "step": 3270 - }, - { - "epoch": 0.2860381965640534, - "grad_norm": 1.6777541637420654, - "learning_rate": 9.233619109786332e-06, - "loss": 1.8887, - "mean_token_accuracy": 0.5692392379045487, - "step": 3280 - }, - { - "epoch": 0.28691026423650473, - "grad_norm": 1.443976879119873, - "learning_rate": 9.172929170373804e-06, - "loss": 1.9289, - "mean_token_accuracy": 0.5607509702444077, - "step": 3290 - }, - { - "epoch": 0.28778233190895613, - "grad_norm": 1.5747430324554443, - "learning_rate": 9.112269888811934e-06, - "loss": 1.927, - "mean_token_accuracy": 0.5572040110826493, - "step": 3300 - }, - { - "epoch": 0.28778233190895613, - "eval_runtime": 4.0885, - "eval_samples_per_second": 268.315, - "eval_steps_per_second": 67.262, - "step": 3300 - }, - { - "epoch": 0.28865439958140754, - "grad_norm": 1.484473705291748, - "learning_rate": 9.051643513618176e-06, - "loss": 1.8969, - "mean_token_accuracy": 0.5657656580209732, - "step": 3310 - }, - { - "epoch": 0.2895264672538589, - "grad_norm": 1.4190107583999634, - "learning_rate": 8.99105229209021e-06, - "loss": 1.941, - "mean_token_accuracy": 0.5517367869615555, - "step": 3320 - }, - { - "epoch": 0.2903985349263103, - "grad_norm": 1.5658514499664307, - "learning_rate": 8.930498470222641e-06, - "loss": 1.9676, - "mean_token_accuracy": 0.5517367869615555, - "step": 3330 - }, - { - "epoch": 0.29127060259876164, - "grad_norm": 1.5152161121368408, - "learning_rate": 8.86998429262374e-06, - "loss": 1.9152, - "mean_token_accuracy": 0.5605430513620376, - "step": 3340 - }, - { - "epoch": 0.29214267027121305, - "grad_norm": 1.4670003652572632, - "learning_rate": 8.809512002432252e-06, - "loss": 1.8652, - "mean_token_accuracy": 0.569899708032608, - "step": 3350 - }, - { - "epoch": 0.29301473794366445, - "grad_norm": 1.599631667137146, - "learning_rate": 8.749083841234235e-06, - "loss": 1.934, - "mean_token_accuracy": 0.5619740664958954, - "step": 3360 - }, - { - "epoch": 0.2938868056161158, - "grad_norm": 1.6647261381149292, - "learning_rate": 8.688702048979974e-06, - "loss": 1.925, - "mean_token_accuracy": 0.5590508818626404, - "step": 3370 - }, - { - "epoch": 0.2947588732885672, - "grad_norm": 1.4344950914382935, - "learning_rate": 8.628368863900954e-06, - "loss": 1.9102, - "mean_token_accuracy": 0.5580724090337753, - "step": 3380 - }, - { - "epoch": 0.29563094096101855, - "grad_norm": 1.475479245185852, - "learning_rate": 8.568086522426884e-06, - "loss": 1.9246, - "mean_token_accuracy": 0.5588674157857895, - "step": 3390 - }, - { - "epoch": 0.29650300863346996, - "grad_norm": 1.309008240699768, - "learning_rate": 8.507857259102814e-06, - "loss": 1.8875, - "mean_token_accuracy": 0.5669765114784241, - "step": 3400 - }, - { - "epoch": 0.29650300863346996, - "eval_runtime": 4.0896, - "eval_samples_per_second": 268.242, - "eval_steps_per_second": 67.244, - "step": 3400 - }, - { - "epoch": 0.29737507630592136, - "grad_norm": 1.4824247360229492, - "learning_rate": 8.447683306506279e-06, - "loss": 1.932, - "mean_token_accuracy": 0.5547700643539428, - "step": 3410 - }, - { - "epoch": 0.2982471439783727, - "grad_norm": 1.4326711893081665, - "learning_rate": 8.387566895164566e-06, - "loss": 1.8969, - "mean_token_accuracy": 0.5620596885681153, - "step": 3420 - }, - { - "epoch": 0.2991192116508241, - "grad_norm": 1.6375669240951538, - "learning_rate": 8.327510253472023e-06, - "loss": 1.9453, - "mean_token_accuracy": 0.553816044330597, - "step": 3430 - }, - { - "epoch": 0.29999127932327546, - "grad_norm": 1.4804935455322266, - "learning_rate": 8.267515607607458e-06, - "loss": 1.8715, - "mean_token_accuracy": 0.569043543934822, - "step": 3440 - }, - { - "epoch": 0.30086334699572687, - "grad_norm": 1.5628015995025635, - "learning_rate": 8.207585181451611e-06, - "loss": 1.882, - "mean_token_accuracy": 0.564811646938324, - "step": 3450 - }, - { - "epoch": 0.30173541466817827, - "grad_norm": 1.6417442560195923, - "learning_rate": 8.147721196504736e-06, - "loss": 1.875, - "mean_token_accuracy": 0.5665484309196472, - "step": 3460 - }, - { - "epoch": 0.3026074823406296, - "grad_norm": 1.7955617904663086, - "learning_rate": 8.08792587180424e-06, - "loss": 1.9273, - "mean_token_accuracy": 0.5594300329685211, - "step": 3470 - }, - { - "epoch": 0.303479550013081, - "grad_norm": 1.4699718952178955, - "learning_rate": 8.028201423842437e-06, - "loss": 1.909, - "mean_token_accuracy": 0.5575097858905792, - "step": 3480 - }, - { - "epoch": 0.30435161768553237, - "grad_norm": 1.3800170421600342, - "learning_rate": 7.96855006648438e-06, - "loss": 1.8832, - "mean_token_accuracy": 0.5657167315483094, - "step": 3490 - }, - { - "epoch": 0.3052236853579838, - "grad_norm": 1.587877869606018, - "learning_rate": 7.908974010885795e-06, - "loss": 1.9406, - "mean_token_accuracy": 0.5579867869615555, - "step": 3500 - }, - { - "epoch": 0.3052236853579838, - "eval_runtime": 4.0852, - "eval_samples_per_second": 268.533, - "eval_steps_per_second": 67.317, - "step": 3500 - }, - { - "epoch": 0.3060957530304352, - "grad_norm": 1.3948945999145508, - "learning_rate": 7.849475465411136e-06, - "loss": 1.8719, - "mean_token_accuracy": 0.5669398188591004, - "step": 3510 - }, - { - "epoch": 0.30696782070288653, - "grad_norm": 1.3805464506149292, - "learning_rate": 7.790056635551704e-06, - "loss": 1.9012, - "mean_token_accuracy": 0.5630626201629638, - "step": 3520 - }, - { - "epoch": 0.30783988837533793, - "grad_norm": 1.3941526412963867, - "learning_rate": 7.730719723843903e-06, - "loss": 1.9355, - "mean_token_accuracy": 0.5606409013271332, - "step": 3530 - }, - { - "epoch": 0.3087119560477893, - "grad_norm": 1.4487614631652832, - "learning_rate": 7.671466929787598e-06, - "loss": 1.8922, - "mean_token_accuracy": 0.5645058721303939, - "step": 3540 - }, - { - "epoch": 0.3095840237202407, - "grad_norm": 1.5826810598373413, - "learning_rate": 7.61230044976458e-06, - "loss": 1.9207, - "mean_token_accuracy": 0.5601516604423523, - "step": 3550 - }, - { - "epoch": 0.3104560913926921, - "grad_norm": 1.4143035411834717, - "learning_rate": 7.553222476957157e-06, - "loss": 1.9074, - "mean_token_accuracy": 0.5620963841676712, - "step": 3560 - }, - { - "epoch": 0.31132815906514344, - "grad_norm": 1.5634782314300537, - "learning_rate": 7.494235201266849e-06, - "loss": 1.8957, - "mean_token_accuracy": 0.5654965758323669, - "step": 3570 - }, - { - "epoch": 0.31220022673759484, - "grad_norm": 1.4505248069763184, - "learning_rate": 7.435340809233218e-06, - "loss": 1.9094, - "mean_token_accuracy": 0.562348335981369, - "step": 3580 - }, - { - "epoch": 0.3130722944100462, - "grad_norm": 1.361753225326538, - "learning_rate": 7.376541483952811e-06, - "loss": 1.9109, - "mean_token_accuracy": 0.5665729016065597, - "step": 3590 - }, - { - "epoch": 0.3139443620824976, - "grad_norm": 1.4534916877746582, - "learning_rate": 7.3178394049982485e-06, - "loss": 1.8863, - "mean_token_accuracy": 0.5663649708032608, - "step": 3600 - }, - { - "epoch": 0.3139443620824976, - "eval_runtime": 4.0837, - "eval_samples_per_second": 268.628, - "eval_steps_per_second": 67.341, - "step": 3600 - }, - { - "epoch": 0.314816429754949, - "grad_norm": 1.6131386756896973, - "learning_rate": 7.259236748337421e-06, - "loss": 1.8797, - "mean_token_accuracy": 0.5715753436088562, - "step": 3610 - }, - { - "epoch": 0.31568849742740035, - "grad_norm": 1.647533655166626, - "learning_rate": 7.20073568625284e-06, - "loss": 1.9297, - "mean_token_accuracy": 0.5601149737834931, - "step": 3620 - }, - { - "epoch": 0.31656056509985175, - "grad_norm": 1.5809109210968018, - "learning_rate": 7.1423383872611045e-06, - "loss": 1.9238, - "mean_token_accuracy": 0.5594422727823257, - "step": 3630 - }, - { - "epoch": 0.31743263277230316, - "grad_norm": 1.4708706140518188, - "learning_rate": 7.084047016032528e-06, - "loss": 1.9172, - "mean_token_accuracy": 0.5604818999767304, - "step": 3640 - }, - { - "epoch": 0.3183047004447545, - "grad_norm": 1.5346980094909668, - "learning_rate": 7.025863733310894e-06, - "loss": 1.8594, - "mean_token_accuracy": 0.570914876461029, - "step": 3650 - }, - { - "epoch": 0.3191767681172059, - "grad_norm": 1.497436761856079, - "learning_rate": 6.967790695833363e-06, - "loss": 1.9203, - "mean_token_accuracy": 0.5600171208381652, - "step": 3660 - }, - { - "epoch": 0.32004883578965726, - "grad_norm": 1.4364548921585083, - "learning_rate": 6.909830056250527e-06, - "loss": 1.8621, - "mean_token_accuracy": 0.5694456309080124, - "step": 3670 - }, - { - "epoch": 0.32092090346210866, - "grad_norm": 1.4897515773773193, - "learning_rate": 6.851983963046612e-06, - "loss": 1.8941, - "mean_token_accuracy": 0.5614848345518112, - "step": 3680 - }, - { - "epoch": 0.32179297113456007, - "grad_norm": 1.4270612001419067, - "learning_rate": 6.794254560459843e-06, - "loss": 1.8844, - "mean_token_accuracy": 0.56588796377182, - "step": 3690 - }, - { - "epoch": 0.3226650388070114, - "grad_norm": 1.4720197916030884, - "learning_rate": 6.736643988402958e-06, - "loss": 1.893, - "mean_token_accuracy": 0.5601761192083359, - "step": 3700 - }, - { - "epoch": 0.3226650388070114, - "eval_runtime": 4.0882, - "eval_samples_per_second": 268.334, - "eval_steps_per_second": 67.267, - "step": 3700 - }, - { - "epoch": 0.3235371064794628, - "grad_norm": 1.4100483655929565, - "learning_rate": 6.679154382383883e-06, - "loss": 1.9289, - "mean_token_accuracy": 0.5587695688009262, - "step": 3710 - }, - { - "epoch": 0.32440917415191417, - "grad_norm": 1.46821928024292, - "learning_rate": 6.621787873426581e-06, - "loss": 1.9387, - "mean_token_accuracy": 0.5547578275203705, - "step": 3720 - }, - { - "epoch": 0.3252812418243656, - "grad_norm": 1.479946494102478, - "learning_rate": 6.564546587992054e-06, - "loss": 1.8813, - "mean_token_accuracy": 0.5644814103841782, - "step": 3730 - }, - { - "epoch": 0.326153309496817, - "grad_norm": 1.4300014972686768, - "learning_rate": 6.507432647899519e-06, - "loss": 1.9535, - "mean_token_accuracy": 0.552580726146698, - "step": 3740 - }, - { - "epoch": 0.3270253771692683, - "grad_norm": 1.514469861984253, - "learning_rate": 6.450448170247757e-06, - "loss": 1.866, - "mean_token_accuracy": 0.5673067539930343, - "step": 3750 - }, - { - "epoch": 0.32789744484171973, - "grad_norm": 1.5405138731002808, - "learning_rate": 6.393595267336639e-06, - "loss": 1.8977, - "mean_token_accuracy": 0.5651541113853454, - "step": 3760 - }, - { - "epoch": 0.3287695125141711, - "grad_norm": 1.5160272121429443, - "learning_rate": 6.3368760465888226e-06, - "loss": 1.923, - "mean_token_accuracy": 0.5567025423049927, - "step": 3770 - }, - { - "epoch": 0.3296415801866225, - "grad_norm": 1.5919586420059204, - "learning_rate": 6.280292610471639e-06, - "loss": 1.9129, - "mean_token_accuracy": 0.5610078275203705, - "step": 3780 - }, - { - "epoch": 0.3305136478590739, - "grad_norm": 1.5915533304214478, - "learning_rate": 6.223847056419154e-06, - "loss": 1.918, - "mean_token_accuracy": 0.5577421814203263, - "step": 3790 - }, - { - "epoch": 0.33138571553152524, - "grad_norm": 1.5471558570861816, - "learning_rate": 6.1675414767544285e-06, - "loss": 1.9059, - "mean_token_accuracy": 0.557852241396904, - "step": 3800 - }, - { - "epoch": 0.33138571553152524, - "eval_runtime": 4.086, - "eval_samples_per_second": 268.477, - "eval_steps_per_second": 67.303, - "step": 3800 - }, - { - "epoch": 0.33225778320397664, - "grad_norm": 1.4880732297897339, - "learning_rate": 6.111377958611948e-06, - "loss": 1.8887, - "mean_token_accuracy": 0.5641022503376008, - "step": 3810 - }, - { - "epoch": 0.333129850876428, - "grad_norm": 1.516377329826355, - "learning_rate": 6.055358583860267e-06, - "loss": 1.8793, - "mean_token_accuracy": 0.5662304311990738, - "step": 3820 - }, - { - "epoch": 0.3340019185488794, - "grad_norm": 1.4484655857086182, - "learning_rate": 5.99948542902483e-06, - "loss": 1.8742, - "mean_token_accuracy": 0.5674779891967774, - "step": 3830 - }, - { - "epoch": 0.3348739862213308, - "grad_norm": 1.6849968433380127, - "learning_rate": 5.943760565211011e-06, - "loss": 1.932, - "mean_token_accuracy": 0.5551247507333755, - "step": 3840 - }, - { - "epoch": 0.33574605389378215, - "grad_norm": 1.680128574371338, - "learning_rate": 5.8881860580273285e-06, - "loss": 1.9238, - "mean_token_accuracy": 0.5592343389987946, - "step": 3850 - }, - { - "epoch": 0.33661812156623355, - "grad_norm": 1.450142502784729, - "learning_rate": 5.832763967508885e-06, - "loss": 1.973, - "mean_token_accuracy": 0.5471624255180358, - "step": 3860 - }, - { - "epoch": 0.3374901892386849, - "grad_norm": 1.4337846040725708, - "learning_rate": 5.777496348041009e-06, - "loss": 1.8746, - "mean_token_accuracy": 0.5702299416065216, - "step": 3870 - }, - { - "epoch": 0.3383622569111363, - "grad_norm": 1.4043320417404175, - "learning_rate": 5.722385248283092e-06, - "loss": 1.9008, - "mean_token_accuracy": 0.5623899191617966, - "step": 3880 - }, - { - "epoch": 0.3392343245835877, - "grad_norm": 1.4860856533050537, - "learning_rate": 5.667432711092651e-06, - "loss": 1.9168, - "mean_token_accuracy": 0.5559686869382858, - "step": 3890 - }, - { - "epoch": 0.34010639225603906, - "grad_norm": 1.5466890335083008, - "learning_rate": 5.61264077344962e-06, - "loss": 1.9285, - "mean_token_accuracy": 0.557815557718277, - "step": 3900 - }, - { - "epoch": 0.34010639225603906, - "eval_runtime": 4.0828, - "eval_samples_per_second": 268.69, - "eval_steps_per_second": 67.356, - "step": 3900 - }, - { - "epoch": 0.34097845992849046, - "grad_norm": 1.5171886682510376, - "learning_rate": 5.558011466380824e-06, - "loss": 1.8691, - "mean_token_accuracy": 0.5628180056810379, - "step": 3910 - }, - { - "epoch": 0.3418505276009418, - "grad_norm": 1.4577383995056152, - "learning_rate": 5.5035468148846926e-06, - "loss": 1.9047, - "mean_token_accuracy": 0.5601516634225845, - "step": 3920 - }, - { - "epoch": 0.3427225952733932, - "grad_norm": 1.3760961294174194, - "learning_rate": 5.449248837856224e-06, - "loss": 1.868, - "mean_token_accuracy": 0.569593933224678, - "step": 3930 - }, - { - "epoch": 0.3435946629458446, - "grad_norm": 1.5944592952728271, - "learning_rate": 5.395119548012112e-06, - "loss": 1.9316, - "mean_token_accuracy": 0.5559564530849457, - "step": 3940 - }, - { - "epoch": 0.34446673061829597, - "grad_norm": 1.44200599193573, - "learning_rate": 5.34116095181616e-06, - "loss": 1.9008, - "mean_token_accuracy": 0.5622920751571655, - "step": 3950 - }, - { - "epoch": 0.3453387982907474, - "grad_norm": 1.4984644651412964, - "learning_rate": 5.287375049404909e-06, - "loss": 1.893, - "mean_token_accuracy": 0.5642979443073273, - "step": 3960 - }, - { - "epoch": 0.3462108659631987, - "grad_norm": 1.4511080980300903, - "learning_rate": 5.233763834513479e-06, - "loss": 1.8684, - "mean_token_accuracy": 0.5677959859371186, - "step": 3970 - }, - { - "epoch": 0.3470829336356501, - "grad_norm": 1.465245246887207, - "learning_rate": 5.180329294401685e-06, - "loss": 1.8496, - "mean_token_accuracy": 0.5706091016530991, - "step": 3980 - }, - { - "epoch": 0.34795500130810153, - "grad_norm": 1.5774682760238647, - "learning_rate": 5.127073409780352e-06, - "loss": 1.9094, - "mean_token_accuracy": 0.5600171282887458, - "step": 3990 - }, - { - "epoch": 0.3488270689805529, - "grad_norm": 1.5574545860290527, - "learning_rate": 5.0739981547379215e-06, - "loss": 1.9187, - "mean_token_accuracy": 0.558231407403946, - "step": 4000 - }, - { - "epoch": 0.3488270689805529, - "eval_runtime": 4.0915, - "eval_samples_per_second": 268.119, - "eval_steps_per_second": 67.213, - "step": 4000 - }, - { - "epoch": 0.3496991366530043, - "grad_norm": 1.382649302482605, - "learning_rate": 5.02110549666724e-06, - "loss": 1.8625, - "mean_token_accuracy": 0.57120840549469, - "step": 4010 - }, - { - "epoch": 0.35057120432545563, - "grad_norm": 1.4300912618637085, - "learning_rate": 4.968397396192675e-06, - "loss": 1.8613, - "mean_token_accuracy": 0.5701810210943222, - "step": 4020 - }, - { - "epoch": 0.35144327199790704, - "grad_norm": 1.5044147968292236, - "learning_rate": 4.91587580709739e-06, - "loss": 1.8813, - "mean_token_accuracy": 0.5653620332479476, - "step": 4030 - }, - { - "epoch": 0.35231533967035844, - "grad_norm": 1.5464123487472534, - "learning_rate": 4.863542676250972e-06, - "loss": 1.8027, - "mean_token_accuracy": 0.5821183949708939, - "step": 4040 - }, - { - "epoch": 0.3531874073428098, - "grad_norm": 1.4657748937606812, - "learning_rate": 4.811399943537223e-06, - "loss": 1.9129, - "mean_token_accuracy": 0.5564579248428345, - "step": 4050 - }, - { - "epoch": 0.3540594750152612, - "grad_norm": 1.4244955778121948, - "learning_rate": 4.759449541782272e-06, - "loss": 1.8906, - "mean_token_accuracy": 0.5680772989988327, - "step": 4060 - }, - { - "epoch": 0.35493154268771254, - "grad_norm": 1.4630274772644043, - "learning_rate": 4.707693396682936e-06, - "loss": 1.8793, - "mean_token_accuracy": 0.569165849685669, - "step": 4070 - }, - { - "epoch": 0.35580361036016395, - "grad_norm": 1.6317942142486572, - "learning_rate": 4.656133426735315e-06, - "loss": 1.8848, - "mean_token_accuracy": 0.5674779832363128, - "step": 4080 - }, - { - "epoch": 0.35667567803261535, - "grad_norm": 1.4215569496154785, - "learning_rate": 4.604771543163706e-06, - "loss": 1.9164, - "mean_token_accuracy": 0.5565435469150544, - "step": 4090 - }, - { - "epoch": 0.3575477457050667, - "grad_norm": 1.4870840311050415, - "learning_rate": 4.5536096498497295e-06, - "loss": 1.9422, - "mean_token_accuracy": 0.5528620347380638, - "step": 4100 - }, - { - "epoch": 0.3575477457050667, - "eval_runtime": 4.0926, - "eval_samples_per_second": 268.048, - "eval_steps_per_second": 67.195, - "step": 4100 - }, - { - "epoch": 0.3584198133775181, - "grad_norm": 1.449483871459961, - "learning_rate": 4.502649643261779e-06, - "loss": 1.8629, - "mean_token_accuracy": 0.569055762887001, - "step": 4110 - }, - { - "epoch": 0.35929188104996945, - "grad_norm": 1.6074910163879395, - "learning_rate": 4.451893412384707e-06, - "loss": 1.941, - "mean_token_accuracy": 0.5552593022584915, - "step": 4120 - }, - { - "epoch": 0.36016394872242086, - "grad_norm": 1.6915830373764038, - "learning_rate": 4.401342838649818e-06, - "loss": 1.8355, - "mean_token_accuracy": 0.5769202530384063, - "step": 4130 - }, - { - "epoch": 0.36103601639487226, - "grad_norm": 1.7034832239151, - "learning_rate": 4.350999795865109e-06, - "loss": 1.8648, - "mean_token_accuracy": 0.572859588265419, - "step": 4140 - }, - { - "epoch": 0.3619080840673236, - "grad_norm": 1.4516830444335938, - "learning_rate": 4.300866150145837e-06, - "loss": 1.8941, - "mean_token_accuracy": 0.5672455906867981, - "step": 4150 - }, - { - "epoch": 0.362780151739775, - "grad_norm": 1.553286075592041, - "learning_rate": 4.250943759845316e-06, - "loss": 1.8625, - "mean_token_accuracy": 0.5689823925495148, - "step": 4160 - }, - { - "epoch": 0.36365221941222636, - "grad_norm": 1.4989771842956543, - "learning_rate": 4.201234475486063e-06, - "loss": 1.8707, - "mean_token_accuracy": 0.5673312187194824, - "step": 4170 - }, - { - "epoch": 0.36452428708467777, - "grad_norm": 1.4032716751098633, - "learning_rate": 4.1517401396911725e-06, - "loss": 1.9254, - "mean_token_accuracy": 0.5564334630966187, - "step": 4180 - }, - { - "epoch": 0.36539635475712917, - "grad_norm": 1.3520058393478394, - "learning_rate": 4.1024625871160325e-06, - "loss": 1.8656, - "mean_token_accuracy": 0.5695327788591384, - "step": 4190 - }, - { - "epoch": 0.3662684224295805, - "grad_norm": 1.4250023365020752, - "learning_rate": 4.053403644380321e-06, - "loss": 1.8586, - "mean_token_accuracy": 0.566585123538971, - "step": 4200 - }, - { - "epoch": 0.3662684224295805, - "eval_runtime": 4.0855, - "eval_samples_per_second": 268.511, - "eval_steps_per_second": 67.311, - "step": 4200 - }, - { - "epoch": 0.3671404901020319, - "grad_norm": 1.4445308446884155, - "learning_rate": 4.004565130000277e-06, - "loss": 1.8926, - "mean_token_accuracy": 0.56280577480793, - "step": 4210 - }, - { - "epoch": 0.3680125577744833, - "grad_norm": 1.4670929908752441, - "learning_rate": 3.955948854321321e-06, - "loss": 1.891, - "mean_token_accuracy": 0.5640044003725052, - "step": 4220 - }, - { - "epoch": 0.3688846254469347, - "grad_norm": 1.4516441822052002, - "learning_rate": 3.907556619450909e-06, - "loss": 1.8914, - "mean_token_accuracy": 0.5648605704307557, - "step": 4230 - }, - { - "epoch": 0.3697566931193861, - "grad_norm": 1.5820462703704834, - "learning_rate": 3.859390219191775e-06, - "loss": 1.8887, - "mean_token_accuracy": 0.5644569545984268, - "step": 4240 - }, - { - "epoch": 0.37062876079183743, - "grad_norm": 1.4450770616531372, - "learning_rate": 3.8114514389754098e-06, - "loss": 1.8836, - "mean_token_accuracy": 0.5650807201862336, - "step": 4250 - }, - { - "epoch": 0.37150082846428883, - "grad_norm": 1.4532335996627808, - "learning_rate": 3.7637420557958927e-06, - "loss": 1.8766, - "mean_token_accuracy": 0.5693982392549515, - "step": 4260 - }, - { - "epoch": 0.3723728961367402, - "grad_norm": 1.3702110052108765, - "learning_rate": 3.7162638381440077e-06, - "loss": 1.875, - "mean_token_accuracy": 0.5667074292898178, - "step": 4270 - }, - { - "epoch": 0.3732449638091916, - "grad_norm": 1.4957563877105713, - "learning_rate": 3.6690185459417107e-06, - "loss": 1.8914, - "mean_token_accuracy": 0.5666585087776184, - "step": 4280 - }, - { - "epoch": 0.374117031481643, - "grad_norm": 1.4075829982757568, - "learning_rate": 3.622007930476865e-06, - "loss": 1.8344, - "mean_token_accuracy": 0.5794031322002411, - "step": 4290 - }, - { - "epoch": 0.37498909915409434, - "grad_norm": 1.4577142000198364, - "learning_rate": 3.575233734338356e-06, - "loss": 1.9273, - "mean_token_accuracy": 0.5555650681257248, - "step": 4300 - }, - { - "epoch": 0.37498909915409434, - "eval_runtime": 4.0876, - "eval_samples_per_second": 268.375, - "eval_steps_per_second": 67.277, - "step": 4300 - }, - { - "epoch": 0.37586116682654575, - "grad_norm": 1.4766334295272827, - "learning_rate": 3.528697691351465e-06, - "loss": 1.9238, - "mean_token_accuracy": 0.5578400224447251, - "step": 4310 - }, - { - "epoch": 0.37673323449899715, - "grad_norm": 1.4638077020645142, - "learning_rate": 3.4824015265136278e-06, - "loss": 1.9078, - "mean_token_accuracy": 0.563307237625122, - "step": 4320 - }, - { - "epoch": 0.3776053021714485, - "grad_norm": 1.4318853616714478, - "learning_rate": 3.436346955930472e-06, - "loss": 1.8813, - "mean_token_accuracy": 0.5675513714551925, - "step": 4330 - }, - { - "epoch": 0.3784773698438999, - "grad_norm": 1.6565748453140259, - "learning_rate": 3.3905356867522187e-06, - "loss": 1.9586, - "mean_token_accuracy": 0.5517000943422318, - "step": 4340 - }, - { - "epoch": 0.37934943751635125, - "grad_norm": 1.5385055541992188, - "learning_rate": 3.344969417110391e-06, - "loss": 1.9031, - "mean_token_accuracy": 0.5641022533178329, - "step": 4350 - }, - { - "epoch": 0.38022150518880266, - "grad_norm": 1.421630620956421, - "learning_rate": 3.29964983605487e-06, - "loss": 1.9352, - "mean_token_accuracy": 0.5561643779277802, - "step": 4360 - }, - { - "epoch": 0.38109357286125406, - "grad_norm": 1.5509594678878784, - "learning_rate": 3.2545786234913e-06, - "loss": 1.8438, - "mean_token_accuracy": 0.571159490942955, - "step": 4370 - }, - { - "epoch": 0.3819656405337054, - "grad_norm": 1.433976411819458, - "learning_rate": 3.2097574501187877e-06, - "loss": 1.8953, - "mean_token_accuracy": 0.5654965758323669, - "step": 4380 - }, - { - "epoch": 0.3828377082061568, - "grad_norm": 1.4957337379455566, - "learning_rate": 3.165187977368007e-06, - "loss": 1.8609, - "mean_token_accuracy": 0.5669642895460129, - "step": 4390 - }, - { - "epoch": 0.38370977587860816, - "grad_norm": 1.774539589881897, - "learning_rate": 3.120871857339582e-06, - "loss": 1.932, - "mean_token_accuracy": 0.5543542072176934, - "step": 4400 - }, - { - "epoch": 0.38370977587860816, - "eval_runtime": 4.0884, - "eval_samples_per_second": 268.317, - "eval_steps_per_second": 67.263, - "step": 4400 - }, - { - "epoch": 0.38458184355105957, - "grad_norm": 1.4590060710906982, - "learning_rate": 3.0768107327428766e-06, - "loss": 1.8539, - "mean_token_accuracy": 0.5772137969732285, - "step": 4410 - }, - { - "epoch": 0.38545391122351097, - "grad_norm": 1.431147575378418, - "learning_rate": 3.033006236835071e-06, - "loss": 1.8816, - "mean_token_accuracy": 0.5620719224214554, - "step": 4420 - }, - { - "epoch": 0.3863259788959623, - "grad_norm": 1.6103287935256958, - "learning_rate": 2.9894599933606518e-06, - "loss": 1.8766, - "mean_token_accuracy": 0.5686521530151367, - "step": 4430 - }, - { - "epoch": 0.3871980465684137, - "grad_norm": 1.3800914287567139, - "learning_rate": 2.9461736164911934e-06, - "loss": 1.8262, - "mean_token_accuracy": 0.5783757269382477, - "step": 4440 - }, - { - "epoch": 0.38807011424086507, - "grad_norm": 1.6143025159835815, - "learning_rate": 2.903148710765552e-06, - "loss": 1.9012, - "mean_token_accuracy": 0.5581090986728668, - "step": 4450 - }, - { - "epoch": 0.3889421819133165, - "grad_norm": 1.535288691520691, - "learning_rate": 2.8603868710303662e-06, - "loss": 1.8422, - "mean_token_accuracy": 0.5744496077299118, - "step": 4460 - }, - { - "epoch": 0.3898142495857679, - "grad_norm": 1.5556520223617554, - "learning_rate": 2.8178896823809465e-06, - "loss": 1.893, - "mean_token_accuracy": 0.5614114463329315, - "step": 4470 - }, - { - "epoch": 0.39068631725821923, - "grad_norm": 1.542545199394226, - "learning_rate": 2.7756587201025297e-06, - "loss": 1.8801, - "mean_token_accuracy": 0.5641389399766922, - "step": 4480 - }, - { - "epoch": 0.39155838493067063, - "grad_norm": 1.5595227479934692, - "learning_rate": 2.7336955496118666e-06, - "loss": 1.9055, - "mean_token_accuracy": 0.5606286689639092, - "step": 4490 - }, - { - "epoch": 0.392430452603122, - "grad_norm": 1.4849331378936768, - "learning_rate": 2.692001726399215e-06, - "loss": 1.8949, - "mean_token_accuracy": 0.5600293561816215, - "step": 4500 - }, - { - "epoch": 0.392430452603122, - "eval_runtime": 4.0889, - "eval_samples_per_second": 268.29, - "eval_steps_per_second": 67.256, - "step": 4500 - }, - { - "epoch": 0.3933025202755734, - "grad_norm": 1.4943605661392212, - "learning_rate": 2.6505787959706607e-06, - "loss": 1.8523, - "mean_token_accuracy": 0.5698752492666245, - "step": 4510 - }, - { - "epoch": 0.3941745879480248, - "grad_norm": 1.4366259574890137, - "learning_rate": 2.609428293790852e-06, - "loss": 1.8289, - "mean_token_accuracy": 0.5753302335739136, - "step": 4520 - }, - { - "epoch": 0.39504665562047614, - "grad_norm": 1.7184029817581177, - "learning_rate": 2.5685517452260566e-06, - "loss": 1.9219, - "mean_token_accuracy": 0.5587573379278183, - "step": 4530 - }, - { - "epoch": 0.39591872329292754, - "grad_norm": 1.3564153909683228, - "learning_rate": 2.5279506654876473e-06, - "loss": 1.8469, - "mean_token_accuracy": 0.5691046923398971, - "step": 4540 - }, - { - "epoch": 0.3967907909653789, - "grad_norm": 1.5561156272888184, - "learning_rate": 2.487626559575911e-06, - "loss": 1.907, - "mean_token_accuracy": 0.5614848345518112, - "step": 4550 - }, - { - "epoch": 0.3976628586378303, - "grad_norm": 1.5362129211425781, - "learning_rate": 2.4475809222242775e-06, - "loss": 1.8957, - "mean_token_accuracy": 0.5658757358789444, - "step": 4560 - }, - { - "epoch": 0.3985349263102817, - "grad_norm": 1.4398785829544067, - "learning_rate": 2.4078152378439033e-06, - "loss": 1.9109, - "mean_token_accuracy": 0.5580479472875595, - "step": 4570 - }, - { - "epoch": 0.39940699398273305, - "grad_norm": 1.7573672533035278, - "learning_rate": 2.3683309804686604e-06, - "loss": 1.857, - "mean_token_accuracy": 0.5705724030733108, - "step": 4580 - }, - { - "epoch": 0.40027906165518445, - "grad_norm": 1.3595718145370483, - "learning_rate": 2.329129613700478e-06, - "loss": 1.9227, - "mean_token_accuracy": 0.5525807231664658, - "step": 4590 - }, - { - "epoch": 0.4011511293276358, - "grad_norm": 1.5244477987289429, - "learning_rate": 2.29021259065511e-06, - "loss": 1.8891, - "mean_token_accuracy": 0.5619251430034637, - "step": 4600 - }, - { - "epoch": 0.4011511293276358, - "eval_runtime": 4.0833, - "eval_samples_per_second": 268.656, - "eval_steps_per_second": 67.348, - "step": 4600 - }, - { - "epoch": 0.4020231970000872, - "grad_norm": 1.3871146440505981, - "learning_rate": 2.251581353908252e-06, - "loss": 1.9031, - "mean_token_accuracy": 0.56024951338768, - "step": 4610 - }, - { - "epoch": 0.4028952646725386, - "grad_norm": 1.5421710014343262, - "learning_rate": 2.2132373354420833e-06, - "loss": 1.8973, - "mean_token_accuracy": 0.5607142865657806, - "step": 4620 - }, - { - "epoch": 0.40376733234498996, - "grad_norm": 1.5073237419128418, - "learning_rate": 2.1751819565921774e-06, - "loss": 1.9176, - "mean_token_accuracy": 0.5590753436088562, - "step": 4630 - }, - { - "epoch": 0.40463940001744136, - "grad_norm": 1.3509526252746582, - "learning_rate": 2.137416627994814e-06, - "loss": 1.8664, - "mean_token_accuracy": 0.5698507905006409, - "step": 4640 - }, - { - "epoch": 0.4055114676898927, - "grad_norm": 1.6109211444854736, - "learning_rate": 2.0999427495347035e-06, - "loss": 1.8957, - "mean_token_accuracy": 0.5636007845401764, - "step": 4650 - }, - { - "epoch": 0.4063835353623441, - "grad_norm": 1.4406919479370117, - "learning_rate": 2.0627617102930753e-06, - "loss": 1.9035, - "mean_token_accuracy": 0.5599682003259658, - "step": 4660 - }, - { - "epoch": 0.4072556030347955, - "grad_norm": 1.4089229106903076, - "learning_rate": 2.02587488849621e-06, - "loss": 1.8594, - "mean_token_accuracy": 0.5714530318975448, - "step": 4670 - }, - { - "epoch": 0.40812767070724687, - "grad_norm": 1.4895331859588623, - "learning_rate": 1.989283651464329e-06, - "loss": 1.9047, - "mean_token_accuracy": 0.5572162419557571, - "step": 4680 - }, - { - "epoch": 0.4089997383796983, - "grad_norm": 1.4899771213531494, - "learning_rate": 1.952989355560929e-06, - "loss": 1.8523, - "mean_token_accuracy": 0.5744985342025757, - "step": 4690 - }, - { - "epoch": 0.4098718060521496, - "grad_norm": 1.5392106771469116, - "learning_rate": 1.9169933461424928e-06, - "loss": 1.9023, - "mean_token_accuracy": 0.5581702530384064, - "step": 4700 - }, - { - "epoch": 0.4098718060521496, - "eval_runtime": 4.0847, - "eval_samples_per_second": 268.562, - "eval_steps_per_second": 67.324, - "step": 4700 - }, - { - "epoch": 0.410743873724601, - "grad_norm": 1.3449276685714722, - "learning_rate": 1.8812969575086272e-06, - "loss": 1.8813, - "mean_token_accuracy": 0.5668175131082535, - "step": 4710 - }, - { - "epoch": 0.41161594139705243, - "grad_norm": 1.4662140607833862, - "learning_rate": 1.8459015128525937e-06, - "loss": 1.807, - "mean_token_accuracy": 0.5794887483119965, - "step": 4720 - }, - { - "epoch": 0.4124880090695038, - "grad_norm": 1.5249788761138916, - "learning_rate": 1.8108083242122764e-06, - "loss": 1.8656, - "mean_token_accuracy": 0.5667318940162659, - "step": 4730 - }, - { - "epoch": 0.4133600767419552, - "grad_norm": 1.5550411939620972, - "learning_rate": 1.7760186924215239e-06, - "loss": 1.8168, - "mean_token_accuracy": 0.5799168288707733, - "step": 4740 - }, - { - "epoch": 0.41423214441440653, - "grad_norm": 1.4331603050231934, - "learning_rate": 1.7415339070619586e-06, - "loss": 1.8926, - "mean_token_accuracy": 0.5634173184633255, - "step": 4750 - }, - { - "epoch": 0.41510421208685794, - "grad_norm": 1.5571794509887695, - "learning_rate": 1.7073552464151465e-06, - "loss": 1.8789, - "mean_token_accuracy": 0.5671355128288269, - "step": 4760 - }, - { - "epoch": 0.41597627975930934, - "grad_norm": 1.4224915504455566, - "learning_rate": 1.6734839774152322e-06, - "loss": 1.923, - "mean_token_accuracy": 0.5605430483818055, - "step": 4770 - }, - { - "epoch": 0.4168483474317607, - "grad_norm": 1.370436668395996, - "learning_rate": 1.6399213556019732e-06, - "loss": 1.8527, - "mean_token_accuracy": 0.5706702560186386, - "step": 4780 - }, - { - "epoch": 0.4177204151042121, - "grad_norm": 1.4772405624389648, - "learning_rate": 1.6066686250741904e-06, - "loss": 1.8605, - "mean_token_accuracy": 0.5731531292200088, - "step": 4790 - }, - { - "epoch": 0.41859248277666344, - "grad_norm": 1.499083399772644, - "learning_rate": 1.573727018443667e-06, - "loss": 1.8875, - "mean_token_accuracy": 0.5655944168567657, - "step": 4800 - }, - { - "epoch": 0.41859248277666344, - "eval_runtime": 4.0836, - "eval_samples_per_second": 268.638, - "eval_steps_per_second": 67.343, - "step": 4800 - }, - { - "epoch": 0.41946455044911485, - "grad_norm": 1.4531036615371704, - "learning_rate": 1.5410977567894403e-06, - "loss": 1.9074, - "mean_token_accuracy": 0.5571550875902176, - "step": 4810 - }, - { - "epoch": 0.42033661812156625, - "grad_norm": 1.4231221675872803, - "learning_rate": 1.5087820496125595e-06, - "loss": 1.9078, - "mean_token_accuracy": 0.5635396242141724, - "step": 4820 - }, - { - "epoch": 0.4212086857940176, - "grad_norm": 1.5508164167404175, - "learning_rate": 1.4767810947912275e-06, - "loss": 1.8996, - "mean_token_accuracy": 0.5575464755296707, - "step": 4830 - }, - { - "epoch": 0.422080753466469, - "grad_norm": 1.4093987941741943, - "learning_rate": 1.4450960785364244e-06, - "loss": 1.9098, - "mean_token_accuracy": 0.5573507815599441, - "step": 4840 - }, - { - "epoch": 0.42295282113892035, - "grad_norm": 1.468575358390808, - "learning_rate": 1.4137281753479092e-06, - "loss": 1.9238, - "mean_token_accuracy": 0.560665363073349, - "step": 4850 - }, - { - "epoch": 0.42382488881137176, - "grad_norm": 1.482642412185669, - "learning_rate": 1.3826785479707128e-06, - "loss": 1.9027, - "mean_token_accuracy": 0.5603473573923111, - "step": 4860 - }, - { - "epoch": 0.42469695648382316, - "grad_norm": 1.5216703414916992, - "learning_rate": 1.3519483473520124e-06, - "loss": 1.9207, - "mean_token_accuracy": 0.554244127869606, - "step": 4870 - }, - { - "epoch": 0.4255690241562745, - "grad_norm": 1.4411274194717407, - "learning_rate": 1.3215387125984813e-06, - "loss": 1.8895, - "mean_token_accuracy": 0.5643101736903191, - "step": 4880 - }, - { - "epoch": 0.4264410918287259, - "grad_norm": 1.301592469215393, - "learning_rate": 1.2914507709340596e-06, - "loss": 1.8734, - "mean_token_accuracy": 0.5655454933643341, - "step": 4890 - }, - { - "epoch": 0.42731315950117726, - "grad_norm": 1.4577182531356812, - "learning_rate": 1.2616856376581766e-06, - "loss": 1.8902, - "mean_token_accuracy": 0.562267616391182, - "step": 4900 - }, - { - "epoch": 0.42731315950117726, - "eval_runtime": 4.0907, - "eval_samples_per_second": 268.168, - "eval_steps_per_second": 67.225, - "step": 4900 - }, - { - "epoch": 0.42818522717362867, - "grad_norm": 1.4009668827056885, - "learning_rate": 1.2322444161044e-06, - "loss": 1.9051, - "mean_token_accuracy": 0.5627079278230667, - "step": 4910 - }, - { - "epoch": 0.4290572948460801, - "grad_norm": 1.520477294921875, - "learning_rate": 1.2031281975995467e-06, - "loss": 1.8891, - "mean_token_accuracy": 0.5616927653551101, - "step": 4920 - }, - { - "epoch": 0.4299293625185314, - "grad_norm": 1.5954930782318115, - "learning_rate": 1.1743380614232213e-06, - "loss": 1.9023, - "mean_token_accuracy": 0.5604085087776184, - "step": 4930 - }, - { - "epoch": 0.4308014301909828, - "grad_norm": 1.5380669832229614, - "learning_rate": 1.1458750747678105e-06, - "loss": 1.859, - "mean_token_accuracy": 0.5733243674039841, - "step": 4940 - }, - { - "epoch": 0.43167349786343423, - "grad_norm": 1.46197509765625, - "learning_rate": 1.1177402926989345e-06, - "loss": 1.8902, - "mean_token_accuracy": 0.5661203533411026, - "step": 4950 - }, - { - "epoch": 0.4325455655358856, - "grad_norm": 1.3501838445663452, - "learning_rate": 1.0899347581163222e-06, - "loss": 1.8992, - "mean_token_accuracy": 0.5643957883119584, - "step": 4960 - }, - { - "epoch": 0.433417633208337, - "grad_norm": 1.4949750900268555, - "learning_rate": 1.0624595017151685e-06, - "loss": 1.9086, - "mean_token_accuracy": 0.5587818026542664, - "step": 4970 - }, - { - "epoch": 0.43428970088078833, - "grad_norm": 1.6622486114501953, - "learning_rate": 1.0353155419479122e-06, - "loss": 1.8758, - "mean_token_accuracy": 0.5664995104074478, - "step": 4980 - }, - { - "epoch": 0.43516176855323974, - "grad_norm": 1.6827987432479858, - "learning_rate": 1.0085038849865025e-06, - "loss": 1.8867, - "mean_token_accuracy": 0.5640166342258454, - "step": 4990 - }, - { - "epoch": 0.43603383622569114, - "grad_norm": 1.3393261432647705, - "learning_rate": 9.820255246850853e-07, - "loss": 1.8934, - "mean_token_accuracy": 0.5630259305238724, - "step": 5000 - }, - { - "epoch": 0.43603383622569114, - "eval_runtime": 4.0872, - "eval_samples_per_second": 268.401, - "eval_steps_per_second": 67.284, - "step": 5000 - }, - { - "epoch": 0.4369059038981425, - "grad_norm": 1.4590463638305664, - "learning_rate": 9.55881442543174e-07, - "loss": 1.9066, - "mean_token_accuracy": 0.5621086120605469, - "step": 5010 - }, - { - "epoch": 0.4377779715705939, - "grad_norm": 1.3824725151062012, - "learning_rate": 9.30072607669259e-07, - "loss": 1.9039, - "mean_token_accuracy": 0.5605308175086975, - "step": 5020 - }, - { - "epoch": 0.43865003924304524, - "grad_norm": 1.4778671264648438, - "learning_rate": 9.045999767448988e-07, - "loss": 1.8813, - "mean_token_accuracy": 0.5673067539930343, - "step": 5030 - }, - { - "epoch": 0.43952210691549665, - "grad_norm": 1.4248597621917725, - "learning_rate": 8.794644939892361e-07, - "loss": 1.9184, - "mean_token_accuracy": 0.5599559783935547, - "step": 5040 - }, - { - "epoch": 0.44039417458794805, - "grad_norm": 1.588160514831543, - "learning_rate": 8.546670911240196e-07, - "loss": 1.9059, - "mean_token_accuracy": 0.5637475490570069, - "step": 5050 - }, - { - "epoch": 0.4412662422603994, - "grad_norm": 1.4497967958450317, - "learning_rate": 8.302086873390536e-07, - "loss": 1.9164, - "mean_token_accuracy": 0.5602005869150162, - "step": 5060 - }, - { - "epoch": 0.4421383099328508, - "grad_norm": 1.405897855758667, - "learning_rate": 8.060901892581241e-07, - "loss": 1.9043, - "mean_token_accuracy": 0.5620474576950073, - "step": 5070 - }, - { - "epoch": 0.44301037760530215, - "grad_norm": 1.4370694160461426, - "learning_rate": 7.82312490905407e-07, - "loss": 1.8742, - "mean_token_accuracy": 0.5676859140396118, - "step": 5080 - }, - { - "epoch": 0.44388244527775356, - "grad_norm": 1.4547231197357178, - "learning_rate": 7.588764736723086e-07, - "loss": 1.8926, - "mean_token_accuracy": 0.5644447177648544, - "step": 5090 - }, - { - "epoch": 0.44475451295020496, - "grad_norm": 1.5096557140350342, - "learning_rate": 7.357830062848114e-07, - "loss": 1.866, - "mean_token_accuracy": 0.5678693801164627, - "step": 5100 - }, - { - "epoch": 0.44475451295020496, - "eval_runtime": 4.0849, - "eval_samples_per_second": 268.55, - "eval_steps_per_second": 67.321, - "step": 5100 - }, - { - "epoch": 0.4456265806226563, - "grad_norm": 1.6047755479812622, - "learning_rate": 7.130329447712581e-07, - "loss": 1.8086, - "mean_token_accuracy": 0.5806017607450485, - "step": 5110 - }, - { - "epoch": 0.4464986482951077, - "grad_norm": 1.3617113828659058, - "learning_rate": 6.906271324306335e-07, - "loss": 1.8957, - "mean_token_accuracy": 0.5590753436088562, - "step": 5120 - }, - { - "epoch": 0.44737071596755906, - "grad_norm": 1.5302565097808838, - "learning_rate": 6.685663998012926e-07, - "loss": 1.8848, - "mean_token_accuracy": 0.5646404147148132, - "step": 5130 - }, - { - "epoch": 0.44824278364001047, - "grad_norm": 1.4842449426651, - "learning_rate": 6.468515646301865e-07, - "loss": 1.8988, - "mean_token_accuracy": 0.5620352268218994, - "step": 5140 - }, - { - "epoch": 0.44911485131246187, - "grad_norm": 1.552765130996704, - "learning_rate": 6.254834318425363e-07, - "loss": 1.8988, - "mean_token_accuracy": 0.5594422698020936, - "step": 5150 - }, - { - "epoch": 0.4499869189849132, - "grad_norm": 1.4039723873138428, - "learning_rate": 6.044627935120107e-07, - "loss": 1.8828, - "mean_token_accuracy": 0.5647994130849838, - "step": 5160 - }, - { - "epoch": 0.4508589866573646, - "grad_norm": 1.4126039743423462, - "learning_rate": 5.837904288313545e-07, - "loss": 1.8734, - "mean_token_accuracy": 0.567465752363205, - "step": 5170 - }, - { - "epoch": 0.451731054329816, - "grad_norm": 1.4614925384521484, - "learning_rate": 5.634671040835104e-07, - "loss": 1.9406, - "mean_token_accuracy": 0.5528742641210556, - "step": 5180 - }, - { - "epoch": 0.4526031220022674, - "grad_norm": 1.54082453250885, - "learning_rate": 5.43493572613214e-07, - "loss": 1.9375, - "mean_token_accuracy": 0.5544275879859925, - "step": 5190 - }, - { - "epoch": 0.4534751896747188, - "grad_norm": 1.5252362489700317, - "learning_rate": 5.238705747990669e-07, - "loss": 1.8316, - "mean_token_accuracy": 0.5732020527124405, - "step": 5200 - }, - { - "epoch": 0.4534751896747188, - "eval_runtime": 4.0849, - "eval_samples_per_second": 268.548, - "eval_steps_per_second": 67.321, - "step": 5200 - }, - { - "epoch": 0.45434725734717013, - "grad_norm": 1.409277319908142, - "learning_rate": 5.045988380260935e-07, - "loss": 1.8574, - "mean_token_accuracy": 0.568272989988327, - "step": 5210 - }, - { - "epoch": 0.45521932501962153, - "grad_norm": 1.4178364276885986, - "learning_rate": 4.856790766587815e-07, - "loss": 1.9172, - "mean_token_accuracy": 0.5640533313155174, - "step": 5220 - }, - { - "epoch": 0.4560913926920729, - "grad_norm": 1.3952265977859497, - "learning_rate": 4.6711199201459833e-07, - "loss": 1.8938, - "mean_token_accuracy": 0.5614359140396118, - "step": 5230 - }, - { - "epoch": 0.4569634603645243, - "grad_norm": 1.577718734741211, - "learning_rate": 4.488982723379887e-07, - "loss": 1.8551, - "mean_token_accuracy": 0.5694104731082916, - "step": 5240 - }, - { - "epoch": 0.4578355280369757, - "grad_norm": 1.529725193977356, - "learning_rate": 4.3103859277488056e-07, - "loss": 1.8492, - "mean_token_accuracy": 0.5718933463096618, - "step": 5250 - }, - { - "epoch": 0.45870759570942704, - "grad_norm": 1.4993305206298828, - "learning_rate": 4.1353361534763657e-07, - "loss": 1.8855, - "mean_token_accuracy": 0.5651663422584534, - "step": 5260 - }, - { - "epoch": 0.45957966338187844, - "grad_norm": 1.5709185600280762, - "learning_rate": 3.963839889305343e-07, - "loss": 1.8855, - "mean_token_accuracy": 0.563551864027977, - "step": 5270 - }, - { - "epoch": 0.4604517310543298, - "grad_norm": 1.550584077835083, - "learning_rate": 3.7959034922569804e-07, - "loss": 1.9148, - "mean_token_accuracy": 0.5563723057508468, - "step": 5280 - }, - { - "epoch": 0.4613237987267812, - "grad_norm": 1.3886584043502808, - "learning_rate": 3.631533187395453e-07, - "loss": 1.8949, - "mean_token_accuracy": 0.561753910779953, - "step": 5290 - }, - { - "epoch": 0.4621958663992326, - "grad_norm": 1.4985154867172241, - "learning_rate": 3.470735067597053e-07, - "loss": 1.882, - "mean_token_accuracy": 0.5627935439348221, - "step": 5300 - }, - { - "epoch": 0.4621958663992326, - "eval_runtime": 4.0805, - "eval_samples_per_second": 268.842, - "eval_steps_per_second": 67.394, - "step": 5300 - }, - { - "epoch": 0.46306793407168395, - "grad_norm": 1.5550390481948853, - "learning_rate": 3.313515093324393e-07, - "loss": 1.8559, - "mean_token_accuracy": 0.5729207396507263, - "step": 5310 - }, - { - "epoch": 0.46394000174413536, - "grad_norm": 1.4984419345855713, - "learning_rate": 3.1598790924053936e-07, - "loss": 1.8711, - "mean_token_accuracy": 0.5680161476135254, - "step": 5320 - }, - { - "epoch": 0.4648120694165867, - "grad_norm": 1.5940117835998535, - "learning_rate": 3.009832759817344e-07, - "loss": 1.9152, - "mean_token_accuracy": 0.5571550905704499, - "step": 5330 - }, - { - "epoch": 0.4656841370890381, - "grad_norm": 1.379276990890503, - "learning_rate": 2.8633816574757166e-07, - "loss": 1.8621, - "mean_token_accuracy": 0.5682240694761276, - "step": 5340 - }, - { - "epoch": 0.4665562047614895, - "grad_norm": 1.4445042610168457, - "learning_rate": 2.720531214028055e-07, - "loss": 1.8508, - "mean_token_accuracy": 0.5736301362514495, - "step": 5350 - }, - { - "epoch": 0.46742827243394086, - "grad_norm": 1.5308756828308105, - "learning_rate": 2.5812867246527207e-07, - "loss": 1.8645, - "mean_token_accuracy": 0.5661448121070862, - "step": 5360 - }, - { - "epoch": 0.46830034010639227, - "grad_norm": 1.438707947731018, - "learning_rate": 2.445653350862609e-07, - "loss": 1.9, - "mean_token_accuracy": 0.5600905090570449, - "step": 5370 - }, - { - "epoch": 0.4691724077788436, - "grad_norm": 1.507072925567627, - "learning_rate": 2.3136361203138668e-07, - "loss": 1.9008, - "mean_token_accuracy": 0.5647015661001206, - "step": 5380 - }, - { - "epoch": 0.470044475451295, - "grad_norm": 1.5116603374481201, - "learning_rate": 2.1852399266194312e-07, - "loss": 1.8871, - "mean_token_accuracy": 0.5647260278463364, - "step": 5390 - }, - { - "epoch": 0.4709165431237464, - "grad_norm": 1.4377670288085938, - "learning_rate": 2.0604695291677523e-07, - "loss": 1.8707, - "mean_token_accuracy": 0.566646283864975, - "step": 5400 - }, - { - "epoch": 0.4709165431237464, - "eval_runtime": 4.085, - "eval_samples_per_second": 268.545, - "eval_steps_per_second": 67.32, - "step": 5400 - }, - { - "epoch": 0.47178861079619777, - "grad_norm": 1.5869266986846924, - "learning_rate": 1.9393295529462674e-07, - "loss": 1.8711, - "mean_token_accuracy": 0.5661203533411026, - "step": 5410 - }, - { - "epoch": 0.4726606784686492, - "grad_norm": 1.4018718004226685, - "learning_rate": 1.8218244883700386e-07, - "loss": 1.9012, - "mean_token_accuracy": 0.5612279862165451, - "step": 5420 - }, - { - "epoch": 0.4735327461411005, - "grad_norm": 1.4257837533950806, - "learning_rate": 1.7079586911152413e-07, - "loss": 1.8687, - "mean_token_accuracy": 0.5651785671710968, - "step": 5430 - }, - { - "epoch": 0.47440481381355193, - "grad_norm": 1.3777456283569336, - "learning_rate": 1.597736381957782e-07, - "loss": 1.8613, - "mean_token_accuracy": 0.5744863003492355, - "step": 5440 - }, - { - "epoch": 0.47527688148600333, - "grad_norm": 1.4538086652755737, - "learning_rate": 1.4911616466167345e-07, - "loss": 1.9023, - "mean_token_accuracy": 0.5583414852619171, - "step": 5450 - }, - { - "epoch": 0.4761489491584547, - "grad_norm": 1.3854527473449707, - "learning_rate": 1.3882384356030066e-07, - "loss": 1.943, - "mean_token_accuracy": 0.5537181943655014, - "step": 5460 - }, - { - "epoch": 0.4770210168309061, - "grad_norm": 1.4162297248840332, - "learning_rate": 1.2889705640728445e-07, - "loss": 1.8441, - "mean_token_accuracy": 0.5747064501047134, - "step": 5470 - }, - { - "epoch": 0.47789308450335743, - "grad_norm": 1.4150612354278564, - "learning_rate": 1.1933617116863805e-07, - "loss": 1.9055, - "mean_token_accuracy": 0.5604941263794899, - "step": 5480 - }, - { - "epoch": 0.47876515217580884, - "grad_norm": 1.6404210329055786, - "learning_rate": 1.1014154224713302e-07, - "loss": 1.8938, - "mean_token_accuracy": 0.5667685896158219, - "step": 5490 - }, - { - "epoch": 0.47963721984826024, - "grad_norm": 1.6611881256103516, - "learning_rate": 1.0131351046915094e-07, - "loss": 1.8387, - "mean_token_accuracy": 0.5724070489406585, - "step": 5500 - }, - { - "epoch": 0.47963721984826024, - "eval_runtime": 4.0906, - "eval_samples_per_second": 268.176, - "eval_steps_per_second": 67.227, - "step": 5500 - }, - { - "epoch": 0.4805092875207116, - "grad_norm": 1.3786742687225342, - "learning_rate": 9.285240307206123e-08, - "loss": 1.8906, - "mean_token_accuracy": 0.563209393620491, - "step": 5510 - }, - { - "epoch": 0.481381355193163, - "grad_norm": 1.4849506616592407, - "learning_rate": 8.475853369207753e-08, - "loss": 1.9164, - "mean_token_accuracy": 0.5593566566705703, - "step": 5520 - }, - { - "epoch": 0.48225342286561435, - "grad_norm": 1.3948898315429688, - "learning_rate": 7.703220235264708e-08, - "loss": 1.893, - "mean_token_accuracy": 0.5631482422351837, - "step": 5530 - }, - { - "epoch": 0.48312549053806575, - "grad_norm": 1.41569983959198, - "learning_rate": 6.967369545331615e-08, - "loss": 1.8961, - "mean_token_accuracy": 0.5623043060302735, - "step": 5540 - }, - { - "epoch": 0.48399755821051715, - "grad_norm": 1.363344430923462, - "learning_rate": 6.26832857591242e-08, - "loss": 1.8785, - "mean_token_accuracy": 0.562940314412117, - "step": 5550 - }, - { - "epoch": 0.4848696258829685, - "grad_norm": 1.4277682304382324, - "learning_rate": 5.606123239048522e-08, - "loss": 1.8809, - "mean_token_accuracy": 0.5643276035785675, - "step": 5560 - }, - { - "epoch": 0.4857416935554199, - "grad_norm": 1.4068211317062378, - "learning_rate": 4.9807780813586615e-08, - "loss": 1.8551, - "mean_token_accuracy": 0.5700464755296707, - "step": 5570 - }, - { - "epoch": 0.48661376122787126, - "grad_norm": 1.4484305381774902, - "learning_rate": 4.392316283128861e-08, - "loss": 1.8262, - "mean_token_accuracy": 0.5742539197206498, - "step": 5580 - }, - { - "epoch": 0.48748582890032266, - "grad_norm": 1.5154151916503906, - "learning_rate": 3.840759657453452e-08, - "loss": 1.8965, - "mean_token_accuracy": 0.5596379607915878, - "step": 5590 - }, - { - "epoch": 0.48835789657277406, - "grad_norm": 1.4292099475860596, - "learning_rate": 3.326128649426053e-08, - "loss": 1.8988, - "mean_token_accuracy": 0.5628179997205734, - "step": 5600 - }, - { - "epoch": 0.48835789657277406, - "eval_runtime": 4.0919, - "eval_samples_per_second": 268.088, - "eval_steps_per_second": 67.205, - "step": 5600 - }, - { - "epoch": 0.4892299642452254, - "grad_norm": 1.4347695112228394, - "learning_rate": 2.8484423353822842e-08, - "loss": 1.8422, - "mean_token_accuracy": 0.5740337610244751, - "step": 5610 - }, - { - "epoch": 0.4901020319176768, - "grad_norm": 1.5624750852584839, - "learning_rate": 2.4077184221920068e-08, - "loss": 1.8184, - "mean_token_accuracy": 0.5777030318975449, - "step": 5620 - }, - { - "epoch": 0.4909740995901282, - "grad_norm": 1.4097241163253784, - "learning_rate": 2.003973246603508e-08, - "loss": 1.8371, - "mean_token_accuracy": 0.5728473573923111, - "step": 5630 - }, - { - "epoch": 0.49184616726257957, - "grad_norm": 1.5872358083724976, - "learning_rate": 1.637221774637765e-08, - "loss": 1.9125, - "mean_token_accuracy": 0.559552350640297, - "step": 5640 - }, - { - "epoch": 0.492718234935031, - "grad_norm": 1.5018359422683716, - "learning_rate": 1.3074776010334466e-08, - "loss": 1.85, - "mean_token_accuracy": 0.5692636936902999, - "step": 5650 - }, - { - "epoch": 0.4935903026074823, - "grad_norm": 1.3956242799758911, - "learning_rate": 1.0147529487432028e-08, - "loss": 1.841, - "mean_token_accuracy": 0.5753180056810379, - "step": 5660 - }, - { - "epoch": 0.4944623702799337, - "grad_norm": 1.4925743341445923, - "learning_rate": 7.590586684805834e-09, - "loss": 1.9199, - "mean_token_accuracy": 0.5554916799068451, - "step": 5670 - }, - { - "epoch": 0.49533443795238513, - "grad_norm": 1.4429186582565308, - "learning_rate": 5.4040423831802635e-09, - "loss": 1.8715, - "mean_token_accuracy": 0.5679549932479858, - "step": 5680 - }, - { - "epoch": 0.4962065056248365, - "grad_norm": 1.4584972858428955, - "learning_rate": 3.587977633348061e-09, - "loss": 1.8531, - "mean_token_accuracy": 0.5728473573923111, - "step": 5690 - }, - { - "epoch": 0.4970785732972879, - "grad_norm": 1.384555697441101, - "learning_rate": 2.1424597531749524e-09, - "loss": 1.9008, - "mean_token_accuracy": 0.5612769097089767, - "step": 5700 - }, - { - "epoch": 0.4970785732972879, - "eval_runtime": 4.086, - "eval_samples_per_second": 268.478, - "eval_steps_per_second": 67.303, - "step": 5700 - }, - { - "epoch": 0.49795064096973923, - "grad_norm": 1.3602359294891357, - "learning_rate": 1.0675423250994244e-09, - "loss": 1.868, - "mean_token_accuracy": 0.5703155636787415, - "step": 5710 - }, - { - "epoch": 0.49882270864219064, - "grad_norm": 1.521691083908081, - "learning_rate": 3.6326519414431327e-10, - "loss": 1.8859, - "mean_token_accuracy": 0.5646893322467804, - "step": 5720 - }, - { - "epoch": 0.49969477631464204, - "grad_norm": 1.449621319770813, - "learning_rate": 2.965446644798142e-11, - "loss": 1.898, - "mean_token_accuracy": 0.5618395298719406, - "step": 5730 - }, - { - "epoch": 0.5000436033836225, - "mean_token_accuracy": 0.5644263699650764, - "step": 5734, + "epoch": 0.10136157337367625, + "mean_token_accuracy": 0.3118449641125543, + "step": 67, "total_flos": 0.0, - "train_loss": 2.0137617991367285, - "train_runtime": 996.2065, - "train_samples_per_second": 92.083, - "train_steps_per_second": 5.756 + "train_loss": 4.772621268656716, + "train_runtime": 29.2884, + "train_samples_per_second": 36.093, + "train_steps_per_second": 2.288 } ], "logging_steps": 10, - "max_steps": 5734, + "max_steps": 67, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500,