{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9990717600856835, "eval_steps": 500, "global_step": 7002, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004284184219921457, "grad_norm": 7.21875, "learning_rate": 1.0688836104513063e-05, "loss": 2.91, "mean_token_accuracy": 0.5702525054415067, "step": 10 }, { "epoch": 0.008568368439842914, "grad_norm": 3.453125, "learning_rate": 2.2565320665083135e-05, "loss": 1.6569, "mean_token_accuracy": 0.6329187631607056, "step": 20 }, { "epoch": 0.01285255265976437, "grad_norm": 2.203125, "learning_rate": 3.44418052256532e-05, "loss": 1.4104, "mean_token_accuracy": 0.657142640153567, "step": 30 }, { "epoch": 0.017136736879685827, "grad_norm": 2.375, "learning_rate": 4.631828978622328e-05, "loss": 1.4205, "mean_token_accuracy": 0.657102554043134, "step": 40 }, { "epoch": 0.021420921099607283, "grad_norm": 2.0625, "learning_rate": 5.819477434679335e-05, "loss": 1.4479, "mean_token_accuracy": 0.6437717239061992, "step": 50 }, { "epoch": 0.02570510531952874, "grad_norm": 1.921875, "learning_rate": 7.007125890736342e-05, "loss": 1.4744, "mean_token_accuracy": 0.6392371306816736, "step": 60 }, { "epoch": 0.029989289539450195, "grad_norm": 1.8984375, "learning_rate": 8.194774346793349e-05, "loss": 1.4946, "mean_token_accuracy": 0.6394953320423762, "step": 70 }, { "epoch": 0.034273473759371655, "grad_norm": 1.9140625, "learning_rate": 9.382422802850355e-05, "loss": 1.5123, "mean_token_accuracy": 0.6348034431536992, "step": 80 }, { "epoch": 0.03855765797929311, "grad_norm": 1.7265625, "learning_rate": 0.00010570071258907364, "loss": 1.5446, "mean_token_accuracy": 0.6331378062566121, "step": 90 }, { "epoch": 0.042841842199214566, "grad_norm": 1.671875, "learning_rate": 0.00011757719714964371, "loss": 1.6083, "mean_token_accuracy": 0.6170119682947794, "step": 100 }, { "epoch": 0.047126026419136026, "grad_norm": 1.6484375, "learning_rate": 0.00012945368171021377, "loss": 1.6014, "mean_token_accuracy": 0.6203826546669007, "step": 110 }, { "epoch": 0.05141021063905748, "grad_norm": 6.375, "learning_rate": 0.00014133016627078385, "loss": 1.65, "mean_token_accuracy": 0.6120437135299047, "step": 120 }, { "epoch": 0.05569439485897894, "grad_norm": 1.7265625, "learning_rate": 0.00015320665083135392, "loss": 1.7805, "mean_token_accuracy": 0.5932602127393086, "step": 130 }, { "epoch": 0.05997857907890039, "grad_norm": 1.484375, "learning_rate": 0.000165083135391924, "loss": 1.6786, "mean_token_accuracy": 0.605377835035324, "step": 140 }, { "epoch": 0.06426276329882184, "grad_norm": 1.7109375, "learning_rate": 0.00017695961995249407, "loss": 1.6794, "mean_token_accuracy": 0.6040138930082322, "step": 150 }, { "epoch": 0.06854694751874331, "grad_norm": 1.5625, "learning_rate": 0.00018883610451306412, "loss": 1.6891, "mean_token_accuracy": 0.6048624157905579, "step": 160 }, { "epoch": 0.07283113173866476, "grad_norm": 1.5625, "learning_rate": 0.00020071258907363422, "loss": 1.7496, "mean_token_accuracy": 0.5943053344885508, "step": 170 }, { "epoch": 0.07711531595858621, "grad_norm": 1.6796875, "learning_rate": 0.00021258907363420426, "loss": 1.8166, "mean_token_accuracy": 0.5864472662409147, "step": 180 }, { "epoch": 0.08139950017850768, "grad_norm": 1.5, "learning_rate": 0.00022446555819477434, "loss": 1.804, "mean_token_accuracy": 0.5849504172801971, "step": 190 }, { "epoch": 0.08568368439842913, "grad_norm": 1.2734375, "learning_rate": 0.00023634204275534444, "loss": 1.8061, "mean_token_accuracy": 0.5850104331970215, "step": 200 }, { "epoch": 0.08996786861835059, "grad_norm": 1.5390625, "learning_rate": 0.0002482185273159145, "loss": 1.8503, "mean_token_accuracy": 0.5780546754598618, "step": 210 }, { "epoch": 0.09425205283827205, "grad_norm": 1.2578125, "learning_rate": 0.00026009501187648456, "loss": 1.7982, "mean_token_accuracy": 0.5852496673663458, "step": 220 }, { "epoch": 0.0985362370581935, "grad_norm": 1.25, "learning_rate": 0.00027197149643705463, "loss": 1.8589, "mean_token_accuracy": 0.5725395048658053, "step": 230 }, { "epoch": 0.10282042127811496, "grad_norm": 1.28125, "learning_rate": 0.0002838479809976247, "loss": 1.8616, "mean_token_accuracy": 0.5743008524179458, "step": 240 }, { "epoch": 0.10710460549803641, "grad_norm": 5.71875, "learning_rate": 0.0002957244655581948, "loss": 1.8569, "mean_token_accuracy": 0.5772680620352427, "step": 250 }, { "epoch": 0.11138878971795788, "grad_norm": 1.234375, "learning_rate": 0.00030760095011876486, "loss": 1.883, "mean_token_accuracy": 0.5712258398532868, "step": 260 }, { "epoch": 0.11567297393787933, "grad_norm": 1.328125, "learning_rate": 0.00031947743467933493, "loss": 1.9089, "mean_token_accuracy": 0.5683296079436938, "step": 270 }, { "epoch": 0.11995715815780078, "grad_norm": 1.140625, "learning_rate": 0.000331353919239905, "loss": 1.8845, "mean_token_accuracy": 0.5682273174325625, "step": 280 }, { "epoch": 0.12424134237772225, "grad_norm": 1.046875, "learning_rate": 0.000343230403800475, "loss": 1.8897, "mean_token_accuracy": 0.5696620404720306, "step": 290 }, { "epoch": 0.12852552659764369, "grad_norm": 1.1640625, "learning_rate": 0.00035510688836104515, "loss": 1.8809, "mean_token_accuracy": 0.571748511493206, "step": 300 }, { "epoch": 0.13280971081756515, "grad_norm": 2.640625, "learning_rate": 0.0003669833729216152, "loss": 1.9614, "mean_token_accuracy": 0.566138303776582, "step": 310 }, { "epoch": 0.13709389503748662, "grad_norm": 1.109375, "learning_rate": 0.00037885985748218525, "loss": 1.969, "mean_token_accuracy": 0.5570171405871709, "step": 320 }, { "epoch": 0.14137807925740806, "grad_norm": 1.1640625, "learning_rate": 0.0003907363420427554, "loss": 1.902, "mean_token_accuracy": 0.5694692318638166, "step": 330 }, { "epoch": 0.14566226347732952, "grad_norm": 1.1171875, "learning_rate": 0.00040261282660332545, "loss": 1.9013, "mean_token_accuracy": 0.5666690587997436, "step": 340 }, { "epoch": 0.149946447697251, "grad_norm": 0.96484375, "learning_rate": 0.00041448931116389547, "loss": 1.9115, "mean_token_accuracy": 0.5653235624233882, "step": 350 }, { "epoch": 0.15423063191717243, "grad_norm": 1.2109375, "learning_rate": 0.00042636579572446554, "loss": 1.9273, "mean_token_accuracy": 0.5625237961610158, "step": 360 }, { "epoch": 0.1585148161370939, "grad_norm": 0.9140625, "learning_rate": 0.00043824228028503567, "loss": 1.9705, "mean_token_accuracy": 0.5571655298272769, "step": 370 }, { "epoch": 0.16279900035701536, "grad_norm": 1.1015625, "learning_rate": 0.0004501187648456057, "loss": 1.9421, "mean_token_accuracy": 0.5626261870066325, "step": 380 }, { "epoch": 0.1670831845769368, "grad_norm": 0.89453125, "learning_rate": 0.00046199524940617576, "loss": 1.9847, "mean_token_accuracy": 0.5572134067614873, "step": 390 }, { "epoch": 0.17136736879685827, "grad_norm": 0.96875, "learning_rate": 0.0004738717339667459, "loss": 1.9701, "mean_token_accuracy": 0.5576820741097133, "step": 400 }, { "epoch": 0.17565155301677973, "grad_norm": 0.875, "learning_rate": 0.0004857482185273159, "loss": 1.9652, "mean_token_accuracy": 0.5562737271189689, "step": 410 }, { "epoch": 0.17993573723670117, "grad_norm": 0.984375, "learning_rate": 0.000497624703087886, "loss": 1.9838, "mean_token_accuracy": 0.553861757616202, "step": 420 }, { "epoch": 0.18421992145662264, "grad_norm": 0.90625, "learning_rate": 0.0004999981769212751, "loss": 1.9842, "mean_token_accuracy": 0.5521314447124799, "step": 430 }, { "epoch": 0.1885041056765441, "grad_norm": 0.76171875, "learning_rate": 0.0004999907707095249, "loss": 1.9573, "mean_token_accuracy": 0.5590350007017454, "step": 440 }, { "epoch": 0.19278828989646554, "grad_norm": 0.79296875, "learning_rate": 0.0004999776675909755, "loss": 1.9179, "mean_token_accuracy": 0.5650409559408823, "step": 450 }, { "epoch": 0.197072474116387, "grad_norm": 0.8203125, "learning_rate": 0.000499958867864227, "loss": 1.9485, "mean_token_accuracy": 0.5609820206960042, "step": 460 }, { "epoch": 0.20135665833630847, "grad_norm": 0.7890625, "learning_rate": 0.0004999343719576963, "loss": 1.9483, "mean_token_accuracy": 0.5612422108650208, "step": 470 }, { "epoch": 0.2056408425562299, "grad_norm": 0.82421875, "learning_rate": 0.0004999041804296074, "loss": 1.9683, "mean_token_accuracy": 0.561284634967645, "step": 480 }, { "epoch": 0.20992502677615138, "grad_norm": 0.76953125, "learning_rate": 0.0004998682939679794, "loss": 1.9333, "mean_token_accuracy": 0.56248503079017, "step": 490 }, { "epoch": 0.21420921099607282, "grad_norm": 0.72265625, "learning_rate": 0.0004998267133906095, "loss": 1.8902, "mean_token_accuracy": 0.568443168203036, "step": 500 }, { "epoch": 0.21849339521599428, "grad_norm": 0.8125, "learning_rate": 0.0004997794396450555, "loss": 1.9334, "mean_token_accuracy": 0.5650926142930984, "step": 510 }, { "epoch": 0.22277757943591575, "grad_norm": 0.79296875, "learning_rate": 0.0004997264738086136, "loss": 1.9271, "mean_token_accuracy": 0.5642434308926264, "step": 520 }, { "epoch": 0.2270617636558372, "grad_norm": 0.7109375, "learning_rate": 0.0004996678170882941, "loss": 1.9352, "mean_token_accuracy": 0.5623646408319474, "step": 530 }, { "epoch": 0.23134594787575866, "grad_norm": 0.8125, "learning_rate": 0.000499603470820794, "loss": 1.9214, "mean_token_accuracy": 0.5678030242522557, "step": 540 }, { "epoch": 0.23563013209568012, "grad_norm": 0.65625, "learning_rate": 0.0004995334364724658, "loss": 1.9247, "mean_token_accuracy": 0.5662606621781985, "step": 550 }, { "epoch": 0.23991431631560156, "grad_norm": 0.64453125, "learning_rate": 0.0004994577156392854, "loss": 1.9279, "mean_token_accuracy": 0.5633541295925776, "step": 560 }, { "epoch": 0.24419850053552303, "grad_norm": 0.640625, "learning_rate": 0.0004993763100468144, "loss": 1.8886, "mean_token_accuracy": 0.5679133623838425, "step": 570 }, { "epoch": 0.2484826847554445, "grad_norm": 0.6953125, "learning_rate": 0.0004992892215501618, "loss": 1.8938, "mean_token_accuracy": 0.5673948327700297, "step": 580 }, { "epoch": 0.25276686897536593, "grad_norm": 0.671875, "learning_rate": 0.0004991964521339408, "loss": 1.8795, "mean_token_accuracy": 0.5750111728906632, "step": 590 }, { "epoch": 0.25705105319528737, "grad_norm": 0.640625, "learning_rate": 0.0004990980039122245, "loss": 1.8579, "mean_token_accuracy": 0.5755017310380935, "step": 600 }, { "epoch": 0.26133523741520887, "grad_norm": 0.65234375, "learning_rate": 0.0004989938791284971, "loss": 1.8579, "mean_token_accuracy": 0.5754131337006887, "step": 610 }, { "epoch": 0.2656194216351303, "grad_norm": 0.69921875, "learning_rate": 0.0004988840801556029, "loss": 1.9225, "mean_token_accuracy": 0.5679664716124535, "step": 620 }, { "epoch": 0.26990360585505174, "grad_norm": 0.625, "learning_rate": 0.0004987686094956922, "loss": 1.8843, "mean_token_accuracy": 0.5681375061472257, "step": 630 }, { "epoch": 0.27418779007497324, "grad_norm": 0.69140625, "learning_rate": 0.0004986474697801647, "loss": 1.8747, "mean_token_accuracy": 0.5750736912091573, "step": 640 }, { "epoch": 0.2784719742948947, "grad_norm": 0.65234375, "learning_rate": 0.000498520663769609, "loss": 1.8875, "mean_token_accuracy": 0.568752312163512, "step": 650 }, { "epoch": 0.2827561585148161, "grad_norm": 0.609375, "learning_rate": 0.0004983881943537396, "loss": 1.8393, "mean_token_accuracy": 0.5768392990032832, "step": 660 }, { "epoch": 0.2870403427347376, "grad_norm": 0.59765625, "learning_rate": 0.0004982500645513319, "loss": 1.8389, "mean_token_accuracy": 0.5770889202753703, "step": 670 }, { "epoch": 0.29132452695465905, "grad_norm": 0.62890625, "learning_rate": 0.0004981062775101524, "loss": 1.812, "mean_token_accuracy": 0.5807933017611504, "step": 680 }, { "epoch": 0.2956087111745805, "grad_norm": 0.58984375, "learning_rate": 0.0004979568365068878, "loss": 1.8463, "mean_token_accuracy": 0.571879476805528, "step": 690 }, { "epoch": 0.299892895394502, "grad_norm": 0.60546875, "learning_rate": 0.0004978017449470692, "loss": 1.8659, "mean_token_accuracy": 0.575709896783034, "step": 700 }, { "epoch": 0.3041770796144234, "grad_norm": 0.55859375, "learning_rate": 0.0004976410063649963, "loss": 1.8381, "mean_token_accuracy": 0.577228785554568, "step": 710 }, { "epoch": 0.30846126383434486, "grad_norm": 0.56640625, "learning_rate": 0.0004974746244236546, "loss": 1.7498, "mean_token_accuracy": 0.5971198469400406, "step": 720 }, { "epoch": 0.31274544805426635, "grad_norm": 0.56640625, "learning_rate": 0.0004973026029146343, "loss": 1.797, "mean_token_accuracy": 0.5817218641440074, "step": 730 }, { "epoch": 0.3170296322741878, "grad_norm": 0.55859375, "learning_rate": 0.0004971249457580418, "loss": 1.8144, "mean_token_accuracy": 0.5801170518000921, "step": 740 }, { "epoch": 0.32131381649410923, "grad_norm": 0.6015625, "learning_rate": 0.0004969416570024118, "loss": 1.8323, "mean_token_accuracy": 0.5770084033409755, "step": 750 }, { "epoch": 0.3255980007140307, "grad_norm": 0.5859375, "learning_rate": 0.0004967527408246142, "loss": 1.8112, "mean_token_accuracy": 0.5806623538335164, "step": 760 }, { "epoch": 0.32988218493395216, "grad_norm": 0.55078125, "learning_rate": 0.0004965582015297593, "loss": 1.7814, "mean_token_accuracy": 0.590585180123647, "step": 770 }, { "epoch": 0.3341663691538736, "grad_norm": 0.5703125, "learning_rate": 0.0004963580435510999, "loss": 1.7749, "mean_token_accuracy": 0.5853669941425323, "step": 780 }, { "epoch": 0.3384505533737951, "grad_norm": 0.57421875, "learning_rate": 0.0004961522714499296, "loss": 1.8028, "mean_token_accuracy": 0.5840177754561107, "step": 790 }, { "epoch": 0.34273473759371653, "grad_norm": 0.53125, "learning_rate": 0.0004959408899154796, "loss": 1.8328, "mean_token_accuracy": 0.5787926654020945, "step": 800 }, { "epoch": 0.34701892181363797, "grad_norm": 0.546875, "learning_rate": 0.0004957239037648111, "loss": 1.8215, "mean_token_accuracy": 0.5796164035797119, "step": 810 }, { "epoch": 0.35130310603355946, "grad_norm": 0.546875, "learning_rate": 0.0004955013179427064, "loss": 1.813, "mean_token_accuracy": 0.5805202250679334, "step": 820 }, { "epoch": 0.3555872902534809, "grad_norm": 0.515625, "learning_rate": 0.0004952731375215554, "loss": 1.7699, "mean_token_accuracy": 0.5885967900355656, "step": 830 }, { "epoch": 0.35987147447340234, "grad_norm": 0.52734375, "learning_rate": 0.0004950393677012406, "loss": 1.7783, "mean_token_accuracy": 0.5840276171763737, "step": 840 }, { "epoch": 0.36415565869332384, "grad_norm": 0.53125, "learning_rate": 0.0004948000138090178, "loss": 1.758, "mean_token_accuracy": 0.5915057013432184, "step": 850 }, { "epoch": 0.3684398429132453, "grad_norm": 0.56640625, "learning_rate": 0.000494555081299396, "loss": 1.7502, "mean_token_accuracy": 0.5921384165684382, "step": 860 }, { "epoch": 0.3727240271331667, "grad_norm": 0.55859375, "learning_rate": 0.0004943045757540116, "loss": 1.7696, "mean_token_accuracy": 0.5876839280128479, "step": 870 }, { "epoch": 0.3770082113530882, "grad_norm": 0.4765625, "learning_rate": 0.0004940485028815028, "loss": 1.7923, "mean_token_accuracy": 0.5851048608620961, "step": 880 }, { "epoch": 0.38129239557300965, "grad_norm": 0.462890625, "learning_rate": 0.0004937868685173779, "loss": 1.7734, "mean_token_accuracy": 0.5879452233513196, "step": 890 }, { "epoch": 0.3855765797929311, "grad_norm": 0.51171875, "learning_rate": 0.0004935196786238832, "loss": 1.7807, "mean_token_accuracy": 0.5856667757034302, "step": 900 }, { "epoch": 0.3898607640128526, "grad_norm": 0.470703125, "learning_rate": 0.0004932469392898675, "loss": 1.7573, "mean_token_accuracy": 0.589621431628863, "step": 910 }, { "epoch": 0.394144948232774, "grad_norm": 0.48046875, "learning_rate": 0.0004929686567306424, "loss": 1.7764, "mean_token_accuracy": 0.5870031158129374, "step": 920 }, { "epoch": 0.39842913245269546, "grad_norm": 0.51171875, "learning_rate": 0.0004926848372878412, "loss": 1.7499, "mean_token_accuracy": 0.5911330193281173, "step": 930 }, { "epoch": 0.40271331667261695, "grad_norm": 0.51171875, "learning_rate": 0.0004923954874292743, "loss": 1.7605, "mean_token_accuracy": 0.5885690222183864, "step": 940 }, { "epoch": 0.4069975008925384, "grad_norm": 0.45703125, "learning_rate": 0.0004921006137487819, "loss": 1.7249, "mean_token_accuracy": 0.5983182614048322, "step": 950 }, { "epoch": 0.4112816851124598, "grad_norm": 0.5, "learning_rate": 0.0004918002229660836, "loss": 1.7411, "mean_token_accuracy": 0.5920816282431285, "step": 960 }, { "epoch": 0.41556586933238127, "grad_norm": 0.45703125, "learning_rate": 0.000491494321926625, "loss": 1.7022, "mean_token_accuracy": 0.6039901932080587, "step": 970 }, { "epoch": 0.41985005355230276, "grad_norm": 0.470703125, "learning_rate": 0.0004911829176014227, "loss": 1.7546, "mean_token_accuracy": 0.5963041971127192, "step": 980 }, { "epoch": 0.4241342377722242, "grad_norm": 0.484375, "learning_rate": 0.0004908660170869041, "loss": 1.7394, "mean_token_accuracy": 0.5946206981937091, "step": 990 }, { "epoch": 0.42841842199214564, "grad_norm": 0.48828125, "learning_rate": 0.0004905436276047468, "loss": 1.7065, "mean_token_accuracy": 0.5999080528815587, "step": 1000 }, { "epoch": 0.43270260621206713, "grad_norm": 0.45703125, "learning_rate": 0.0004902157565017131, "loss": 1.7317, "mean_token_accuracy": 0.5951716423034668, "step": 1010 }, { "epoch": 0.43698679043198857, "grad_norm": 0.455078125, "learning_rate": 0.0004898824112494834, "loss": 1.7353, "mean_token_accuracy": 0.5931165516376495, "step": 1020 }, { "epoch": 0.44127097465191, "grad_norm": 0.4296875, "learning_rate": 0.0004895435994444855, "loss": 1.7234, "mean_token_accuracy": 0.592176150282224, "step": 1030 }, { "epoch": 0.4455551588718315, "grad_norm": 0.478515625, "learning_rate": 0.0004891993288077216, "loss": 1.7373, "mean_token_accuracy": 0.5937283883492152, "step": 1040 }, { "epoch": 0.44983934309175294, "grad_norm": 0.48046875, "learning_rate": 0.0004888496071845921, "loss": 1.7339, "mean_token_accuracy": 0.5963114351034164, "step": 1050 }, { "epoch": 0.4541235273116744, "grad_norm": 0.515625, "learning_rate": 0.0004884944425447174, "loss": 1.7075, "mean_token_accuracy": 0.5985465884208679, "step": 1060 }, { "epoch": 0.4584077115315959, "grad_norm": 0.43359375, "learning_rate": 0.00048813384298175533, "loss": 1.7108, "mean_token_accuracy": 0.5949873934189479, "step": 1070 }, { "epoch": 0.4626918957515173, "grad_norm": 0.4296875, "learning_rate": 0.000487767816713218, "loss": 1.7319, "mean_token_accuracy": 0.5978627324104309, "step": 1080 }, { "epoch": 0.46697607997143875, "grad_norm": 0.515625, "learning_rate": 0.00048739637208028343, "loss": 1.7215, "mean_token_accuracy": 0.5949975033601125, "step": 1090 }, { "epoch": 0.47126026419136025, "grad_norm": 0.41796875, "learning_rate": 0.0004870195175476059, "loss": 1.6915, "mean_token_accuracy": 0.6006935626268387, "step": 1100 }, { "epoch": 0.4755444484112817, "grad_norm": 0.45703125, "learning_rate": 0.00048663726170312304, "loss": 1.7224, "mean_token_accuracy": 0.5965896293520927, "step": 1110 }, { "epoch": 0.4798286326312031, "grad_norm": 0.46484375, "learning_rate": 0.0004862496132578601, "loss": 1.7188, "mean_token_accuracy": 0.5954587419827779, "step": 1120 }, { "epoch": 0.4841128168511246, "grad_norm": 0.46875, "learning_rate": 0.0004858565810457315, "loss": 1.7029, "mean_token_accuracy": 0.5984790166219075, "step": 1130 }, { "epoch": 0.48839700107104606, "grad_norm": 0.46875, "learning_rate": 0.00048545817402333944, "loss": 1.6707, "mean_token_accuracy": 0.6039733906586965, "step": 1140 }, { "epoch": 0.4926811852909675, "grad_norm": 0.458984375, "learning_rate": 0.00048505440126976975, "loss": 1.7063, "mean_token_accuracy": 0.5990351974964142, "step": 1150 }, { "epoch": 0.496965369510889, "grad_norm": 0.443359375, "learning_rate": 0.000484645271986385, "loss": 1.7019, "mean_token_accuracy": 0.5967398678263028, "step": 1160 }, { "epoch": 0.5012495537308104, "grad_norm": 0.4453125, "learning_rate": 0.00048423079549661513, "loss": 1.7185, "mean_token_accuracy": 0.5945661654074986, "step": 1170 }, { "epoch": 0.5055337379507319, "grad_norm": 0.45703125, "learning_rate": 0.00048381098124574453, "loss": 1.6953, "mean_token_accuracy": 0.5991637865702312, "step": 1180 }, { "epoch": 0.5098179221706534, "grad_norm": 0.458984375, "learning_rate": 0.000483385838800697, "loss": 1.6724, "mean_token_accuracy": 0.6055593649546306, "step": 1190 }, { "epoch": 0.5141021063905747, "grad_norm": 0.453125, "learning_rate": 0.0004829553778498177, "loss": 1.69, "mean_token_accuracy": 0.6024374475081762, "step": 1200 }, { "epoch": 0.5183862906104962, "grad_norm": 0.419921875, "learning_rate": 0.0004825196082026525, "loss": 1.6896, "mean_token_accuracy": 0.6015072325865428, "step": 1210 }, { "epoch": 0.5226704748304177, "grad_norm": 0.470703125, "learning_rate": 0.00048207853978972425, "loss": 1.6911, "mean_token_accuracy": 0.5975394507249197, "step": 1220 }, { "epoch": 0.5269546590503391, "grad_norm": 0.470703125, "learning_rate": 0.00048163218266230657, "loss": 1.6506, "mean_token_accuracy": 0.6080829570690791, "step": 1230 }, { "epoch": 0.5312388432702606, "grad_norm": 0.482421875, "learning_rate": 0.00048118054699219486, "loss": 1.7084, "mean_token_accuracy": 0.5956538558006287, "step": 1240 }, { "epoch": 0.5355230274901821, "grad_norm": 0.4296875, "learning_rate": 0.00048072364307147434, "loss": 1.6769, "mean_token_accuracy": 0.6048657476902009, "step": 1250 }, { "epoch": 0.5398072117101035, "grad_norm": 0.412109375, "learning_rate": 0.00048026148131228544, "loss": 1.6391, "mean_token_accuracy": 0.6106082250674566, "step": 1260 }, { "epoch": 0.544091395930025, "grad_norm": 0.4140625, "learning_rate": 0.00047979407224658704, "loss": 1.652, "mean_token_accuracy": 0.609614963332812, "step": 1270 }, { "epoch": 0.5483755801499465, "grad_norm": 0.42578125, "learning_rate": 0.0004793214265259158, "loss": 1.6686, "mean_token_accuracy": 0.6056814392407736, "step": 1280 }, { "epoch": 0.5526597643698679, "grad_norm": 0.435546875, "learning_rate": 0.0004788435549211439, "loss": 1.7061, "mean_token_accuracy": 0.600483645995458, "step": 1290 }, { "epoch": 0.5569439485897894, "grad_norm": 0.4765625, "learning_rate": 0.00047836046832223336, "loss": 1.6521, "mean_token_accuracy": 0.607241137822469, "step": 1300 }, { "epoch": 0.5612281328097108, "grad_norm": 0.443359375, "learning_rate": 0.00047787217773798775, "loss": 1.6408, "mean_token_accuracy": 0.6109602769215902, "step": 1310 }, { "epoch": 0.5655123170296322, "grad_norm": 0.416015625, "learning_rate": 0.00047737869429580177, "loss": 1.6651, "mean_token_accuracy": 0.6036148915688196, "step": 1320 }, { "epoch": 0.5697965012495537, "grad_norm": 0.39453125, "learning_rate": 0.0004768800292414073, "loss": 1.6612, "mean_token_accuracy": 0.6085895299911499, "step": 1330 }, { "epoch": 0.5740806854694752, "grad_norm": 0.419921875, "learning_rate": 0.00047637619393861726, "loss": 1.6645, "mean_token_accuracy": 0.6032974988222122, "step": 1340 }, { "epoch": 0.5783648696893966, "grad_norm": 0.439453125, "learning_rate": 0.00047586719986906644, "loss": 1.6206, "mean_token_accuracy": 0.6132790346940359, "step": 1350 }, { "epoch": 0.5826490539093181, "grad_norm": 0.423828125, "learning_rate": 0.00047535305863195023, "loss": 1.6523, "mean_token_accuracy": 0.6070671379566193, "step": 1360 }, { "epoch": 0.5869332381292396, "grad_norm": 0.404296875, "learning_rate": 0.00047483378194376004, "loss": 1.6394, "mean_token_accuracy": 0.608396037419637, "step": 1370 }, { "epoch": 0.591217422349161, "grad_norm": 0.412109375, "learning_rate": 0.00047430938163801623, "loss": 1.6279, "mean_token_accuracy": 0.6083019236723582, "step": 1380 }, { "epoch": 0.5955016065690825, "grad_norm": 0.404296875, "learning_rate": 0.00047377986966499867, "loss": 1.6447, "mean_token_accuracy": 0.6057626972595851, "step": 1390 }, { "epoch": 0.599785790789004, "grad_norm": 0.41015625, "learning_rate": 0.00047324525809147437, "loss": 1.6669, "mean_token_accuracy": 0.6031167497237523, "step": 1400 }, { "epoch": 0.6040699750089253, "grad_norm": 0.3984375, "learning_rate": 0.0004727055591004221, "loss": 1.6684, "mean_token_accuracy": 0.6013229255874951, "step": 1410 }, { "epoch": 0.6083541592288468, "grad_norm": 0.453125, "learning_rate": 0.00047216078499075556, "loss": 1.6047, "mean_token_accuracy": 0.613938628633817, "step": 1420 }, { "epoch": 0.6126383434487683, "grad_norm": 0.392578125, "learning_rate": 0.0004716109481770422, "loss": 1.646, "mean_token_accuracy": 0.6082295358181, "step": 1430 }, { "epoch": 0.6169225276686897, "grad_norm": 0.380859375, "learning_rate": 0.000471056061189221, "loss": 1.5866, "mean_token_accuracy": 0.6149321556091308, "step": 1440 }, { "epoch": 0.6212067118886112, "grad_norm": 0.392578125, "learning_rate": 0.0004704961366723165, "loss": 1.6708, "mean_token_accuracy": 0.607517758011818, "step": 1450 }, { "epoch": 0.6254908961085327, "grad_norm": 0.40234375, "learning_rate": 0.000469931187386151, "loss": 1.6741, "mean_token_accuracy": 0.607824856042862, "step": 1460 }, { "epoch": 0.6297750803284541, "grad_norm": 0.4375, "learning_rate": 0.0004693612262050535, "loss": 1.6384, "mean_token_accuracy": 0.6065444548924764, "step": 1470 }, { "epoch": 0.6340592645483756, "grad_norm": 0.3984375, "learning_rate": 0.0004687862661175664, "loss": 1.6302, "mean_token_accuracy": 0.611606694261233, "step": 1480 }, { "epoch": 0.6383434487682971, "grad_norm": 0.404296875, "learning_rate": 0.0004682063202261495, "loss": 1.608, "mean_token_accuracy": 0.617574452360471, "step": 1490 }, { "epoch": 0.6426276329882185, "grad_norm": 0.392578125, "learning_rate": 0.0004676214017468815, "loss": 1.6193, "mean_token_accuracy": 0.6125218907992045, "step": 1500 }, { "epoch": 0.64691181720814, "grad_norm": 0.388671875, "learning_rate": 0.00046703152400915873, "loss": 1.6094, "mean_token_accuracy": 0.6132543057203292, "step": 1510 }, { "epoch": 0.6511960014280614, "grad_norm": 0.431640625, "learning_rate": 0.0004664367004553914, "loss": 1.644, "mean_token_accuracy": 0.6055496126413346, "step": 1520 }, { "epoch": 0.6554801856479828, "grad_norm": 0.4296875, "learning_rate": 0.0004658369446406974, "loss": 1.6027, "mean_token_accuracy": 0.6137914508581161, "step": 1530 }, { "epoch": 0.6597643698679043, "grad_norm": 0.373046875, "learning_rate": 0.000465232270232593, "loss": 1.5611, "mean_token_accuracy": 0.6206978172063827, "step": 1540 }, { "epoch": 0.6640485540878258, "grad_norm": 0.404296875, "learning_rate": 0.0004646226910106821, "loss": 1.5668, "mean_token_accuracy": 0.6236391812562943, "step": 1550 }, { "epoch": 0.6683327383077472, "grad_norm": 0.423828125, "learning_rate": 0.0004640082208663415, "loss": 1.6394, "mean_token_accuracy": 0.6060982346534729, "step": 1560 }, { "epoch": 0.6726169225276687, "grad_norm": 0.3828125, "learning_rate": 0.0004633888738024048, "loss": 1.5921, "mean_token_accuracy": 0.6156363248825073, "step": 1570 }, { "epoch": 0.6769011067475902, "grad_norm": 0.400390625, "learning_rate": 0.00046276466393284295, "loss": 1.6083, "mean_token_accuracy": 0.6118316878875096, "step": 1580 }, { "epoch": 0.6811852909675116, "grad_norm": 0.392578125, "learning_rate": 0.00046213560548244296, "loss": 1.5921, "mean_token_accuracy": 0.6157896757125855, "step": 1590 }, { "epoch": 0.6854694751874331, "grad_norm": 0.416015625, "learning_rate": 0.0004615017127864834, "loss": 1.612, "mean_token_accuracy": 0.6124424457550048, "step": 1600 }, { "epoch": 0.6897536594073546, "grad_norm": 0.3828125, "learning_rate": 0.00046086300029040805, "loss": 1.5575, "mean_token_accuracy": 0.620477185646693, "step": 1610 }, { "epoch": 0.6940378436272759, "grad_norm": 0.392578125, "learning_rate": 0.0004602194825494965, "loss": 1.5634, "mean_token_accuracy": 0.623723766207695, "step": 1620 }, { "epoch": 0.6983220278471974, "grad_norm": 0.419921875, "learning_rate": 0.00045957117422853257, "loss": 1.5947, "mean_token_accuracy": 0.6120685537656149, "step": 1630 }, { "epoch": 0.7026062120671189, "grad_norm": 0.365234375, "learning_rate": 0.0004589180901014699, "loss": 1.5554, "mean_token_accuracy": 0.6264029294252396, "step": 1640 }, { "epoch": 0.7068903962870403, "grad_norm": 0.37890625, "learning_rate": 0.0004582602450510955, "loss": 1.6142, "mean_token_accuracy": 0.6149922668933868, "step": 1650 }, { "epoch": 0.7111745805069618, "grad_norm": 0.375, "learning_rate": 0.00045759765406869077, "loss": 1.5881, "mean_token_accuracy": 0.6184295862913132, "step": 1660 }, { "epoch": 0.7154587647268833, "grad_norm": 0.423828125, "learning_rate": 0.00045693033225368917, "loss": 1.5637, "mean_token_accuracy": 0.618203051884969, "step": 1670 }, { "epoch": 0.7197429489468047, "grad_norm": 0.40234375, "learning_rate": 0.0004562582948133331, "loss": 1.5764, "mean_token_accuracy": 0.6192536850770315, "step": 1680 }, { "epoch": 0.7240271331667262, "grad_norm": 0.376953125, "learning_rate": 0.0004555815570623264, "loss": 1.6028, "mean_token_accuracy": 0.61400941212972, "step": 1690 }, { "epoch": 0.7283113173866477, "grad_norm": 0.361328125, "learning_rate": 0.000454900134422486, "loss": 1.5918, "mean_token_accuracy": 0.6173286845286687, "step": 1700 }, { "epoch": 0.732595501606569, "grad_norm": 0.380859375, "learning_rate": 0.0004542140424223904, "loss": 1.5623, "mean_token_accuracy": 0.6222762515147527, "step": 1710 }, { "epoch": 0.7368796858264905, "grad_norm": 0.40234375, "learning_rate": 0.0004535232966970253, "loss": 1.5609, "mean_token_accuracy": 0.6209416131178538, "step": 1720 }, { "epoch": 0.741163870046412, "grad_norm": 0.45703125, "learning_rate": 0.0004528279129874281, "loss": 1.6066, "mean_token_accuracy": 0.6145075420538585, "step": 1730 }, { "epoch": 0.7454480542663334, "grad_norm": 0.376953125, "learning_rate": 0.00045212790714032843, "loss": 1.5767, "mean_token_accuracy": 0.6191084752480189, "step": 1740 }, { "epoch": 0.7497322384862549, "grad_norm": 0.40234375, "learning_rate": 0.0004514232951077875, "loss": 1.5814, "mean_token_accuracy": 0.612170214454333, "step": 1750 }, { "epoch": 0.7540164227061764, "grad_norm": 0.37109375, "learning_rate": 0.00045071409294683443, "loss": 1.5696, "mean_token_accuracy": 0.6165617436170578, "step": 1760 }, { "epoch": 0.7583006069260978, "grad_norm": 0.408203125, "learning_rate": 0.00045000031681910024, "loss": 1.5513, "mean_token_accuracy": 0.6239033748706182, "step": 1770 }, { "epoch": 0.7625847911460193, "grad_norm": 0.392578125, "learning_rate": 0.0004492819829904498, "loss": 1.6032, "mean_token_accuracy": 0.614315361281236, "step": 1780 }, { "epoch": 0.7668689753659408, "grad_norm": 0.3671875, "learning_rate": 0.0004485591078306109, "loss": 1.5622, "mean_token_accuracy": 0.6223886062701544, "step": 1790 }, { "epoch": 0.7711531595858622, "grad_norm": 0.392578125, "learning_rate": 0.0004478317078128013, "loss": 1.5891, "mean_token_accuracy": 0.6170624762773513, "step": 1800 }, { "epoch": 0.7754373438057837, "grad_norm": 0.359375, "learning_rate": 0.0004470997995133534, "loss": 1.5589, "mean_token_accuracy": 0.6224645217259724, "step": 1810 }, { "epoch": 0.7797215280257052, "grad_norm": 0.392578125, "learning_rate": 0.0004463633996113365, "loss": 1.5597, "mean_token_accuracy": 0.6208334664503733, "step": 1820 }, { "epoch": 0.7840057122456265, "grad_norm": 0.41015625, "learning_rate": 0.00044562252488817644, "loss": 1.5748, "mean_token_accuracy": 0.6175821512937546, "step": 1830 }, { "epoch": 0.788289896465548, "grad_norm": 0.400390625, "learning_rate": 0.00044487719222727353, "loss": 1.5298, "mean_token_accuracy": 0.6264814734458923, "step": 1840 }, { "epoch": 0.7925740806854695, "grad_norm": 0.396484375, "learning_rate": 0.0004441274186136176, "loss": 1.5153, "mean_token_accuracy": 0.6300516416629155, "step": 1850 }, { "epoch": 0.7968582649053909, "grad_norm": 0.40234375, "learning_rate": 0.0004433732211334011, "loss": 1.556, "mean_token_accuracy": 0.6228625237941742, "step": 1860 }, { "epoch": 0.8011424491253124, "grad_norm": 0.375, "learning_rate": 0.0004426146169736295, "loss": 1.5473, "mean_token_accuracy": 0.6202453782161077, "step": 1870 }, { "epoch": 0.8054266333452339, "grad_norm": 0.365234375, "learning_rate": 0.0004418516234217297, "loss": 1.5485, "mean_token_accuracy": 0.6223239193360011, "step": 1880 }, { "epoch": 0.8097108175651553, "grad_norm": 0.37890625, "learning_rate": 0.00044108425786515626, "loss": 1.5764, "mean_token_accuracy": 0.6173768778642018, "step": 1890 }, { "epoch": 0.8139950017850768, "grad_norm": 0.373046875, "learning_rate": 0.00044031253779099505, "loss": 1.5518, "mean_token_accuracy": 0.6269906918207805, "step": 1900 }, { "epoch": 0.8182791860049982, "grad_norm": 0.3359375, "learning_rate": 0.00043953648078556465, "loss": 1.53, "mean_token_accuracy": 0.6300149967273077, "step": 1910 }, { "epoch": 0.8225633702249197, "grad_norm": 0.361328125, "learning_rate": 0.0004387561045340155, "loss": 1.5431, "mean_token_accuracy": 0.6231356263160706, "step": 1920 }, { "epoch": 0.8268475544448411, "grad_norm": 0.337890625, "learning_rate": 0.00043797142681992744, "loss": 1.5386, "mean_token_accuracy": 0.626959690451622, "step": 1930 }, { "epoch": 0.8311317386647625, "grad_norm": 0.36328125, "learning_rate": 0.0004371824655249037, "loss": 1.5707, "mean_token_accuracy": 0.6185725013415019, "step": 1940 }, { "epoch": 0.835415922884684, "grad_norm": 0.404296875, "learning_rate": 0.0004363892386281639, "loss": 1.5308, "mean_token_accuracy": 0.6247758358716965, "step": 1950 }, { "epoch": 0.8397001071046055, "grad_norm": 0.34765625, "learning_rate": 0.0004355917642061342, "loss": 1.5746, "mean_token_accuracy": 0.6213243504365286, "step": 1960 }, { "epoch": 0.8439842913245269, "grad_norm": 0.345703125, "learning_rate": 0.0004347900604320353, "loss": 1.5369, "mean_token_accuracy": 0.6267731686433157, "step": 1970 }, { "epoch": 0.8482684755444484, "grad_norm": 0.365234375, "learning_rate": 0.0004339841455754684, "loss": 1.5302, "mean_token_accuracy": 0.6294100970029831, "step": 1980 }, { "epoch": 0.8525526597643699, "grad_norm": 0.36328125, "learning_rate": 0.0004331740380019988, "loss": 1.5668, "mean_token_accuracy": 0.6167016873757044, "step": 1990 }, { "epoch": 0.8568368439842913, "grad_norm": 0.36328125, "learning_rate": 0.0004323597561727374, "loss": 1.5192, "mean_token_accuracy": 0.6278371940056483, "step": 2000 }, { "epoch": 0.8611210282042128, "grad_norm": 0.361328125, "learning_rate": 0.0004315413186439201, "loss": 1.5214, "mean_token_accuracy": 0.6299047251542409, "step": 2010 }, { "epoch": 0.8654052124241343, "grad_norm": 0.3671875, "learning_rate": 0.0004307187440664846, "loss": 1.5366, "mean_token_accuracy": 0.6220711261034012, "step": 2020 }, { "epoch": 0.8696893966440556, "grad_norm": 0.373046875, "learning_rate": 0.00042989205118564575, "loss": 1.5766, "mean_token_accuracy": 0.6220267007748286, "step": 2030 }, { "epoch": 0.8739735808639771, "grad_norm": 0.333984375, "learning_rate": 0.00042906125884046827, "loss": 1.5452, "mean_token_accuracy": 0.628257621328036, "step": 2040 }, { "epoch": 0.8782577650838986, "grad_norm": 0.3984375, "learning_rate": 0.00042822638596343735, "loss": 1.5187, "mean_token_accuracy": 0.6320155799388886, "step": 2050 }, { "epoch": 0.88254194930382, "grad_norm": 0.373046875, "learning_rate": 0.0004273874515800271, "loss": 1.4877, "mean_token_accuracy": 0.6335129290819168, "step": 2060 }, { "epoch": 0.8868261335237415, "grad_norm": 0.37109375, "learning_rate": 0.0004265444748082674, "loss": 1.5414, "mean_token_accuracy": 0.6232473621765773, "step": 2070 }, { "epoch": 0.891110317743663, "grad_norm": 0.3671875, "learning_rate": 0.00042569747485830784, "loss": 1.5354, "mean_token_accuracy": 0.6265609403451283, "step": 2080 }, { "epoch": 0.8953945019635844, "grad_norm": 0.36328125, "learning_rate": 0.00042484647103198007, "loss": 1.5011, "mean_token_accuracy": 0.632336409886678, "step": 2090 }, { "epoch": 0.8996786861835059, "grad_norm": 0.3515625, "learning_rate": 0.0004239914827223579, "loss": 1.5261, "mean_token_accuracy": 0.6247933119535446, "step": 2100 }, { "epoch": 0.9039628704034274, "grad_norm": 0.333984375, "learning_rate": 0.0004231325294133155, "loss": 1.5346, "mean_token_accuracy": 0.6297541747490565, "step": 2110 }, { "epoch": 0.9082470546233488, "grad_norm": 0.34375, "learning_rate": 0.0004222696306790833, "loss": 1.5457, "mean_token_accuracy": 0.6180067032575607, "step": 2120 }, { "epoch": 0.9125312388432703, "grad_norm": 0.384765625, "learning_rate": 0.000421402806183802, "loss": 1.4974, "mean_token_accuracy": 0.6331047038237254, "step": 2130 }, { "epoch": 0.9168154230631917, "grad_norm": 0.35546875, "learning_rate": 0.00042053207568107414, "loss": 1.5353, "mean_token_accuracy": 0.6293119370937348, "step": 2140 }, { "epoch": 0.9210996072831131, "grad_norm": 0.3359375, "learning_rate": 0.0004196574590135144, "loss": 1.5305, "mean_token_accuracy": 0.6253014832735062, "step": 2150 }, { "epoch": 0.9253837915030346, "grad_norm": 0.416015625, "learning_rate": 0.0004187789761122972, "loss": 1.5138, "mean_token_accuracy": 0.6260040392478307, "step": 2160 }, { "epoch": 0.9296679757229561, "grad_norm": 0.396484375, "learning_rate": 0.0004178966469967024, "loss": 1.5205, "mean_token_accuracy": 0.6270245303710301, "step": 2170 }, { "epoch": 0.9339521599428775, "grad_norm": 0.353515625, "learning_rate": 0.0004170104917736591, "loss": 1.5428, "mean_token_accuracy": 0.6261491070191065, "step": 2180 }, { "epoch": 0.938236344162799, "grad_norm": 0.35546875, "learning_rate": 0.00041612053063728793, "loss": 1.4962, "mean_token_accuracy": 0.6338411172231039, "step": 2190 }, { "epoch": 0.9425205283827205, "grad_norm": 0.3671875, "learning_rate": 0.00041522678386844003, "loss": 1.5215, "mean_token_accuracy": 0.6297219822804133, "step": 2200 }, { "epoch": 0.9468047126026419, "grad_norm": 0.341796875, "learning_rate": 0.0004143292718342355, "loss": 1.5052, "mean_token_accuracy": 0.6313861280679702, "step": 2210 }, { "epoch": 0.9510888968225634, "grad_norm": 0.373046875, "learning_rate": 0.0004134280149875991, "loss": 1.5522, "mean_token_accuracy": 0.6244584927956264, "step": 2220 }, { "epoch": 0.9553730810424849, "grad_norm": 0.34375, "learning_rate": 0.000412523033866794, "loss": 1.5116, "mean_token_accuracy": 0.6311194211244583, "step": 2230 }, { "epoch": 0.9596572652624062, "grad_norm": 0.349609375, "learning_rate": 0.000411614349094954, "loss": 1.5068, "mean_token_accuracy": 0.6359375943740209, "step": 2240 }, { "epoch": 0.9639414494823277, "grad_norm": 0.34375, "learning_rate": 0.00041070198137961334, "loss": 1.5228, "mean_token_accuracy": 0.630420845746994, "step": 2250 }, { "epoch": 0.9682256337022492, "grad_norm": 0.3515625, "learning_rate": 0.00040978595151223496, "loss": 1.5304, "mean_token_accuracy": 0.6266944895188014, "step": 2260 }, { "epoch": 0.9725098179221706, "grad_norm": 0.34375, "learning_rate": 0.00040886628036773665, "loss": 1.4539, "mean_token_accuracy": 0.6405547618865967, "step": 2270 }, { "epoch": 0.9767940021420921, "grad_norm": 0.38671875, "learning_rate": 0.0004079429889040153, "loss": 1.4749, "mean_token_accuracy": 0.6339232285817464, "step": 2280 }, { "epoch": 0.9810781863620136, "grad_norm": 0.36328125, "learning_rate": 0.0004070160981614693, "loss": 1.4821, "mean_token_accuracy": 0.6370660841464997, "step": 2290 }, { "epoch": 0.985362370581935, "grad_norm": 0.396484375, "learning_rate": 0.00040608562926251914, "loss": 1.5096, "mean_token_accuracy": 0.6285079121589661, "step": 2300 }, { "epoch": 0.9896465548018565, "grad_norm": 0.33984375, "learning_rate": 0.000405151603411126, "loss": 1.5167, "mean_token_accuracy": 0.6253816932439804, "step": 2310 }, { "epoch": 0.993930739021778, "grad_norm": 0.33984375, "learning_rate": 0.0004042140418923085, "loss": 1.487, "mean_token_accuracy": 0.6365177949269613, "step": 2320 }, { "epoch": 0.9982149232416994, "grad_norm": 0.373046875, "learning_rate": 0.0004032729660716579, "loss": 1.4693, "mean_token_accuracy": 0.6372685492038727, "step": 2330 }, { "epoch": 1.0021420921099606, "grad_norm": 0.35546875, "learning_rate": 0.00040232839739485067, "loss": 1.4216, "mean_token_accuracy": 0.6438040440732783, "step": 2340 }, { "epoch": 1.0064262763298821, "grad_norm": 0.369140625, "learning_rate": 0.0004013803573871605, "loss": 1.3392, "mean_token_accuracy": 0.6590171555678049, "step": 2350 }, { "epoch": 1.0107104605498036, "grad_norm": 0.357421875, "learning_rate": 0.00040042886765296714, "loss": 1.3092, "mean_token_accuracy": 0.6659611910581589, "step": 2360 }, { "epoch": 1.0149946447697251, "grad_norm": 0.349609375, "learning_rate": 0.0003994739498752645, "loss": 1.338, "mean_token_accuracy": 0.6604650288820266, "step": 2370 }, { "epoch": 1.0192788289896466, "grad_norm": 0.34765625, "learning_rate": 0.0003985156258151662, "loss": 1.2883, "mean_token_accuracy": 0.6699223518371582, "step": 2380 }, { "epoch": 1.023563013209568, "grad_norm": 0.3359375, "learning_rate": 0.00039755391731140986, "loss": 1.3335, "mean_token_accuracy": 0.6621675411860148, "step": 2390 }, { "epoch": 1.0278471974294894, "grad_norm": 0.37890625, "learning_rate": 0.00039658884627985947, "loss": 1.3185, "mean_token_accuracy": 0.6633899579445521, "step": 2400 }, { "epoch": 1.0321313816494109, "grad_norm": 0.330078125, "learning_rate": 0.00039562043471300573, "loss": 1.3408, "mean_token_accuracy": 0.6567703276872635, "step": 2410 }, { "epoch": 1.0364155658693324, "grad_norm": 0.333984375, "learning_rate": 0.00039464870467946516, "loss": 1.3053, "mean_token_accuracy": 0.6645698845386505, "step": 2420 }, { "epoch": 1.0406997500892539, "grad_norm": 0.3359375, "learning_rate": 0.00039367367832347707, "loss": 1.3634, "mean_token_accuracy": 0.6515278299649556, "step": 2430 }, { "epoch": 1.0449839343091754, "grad_norm": 0.341796875, "learning_rate": 0.00039269537786439866, "loss": 1.3179, "mean_token_accuracy": 0.6596842000881831, "step": 2440 }, { "epoch": 1.0492681185290968, "grad_norm": 0.341796875, "learning_rate": 0.0003917138255961993, "loss": 1.3042, "mean_token_accuracy": 0.6642681717872619, "step": 2450 }, { "epoch": 1.0535523027490181, "grad_norm": 0.373046875, "learning_rate": 0.0003907290438869517, "loss": 1.328, "mean_token_accuracy": 0.6599580854177475, "step": 2460 }, { "epoch": 1.0578364869689396, "grad_norm": 0.32421875, "learning_rate": 0.00038974105517832315, "loss": 1.304, "mean_token_accuracy": 0.6646572331587474, "step": 2470 }, { "epoch": 1.062120671188861, "grad_norm": 0.326171875, "learning_rate": 0.00038874988198506287, "loss": 1.3355, "mean_token_accuracy": 0.6583087474107743, "step": 2480 }, { "epoch": 1.0664048554087826, "grad_norm": 0.337890625, "learning_rate": 0.00038775554689449013, "loss": 1.3209, "mean_token_accuracy": 0.6637792021036149, "step": 2490 }, { "epoch": 1.070689039628704, "grad_norm": 0.390625, "learning_rate": 0.00038675807256597863, "loss": 1.3437, "mean_token_accuracy": 0.6541990071535111, "step": 2500 }, { "epoch": 1.0749732238486256, "grad_norm": 0.333984375, "learning_rate": 0.0003857574817304407, "loss": 1.3368, "mean_token_accuracy": 0.6589954853057861, "step": 2510 }, { "epoch": 1.0792574080685469, "grad_norm": 0.31640625, "learning_rate": 0.0003847537971898093, "loss": 1.3431, "mean_token_accuracy": 0.6575679163138072, "step": 2520 }, { "epoch": 1.0835415922884684, "grad_norm": 0.333984375, "learning_rate": 0.0003837470418165176, "loss": 1.3559, "mean_token_accuracy": 0.6553231467803319, "step": 2530 }, { "epoch": 1.0878257765083899, "grad_norm": 0.34765625, "learning_rate": 0.000382737238552979, "loss": 1.3401, "mean_token_accuracy": 0.6568175663550695, "step": 2540 }, { "epoch": 1.0921099607283113, "grad_norm": 0.3203125, "learning_rate": 0.00038172441041106316, "loss": 1.3171, "mean_token_accuracy": 0.6621425936619441, "step": 2550 }, { "epoch": 1.0963941449482328, "grad_norm": 0.349609375, "learning_rate": 0.0003807085804715723, "loss": 1.3206, "mean_token_accuracy": 0.6625715295473734, "step": 2560 }, { "epoch": 1.1006783291681543, "grad_norm": 0.31640625, "learning_rate": 0.000379689771883715, "loss": 1.3234, "mean_token_accuracy": 0.660184133052826, "step": 2570 }, { "epoch": 1.1049625133880756, "grad_norm": 0.341796875, "learning_rate": 0.00037866800786457864, "loss": 1.3085, "mean_token_accuracy": 0.6675328105688095, "step": 2580 }, { "epoch": 1.109246697607997, "grad_norm": 0.34375, "learning_rate": 0.00037764331169860046, "loss": 1.3138, "mean_token_accuracy": 0.6605868438879648, "step": 2590 }, { "epoch": 1.1135308818279186, "grad_norm": 0.34765625, "learning_rate": 0.0003766157067370366, "loss": 1.3211, "mean_token_accuracy": 0.6628186653057734, "step": 2600 }, { "epoch": 1.11781506604784, "grad_norm": 0.322265625, "learning_rate": 0.00037558521639743036, "loss": 1.3113, "mean_token_accuracy": 0.6645319203535716, "step": 2610 }, { "epoch": 1.1220992502677616, "grad_norm": 0.326171875, "learning_rate": 0.0003745518641630785, "loss": 1.3303, "mean_token_accuracy": 0.659632471203804, "step": 2620 }, { "epoch": 1.1263834344876829, "grad_norm": 0.365234375, "learning_rate": 0.0003735156735824957, "loss": 1.3333, "mean_token_accuracy": 0.6590901345014573, "step": 2630 }, { "epoch": 1.1306676187076043, "grad_norm": 0.35546875, "learning_rate": 0.0003724766682688784, "loss": 1.3414, "mean_token_accuracy": 0.6546030879020691, "step": 2640 }, { "epoch": 1.1349518029275258, "grad_norm": 0.37109375, "learning_rate": 0.00037143487189956635, "loss": 1.3257, "mean_token_accuracy": 0.6572333713372548, "step": 2650 }, { "epoch": 1.1392359871474473, "grad_norm": 0.37109375, "learning_rate": 0.0003703903082155035, "loss": 1.3372, "mean_token_accuracy": 0.6556382944186528, "step": 2660 }, { "epoch": 1.1435201713673688, "grad_norm": 0.318359375, "learning_rate": 0.0003693430010206962, "loss": 1.321, "mean_token_accuracy": 0.6620635330677033, "step": 2670 }, { "epoch": 1.1478043555872903, "grad_norm": 0.318359375, "learning_rate": 0.0003682929741816717, "loss": 1.3186, "mean_token_accuracy": 0.6653014043966929, "step": 2680 }, { "epoch": 1.1520885398072118, "grad_norm": 0.404296875, "learning_rate": 0.00036724025162693317, "loss": 1.3106, "mean_token_accuracy": 0.666445321838061, "step": 2690 }, { "epoch": 1.156372724027133, "grad_norm": 0.345703125, "learning_rate": 0.00036618485734641584, "loss": 1.3426, "mean_token_accuracy": 0.6579491098721822, "step": 2700 }, { "epoch": 1.1606569082470546, "grad_norm": 0.36328125, "learning_rate": 0.0003651268153909386, "loss": 1.3593, "mean_token_accuracy": 0.6558250943819682, "step": 2710 }, { "epoch": 1.164941092466976, "grad_norm": 0.380859375, "learning_rate": 0.00036406614987165737, "loss": 1.3385, "mean_token_accuracy": 0.6573931157588959, "step": 2720 }, { "epoch": 1.1692252766868976, "grad_norm": 0.35546875, "learning_rate": 0.00036300288495951487, "loss": 1.303, "mean_token_accuracy": 0.6656403839588165, "step": 2730 }, { "epoch": 1.173509460906819, "grad_norm": 0.306640625, "learning_rate": 0.0003619370448846901, "loss": 1.321, "mean_token_accuracy": 0.6617445478836695, "step": 2740 }, { "epoch": 1.1777936451267403, "grad_norm": 0.380859375, "learning_rate": 0.000360868653936046, "loss": 1.3403, "mean_token_accuracy": 0.6576227575540543, "step": 2750 }, { "epoch": 1.1820778293466618, "grad_norm": 0.34765625, "learning_rate": 0.00035979773646057603, "loss": 1.3161, "mean_token_accuracy": 0.6614106744527817, "step": 2760 }, { "epoch": 1.1863620135665833, "grad_norm": 0.318359375, "learning_rate": 0.0003587243168628491, "loss": 1.3549, "mean_token_accuracy": 0.6531448741753896, "step": 2770 }, { "epoch": 1.1906461977865048, "grad_norm": 0.341796875, "learning_rate": 0.00035764841960445433, "loss": 1.3016, "mean_token_accuracy": 0.6694233824809392, "step": 2780 }, { "epoch": 1.1949303820064263, "grad_norm": 0.322265625, "learning_rate": 0.0003565700692034421, "loss": 1.3076, "mean_token_accuracy": 0.6626476556062698, "step": 2790 }, { "epoch": 1.1992145662263478, "grad_norm": 0.322265625, "learning_rate": 0.00035548929023376677, "loss": 1.2832, "mean_token_accuracy": 0.6730219264825185, "step": 2800 }, { "epoch": 1.2034987504462693, "grad_norm": 0.365234375, "learning_rate": 0.00035440610732472564, "loss": 1.3105, "mean_token_accuracy": 0.6644942373037338, "step": 2810 }, { "epoch": 1.2077829346661906, "grad_norm": 0.365234375, "learning_rate": 0.00035332054516039834, "loss": 1.3084, "mean_token_accuracy": 0.6659467816352844, "step": 2820 }, { "epoch": 1.212067118886112, "grad_norm": 0.357421875, "learning_rate": 0.0003522326284790842, "loss": 1.3263, "mean_token_accuracy": 0.6606633335351944, "step": 2830 }, { "epoch": 1.2163513031060336, "grad_norm": 0.341796875, "learning_rate": 0.000351142382072738, "loss": 1.3269, "mean_token_accuracy": 0.6555161933104198, "step": 2840 }, { "epoch": 1.220635487325955, "grad_norm": 0.328125, "learning_rate": 0.0003500498307864057, "loss": 1.3094, "mean_token_accuracy": 0.6611082951227824, "step": 2850 }, { "epoch": 1.2249196715458766, "grad_norm": 0.337890625, "learning_rate": 0.00034895499951765805, "loss": 1.3298, "mean_token_accuracy": 0.6599132279555003, "step": 2860 }, { "epoch": 1.2292038557657978, "grad_norm": 0.357421875, "learning_rate": 0.0003478579132160226, "loss": 1.3194, "mean_token_accuracy": 0.6632148762543996, "step": 2870 }, { "epoch": 1.2334880399857193, "grad_norm": 0.34765625, "learning_rate": 0.00034675859688241607, "loss": 1.3216, "mean_token_accuracy": 0.6630672425031662, "step": 2880 }, { "epoch": 1.2377722242056408, "grad_norm": 0.33203125, "learning_rate": 0.00034565707556857405, "loss": 1.3175, "mean_token_accuracy": 0.6651228954394658, "step": 2890 }, { "epoch": 1.2420564084255623, "grad_norm": 0.357421875, "learning_rate": 0.0003445533743764804, "loss": 1.2983, "mean_token_accuracy": 0.6665589342514674, "step": 2900 }, { "epoch": 1.2463405926454838, "grad_norm": 0.408203125, "learning_rate": 0.00034344751845779485, "loss": 1.332, "mean_token_accuracy": 0.6578145096699397, "step": 2910 }, { "epoch": 1.250624776865405, "grad_norm": 0.341796875, "learning_rate": 0.00034233953301328026, "loss": 1.3221, "mean_token_accuracy": 0.6605254203081131, "step": 2920 }, { "epoch": 1.2549089610853268, "grad_norm": 0.31640625, "learning_rate": 0.0003412294432922278, "loss": 1.3093, "mean_token_accuracy": 0.665352334578832, "step": 2930 }, { "epoch": 1.259193145305248, "grad_norm": 0.349609375, "learning_rate": 0.00034011727459188224, "loss": 1.2938, "mean_token_accuracy": 0.6671251505613327, "step": 2940 }, { "epoch": 1.2634773295251696, "grad_norm": 0.33203125, "learning_rate": 0.000339003052256865, "loss": 1.283, "mean_token_accuracy": 0.666038570801417, "step": 2950 }, { "epoch": 1.267761513745091, "grad_norm": 0.3828125, "learning_rate": 0.0003378868016785966, "loss": 1.3174, "mean_token_accuracy": 0.6619399875402451, "step": 2960 }, { "epoch": 1.2720456979650125, "grad_norm": 0.3203125, "learning_rate": 0.000336768548294718, "loss": 1.3422, "mean_token_accuracy": 0.6565162986516953, "step": 2970 }, { "epoch": 1.276329882184934, "grad_norm": 0.3515625, "learning_rate": 0.00033564831758851145, "loss": 1.2966, "mean_token_accuracy": 0.6664889295895894, "step": 2980 }, { "epoch": 1.2806140664048553, "grad_norm": 0.32421875, "learning_rate": 0.0003345261350883189, "loss": 1.3002, "mean_token_accuracy": 0.6681903342405955, "step": 2990 }, { "epoch": 1.2848982506247768, "grad_norm": 0.318359375, "learning_rate": 0.00033340202636696103, "loss": 1.3375, "mean_token_accuracy": 0.6571354617675146, "step": 3000 }, { "epoch": 1.2891824348446983, "grad_norm": 0.322265625, "learning_rate": 0.0003322760170411539, "loss": 1.3163, "mean_token_accuracy": 0.6622751196225484, "step": 3010 }, { "epoch": 1.2934666190646198, "grad_norm": 0.337890625, "learning_rate": 0.00033114813277092557, "loss": 1.3095, "mean_token_accuracy": 0.6604131867488225, "step": 3020 }, { "epoch": 1.2977508032845413, "grad_norm": 0.337890625, "learning_rate": 0.00033001839925903123, "loss": 1.269, "mean_token_accuracy": 0.670791240533193, "step": 3030 }, { "epoch": 1.3020349875044626, "grad_norm": 0.31640625, "learning_rate": 0.00032888684225036735, "loss": 1.324, "mean_token_accuracy": 0.6640015542507172, "step": 3040 }, { "epoch": 1.3063191717243843, "grad_norm": 0.31640625, "learning_rate": 0.0003277534875313851, "loss": 1.2934, "mean_token_accuracy": 0.6645639598369598, "step": 3050 }, { "epoch": 1.3106033559443055, "grad_norm": 0.314453125, "learning_rate": 0.000326618360929503, "loss": 1.2911, "mean_token_accuracy": 0.662736972173055, "step": 3060 }, { "epoch": 1.314887540164227, "grad_norm": 0.314453125, "learning_rate": 0.0003254814883125176, "loss": 1.3102, "mean_token_accuracy": 0.6641101191441218, "step": 3070 }, { "epoch": 1.3191717243841485, "grad_norm": 0.34375, "learning_rate": 0.00032434289558801486, "loss": 1.2946, "mean_token_accuracy": 0.669797545671463, "step": 3080 }, { "epoch": 1.32345590860407, "grad_norm": 0.34765625, "learning_rate": 0.00032320260870277907, "loss": 1.3234, "mean_token_accuracy": 0.662629238764445, "step": 3090 }, { "epoch": 1.3277400928239915, "grad_norm": 0.341796875, "learning_rate": 0.00032206065364220204, "loss": 1.2875, "mean_token_accuracy": 0.6695479412873586, "step": 3100 }, { "epoch": 1.3320242770439128, "grad_norm": 0.314453125, "learning_rate": 0.0003209170564296907, "loss": 1.3185, "mean_token_accuracy": 0.6618772675593694, "step": 3110 }, { "epoch": 1.3363084612638343, "grad_norm": 0.337890625, "learning_rate": 0.00031977184312607406, "loss": 1.2956, "mean_token_accuracy": 0.6677195648352305, "step": 3120 }, { "epoch": 1.3405926454837558, "grad_norm": 0.34375, "learning_rate": 0.0003186250398290093, "loss": 1.31, "mean_token_accuracy": 0.662284501393636, "step": 3130 }, { "epoch": 1.3448768297036773, "grad_norm": 0.322265625, "learning_rate": 0.0003174766726723873, "loss": 1.3053, "mean_token_accuracy": 0.6651754269997279, "step": 3140 }, { "epoch": 1.3491610139235988, "grad_norm": 0.326171875, "learning_rate": 0.000316326767825737, "loss": 1.3318, "mean_token_accuracy": 0.658694452047348, "step": 3150 }, { "epoch": 1.35344519814352, "grad_norm": 0.3046875, "learning_rate": 0.0003151753514936285, "loss": 1.2949, "mean_token_accuracy": 0.6696987201770147, "step": 3160 }, { "epoch": 1.3577293823634418, "grad_norm": 0.3125, "learning_rate": 0.00031402244991507656, "loss": 1.3184, "mean_token_accuracy": 0.6619691550731659, "step": 3170 }, { "epoch": 1.362013566583363, "grad_norm": 0.328125, "learning_rate": 0.00031286808936294266, "loss": 1.2695, "mean_token_accuracy": 0.6740480273962021, "step": 3180 }, { "epoch": 1.3662977508032845, "grad_norm": 0.330078125, "learning_rate": 0.00031171229614333567, "loss": 1.2973, "mean_token_accuracy": 0.6682296107212703, "step": 3190 }, { "epoch": 1.370581935023206, "grad_norm": 0.330078125, "learning_rate": 0.0003105550965950132, "loss": 1.2879, "mean_token_accuracy": 0.6695340464512507, "step": 3200 }, { "epoch": 1.3748661192431275, "grad_norm": 0.33984375, "learning_rate": 0.0003093965170887804, "loss": 1.3382, "mean_token_accuracy": 0.6577111542224884, "step": 3210 }, { "epoch": 1.379150303463049, "grad_norm": 0.3359375, "learning_rate": 0.00030823658402689004, "loss": 1.3005, "mean_token_accuracy": 0.6613259871800741, "step": 3220 }, { "epoch": 1.3834344876829703, "grad_norm": 0.33203125, "learning_rate": 0.0003070753238424401, "loss": 1.2846, "mean_token_accuracy": 0.6712899088859559, "step": 3230 }, { "epoch": 1.3877186719028918, "grad_norm": 0.3125, "learning_rate": 0.0003059127629987715, "loss": 1.3022, "mean_token_accuracy": 0.6612801601489385, "step": 3240 }, { "epoch": 1.3920028561228133, "grad_norm": 0.328125, "learning_rate": 0.00030474892798886574, "loss": 1.2673, "mean_token_accuracy": 0.6732950339714686, "step": 3250 }, { "epoch": 1.3962870403427348, "grad_norm": 0.33984375, "learning_rate": 0.00030358384533473993, "loss": 1.2745, "mean_token_accuracy": 0.6716413120428721, "step": 3260 }, { "epoch": 1.4005712245626563, "grad_norm": 0.36328125, "learning_rate": 0.0003024175415868436, "loss": 1.2964, "mean_token_accuracy": 0.6657003333171209, "step": 3270 }, { "epoch": 1.4048554087825775, "grad_norm": 0.326171875, "learning_rate": 0.00030125004332345293, "loss": 1.3154, "mean_token_accuracy": 0.6634772340456645, "step": 3280 }, { "epoch": 1.409139593002499, "grad_norm": 0.306640625, "learning_rate": 0.0003000813771500652, "loss": 1.2857, "mean_token_accuracy": 0.6668003877003987, "step": 3290 }, { "epoch": 1.4134237772224205, "grad_norm": 0.33984375, "learning_rate": 0.00029891156969879276, "loss": 1.2846, "mean_token_accuracy": 0.6660936713218689, "step": 3300 }, { "epoch": 1.417707961442342, "grad_norm": 0.3515625, "learning_rate": 0.00029774064762775584, "loss": 1.3119, "mean_token_accuracy": 0.6623692701260249, "step": 3310 }, { "epoch": 1.4219921456622635, "grad_norm": 0.330078125, "learning_rate": 0.00029656863762047507, "loss": 1.2528, "mean_token_accuracy": 0.6746878981590271, "step": 3320 }, { "epoch": 1.426276329882185, "grad_norm": 0.326171875, "learning_rate": 0.0002953955663852637, "loss": 1.2755, "mean_token_accuracy": 0.6691719969113668, "step": 3330 }, { "epoch": 1.4305605141021065, "grad_norm": 0.291015625, "learning_rate": 0.00029422146065461846, "loss": 1.291, "mean_token_accuracy": 0.6642946968475978, "step": 3340 }, { "epoch": 1.4348446983220278, "grad_norm": 0.3203125, "learning_rate": 0.0002930463471846109, "loss": 1.2858, "mean_token_accuracy": 0.6680060108502706, "step": 3350 }, { "epoch": 1.4391288825419493, "grad_norm": 0.3515625, "learning_rate": 0.00029187025275427726, "loss": 1.3238, "mean_token_accuracy": 0.6617356767257054, "step": 3360 }, { "epoch": 1.4434130667618708, "grad_norm": 0.326171875, "learning_rate": 0.0002906932041650083, "loss": 1.2809, "mean_token_accuracy": 0.6699890126784642, "step": 3370 }, { "epoch": 1.4476972509817922, "grad_norm": 0.31640625, "learning_rate": 0.00028951522823993884, "loss": 1.2368, "mean_token_accuracy": 0.6795000006755193, "step": 3380 }, { "epoch": 1.4519814352017137, "grad_norm": 0.322265625, "learning_rate": 0.0002883363518233361, "loss": 1.2939, "mean_token_accuracy": 0.6654811183611552, "step": 3390 }, { "epoch": 1.456265619421635, "grad_norm": 0.318359375, "learning_rate": 0.0002871566017799881, "loss": 1.2918, "mean_token_accuracy": 0.6694387882947922, "step": 3400 }, { "epoch": 1.4605498036415565, "grad_norm": 0.322265625, "learning_rate": 0.0002859760049945915, "loss": 1.2996, "mean_token_accuracy": 0.6622726023197174, "step": 3410 }, { "epoch": 1.464833987861478, "grad_norm": 0.333984375, "learning_rate": 0.00028479458837113886, "loss": 1.2907, "mean_token_accuracy": 0.6693145235379537, "step": 3420 }, { "epoch": 1.4691181720813995, "grad_norm": 0.328125, "learning_rate": 0.00028361237883230595, "loss": 1.2589, "mean_token_accuracy": 0.6747862150271734, "step": 3430 }, { "epoch": 1.473402356301321, "grad_norm": 0.310546875, "learning_rate": 0.00028242940331883726, "loss": 1.301, "mean_token_accuracy": 0.665789802869161, "step": 3440 }, { "epoch": 1.4776865405212425, "grad_norm": 0.302734375, "learning_rate": 0.00028124568878893323, "loss": 1.2522, "mean_token_accuracy": 0.6749315430720647, "step": 3450 }, { "epoch": 1.481970724741164, "grad_norm": 0.330078125, "learning_rate": 0.000280061262217635, "loss": 1.2831, "mean_token_accuracy": 0.6701455026865005, "step": 3460 }, { "epoch": 1.4862549089610853, "grad_norm": 0.33203125, "learning_rate": 0.0002788761505962102, "loss": 1.2951, "mean_token_accuracy": 0.6682408134142558, "step": 3470 }, { "epoch": 1.4905390931810067, "grad_norm": 0.337890625, "learning_rate": 0.00027769038093153765, "loss": 1.2898, "mean_token_accuracy": 0.6709642231464386, "step": 3480 }, { "epoch": 1.4948232774009282, "grad_norm": 0.318359375, "learning_rate": 0.000276503980245492, "loss": 1.2709, "mean_token_accuracy": 0.6721013089021047, "step": 3490 }, { "epoch": 1.4991074616208497, "grad_norm": 0.326171875, "learning_rate": 0.0002753169755743277, "loss": 1.2677, "mean_token_accuracy": 0.6691457152366638, "step": 3500 }, { "epoch": 1.5033916458407712, "grad_norm": 0.30859375, "learning_rate": 0.0002741293939680637, "loss": 1.2841, "mean_token_accuracy": 0.6697168608506521, "step": 3510 }, { "epoch": 1.5076758300606925, "grad_norm": 0.30078125, "learning_rate": 0.00027294126248986563, "loss": 1.2824, "mean_token_accuracy": 0.6717678278684616, "step": 3520 }, { "epoch": 1.5119600142806142, "grad_norm": 0.33203125, "learning_rate": 0.0002717526082154304, "loss": 1.2587, "mean_token_accuracy": 0.6737531036138534, "step": 3530 }, { "epoch": 1.5162441985005355, "grad_norm": 0.30859375, "learning_rate": 0.00027056345823236837, "loss": 1.265, "mean_token_accuracy": 0.6732283522685368, "step": 3540 }, { "epoch": 1.520528382720457, "grad_norm": 0.318359375, "learning_rate": 0.0002693738396395866, "loss": 1.2674, "mean_token_accuracy": 0.671101305882136, "step": 3550 }, { "epoch": 1.5248125669403785, "grad_norm": 0.326171875, "learning_rate": 0.00026818377954667083, "loss": 1.294, "mean_token_accuracy": 0.6672694971164067, "step": 3560 }, { "epoch": 1.5290967511602997, "grad_norm": 0.3203125, "learning_rate": 0.0002669933050732679, "loss": 1.2372, "mean_token_accuracy": 0.6788262248039245, "step": 3570 }, { "epoch": 1.5333809353802215, "grad_norm": 0.3203125, "learning_rate": 0.000265802443348468, "loss": 1.2665, "mean_token_accuracy": 0.6717091699441274, "step": 3580 }, { "epoch": 1.5376651196001427, "grad_norm": 0.30859375, "learning_rate": 0.0002646112215101858, "loss": 1.2526, "mean_token_accuracy": 0.6772685488065083, "step": 3590 }, { "epoch": 1.5419493038200642, "grad_norm": 0.322265625, "learning_rate": 0.0002634196667045428, "loss": 1.2361, "mean_token_accuracy": 0.6766181766986847, "step": 3600 }, { "epoch": 1.5462334880399857, "grad_norm": 0.314453125, "learning_rate": 0.0002622278060852481, "loss": 1.2519, "mean_token_accuracy": 0.6780080666144689, "step": 3610 }, { "epoch": 1.5505176722599072, "grad_norm": 0.30078125, "learning_rate": 0.00026103566681297973, "loss": 1.2568, "mean_token_accuracy": 0.6755953232447306, "step": 3620 }, { "epoch": 1.5548018564798287, "grad_norm": 0.3125, "learning_rate": 0.00025984327605476607, "loss": 1.2607, "mean_token_accuracy": 0.6782088299592336, "step": 3630 }, { "epoch": 1.55908604069975, "grad_norm": 0.32421875, "learning_rate": 0.0002586506609833662, "loss": 1.2753, "mean_token_accuracy": 0.6692374924818675, "step": 3640 }, { "epoch": 1.5633702249196717, "grad_norm": 0.314453125, "learning_rate": 0.00025745784877665123, "loss": 1.2738, "mean_token_accuracy": 0.6710573007663091, "step": 3650 }, { "epoch": 1.567654409139593, "grad_norm": 0.29296875, "learning_rate": 0.00025626486661698447, "loss": 1.2333, "mean_token_accuracy": 0.6815130422512691, "step": 3660 }, { "epoch": 1.5719385933595145, "grad_norm": 0.306640625, "learning_rate": 0.0002550717416906022, "loss": 1.2424, "mean_token_accuracy": 0.6788204977909724, "step": 3670 }, { "epoch": 1.576222777579436, "grad_norm": 0.30859375, "learning_rate": 0.00025387850118699433, "loss": 1.2701, "mean_token_accuracy": 0.6694417973359426, "step": 3680 }, { "epoch": 1.5805069617993572, "grad_norm": 0.32421875, "learning_rate": 0.00025268517229828436, "loss": 1.2538, "mean_token_accuracy": 0.6743627349535625, "step": 3690 }, { "epoch": 1.584791146019279, "grad_norm": 0.328125, "learning_rate": 0.00025149178221861015, "loss": 1.2538, "mean_token_accuracy": 0.6739106665054957, "step": 3700 }, { "epoch": 1.5890753302392002, "grad_norm": 0.3046875, "learning_rate": 0.0002502983581435038, "loss": 1.2443, "mean_token_accuracy": 0.6750338355700175, "step": 3710 }, { "epoch": 1.5933595144591217, "grad_norm": 0.306640625, "learning_rate": 0.00024910492726927237, "loss": 1.2376, "mean_token_accuracy": 0.6762576440970103, "step": 3720 }, { "epoch": 1.5976436986790432, "grad_norm": 0.31640625, "learning_rate": 0.0002479115167923776, "loss": 1.2506, "mean_token_accuracy": 0.6760666708151499, "step": 3730 }, { "epoch": 1.6019278828989647, "grad_norm": 0.33203125, "learning_rate": 0.0002467181539088166, "loss": 1.2515, "mean_token_accuracy": 0.6759268889824549, "step": 3740 }, { "epoch": 1.6062120671188862, "grad_norm": 0.318359375, "learning_rate": 0.0002455248658135018, "loss": 1.2563, "mean_token_accuracy": 0.6760497013727824, "step": 3750 }, { "epoch": 1.6104962513388075, "grad_norm": 0.330078125, "learning_rate": 0.0002443316796996414, "loss": 1.2115, "mean_token_accuracy": 0.6829163481791815, "step": 3760 }, { "epoch": 1.6147804355587292, "grad_norm": 0.314453125, "learning_rate": 0.00024313862275811954, "loss": 1.2524, "mean_token_accuracy": 0.6750467717647552, "step": 3770 }, { "epoch": 1.6190646197786505, "grad_norm": 0.310546875, "learning_rate": 0.00024194572217687657, "loss": 1.2558, "mean_token_accuracy": 0.6752542575200399, "step": 3780 }, { "epoch": 1.623348803998572, "grad_norm": 0.3359375, "learning_rate": 0.00024075300514028996, "loss": 1.2014, "mean_token_accuracy": 0.6858539601167043, "step": 3790 }, { "epoch": 1.6276329882184934, "grad_norm": 0.298828125, "learning_rate": 0.00023956049882855435, "loss": 1.2438, "mean_token_accuracy": 0.6779787172873815, "step": 3800 }, { "epoch": 1.6319171724384147, "grad_norm": 0.34375, "learning_rate": 0.00023836823041706214, "loss": 1.2752, "mean_token_accuracy": 0.6702134589354197, "step": 3810 }, { "epoch": 1.6362013566583364, "grad_norm": 0.310546875, "learning_rate": 0.00023717622707578444, "loss": 1.2694, "mean_token_accuracy": 0.6743356436491013, "step": 3820 }, { "epoch": 1.6404855408782577, "grad_norm": 0.3046875, "learning_rate": 0.00023598451596865185, "loss": 1.2574, "mean_token_accuracy": 0.6762406408786774, "step": 3830 }, { "epoch": 1.6447697250981792, "grad_norm": 0.330078125, "learning_rate": 0.00023479312425293532, "loss": 1.2388, "mean_token_accuracy": 0.6752854565779368, "step": 3840 }, { "epoch": 1.6490539093181007, "grad_norm": 0.326171875, "learning_rate": 0.00023360207907862753, "loss": 1.2227, "mean_token_accuracy": 0.6849334806203842, "step": 3850 }, { "epoch": 1.653338093538022, "grad_norm": 0.314453125, "learning_rate": 0.00023241140758782387, "loss": 1.2652, "mean_token_accuracy": 0.674446169535319, "step": 3860 }, { "epoch": 1.6576222777579437, "grad_norm": 0.302734375, "learning_rate": 0.00023122113691410396, "loss": 1.2314, "mean_token_accuracy": 0.6808192272981007, "step": 3870 }, { "epoch": 1.661906461977865, "grad_norm": 0.30859375, "learning_rate": 0.00023003129418191356, "loss": 1.2636, "mean_token_accuracy": 0.6753036012252172, "step": 3880 }, { "epoch": 1.6661906461977865, "grad_norm": 0.322265625, "learning_rate": 0.00022884190650594648, "loss": 1.2169, "mean_token_accuracy": 0.6835887253284454, "step": 3890 }, { "epoch": 1.670474830417708, "grad_norm": 0.298828125, "learning_rate": 0.00022765300099052607, "loss": 1.2232, "mean_token_accuracy": 0.679744150241216, "step": 3900 }, { "epoch": 1.6747590146376294, "grad_norm": 0.310546875, "learning_rate": 0.00022646460472898824, "loss": 1.2182, "mean_token_accuracy": 0.6807852476835251, "step": 3910 }, { "epoch": 1.679043198857551, "grad_norm": 0.3125, "learning_rate": 0.00022527674480306382, "loss": 1.2026, "mean_token_accuracy": 0.6893678406874338, "step": 3920 }, { "epoch": 1.6833273830774722, "grad_norm": 0.294921875, "learning_rate": 0.00022408944828226113, "loss": 1.234, "mean_token_accuracy": 0.6771794279416402, "step": 3930 }, { "epoch": 1.687611567297394, "grad_norm": 0.296875, "learning_rate": 0.00022290274222324971, "loss": 1.235, "mean_token_accuracy": 0.6769244899352391, "step": 3940 }, { "epoch": 1.6918957515173152, "grad_norm": 0.318359375, "learning_rate": 0.00022171665366924303, "loss": 1.2525, "mean_token_accuracy": 0.6755237986644109, "step": 3950 }, { "epoch": 1.6961799357372367, "grad_norm": 0.302734375, "learning_rate": 0.0002205312096493829, "loss": 1.2201, "mean_token_accuracy": 0.6836084206899007, "step": 3960 }, { "epoch": 1.7004641199571582, "grad_norm": 0.296875, "learning_rate": 0.00021934643717812281, "loss": 1.2055, "mean_token_accuracy": 0.685323445002238, "step": 3970 }, { "epoch": 1.7047483041770795, "grad_norm": 0.31640625, "learning_rate": 0.0002181623632546129, "loss": 1.2321, "mean_token_accuracy": 0.6818149715662003, "step": 3980 }, { "epoch": 1.7090324883970012, "grad_norm": 0.31640625, "learning_rate": 0.00021697901486208458, "loss": 1.2101, "mean_token_accuracy": 0.6855930884679159, "step": 3990 }, { "epoch": 1.7133166726169224, "grad_norm": 0.3046875, "learning_rate": 0.0002157964189672353, "loss": 1.2379, "mean_token_accuracy": 0.6786120980978012, "step": 4000 }, { "epoch": 1.717600856836844, "grad_norm": 0.302734375, "learning_rate": 0.0002146146025196144, "loss": 1.2587, "mean_token_accuracy": 0.676125301917394, "step": 4010 }, { "epoch": 1.7218850410567654, "grad_norm": 0.291015625, "learning_rate": 0.00021343359245100873, "loss": 1.2412, "mean_token_accuracy": 0.6767153064409892, "step": 4020 }, { "epoch": 1.726169225276687, "grad_norm": 0.30078125, "learning_rate": 0.0002122534156748289, "loss": 1.2144, "mean_token_accuracy": 0.68453202744325, "step": 4030 }, { "epoch": 1.7304534094966084, "grad_norm": 0.31640625, "learning_rate": 0.00021107409908549632, "loss": 1.2382, "mean_token_accuracy": 0.6767129331827164, "step": 4040 }, { "epoch": 1.7347375937165297, "grad_norm": 0.30078125, "learning_rate": 0.00020989566955782992, "loss": 1.2235, "mean_token_accuracy": 0.6808336655298869, "step": 4050 }, { "epoch": 1.7390217779364514, "grad_norm": 0.314453125, "learning_rate": 0.00020871815394643385, "loss": 1.2331, "mean_token_accuracy": 0.6781981885433197, "step": 4060 }, { "epoch": 1.7433059621563727, "grad_norm": 0.29296875, "learning_rate": 0.00020754157908508536, "loss": 1.2358, "mean_token_accuracy": 0.6778120845556259, "step": 4070 }, { "epoch": 1.7475901463762942, "grad_norm": 0.322265625, "learning_rate": 0.00020636597178612365, "loss": 1.2209, "mean_token_accuracy": 0.6791574577490489, "step": 4080 }, { "epoch": 1.7518743305962157, "grad_norm": 0.333984375, "learning_rate": 0.00020519135883983878, "loss": 1.2202, "mean_token_accuracy": 0.6824594676494599, "step": 4090 }, { "epoch": 1.756158514816137, "grad_norm": 0.322265625, "learning_rate": 0.0002040177670138607, "loss": 1.2279, "mean_token_accuracy": 0.6825836052497228, "step": 4100 }, { "epoch": 1.7604426990360587, "grad_norm": 0.296875, "learning_rate": 0.0002028452230525497, "loss": 1.1914, "mean_token_accuracy": 0.6880497256914775, "step": 4110 }, { "epoch": 1.76472688325598, "grad_norm": 0.29296875, "learning_rate": 0.00020167375367638707, "loss": 1.2336, "mean_token_accuracy": 0.6778925875822703, "step": 4120 }, { "epoch": 1.7690110674759014, "grad_norm": 0.318359375, "learning_rate": 0.0002005033855813655, "loss": 1.248, "mean_token_accuracy": 0.6761407842238744, "step": 4130 }, { "epoch": 1.773295251695823, "grad_norm": 0.314453125, "learning_rate": 0.0001993341454383817, "loss": 1.2239, "mean_token_accuracy": 0.6766098787387212, "step": 4140 }, { "epoch": 1.7775794359157444, "grad_norm": 0.318359375, "learning_rate": 0.0001981660598926277, "loss": 1.2034, "mean_token_accuracy": 0.6846154699722926, "step": 4150 }, { "epoch": 1.781863620135666, "grad_norm": 0.32421875, "learning_rate": 0.00019699915556298413, "loss": 1.1972, "mean_token_accuracy": 0.6866144865751267, "step": 4160 }, { "epoch": 1.7861478043555872, "grad_norm": 0.298828125, "learning_rate": 0.0001958334590414136, "loss": 1.2012, "mean_token_accuracy": 0.6865098079045614, "step": 4170 }, { "epoch": 1.790431988575509, "grad_norm": 0.306640625, "learning_rate": 0.00019466899689235434, "loss": 1.2154, "mean_token_accuracy": 0.6773028880357742, "step": 4180 }, { "epoch": 1.7947161727954302, "grad_norm": 0.322265625, "learning_rate": 0.00019350579565211563, "loss": 1.2082, "mean_token_accuracy": 0.6842084477345148, "step": 4190 }, { "epoch": 1.7990003570153517, "grad_norm": 0.3203125, "learning_rate": 0.0001923438818282721, "loss": 1.2321, "mean_token_accuracy": 0.680533907810847, "step": 4200 }, { "epoch": 1.8032845412352732, "grad_norm": 0.275390625, "learning_rate": 0.00019118328189906037, "loss": 1.2276, "mean_token_accuracy": 0.6793491671482722, "step": 4210 }, { "epoch": 1.8075687254551944, "grad_norm": 0.27734375, "learning_rate": 0.00019002402231277533, "loss": 1.1895, "mean_token_accuracy": 0.6883792887131374, "step": 4220 }, { "epoch": 1.8118529096751161, "grad_norm": 0.298828125, "learning_rate": 0.00018886612948716737, "loss": 1.196, "mean_token_accuracy": 0.6824484934409459, "step": 4230 }, { "epoch": 1.8161370938950374, "grad_norm": 0.33203125, "learning_rate": 0.00018770962980884086, "loss": 1.2068, "mean_token_accuracy": 0.6837878266970316, "step": 4240 }, { "epoch": 1.820421278114959, "grad_norm": 0.283203125, "learning_rate": 0.0001865545496326523, "loss": 1.2161, "mean_token_accuracy": 0.6800723781188329, "step": 4250 }, { "epoch": 1.8247054623348804, "grad_norm": 0.322265625, "learning_rate": 0.00018540091528110973, "loss": 1.1971, "mean_token_accuracy": 0.6874994953473409, "step": 4260 }, { "epoch": 1.828989646554802, "grad_norm": 0.298828125, "learning_rate": 0.0001842487530437732, "loss": 1.1987, "mean_token_accuracy": 0.6840162913004557, "step": 4270 }, { "epoch": 1.8332738307747234, "grad_norm": 0.294921875, "learning_rate": 0.00018309808917665562, "loss": 1.1892, "mean_token_accuracy": 0.6898854543765386, "step": 4280 }, { "epoch": 1.8375580149946447, "grad_norm": 0.271484375, "learning_rate": 0.00018194894990162424, "loss": 1.1957, "mean_token_accuracy": 0.6860898315906525, "step": 4290 }, { "epoch": 1.8418421992145664, "grad_norm": 0.298828125, "learning_rate": 0.00018080136140580328, "loss": 1.2323, "mean_token_accuracy": 0.6818597843249639, "step": 4300 }, { "epoch": 1.8461263834344876, "grad_norm": 0.31640625, "learning_rate": 0.00017965534984097696, "loss": 1.2124, "mean_token_accuracy": 0.6838676472504933, "step": 4310 }, { "epoch": 1.8504105676544091, "grad_norm": 0.3203125, "learning_rate": 0.00017851094132299362, "loss": 1.1997, "mean_token_accuracy": 0.6875764499107997, "step": 4320 }, { "epoch": 1.8546947518743306, "grad_norm": 0.291015625, "learning_rate": 0.00017736816193117066, "loss": 1.1956, "mean_token_accuracy": 0.6873005121946335, "step": 4330 }, { "epoch": 1.858978936094252, "grad_norm": 0.28515625, "learning_rate": 0.0001762270377077005, "loss": 1.1768, "mean_token_accuracy": 0.6917450726032257, "step": 4340 }, { "epoch": 1.8632631203141736, "grad_norm": 0.296875, "learning_rate": 0.0001750875946570564, "loss": 1.2165, "mean_token_accuracy": 0.6850099762280782, "step": 4350 }, { "epoch": 1.867547304534095, "grad_norm": 0.287109375, "learning_rate": 0.00017394985874540032, "loss": 1.1952, "mean_token_accuracy": 0.6887429515520732, "step": 4360 }, { "epoch": 1.8718314887540164, "grad_norm": 0.29296875, "learning_rate": 0.00017281385589999133, "loss": 1.1916, "mean_token_accuracy": 0.6882669021685918, "step": 4370 }, { "epoch": 1.8761156729739379, "grad_norm": 0.296875, "learning_rate": 0.00017167961200859432, "loss": 1.2191, "mean_token_accuracy": 0.6836802691221238, "step": 4380 }, { "epoch": 1.8803998571938594, "grad_norm": 0.2890625, "learning_rate": 0.00017054715291889072, "loss": 1.1771, "mean_token_accuracy": 0.689866092801094, "step": 4390 }, { "epoch": 1.8846840414137809, "grad_norm": 0.287109375, "learning_rate": 0.00016941650443788857, "loss": 1.1708, "mean_token_accuracy": 0.6898455142974853, "step": 4400 }, { "epoch": 1.8889682256337021, "grad_norm": 0.30859375, "learning_rate": 0.00016828769233133528, "loss": 1.2134, "mean_token_accuracy": 0.6872386793295543, "step": 4410 }, { "epoch": 1.8932524098536239, "grad_norm": 0.310546875, "learning_rate": 0.00016716074232312993, "loss": 1.2143, "mean_token_accuracy": 0.6822999050219853, "step": 4420 }, { "epoch": 1.8975365940735451, "grad_norm": 0.310546875, "learning_rate": 0.00016603568009473715, "loss": 1.1919, "mean_token_accuracy": 0.6847109029690425, "step": 4430 }, { "epoch": 1.9018207782934666, "grad_norm": 0.283203125, "learning_rate": 0.00016491253128460222, "loss": 1.1944, "mean_token_accuracy": 0.6889555603265762, "step": 4440 }, { "epoch": 1.9061049625133881, "grad_norm": 0.3125, "learning_rate": 0.00016379132148756638, "loss": 1.2015, "mean_token_accuracy": 0.6848962704340616, "step": 4450 }, { "epoch": 1.9103891467333094, "grad_norm": 0.294921875, "learning_rate": 0.00016267207625428375, "loss": 1.2066, "mean_token_accuracy": 0.6844677517811457, "step": 4460 }, { "epoch": 1.9146733309532311, "grad_norm": 0.287109375, "learning_rate": 0.00016155482109063898, "loss": 1.1895, "mean_token_accuracy": 0.689970392982165, "step": 4470 }, { "epoch": 1.9189575151731524, "grad_norm": 0.294921875, "learning_rate": 0.00016043958145716615, "loss": 1.1808, "mean_token_accuracy": 0.6905927946170171, "step": 4480 }, { "epoch": 1.9232416993930739, "grad_norm": 0.302734375, "learning_rate": 0.00015932638276846853, "loss": 1.1895, "mean_token_accuracy": 0.6876069366931915, "step": 4490 }, { "epoch": 1.9275258836129954, "grad_norm": 0.291015625, "learning_rate": 0.00015821525039263945, "loss": 1.1847, "mean_token_accuracy": 0.6875915179649988, "step": 4500 }, { "epoch": 1.9318100678329169, "grad_norm": 0.283203125, "learning_rate": 0.00015710620965068395, "loss": 1.2023, "mean_token_accuracy": 0.6880812575419744, "step": 4510 }, { "epoch": 1.9360942520528384, "grad_norm": 0.326171875, "learning_rate": 0.00015599928581594197, "loss": 1.1944, "mean_token_accuracy": 0.6877446641524633, "step": 4520 }, { "epoch": 1.9403784362727596, "grad_norm": 0.28515625, "learning_rate": 0.00015489450411351247, "loss": 1.2015, "mean_token_accuracy": 0.6858119696378708, "step": 4530 }, { "epoch": 1.9446626204926813, "grad_norm": 0.306640625, "learning_rate": 0.00015379188971967854, "loss": 1.1747, "mean_token_accuracy": 0.6891631742318471, "step": 4540 }, { "epoch": 1.9489468047126026, "grad_norm": 0.306640625, "learning_rate": 0.00015269146776133346, "loss": 1.1804, "mean_token_accuracy": 0.6899756520986557, "step": 4550 }, { "epoch": 1.9532309889325241, "grad_norm": 0.310546875, "learning_rate": 0.00015159326331540835, "loss": 1.1935, "mean_token_accuracy": 0.6860665520032246, "step": 4560 }, { "epoch": 1.9575151731524456, "grad_norm": 0.28515625, "learning_rate": 0.00015049730140830064, "loss": 1.1958, "mean_token_accuracy": 0.6895552823940913, "step": 4570 }, { "epoch": 1.9617993573723669, "grad_norm": 0.294921875, "learning_rate": 0.0001494036070153036, "loss": 1.1593, "mean_token_accuracy": 0.6954157501459122, "step": 4580 }, { "epoch": 1.9660835415922886, "grad_norm": 0.326171875, "learning_rate": 0.0001483122050600376, "loss": 1.1817, "mean_token_accuracy": 0.6880984852711359, "step": 4590 }, { "epoch": 1.9703677258122099, "grad_norm": 0.302734375, "learning_rate": 0.00014722312041388162, "loss": 1.1708, "mean_token_accuracy": 0.6934361755847931, "step": 4600 }, { "epoch": 1.9746519100321314, "grad_norm": 0.28515625, "learning_rate": 0.00014613637789540683, "loss": 1.1526, "mean_token_accuracy": 0.6956786572933197, "step": 4610 }, { "epoch": 1.9789360942520529, "grad_norm": 0.30859375, "learning_rate": 0.0001450520022698108, "loss": 1.1659, "mean_token_accuracy": 0.6891820311546326, "step": 4620 }, { "epoch": 1.9832202784719741, "grad_norm": 0.30078125, "learning_rate": 0.0001439700182483532, "loss": 1.1926, "mean_token_accuracy": 0.6883834769328435, "step": 4630 }, { "epoch": 1.9875044626918958, "grad_norm": 0.287109375, "learning_rate": 0.00014289045048779316, "loss": 1.179, "mean_token_accuracy": 0.6905747185150782, "step": 4640 }, { "epoch": 1.9917886469118171, "grad_norm": 0.306640625, "learning_rate": 0.00014181332358982615, "loss": 1.1518, "mean_token_accuracy": 0.6962420682112376, "step": 4650 }, { "epoch": 1.9960728311317386, "grad_norm": 0.28515625, "learning_rate": 0.00014073866210052478, "loss": 1.1798, "mean_token_accuracy": 0.6898052622874578, "step": 4660 }, { "epoch": 2.0, "grad_norm": 0.7421875, "learning_rate": 0.00013966649050977853, "loss": 1.1623, "mean_token_accuracy": 0.6901761282574047, "step": 4670 }, { "epoch": 2.0042841842199213, "grad_norm": 0.314453125, "learning_rate": 0.00013859683325073563, "loss": 0.848, "mean_token_accuracy": 0.7650491803884506, "step": 4680 }, { "epoch": 2.008568368439843, "grad_norm": 0.287109375, "learning_rate": 0.00013752971469924727, "loss": 0.863, "mean_token_accuracy": 0.7565648088852565, "step": 4690 }, { "epoch": 2.0128525526597643, "grad_norm": 0.31640625, "learning_rate": 0.00013646515917331055, "loss": 0.8361, "mean_token_accuracy": 0.7650597403446834, "step": 4700 }, { "epoch": 2.017136736879686, "grad_norm": 0.28515625, "learning_rate": 0.00013540319093251565, "loss": 0.8601, "mean_token_accuracy": 0.7594423244396845, "step": 4710 }, { "epoch": 2.0214209210996072, "grad_norm": 0.28125, "learning_rate": 0.00013434383417749247, "loss": 0.8135, "mean_token_accuracy": 0.7731389890114466, "step": 4720 }, { "epoch": 2.0257051053195285, "grad_norm": 0.296875, "learning_rate": 0.0001332871130493587, "loss": 0.8556, "mean_token_accuracy": 0.7604803055524826, "step": 4730 }, { "epoch": 2.0299892895394502, "grad_norm": 0.333984375, "learning_rate": 0.0001322330516291709, "loss": 0.842, "mean_token_accuracy": 0.7640973548094432, "step": 4740 }, { "epoch": 2.0342734737593715, "grad_norm": 0.298828125, "learning_rate": 0.0001311816739373742, "loss": 0.8381, "mean_token_accuracy": 0.7674211412668228, "step": 4750 }, { "epoch": 2.0385576579792932, "grad_norm": 0.296875, "learning_rate": 0.00013013300393325611, "loss": 0.8126, "mean_token_accuracy": 0.7714175979296366, "step": 4760 }, { "epoch": 2.0428418421992145, "grad_norm": 0.30078125, "learning_rate": 0.00012908706551440004, "loss": 0.8481, "mean_token_accuracy": 0.7640273501475652, "step": 4770 }, { "epoch": 2.047126026419136, "grad_norm": 0.3125, "learning_rate": 0.00012804388251614037, "loss": 0.8311, "mean_token_accuracy": 0.7653455446163814, "step": 4780 }, { "epoch": 2.0514102106390575, "grad_norm": 0.310546875, "learning_rate": 0.00012700347871102036, "loss": 0.8258, "mean_token_accuracy": 0.7644449760516484, "step": 4790 }, { "epoch": 2.0556943948589788, "grad_norm": 0.283203125, "learning_rate": 0.00012596587780824923, "loss": 0.8364, "mean_token_accuracy": 0.7657805611689885, "step": 4800 }, { "epoch": 2.0599785790789005, "grad_norm": 0.29296875, "learning_rate": 0.0001249311034531623, "loss": 0.8471, "mean_token_accuracy": 0.7625262240568796, "step": 4810 }, { "epoch": 2.0642627632988217, "grad_norm": 0.3125, "learning_rate": 0.00012389917922668245, "loss": 0.823, "mean_token_accuracy": 0.769603114326795, "step": 4820 }, { "epoch": 2.0685469475187435, "grad_norm": 0.328125, "learning_rate": 0.0001228701286447824, "loss": 0.8242, "mean_token_accuracy": 0.7667522599299749, "step": 4830 }, { "epoch": 2.0728311317386647, "grad_norm": 0.314453125, "learning_rate": 0.00012184397515794888, "loss": 0.843, "mean_token_accuracy": 0.7642757395903269, "step": 4840 }, { "epoch": 2.077115315958586, "grad_norm": 0.283203125, "learning_rate": 0.00012082074215064836, "loss": 0.8298, "mean_token_accuracy": 0.7669574290513992, "step": 4850 }, { "epoch": 2.0813995001785077, "grad_norm": 0.287109375, "learning_rate": 0.00011980045294079384, "loss": 0.8241, "mean_token_accuracy": 0.7687270969152451, "step": 4860 }, { "epoch": 2.085683684398429, "grad_norm": 0.326171875, "learning_rate": 0.00011878313077921388, "loss": 0.8395, "mean_token_accuracy": 0.76190112332503, "step": 4870 }, { "epoch": 2.0899678686183507, "grad_norm": 0.30078125, "learning_rate": 0.00011776879884912247, "loss": 0.8234, "mean_token_accuracy": 0.7668329626321793, "step": 4880 }, { "epoch": 2.094252052838272, "grad_norm": 0.3125, "learning_rate": 0.00011675748026559091, "loss": 0.8221, "mean_token_accuracy": 0.7703782876332601, "step": 4890 }, { "epoch": 2.0985362370581937, "grad_norm": 0.322265625, "learning_rate": 0.00011574919807502091, "loss": 0.8332, "mean_token_accuracy": 0.7663482298453649, "step": 4900 }, { "epoch": 2.102820421278115, "grad_norm": 0.298828125, "learning_rate": 0.00011474397525461919, "loss": 0.8166, "mean_token_accuracy": 0.7704643438259761, "step": 4910 }, { "epoch": 2.1071046054980362, "grad_norm": 0.30078125, "learning_rate": 0.0001137418347118744, "loss": 0.8254, "mean_token_accuracy": 0.7690837909777959, "step": 4920 }, { "epoch": 2.111388789717958, "grad_norm": 0.326171875, "learning_rate": 0.00011274279928403475, "loss": 0.8058, "mean_token_accuracy": 0.7719497382640839, "step": 4930 }, { "epoch": 2.1156729739378792, "grad_norm": 0.314453125, "learning_rate": 0.00011174689173758759, "loss": 0.8294, "mean_token_accuracy": 0.7664976229270299, "step": 4940 }, { "epoch": 2.119957158157801, "grad_norm": 0.388671875, "learning_rate": 0.00011075413476774066, "loss": 0.8292, "mean_token_accuracy": 0.7666595439116161, "step": 4950 }, { "epoch": 2.124241342377722, "grad_norm": 0.32421875, "learning_rate": 0.00010976455099790491, "loss": 0.8326, "mean_token_accuracy": 0.7672491510709126, "step": 4960 }, { "epoch": 2.1285255265976435, "grad_norm": 0.32421875, "learning_rate": 0.00010877816297917881, "loss": 0.8007, "mean_token_accuracy": 0.7737640182177226, "step": 4970 }, { "epoch": 2.132809710817565, "grad_norm": 0.3046875, "learning_rate": 0.00010779499318983463, "loss": 0.8092, "mean_token_accuracy": 0.7724389503399531, "step": 4980 }, { "epoch": 2.1370938950374865, "grad_norm": 0.3203125, "learning_rate": 0.00010681506403480617, "loss": 0.8111, "mean_token_accuracy": 0.7739120999972026, "step": 4990 }, { "epoch": 2.141378079257408, "grad_norm": 0.3203125, "learning_rate": 0.00010583839784517812, "loss": 0.82, "mean_token_accuracy": 0.7729650676250458, "step": 5000 }, { "epoch": 2.1456622634773295, "grad_norm": 0.291015625, "learning_rate": 0.00010486501687767719, "loss": 0.792, "mean_token_accuracy": 0.776722161968549, "step": 5010 }, { "epoch": 2.149946447697251, "grad_norm": 0.27734375, "learning_rate": 0.00010389494331416477, "loss": 0.8301, "mean_token_accuracy": 0.7654323528210322, "step": 5020 }, { "epoch": 2.1542306319171725, "grad_norm": 0.291015625, "learning_rate": 0.0001029281992611317, "loss": 0.7938, "mean_token_accuracy": 0.7787847359975179, "step": 5030 }, { "epoch": 2.1585148161370937, "grad_norm": 0.30859375, "learning_rate": 0.00010196480674919443, "loss": 0.8205, "mean_token_accuracy": 0.7693845987319946, "step": 5040 }, { "epoch": 2.1627990003570154, "grad_norm": 0.291015625, "learning_rate": 0.0001010047877325928, "loss": 0.7904, "mean_token_accuracy": 0.7785027374823889, "step": 5050 }, { "epoch": 2.1670831845769367, "grad_norm": 0.3203125, "learning_rate": 0.00010004816408869002, "loss": 0.8308, "mean_token_accuracy": 0.7703908115625382, "step": 5060 }, { "epoch": 2.1713673687968584, "grad_norm": 0.3203125, "learning_rate": 9.909495761747372e-05, "loss": 0.8253, "mean_token_accuracy": 0.7665383875370025, "step": 5070 }, { "epoch": 2.1756515530167797, "grad_norm": 0.31640625, "learning_rate": 9.814519004105946e-05, "loss": 0.8144, "mean_token_accuracy": 0.7718339473009109, "step": 5080 }, { "epoch": 2.179935737236701, "grad_norm": 0.30078125, "learning_rate": 9.719888300319602e-05, "loss": 0.8283, "mean_token_accuracy": 0.7683377832174301, "step": 5090 }, { "epoch": 2.1842199214566227, "grad_norm": 0.302734375, "learning_rate": 9.625605806877122e-05, "loss": 0.8472, "mean_token_accuracy": 0.7613804837067922, "step": 5100 }, { "epoch": 2.188504105676544, "grad_norm": 0.306640625, "learning_rate": 9.531673672332145e-05, "loss": 0.8098, "mean_token_accuracy": 0.7719518701235454, "step": 5110 }, { "epoch": 2.1927882898964657, "grad_norm": 0.314453125, "learning_rate": 9.438094037254172e-05, "loss": 0.7964, "mean_token_accuracy": 0.7749660849571228, "step": 5120 }, { "epoch": 2.197072474116387, "grad_norm": 0.314453125, "learning_rate": 9.344869034179735e-05, "loss": 0.8143, "mean_token_accuracy": 0.7723793685436249, "step": 5130 }, { "epoch": 2.2013566583363087, "grad_norm": 0.32421875, "learning_rate": 9.252000787563919e-05, "loss": 0.8087, "mean_token_accuracy": 0.7712360550959905, "step": 5140 }, { "epoch": 2.20564084255623, "grad_norm": 0.33203125, "learning_rate": 9.159491413731805e-05, "loss": 0.7954, "mean_token_accuracy": 0.7725468198458354, "step": 5150 }, { "epoch": 2.209925026776151, "grad_norm": 0.3125, "learning_rate": 9.067343020830352e-05, "loss": 0.8136, "mean_token_accuracy": 0.7698225736618042, "step": 5160 }, { "epoch": 2.214209210996073, "grad_norm": 0.310546875, "learning_rate": 8.975557708780316e-05, "loss": 0.8181, "mean_token_accuracy": 0.769992024699847, "step": 5170 }, { "epoch": 2.218493395215994, "grad_norm": 0.310546875, "learning_rate": 8.884137569228362e-05, "loss": 0.8098, "mean_token_accuracy": 0.770973147948583, "step": 5180 }, { "epoch": 2.222777579435916, "grad_norm": 0.3359375, "learning_rate": 8.793084685499498e-05, "loss": 0.7998, "mean_token_accuracy": 0.776837948958079, "step": 5190 }, { "epoch": 2.227061763655837, "grad_norm": 0.306640625, "learning_rate": 8.702401132549459e-05, "loss": 0.7891, "mean_token_accuracy": 0.7805549720923106, "step": 5200 }, { "epoch": 2.2313459478757585, "grad_norm": 0.322265625, "learning_rate": 8.612088976917554e-05, "loss": 0.8364, "mean_token_accuracy": 0.7643806467453639, "step": 5210 }, { "epoch": 2.23563013209568, "grad_norm": 0.283203125, "learning_rate": 8.522150276679494e-05, "loss": 0.8184, "mean_token_accuracy": 0.7683983782927195, "step": 5220 }, { "epoch": 2.2399143163156015, "grad_norm": 0.306640625, "learning_rate": 8.432587081400492e-05, "loss": 0.7959, "mean_token_accuracy": 0.778293655316035, "step": 5230 }, { "epoch": 2.244198500535523, "grad_norm": 0.3203125, "learning_rate": 8.343401432088629e-05, "loss": 0.7847, "mean_token_accuracy": 0.7779005120197932, "step": 5240 }, { "epoch": 2.2484826847554444, "grad_norm": 0.296875, "learning_rate": 8.254595361148262e-05, "loss": 0.7926, "mean_token_accuracy": 0.7749503721793493, "step": 5250 }, { "epoch": 2.2527668689753657, "grad_norm": 0.30859375, "learning_rate": 8.166170892333722e-05, "loss": 0.7907, "mean_token_accuracy": 0.7779211064179739, "step": 5260 }, { "epoch": 2.2570510531952874, "grad_norm": 0.318359375, "learning_rate": 8.078130040703238e-05, "loss": 0.7985, "mean_token_accuracy": 0.7758079042037328, "step": 5270 }, { "epoch": 2.2613352374152087, "grad_norm": 0.298828125, "learning_rate": 7.990474812572981e-05, "loss": 0.7876, "mean_token_accuracy": 0.7797471781571707, "step": 5280 }, { "epoch": 2.2656194216351304, "grad_norm": 0.306640625, "learning_rate": 7.903207205471358e-05, "loss": 0.8053, "mean_token_accuracy": 0.7768173178037008, "step": 5290 }, { "epoch": 2.2699036058550517, "grad_norm": 0.298828125, "learning_rate": 7.81632920809349e-05, "loss": 0.7792, "mean_token_accuracy": 0.7814870576063792, "step": 5300 }, { "epoch": 2.2741877900749734, "grad_norm": 0.322265625, "learning_rate": 7.729842800255865e-05, "loss": 0.8226, "mean_token_accuracy": 0.768639792005221, "step": 5310 }, { "epoch": 2.2784719742948947, "grad_norm": 0.298828125, "learning_rate": 7.64374995285127e-05, "loss": 0.7939, "mean_token_accuracy": 0.7751558671394984, "step": 5320 }, { "epoch": 2.282756158514816, "grad_norm": 0.30078125, "learning_rate": 7.558052627803846e-05, "loss": 0.7966, "mean_token_accuracy": 0.7758613834778468, "step": 5330 }, { "epoch": 2.2870403427347377, "grad_norm": 0.328125, "learning_rate": 7.472752778024383e-05, "loss": 0.7812, "mean_token_accuracy": 0.7802222698926926, "step": 5340 }, { "epoch": 2.291324526954659, "grad_norm": 0.287109375, "learning_rate": 7.387852347365829e-05, "loss": 0.7849, "mean_token_accuracy": 0.7787080804506937, "step": 5350 }, { "epoch": 2.2956087111745807, "grad_norm": 0.326171875, "learning_rate": 7.303353270578952e-05, "loss": 0.8034, "mean_token_accuracy": 0.770248160759608, "step": 5360 }, { "epoch": 2.299892895394502, "grad_norm": 0.318359375, "learning_rate": 7.219257473268312e-05, "loss": 0.8094, "mean_token_accuracy": 0.7722504367431005, "step": 5370 }, { "epoch": 2.3041770796144236, "grad_norm": 0.318359375, "learning_rate": 7.135566871848331e-05, "loss": 0.8008, "mean_token_accuracy": 0.7736594120661417, "step": 5380 }, { "epoch": 2.308461263834345, "grad_norm": 0.302734375, "learning_rate": 7.052283373499649e-05, "loss": 0.797, "mean_token_accuracy": 0.7745833595593771, "step": 5390 }, { "epoch": 2.312745448054266, "grad_norm": 0.306640625, "learning_rate": 6.969408876125637e-05, "loss": 0.7898, "mean_token_accuracy": 0.77644149462382, "step": 5400 }, { "epoch": 2.317029632274188, "grad_norm": 0.32421875, "learning_rate": 6.886945268309177e-05, "loss": 0.8053, "mean_token_accuracy": 0.7735361973444621, "step": 5410 }, { "epoch": 2.321313816494109, "grad_norm": 0.302734375, "learning_rate": 6.804894429269582e-05, "loss": 0.7936, "mean_token_accuracy": 0.7798129876454671, "step": 5420 }, { "epoch": 2.325598000714031, "grad_norm": 0.3203125, "learning_rate": 6.723258228819815e-05, "loss": 0.796, "mean_token_accuracy": 0.7762234061956406, "step": 5430 }, { "epoch": 2.329882184933952, "grad_norm": 0.294921875, "learning_rate": 6.64203852732386e-05, "loss": 0.8006, "mean_token_accuracy": 0.7752728084723155, "step": 5440 }, { "epoch": 2.3341663691538734, "grad_norm": 0.30859375, "learning_rate": 6.561237175654325e-05, "loss": 0.7862, "mean_token_accuracy": 0.7770834614833196, "step": 5450 }, { "epoch": 2.338450553373795, "grad_norm": 0.337890625, "learning_rate": 6.480856015150272e-05, "loss": 0.8179, "mean_token_accuracy": 0.7689370075861613, "step": 5460 }, { "epoch": 2.3427347375937164, "grad_norm": 0.291015625, "learning_rate": 6.40089687757523e-05, "loss": 0.7858, "mean_token_accuracy": 0.7789360960324605, "step": 5470 }, { "epoch": 2.347018921813638, "grad_norm": 0.32421875, "learning_rate": 6.32136158507549e-05, "loss": 0.8105, "mean_token_accuracy": 0.7707864145437876, "step": 5480 }, { "epoch": 2.3513031060335594, "grad_norm": 0.306640625, "learning_rate": 6.242251950138564e-05, "loss": 0.7907, "mean_token_accuracy": 0.7774596979220708, "step": 5490 }, { "epoch": 2.3555872902534807, "grad_norm": 0.3203125, "learning_rate": 6.163569775551869e-05, "loss": 0.7875, "mean_token_accuracy": 0.780629759033521, "step": 5500 }, { "epoch": 2.3598714744734024, "grad_norm": 0.298828125, "learning_rate": 6.0853168543616694e-05, "loss": 0.8057, "mean_token_accuracy": 0.7729009737571081, "step": 5510 }, { "epoch": 2.3641556586933237, "grad_norm": 0.322265625, "learning_rate": 6.007494969832181e-05, "loss": 0.7709, "mean_token_accuracy": 0.7820465574661891, "step": 5520 }, { "epoch": 2.3684398429132454, "grad_norm": 0.30859375, "learning_rate": 5.9301058954049664e-05, "loss": 0.7857, "mean_token_accuracy": 0.7776114324728648, "step": 5530 }, { "epoch": 2.3727240271331667, "grad_norm": 0.3125, "learning_rate": 5.853151394658526e-05, "loss": 0.7841, "mean_token_accuracy": 0.777163389325142, "step": 5540 }, { "epoch": 2.3770082113530884, "grad_norm": 0.326171875, "learning_rate": 5.776633221268057e-05, "loss": 0.7933, "mean_token_accuracy": 0.7760427872339885, "step": 5550 }, { "epoch": 2.3812923955730096, "grad_norm": 0.2890625, "learning_rate": 5.7005531189655515e-05, "loss": 0.8003, "mean_token_accuracy": 0.7774786601463953, "step": 5560 }, { "epoch": 2.385576579792931, "grad_norm": 0.31640625, "learning_rate": 5.624912821500025e-05, "loss": 0.789, "mean_token_accuracy": 0.77956602871418, "step": 5570 }, { "epoch": 2.3898607640128526, "grad_norm": 0.322265625, "learning_rate": 5.5497140525979925e-05, "loss": 0.8054, "mean_token_accuracy": 0.7731809784968694, "step": 5580 }, { "epoch": 2.394144948232774, "grad_norm": 0.3359375, "learning_rate": 5.474958525924262e-05, "loss": 0.7969, "mean_token_accuracy": 0.7769083728392919, "step": 5590 }, { "epoch": 2.3984291324526956, "grad_norm": 0.326171875, "learning_rate": 5.4006479450427694e-05, "loss": 0.7873, "mean_token_accuracy": 0.7816483676433563, "step": 5600 }, { "epoch": 2.402713316672617, "grad_norm": 0.3359375, "learning_rate": 5.3267840033778516e-05, "loss": 0.7924, "mean_token_accuracy": 0.7759514023860296, "step": 5610 }, { "epoch": 2.4069975008925386, "grad_norm": 0.29296875, "learning_rate": 5.25336838417563e-05, "loss": 0.7693, "mean_token_accuracy": 0.7819558610518773, "step": 5620 }, { "epoch": 2.41128168511246, "grad_norm": 0.294921875, "learning_rate": 5.1804027604655995e-05, "loss": 0.7901, "mean_token_accuracy": 0.7765521576007207, "step": 5630 }, { "epoch": 2.415565869332381, "grad_norm": 0.30078125, "learning_rate": 5.1078887950226084e-05, "loss": 0.8054, "mean_token_accuracy": 0.7711812168359756, "step": 5640 }, { "epoch": 2.419850053552303, "grad_norm": 0.298828125, "learning_rate": 5.035828140328852e-05, "loss": 0.7997, "mean_token_accuracy": 0.7752634723981221, "step": 5650 }, { "epoch": 2.424134237772224, "grad_norm": 0.310546875, "learning_rate": 4.964222438536295e-05, "loss": 0.7682, "mean_token_accuracy": 0.7835259586572647, "step": 5660 }, { "epoch": 2.428418421992146, "grad_norm": 0.29296875, "learning_rate": 4.8930733214292227e-05, "loss": 0.7798, "mean_token_accuracy": 0.7836492071549098, "step": 5670 }, { "epoch": 2.432702606212067, "grad_norm": 0.33203125, "learning_rate": 4.822382410387027e-05, "loss": 0.7744, "mean_token_accuracy": 0.7835030923287074, "step": 5680 }, { "epoch": 2.4369867904319884, "grad_norm": 0.314453125, "learning_rate": 4.752151316347328e-05, "loss": 0.7777, "mean_token_accuracy": 0.7836812446514766, "step": 5690 }, { "epoch": 2.44127097465191, "grad_norm": 0.328125, "learning_rate": 4.682381639769195e-05, "loss": 0.7843, "mean_token_accuracy": 0.7796516954898834, "step": 5700 }, { "epoch": 2.4455551588718314, "grad_norm": 0.337890625, "learning_rate": 4.6130749705966924e-05, "loss": 0.7856, "mean_token_accuracy": 0.7769708921511967, "step": 5710 }, { "epoch": 2.449839343091753, "grad_norm": 0.3203125, "learning_rate": 4.544232888222674e-05, "loss": 0.7558, "mean_token_accuracy": 0.7886910130580266, "step": 5720 }, { "epoch": 2.4541235273116744, "grad_norm": 0.326171875, "learning_rate": 4.475856961452765e-05, "loss": 0.7777, "mean_token_accuracy": 0.7824478884538014, "step": 5730 }, { "epoch": 2.4584077115315957, "grad_norm": 0.3203125, "learning_rate": 4.407948748469615e-05, "loss": 0.7647, "mean_token_accuracy": 0.7870442191759746, "step": 5740 }, { "epoch": 2.4626918957515174, "grad_norm": 0.34375, "learning_rate": 4.340509796797401e-05, "loss": 0.8013, "mean_token_accuracy": 0.7732500980297724, "step": 5750 }, { "epoch": 2.4669760799714386, "grad_norm": 0.2890625, "learning_rate": 4.273541643266537e-05, "loss": 0.7809, "mean_token_accuracy": 0.7783005088567734, "step": 5760 }, { "epoch": 2.4712602641913604, "grad_norm": 0.30078125, "learning_rate": 4.2070458139786886e-05, "loss": 0.7664, "mean_token_accuracy": 0.7849383483330409, "step": 5770 }, { "epoch": 2.4755444484112816, "grad_norm": 0.3125, "learning_rate": 4.141023824271964e-05, "loss": 0.7862, "mean_token_accuracy": 0.7778257886568706, "step": 5780 }, { "epoch": 2.479828632631203, "grad_norm": 0.3125, "learning_rate": 4.075477178686382e-05, "loss": 0.7558, "mean_token_accuracy": 0.7865474035342535, "step": 5790 }, { "epoch": 2.4841128168511246, "grad_norm": 0.314453125, "learning_rate": 4.010407370929634e-05, "loss": 0.7903, "mean_token_accuracy": 0.776697979370753, "step": 5800 }, { "epoch": 2.488397001071046, "grad_norm": 0.3046875, "learning_rate": 3.94581588384296e-05, "loss": 0.7905, "mean_token_accuracy": 0.7786333004633585, "step": 5810 }, { "epoch": 2.4926811852909676, "grad_norm": 0.298828125, "learning_rate": 3.8817041893674407e-05, "loss": 0.7815, "mean_token_accuracy": 0.7784610877434412, "step": 5820 }, { "epoch": 2.496965369510889, "grad_norm": 0.310546875, "learning_rate": 3.818073748510406e-05, "loss": 0.7908, "mean_token_accuracy": 0.7784650901953379, "step": 5830 }, { "epoch": 2.50124955373081, "grad_norm": 0.3046875, "learning_rate": 3.754926011312137e-05, "loss": 0.7778, "mean_token_accuracy": 0.7789822975794475, "step": 5840 }, { "epoch": 2.505533737950732, "grad_norm": 0.333984375, "learning_rate": 3.692262416812869e-05, "loss": 0.7799, "mean_token_accuracy": 0.7808815310398738, "step": 5850 }, { "epoch": 2.5098179221706536, "grad_norm": 0.291015625, "learning_rate": 3.63008439301995e-05, "loss": 0.7668, "mean_token_accuracy": 0.7802990694840749, "step": 5860 }, { "epoch": 2.514102106390575, "grad_norm": 0.30859375, "learning_rate": 3.568393356875305e-05, "loss": 0.7787, "mean_token_accuracy": 0.7813133666912715, "step": 5870 }, { "epoch": 2.518386290610496, "grad_norm": 0.330078125, "learning_rate": 3.507190714223168e-05, "loss": 0.7769, "mean_token_accuracy": 0.7826229612032573, "step": 5880 }, { "epoch": 2.522670474830418, "grad_norm": 0.3125, "learning_rate": 3.446477859778041e-05, "loss": 0.78, "mean_token_accuracy": 0.7789117127656937, "step": 5890 }, { "epoch": 2.526954659050339, "grad_norm": 0.291015625, "learning_rate": 3.386256177092886e-05, "loss": 0.7882, "mean_token_accuracy": 0.7778934846321742, "step": 5900 }, { "epoch": 2.531238843270261, "grad_norm": 0.318359375, "learning_rate": 3.3265270385276296e-05, "loss": 0.7719, "mean_token_accuracy": 0.7834851761658986, "step": 5910 }, { "epoch": 2.535523027490182, "grad_norm": 0.31640625, "learning_rate": 3.267291805217851e-05, "loss": 0.7863, "mean_token_accuracy": 0.7806941578785579, "step": 5920 }, { "epoch": 2.5398072117101034, "grad_norm": 0.3125, "learning_rate": 3.208551827043804e-05, "loss": 0.7656, "mean_token_accuracy": 0.7854113181432089, "step": 5930 }, { "epoch": 2.544091395930025, "grad_norm": 0.33203125, "learning_rate": 3.150308442599631e-05, "loss": 0.7658, "mean_token_accuracy": 0.783687628308932, "step": 5940 }, { "epoch": 2.5483755801499464, "grad_norm": 0.3125, "learning_rate": 3.092562979162864e-05, "loss": 0.8225, "mean_token_accuracy": 0.7703000376621882, "step": 5950 }, { "epoch": 2.552659764369868, "grad_norm": 0.298828125, "learning_rate": 3.0353167526641745e-05, "loss": 0.7792, "mean_token_accuracy": 0.7798318127791087, "step": 5960 }, { "epoch": 2.5569439485897894, "grad_norm": 0.3046875, "learning_rate": 2.97857106765739e-05, "loss": 0.7706, "mean_token_accuracy": 0.7838211774826049, "step": 5970 }, { "epoch": 2.5612281328097106, "grad_norm": 0.326171875, "learning_rate": 2.9223272172897607e-05, "loss": 0.7815, "mean_token_accuracy": 0.7787001341581344, "step": 5980 }, { "epoch": 2.5655123170296323, "grad_norm": 0.33984375, "learning_rate": 2.866586483272507e-05, "loss": 0.7745, "mean_token_accuracy": 0.780657422542572, "step": 5990 }, { "epoch": 2.5697965012495536, "grad_norm": 0.294921875, "learning_rate": 2.8113501358515813e-05, "loss": 0.7755, "mean_token_accuracy": 0.7821098357439041, "step": 6000 }, { "epoch": 2.5740806854694753, "grad_norm": 0.345703125, "learning_rate": 2.7566194337787507e-05, "loss": 0.8095, "mean_token_accuracy": 0.7710189620653788, "step": 6010 }, { "epoch": 2.5783648696893966, "grad_norm": 0.3203125, "learning_rate": 2.7023956242828968e-05, "loss": 0.7781, "mean_token_accuracy": 0.7805722614129385, "step": 6020 }, { "epoch": 2.582649053909318, "grad_norm": 0.328125, "learning_rate": 2.6486799430415875e-05, "loss": 0.7774, "mean_token_accuracy": 0.7799272914727529, "step": 6030 }, { "epoch": 2.5869332381292396, "grad_norm": 0.333984375, "learning_rate": 2.595473614152932e-05, "loss": 0.7781, "mean_token_accuracy": 0.7814104864994685, "step": 6040 }, { "epoch": 2.591217422349161, "grad_norm": 0.33203125, "learning_rate": 2.5427778501076804e-05, "loss": 0.7885, "mean_token_accuracy": 0.7791467080513637, "step": 6050 }, { "epoch": 2.5955016065690826, "grad_norm": 0.291015625, "learning_rate": 2.490593851761591e-05, "loss": 0.7689, "mean_token_accuracy": 0.7836964478095373, "step": 6060 }, { "epoch": 2.599785790789004, "grad_norm": 0.322265625, "learning_rate": 2.4389228083080722e-05, "loss": 0.8215, "mean_token_accuracy": 0.7677028367916743, "step": 6070 }, { "epoch": 2.604069975008925, "grad_norm": 0.328125, "learning_rate": 2.387765897251057e-05, "loss": 0.7749, "mean_token_accuracy": 0.7838013221820196, "step": 6080 }, { "epoch": 2.608354159228847, "grad_norm": 0.314453125, "learning_rate": 2.3371242843782088e-05, "loss": 0.7725, "mean_token_accuracy": 0.7843282967805862, "step": 6090 }, { "epoch": 2.6126383434487686, "grad_norm": 0.328125, "learning_rate": 2.2869991237343207e-05, "loss": 0.797, "mean_token_accuracy": 0.7766178419192632, "step": 6100 }, { "epoch": 2.61692252766869, "grad_norm": 0.330078125, "learning_rate": 2.237391557595042e-05, "loss": 0.7642, "mean_token_accuracy": 0.7866159160931905, "step": 6110 }, { "epoch": 2.621206711888611, "grad_norm": 0.32421875, "learning_rate": 2.188302716440832e-05, "loss": 0.7788, "mean_token_accuracy": 0.7817911605040232, "step": 6120 }, { "epoch": 2.625490896108533, "grad_norm": 0.31640625, "learning_rate": 2.1397337189311915e-05, "loss": 0.7902, "mean_token_accuracy": 0.7791908890008926, "step": 6130 }, { "epoch": 2.629775080328454, "grad_norm": 0.33203125, "learning_rate": 2.091685671879187e-05, "loss": 0.7905, "mean_token_accuracy": 0.7808248698711395, "step": 6140 }, { "epoch": 2.634059264548376, "grad_norm": 0.314453125, "learning_rate": 2.044159670226245e-05, "loss": 0.7572, "mean_token_accuracy": 0.7873713413874308, "step": 6150 }, { "epoch": 2.638343448768297, "grad_norm": 0.333984375, "learning_rate": 1.9971567970171355e-05, "loss": 0.7987, "mean_token_accuracy": 0.7766677429278691, "step": 6160 }, { "epoch": 2.6426276329882183, "grad_norm": 0.298828125, "learning_rate": 1.95067812337536e-05, "loss": 0.7688, "mean_token_accuracy": 0.7839142779509226, "step": 6170 }, { "epoch": 2.64691181720814, "grad_norm": 0.330078125, "learning_rate": 1.9047247084787112e-05, "loss": 0.7602, "mean_token_accuracy": 0.7852412790060044, "step": 6180 }, { "epoch": 2.6511960014280613, "grad_norm": 0.341796875, "learning_rate": 1.8592975995351257e-05, "loss": 0.762, "mean_token_accuracy": 0.7860971083243687, "step": 6190 }, { "epoch": 2.655480185647983, "grad_norm": 0.306640625, "learning_rate": 1.81439783175886e-05, "loss": 0.7658, "mean_token_accuracy": 0.7843163589636485, "step": 6200 }, { "epoch": 2.6597643698679043, "grad_norm": 0.3125, "learning_rate": 1.7700264283468465e-05, "loss": 0.7793, "mean_token_accuracy": 0.7768728653589885, "step": 6210 }, { "epoch": 2.6640485540878256, "grad_norm": 0.322265625, "learning_rate": 1.7261844004554105e-05, "loss": 0.7862, "mean_token_accuracy": 0.7782338261604309, "step": 6220 }, { "epoch": 2.6683327383077473, "grad_norm": 0.341796875, "learning_rate": 1.6828727471772358e-05, "loss": 0.7873, "mean_token_accuracy": 0.7764876892169317, "step": 6230 }, { "epoch": 2.6726169225276686, "grad_norm": 0.31640625, "learning_rate": 1.6400924555185492e-05, "loss": 0.7774, "mean_token_accuracy": 0.7807328204313914, "step": 6240 }, { "epoch": 2.6769011067475903, "grad_norm": 0.310546875, "learning_rate": 1.5978445003766968e-05, "loss": 0.786, "mean_token_accuracy": 0.7790028562148412, "step": 6250 }, { "epoch": 2.6811852909675116, "grad_norm": 0.30859375, "learning_rate": 1.5561298445178617e-05, "loss": 0.7511, "mean_token_accuracy": 0.7876404513915379, "step": 6260 }, { "epoch": 2.685469475187433, "grad_norm": 0.337890625, "learning_rate": 1.5149494385551688e-05, "loss": 0.781, "mean_token_accuracy": 0.7786963661511739, "step": 6270 }, { "epoch": 2.6897536594073546, "grad_norm": 0.318359375, "learning_rate": 1.474304220927003e-05, "loss": 0.7837, "mean_token_accuracy": 0.780462098121643, "step": 6280 }, { "epoch": 2.694037843627276, "grad_norm": 0.318359375, "learning_rate": 1.4341951178756168e-05, "loss": 0.7817, "mean_token_accuracy": 0.7801312992970149, "step": 6290 }, { "epoch": 2.6983220278471975, "grad_norm": 0.3203125, "learning_rate": 1.3946230434260493e-05, "loss": 0.7837, "mean_token_accuracy": 0.7814756115277608, "step": 6300 }, { "epoch": 2.702606212067119, "grad_norm": 0.31640625, "learning_rate": 1.3555888993652732e-05, "loss": 0.7825, "mean_token_accuracy": 0.7780475705862046, "step": 6310 }, { "epoch": 2.70689039628704, "grad_norm": 0.333984375, "learning_rate": 1.3170935752216423e-05, "loss": 0.7856, "mean_token_accuracy": 0.7791298975547155, "step": 6320 }, { "epoch": 2.711174580506962, "grad_norm": 0.318359375, "learning_rate": 1.2791379482446407e-05, "loss": 0.7796, "mean_token_accuracy": 0.7799790789683659, "step": 6330 }, { "epoch": 2.7154587647268835, "grad_norm": 0.337890625, "learning_rate": 1.2417228833848798e-05, "loss": 0.7748, "mean_token_accuracy": 0.7825185318787893, "step": 6340 }, { "epoch": 2.719742948946805, "grad_norm": 0.326171875, "learning_rate": 1.2048492332743827e-05, "loss": 0.7963, "mean_token_accuracy": 0.7784641563892365, "step": 6350 }, { "epoch": 2.724027133166726, "grad_norm": 0.3203125, "learning_rate": 1.1685178382071698e-05, "loss": 0.7778, "mean_token_accuracy": 0.7813698927561442, "step": 6360 }, { "epoch": 2.728311317386648, "grad_norm": 0.291015625, "learning_rate": 1.1327295261200826e-05, "loss": 0.7605, "mean_token_accuracy": 0.7861085881789526, "step": 6370 }, { "epoch": 2.732595501606569, "grad_norm": 0.3125, "learning_rate": 1.0974851125739483e-05, "loss": 0.7647, "mean_token_accuracy": 0.7843359092871348, "step": 6380 }, { "epoch": 2.7368796858264908, "grad_norm": 0.318359375, "learning_rate": 1.0627854007349725e-05, "loss": 0.7771, "mean_token_accuracy": 0.7815760542949041, "step": 6390 }, { "epoch": 2.741163870046412, "grad_norm": 0.33203125, "learning_rate": 1.0286311813564487e-05, "loss": 0.7819, "mean_token_accuracy": 0.7804714103539785, "step": 6400 }, { "epoch": 2.7454480542663333, "grad_norm": 0.314453125, "learning_rate": 9.950232327607278e-06, "loss": 0.7887, "mean_token_accuracy": 0.7779040902853012, "step": 6410 }, { "epoch": 2.749732238486255, "grad_norm": 0.32421875, "learning_rate": 9.619623208214801e-06, "loss": 0.7795, "mean_token_accuracy": 0.7815526028474172, "step": 6420 }, { "epoch": 2.7540164227061763, "grad_norm": 0.314453125, "learning_rate": 9.29449198946264e-06, "loss": 0.7577, "mean_token_accuracy": 0.7885059833526611, "step": 6430 }, { "epoch": 2.758300606926098, "grad_norm": 0.330078125, "learning_rate": 8.974846080593262e-06, "loss": 0.7688, "mean_token_accuracy": 0.7825045188268026, "step": 6440 }, { "epoch": 2.7625847911460193, "grad_norm": 0.296875, "learning_rate": 8.66069276584741e-06, "loss": 0.7639, "mean_token_accuracy": 0.7843573103348415, "step": 6450 }, { "epoch": 2.7668689753659406, "grad_norm": 0.279296875, "learning_rate": 8.352039204298029e-06, "loss": 0.7457, "mean_token_accuracy": 0.7897063046693802, "step": 6460 }, { "epoch": 2.7711531595858623, "grad_norm": 0.318359375, "learning_rate": 8.048892429687066e-06, "loss": 0.7661, "mean_token_accuracy": 0.7825449128945668, "step": 6470 }, { "epoch": 2.7754373438057836, "grad_norm": 0.306640625, "learning_rate": 7.751259350265216e-06, "loss": 0.7502, "mean_token_accuracy": 0.7883096744616827, "step": 6480 }, { "epoch": 2.7797215280257053, "grad_norm": 0.31640625, "learning_rate": 7.459146748634516e-06, "loss": 0.7681, "mean_token_accuracy": 0.7826910416285197, "step": 6490 }, { "epoch": 2.7840057122456265, "grad_norm": 0.298828125, "learning_rate": 7.172561281593798e-06, "loss": 0.7842, "mean_token_accuracy": 0.7800470501184463, "step": 6500 }, { "epoch": 2.788289896465548, "grad_norm": 0.302734375, "learning_rate": 6.891509479986957e-06, "loss": 0.7857, "mean_token_accuracy": 0.7821651329596837, "step": 6510 }, { "epoch": 2.7925740806854695, "grad_norm": 0.330078125, "learning_rate": 6.615997748554148e-06, "loss": 0.7631, "mean_token_accuracy": 0.7860769232114156, "step": 6520 }, { "epoch": 2.796858264905391, "grad_norm": 0.318359375, "learning_rate": 6.346032365785709e-06, "loss": 0.8009, "mean_token_accuracy": 0.7765180408954621, "step": 6530 }, { "epoch": 2.8011424491253125, "grad_norm": 0.30078125, "learning_rate": 6.081619483779277e-06, "loss": 0.7809, "mean_token_accuracy": 0.7795830368995667, "step": 6540 }, { "epoch": 2.805426633345234, "grad_norm": 0.322265625, "learning_rate": 5.822765128099483e-06, "loss": 0.764, "mean_token_accuracy": 0.7845493823289871, "step": 6550 }, { "epoch": 2.809710817565155, "grad_norm": 0.3125, "learning_rate": 5.569475197640672e-06, "loss": 0.7747, "mean_token_accuracy": 0.780571989218394, "step": 6560 }, { "epoch": 2.8139950017850768, "grad_norm": 0.296875, "learning_rate": 5.321755464492456e-06, "loss": 0.7897, "mean_token_accuracy": 0.7762352069218953, "step": 6570 }, { "epoch": 2.818279186004998, "grad_norm": 0.296875, "learning_rate": 5.079611573808124e-06, "loss": 0.7919, "mean_token_accuracy": 0.7770307193199794, "step": 6580 }, { "epoch": 2.8225633702249198, "grad_norm": 0.314453125, "learning_rate": 4.843049043676079e-06, "loss": 0.7841, "mean_token_accuracy": 0.7784975161155064, "step": 6590 }, { "epoch": 2.826847554444841, "grad_norm": 0.34375, "learning_rate": 4.61207326499416e-06, "loss": 0.8001, "mean_token_accuracy": 0.7755351980527242, "step": 6600 }, { "epoch": 2.8311317386647623, "grad_norm": 0.3125, "learning_rate": 4.386689501346574e-06, "loss": 0.8, "mean_token_accuracy": 0.7723700026671092, "step": 6610 }, { "epoch": 2.835415922884684, "grad_norm": 0.306640625, "learning_rate": 4.166902888884105e-06, "loss": 0.7928, "mean_token_accuracy": 0.7796116421620051, "step": 6620 }, { "epoch": 2.8397001071046057, "grad_norm": 0.33203125, "learning_rate": 3.952718436207065e-06, "loss": 0.7813, "mean_token_accuracy": 0.7800139904022216, "step": 6630 }, { "epoch": 2.843984291324527, "grad_norm": 0.302734375, "learning_rate": 3.7441410242510796e-06, "loss": 0.7706, "mean_token_accuracy": 0.7841657598813375, "step": 6640 }, { "epoch": 2.8482684755444483, "grad_norm": 0.333984375, "learning_rate": 3.5411754061759614e-06, "loss": 0.7877, "mean_token_accuracy": 0.7804912636677425, "step": 6650 }, { "epoch": 2.85255265976437, "grad_norm": 0.306640625, "learning_rate": 3.3438262072572612e-06, "loss": 0.7821, "mean_token_accuracy": 0.7790777862071991, "step": 6660 }, { "epoch": 2.8568368439842913, "grad_norm": 0.4140625, "learning_rate": 3.1520979247810223e-06, "loss": 0.7877, "mean_token_accuracy": 0.7802624036868413, "step": 6670 }, { "epoch": 2.861121028204213, "grad_norm": 0.30859375, "learning_rate": 2.9659949279411404e-06, "loss": 0.7736, "mean_token_accuracy": 0.7815846174955368, "step": 6680 }, { "epoch": 2.8654052124241343, "grad_norm": 0.31640625, "learning_rate": 2.785521457739859e-06, "loss": 0.7598, "mean_token_accuracy": 0.7873433222373326, "step": 6690 }, { "epoch": 2.8696893966440555, "grad_norm": 0.294921875, "learning_rate": 2.610681626891237e-06, "loss": 0.773, "mean_token_accuracy": 0.7824377208948136, "step": 6700 }, { "epoch": 2.8739735808639773, "grad_norm": 0.298828125, "learning_rate": 2.4414794197272217e-06, "loss": 0.8005, "mean_token_accuracy": 0.7779265910387039, "step": 6710 }, { "epoch": 2.8782577650838985, "grad_norm": 0.322265625, "learning_rate": 2.277918692106973e-06, "loss": 0.7652, "mean_token_accuracy": 0.7827598293622334, "step": 6720 }, { "epoch": 2.8825419493038202, "grad_norm": 0.322265625, "learning_rate": 2.120003171328988e-06, "loss": 0.8107, "mean_token_accuracy": 0.7738327503204345, "step": 6730 }, { "epoch": 2.8868261335237415, "grad_norm": 0.3046875, "learning_rate": 1.9677364560460874e-06, "loss": 0.7964, "mean_token_accuracy": 0.7783457924922307, "step": 6740 }, { "epoch": 2.891110317743663, "grad_norm": 0.30859375, "learning_rate": 1.8211220161835629e-06, "loss": 0.7952, "mean_token_accuracy": 0.778786172469457, "step": 6750 }, { "epoch": 2.8953945019635845, "grad_norm": 0.33984375, "learning_rate": 1.6801631928599626e-06, "loss": 0.7956, "mean_token_accuracy": 0.7760281264781952, "step": 6760 }, { "epoch": 2.8996786861835058, "grad_norm": 0.306640625, "learning_rate": 1.5448631983109584e-06, "loss": 0.7765, "mean_token_accuracy": 0.7821375171343485, "step": 6770 }, { "epoch": 2.9039628704034275, "grad_norm": 0.333984375, "learning_rate": 1.41522511581621e-06, "loss": 0.7765, "mean_token_accuracy": 0.7809682806332906, "step": 6780 }, { "epoch": 2.9082470546233488, "grad_norm": 0.318359375, "learning_rate": 1.2912518996290866e-06, "loss": 0.7705, "mean_token_accuracy": 0.786265421907107, "step": 6790 }, { "epoch": 2.91253123884327, "grad_norm": 0.322265625, "learning_rate": 1.1729463749093338e-06, "loss": 0.7982, "mean_token_accuracy": 0.7765524844328563, "step": 6800 }, { "epoch": 2.9168154230631917, "grad_norm": 0.291015625, "learning_rate": 1.060311237658651e-06, "loss": 0.773, "mean_token_accuracy": 0.7809695929288865, "step": 6810 }, { "epoch": 2.921099607283113, "grad_norm": 0.302734375, "learning_rate": 9.533490546593248e-07, "loss": 0.774, "mean_token_accuracy": 0.7783287167549133, "step": 6820 }, { "epoch": 2.9253837915030347, "grad_norm": 0.314453125, "learning_rate": 8.520622634156927e-07, "loss": 0.8114, "mean_token_accuracy": 0.7732517321904501, "step": 6830 }, { "epoch": 2.929667975722956, "grad_norm": 0.30859375, "learning_rate": 7.564531720985756e-07, "loss": 0.7683, "mean_token_accuracy": 0.7860218505064647, "step": 6840 }, { "epoch": 2.9339521599428773, "grad_norm": 0.3125, "learning_rate": 6.665239594927929e-07, "loss": 0.7902, "mean_token_accuracy": 0.780942557255427, "step": 6850 }, { "epoch": 2.938236344162799, "grad_norm": 0.3203125, "learning_rate": 5.82276674947313e-07, "loss": 0.7628, "mean_token_accuracy": 0.7842933177947998, "step": 6860 }, { "epoch": 2.9425205283827207, "grad_norm": 0.302734375, "learning_rate": 5.037132383287624e-07, "loss": 0.7806, "mean_token_accuracy": 0.7809433400630951, "step": 6870 }, { "epoch": 2.946804712602642, "grad_norm": 0.326171875, "learning_rate": 4.308354399775172e-07, "loss": 0.7796, "mean_token_accuracy": 0.7829487164815266, "step": 6880 }, { "epoch": 2.9510888968225633, "grad_norm": 0.3125, "learning_rate": 3.636449406670128e-07, "loss": 0.7785, "mean_token_accuracy": 0.7833139419555664, "step": 6890 }, { "epoch": 2.955373081042485, "grad_norm": 0.3046875, "learning_rate": 3.021432715658023e-07, "loss": 0.776, "mean_token_accuracy": 0.7816599200169245, "step": 6900 }, { "epoch": 2.9596572652624062, "grad_norm": 0.31640625, "learning_rate": 2.4633183420275093e-07, "loss": 0.7895, "mean_token_accuracy": 0.7800428807735443, "step": 6910 }, { "epoch": 2.963941449482328, "grad_norm": 0.310546875, "learning_rate": 1.9621190043506155e-07, "loss": 0.7643, "mean_token_accuracy": 0.7820758432149887, "step": 6920 }, { "epoch": 2.9682256337022492, "grad_norm": 0.328125, "learning_rate": 1.5178461241918684e-07, "loss": 0.7962, "mean_token_accuracy": 0.7780212819576263, "step": 6930 }, { "epoch": 2.9725098179221705, "grad_norm": 0.33203125, "learning_rate": 1.1305098258504454e-07, "loss": 0.7694, "mean_token_accuracy": 0.7845108538866044, "step": 6940 }, { "epoch": 2.976794002142092, "grad_norm": 0.32421875, "learning_rate": 8.001189361273032e-08, "loss": 0.7709, "mean_token_accuracy": 0.7825704693794251, "step": 6950 }, { "epoch": 2.9810781863620135, "grad_norm": 0.33203125, "learning_rate": 5.266809841247833e-08, "loss": 0.7737, "mean_token_accuracy": 0.7819208929936091, "step": 6960 }, { "epoch": 2.985362370581935, "grad_norm": 0.310546875, "learning_rate": 3.1020220107480513e-08, "loss": 0.7593, "mean_token_accuracy": 0.7868289381265641, "step": 6970 }, { "epoch": 2.9896465548018565, "grad_norm": 0.31640625, "learning_rate": 1.506875201975899e-08, "loss": 0.8075, "mean_token_accuracy": 0.7752761413653692, "step": 6980 }, { "epoch": 2.9939307390217778, "grad_norm": 0.30859375, "learning_rate": 4.8140576588140415e-09, "loss": 0.7801, "mean_token_accuracy": 0.7793139110008875, "step": 6990 }, { "epoch": 2.9982149232416995, "grad_norm": 0.326171875, "learning_rate": 2.5637071346396034e-10, "loss": 0.7702, "mean_token_accuracy": 0.7825540274381637, "step": 7000 } ], "logging_steps": 10, "max_steps": 7002, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3733801424314171e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }