{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.7623529411764705, "eval_steps": 500, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03137254901960784, "grad_norm": 2.0088300704956055, "learning_rate": 8.000000000000001e-07, "loss": 3.282, "mean_token_accuracy": 0.4480127369053662, "num_tokens": 34003.0, "step": 10 }, { "epoch": 0.06274509803921569, "grad_norm": 0.46773308515548706, "learning_rate": 1.7000000000000002e-06, "loss": 3.3452, "mean_token_accuracy": 0.4279281569644809, "num_tokens": 66834.0, "step": 20 }, { "epoch": 0.09411764705882353, "grad_norm": 1.1083784103393555, "learning_rate": 2.7000000000000004e-06, "loss": 3.1795, "mean_token_accuracy": 0.4369500808417797, "num_tokens": 102094.0, "step": 30 }, { "epoch": 0.12549019607843137, "grad_norm": 3.110588788986206, "learning_rate": 3.7e-06, "loss": 3.1706, "mean_token_accuracy": 0.43956867372617126, "num_tokens": 136916.0, "step": 40 }, { "epoch": 0.1568627450980392, "grad_norm": 0.6114773750305176, "learning_rate": 4.600000000000001e-06, "loss": 3.2986, "mean_token_accuracy": 0.4236688693985343, "num_tokens": 166339.0, "step": 50 }, { "epoch": 0.18823529411764706, "grad_norm": 1.4991090297698975, "learning_rate": 5.600000000000001e-06, "loss": 3.3758, "mean_token_accuracy": 0.4320780340582132, "num_tokens": 193757.0, "step": 60 }, { "epoch": 0.2196078431372549, "grad_norm": 1.0190929174423218, "learning_rate": 6.600000000000001e-06, "loss": 3.5999, "mean_token_accuracy": 0.4074632978066802, "num_tokens": 227753.0, "step": 70 }, { "epoch": 0.25098039215686274, "grad_norm": 0.5823692679405212, "learning_rate": 7.600000000000001e-06, "loss": 3.242, "mean_token_accuracy": 0.4243007113225758, "num_tokens": 258774.0, "step": 80 }, { "epoch": 0.2823529411764706, "grad_norm": 1.197152018547058, "learning_rate": 8.6e-06, "loss": 3.7351, "mean_token_accuracy": 0.40340174464508893, "num_tokens": 289476.0, "step": 90 }, { "epoch": 0.3137254901960784, "grad_norm": 1.116959810256958, "learning_rate": 9.600000000000001e-06, "loss": 3.4449, "mean_token_accuracy": 0.42097287215292456, "num_tokens": 319562.0, "step": 100 }, { "epoch": 0.34509803921568627, "grad_norm": 2.1092543601989746, "learning_rate": 9.948805460750855e-06, "loss": 3.2034, "mean_token_accuracy": 0.42690765811130404, "num_tokens": 350950.0, "step": 110 }, { "epoch": 0.3764705882352941, "grad_norm": 0.726530909538269, "learning_rate": 9.863481228668942e-06, "loss": 3.1113, "mean_token_accuracy": 0.44094684603624046, "num_tokens": 379819.0, "step": 120 }, { "epoch": 0.40784313725490196, "grad_norm": 1.3136755228042603, "learning_rate": 9.778156996587031e-06, "loss": 3.1945, "mean_token_accuracy": 0.448084157705307, "num_tokens": 412785.0, "step": 130 }, { "epoch": 0.4392156862745098, "grad_norm": 0.9245865941047668, "learning_rate": 9.69283276450512e-06, "loss": 3.0248, "mean_token_accuracy": 0.4554275684058666, "num_tokens": 442964.0, "step": 140 }, { "epoch": 0.47058823529411764, "grad_norm": 4.568413257598877, "learning_rate": 9.607508532423209e-06, "loss": 3.0576, "mean_token_accuracy": 0.45087954150512816, "num_tokens": 473446.0, "step": 150 }, { "epoch": 0.5019607843137255, "grad_norm": 7.357224464416504, "learning_rate": 9.522184300341298e-06, "loss": 3.195, "mean_token_accuracy": 0.4267027805559337, "num_tokens": 503608.0, "step": 160 }, { "epoch": 0.5333333333333333, "grad_norm": 0.9659298658370972, "learning_rate": 9.436860068259387e-06, "loss": 3.1946, "mean_token_accuracy": 0.4488052343018353, "num_tokens": 533341.0, "step": 170 }, { "epoch": 0.5647058823529412, "grad_norm": 1.9798550605773926, "learning_rate": 9.351535836177476e-06, "loss": 3.25, "mean_token_accuracy": 0.4342062085866928, "num_tokens": 563710.0, "step": 180 }, { "epoch": 0.596078431372549, "grad_norm": 2.385053873062134, "learning_rate": 9.266211604095564e-06, "loss": 2.8966, "mean_token_accuracy": 0.4620134405791759, "num_tokens": 592080.0, "step": 190 }, { "epoch": 0.6274509803921569, "grad_norm": 1.955040693283081, "learning_rate": 9.180887372013653e-06, "loss": 3.2465, "mean_token_accuracy": 0.42782977214083073, "num_tokens": 621337.0, "step": 200 }, { "epoch": 0.6588235294117647, "grad_norm": 3.6970317363739014, "learning_rate": 9.09556313993174e-06, "loss": 3.1251, "mean_token_accuracy": 0.44717809772118927, "num_tokens": 646419.0, "step": 210 }, { "epoch": 0.6901960784313725, "grad_norm": 2.0861480236053467, "learning_rate": 9.01023890784983e-06, "loss": 3.1319, "mean_token_accuracy": 0.4380856929346919, "num_tokens": 678845.0, "step": 220 }, { "epoch": 0.7215686274509804, "grad_norm": 1.1843408346176147, "learning_rate": 8.924914675767918e-06, "loss": 3.0282, "mean_token_accuracy": 0.4654800074175, "num_tokens": 708108.0, "step": 230 }, { "epoch": 0.7529411764705882, "grad_norm": 2.084069013595581, "learning_rate": 8.839590443686009e-06, "loss": 3.1245, "mean_token_accuracy": 0.43198747336864474, "num_tokens": 734439.0, "step": 240 }, { "epoch": 0.7843137254901961, "grad_norm": 3.9663286209106445, "learning_rate": 8.754266211604096e-06, "loss": 2.8906, "mean_token_accuracy": 0.45770675158128143, "num_tokens": 763349.0, "step": 250 }, { "epoch": 0.8156862745098039, "grad_norm": 2.0605413913726807, "learning_rate": 8.668941979522185e-06, "loss": 2.9757, "mean_token_accuracy": 0.4534512896090746, "num_tokens": 791592.0, "step": 260 }, { "epoch": 0.8470588235294118, "grad_norm": 3.5317554473876953, "learning_rate": 8.583617747440274e-06, "loss": 2.8376, "mean_token_accuracy": 0.4683062855154276, "num_tokens": 825019.0, "step": 270 }, { "epoch": 0.8784313725490196, "grad_norm": 3.9178497791290283, "learning_rate": 8.498293515358363e-06, "loss": 2.9376, "mean_token_accuracy": 0.45492212250828745, "num_tokens": 854288.0, "step": 280 }, { "epoch": 0.9098039215686274, "grad_norm": 0.9526835680007935, "learning_rate": 8.412969283276451e-06, "loss": 2.8571, "mean_token_accuracy": 0.46086471611633895, "num_tokens": 884793.0, "step": 290 }, { "epoch": 0.9411764705882353, "grad_norm": 3.918769598007202, "learning_rate": 8.327645051194539e-06, "loss": 2.7934, "mean_token_accuracy": 0.4795181108638644, "num_tokens": 915321.0, "step": 300 }, { "epoch": 0.9725490196078431, "grad_norm": 3.45381760597229, "learning_rate": 8.24232081911263e-06, "loss": 2.8085, "mean_token_accuracy": 0.4741422997787595, "num_tokens": 946666.0, "step": 310 }, { "epoch": 1.0031372549019608, "grad_norm": 2.1785495281219482, "learning_rate": 8.156996587030718e-06, "loss": 2.8618, "mean_token_accuracy": 0.4749741800702535, "num_tokens": 974017.0, "step": 320 }, { "epoch": 1.0345098039215685, "grad_norm": 6.006409168243408, "learning_rate": 8.071672354948807e-06, "loss": 2.9078, "mean_token_accuracy": 0.46515854969620707, "num_tokens": 1004744.0, "step": 330 }, { "epoch": 1.0658823529411765, "grad_norm": 1.7984623908996582, "learning_rate": 7.986348122866894e-06, "loss": 2.9124, "mean_token_accuracy": 0.4585884911939502, "num_tokens": 1033652.0, "step": 340 }, { "epoch": 1.0972549019607842, "grad_norm": 2.510467052459717, "learning_rate": 7.901023890784983e-06, "loss": 2.8057, "mean_token_accuracy": 0.4740089667029679, "num_tokens": 1066035.0, "step": 350 }, { "epoch": 1.1286274509803922, "grad_norm": 3.545011520385742, "learning_rate": 7.815699658703072e-06, "loss": 2.8801, "mean_token_accuracy": 0.4632578143849969, "num_tokens": 1092737.0, "step": 360 }, { "epoch": 1.16, "grad_norm": 2.1517884731292725, "learning_rate": 7.73037542662116e-06, "loss": 2.7748, "mean_token_accuracy": 0.47425267212092875, "num_tokens": 1121228.0, "step": 370 }, { "epoch": 1.1913725490196079, "grad_norm": 1.727739691734314, "learning_rate": 7.64505119453925e-06, "loss": 2.7721, "mean_token_accuracy": 0.4736901242285967, "num_tokens": 1152714.0, "step": 380 }, { "epoch": 1.2227450980392156, "grad_norm": 2.197744131088257, "learning_rate": 7.5597269624573385e-06, "loss": 2.7644, "mean_token_accuracy": 0.47409027721732855, "num_tokens": 1184573.0, "step": 390 }, { "epoch": 1.2541176470588236, "grad_norm": 3.178690195083618, "learning_rate": 7.474402730375427e-06, "loss": 2.6941, "mean_token_accuracy": 0.48159148562699555, "num_tokens": 1218513.0, "step": 400 }, { "epoch": 1.2854901960784313, "grad_norm": 1.3430229425430298, "learning_rate": 7.389078498293516e-06, "loss": 2.5874, "mean_token_accuracy": 0.49995266608893874, "num_tokens": 1250333.0, "step": 410 }, { "epoch": 1.3168627450980392, "grad_norm": 3.5784506797790527, "learning_rate": 7.303754266211604e-06, "loss": 2.5586, "mean_token_accuracy": 0.5180117629468441, "num_tokens": 1286668.0, "step": 420 }, { "epoch": 1.348235294117647, "grad_norm": 31.7750186920166, "learning_rate": 7.218430034129693e-06, "loss": 2.6383, "mean_token_accuracy": 0.48776071686297656, "num_tokens": 1315580.0, "step": 430 }, { "epoch": 1.379607843137255, "grad_norm": 2.4759323596954346, "learning_rate": 7.133105802047782e-06, "loss": 2.6451, "mean_token_accuracy": 0.4944142198190093, "num_tokens": 1347539.0, "step": 440 }, { "epoch": 1.4109803921568629, "grad_norm": 1.7809475660324097, "learning_rate": 7.047781569965872e-06, "loss": 2.7221, "mean_token_accuracy": 0.47517210952937605, "num_tokens": 1377083.0, "step": 450 }, { "epoch": 1.4423529411764706, "grad_norm": 1.1610660552978516, "learning_rate": 6.96245733788396e-06, "loss": 2.5579, "mean_token_accuracy": 0.49381575733423233, "num_tokens": 1408914.0, "step": 460 }, { "epoch": 1.4737254901960783, "grad_norm": 4.139962673187256, "learning_rate": 6.877133105802049e-06, "loss": 2.9326, "mean_token_accuracy": 0.45861218236386775, "num_tokens": 1438118.0, "step": 470 }, { "epoch": 1.5050980392156863, "grad_norm": 3.0993845462799072, "learning_rate": 6.7918088737201375e-06, "loss": 2.8458, "mean_token_accuracy": 0.47443244988098743, "num_tokens": 1467640.0, "step": 480 }, { "epoch": 1.5364705882352943, "grad_norm": 1.291991949081421, "learning_rate": 6.7064846416382255e-06, "loss": 2.6781, "mean_token_accuracy": 0.4779525174759328, "num_tokens": 1495733.0, "step": 490 }, { "epoch": 1.567843137254902, "grad_norm": 4.795923709869385, "learning_rate": 6.621160409556314e-06, "loss": 2.9197, "mean_token_accuracy": 0.4680457916110754, "num_tokens": 1525251.0, "step": 500 }, { "epoch": 1.5992156862745097, "grad_norm": 1.3896703720092773, "learning_rate": 6.535836177474402e-06, "loss": 2.6147, "mean_token_accuracy": 0.49835432767868043, "num_tokens": 1554363.0, "step": 510 }, { "epoch": 1.6305882352941177, "grad_norm": 1.1814641952514648, "learning_rate": 6.450511945392492e-06, "loss": 2.6656, "mean_token_accuracy": 0.48573412485420703, "num_tokens": 1581026.0, "step": 520 }, { "epoch": 1.6619607843137256, "grad_norm": 1.8640310764312744, "learning_rate": 6.365187713310581e-06, "loss": 2.5826, "mean_token_accuracy": 0.4969061462208629, "num_tokens": 1611477.0, "step": 530 }, { "epoch": 1.6933333333333334, "grad_norm": 4.471650123596191, "learning_rate": 6.27986348122867e-06, "loss": 2.6517, "mean_token_accuracy": 0.4934783162549138, "num_tokens": 1641681.0, "step": 540 }, { "epoch": 1.724705882352941, "grad_norm": 3.423351526260376, "learning_rate": 6.194539249146758e-06, "loss": 2.6683, "mean_token_accuracy": 0.48104359675198793, "num_tokens": 1670996.0, "step": 550 }, { "epoch": 1.756078431372549, "grad_norm": 1.9675357341766357, "learning_rate": 6.109215017064847e-06, "loss": 2.5381, "mean_token_accuracy": 0.49859709180891515, "num_tokens": 1702169.0, "step": 560 }, { "epoch": 1.787450980392157, "grad_norm": 1.6399911642074585, "learning_rate": 6.023890784982936e-06, "loss": 2.5058, "mean_token_accuracy": 0.5064322877675295, "num_tokens": 1731408.0, "step": 570 }, { "epoch": 1.8188235294117647, "grad_norm": 1.8453171253204346, "learning_rate": 5.938566552901024e-06, "loss": 2.6272, "mean_token_accuracy": 0.4801918284967542, "num_tokens": 1759204.0, "step": 580 }, { "epoch": 1.8501960784313725, "grad_norm": 1.7112871408462524, "learning_rate": 5.853242320819113e-06, "loss": 2.4362, "mean_token_accuracy": 0.512086040340364, "num_tokens": 1789717.0, "step": 590 }, { "epoch": 1.8815686274509804, "grad_norm": 3.174295663833618, "learning_rate": 5.767918088737202e-06, "loss": 2.5042, "mean_token_accuracy": 0.5141274336725473, "num_tokens": 1821803.0, "step": 600 }, { "epoch": 1.9129411764705884, "grad_norm": 3.231480121612549, "learning_rate": 5.682593856655291e-06, "loss": 2.6359, "mean_token_accuracy": 0.49160230327397586, "num_tokens": 1853817.0, "step": 610 }, { "epoch": 1.944313725490196, "grad_norm": 1.1881468296051025, "learning_rate": 5.597269624573379e-06, "loss": 2.4535, "mean_token_accuracy": 0.5213793812319636, "num_tokens": 1885929.0, "step": 620 }, { "epoch": 1.9756862745098038, "grad_norm": 1.3049256801605225, "learning_rate": 5.511945392491468e-06, "loss": 2.5596, "mean_token_accuracy": 0.5133258309215307, "num_tokens": 1918060.0, "step": 630 }, { "epoch": 2.0062745098039216, "grad_norm": 2.1421661376953125, "learning_rate": 5.426621160409556e-06, "loss": 2.4831, "mean_token_accuracy": 0.5165034267000663, "num_tokens": 1948420.0, "step": 640 }, { "epoch": 2.0376470588235294, "grad_norm": 2.0425727367401123, "learning_rate": 5.341296928327645e-06, "loss": 2.3654, "mean_token_accuracy": 0.5259943537414074, "num_tokens": 1977715.0, "step": 650 }, { "epoch": 2.069019607843137, "grad_norm": 4.167781352996826, "learning_rate": 5.255972696245735e-06, "loss": 2.3315, "mean_token_accuracy": 0.5249333314597606, "num_tokens": 2008534.0, "step": 660 }, { "epoch": 2.1003921568627453, "grad_norm": 1.0092592239379883, "learning_rate": 5.1706484641638235e-06, "loss": 2.5238, "mean_token_accuracy": 0.5057306325063109, "num_tokens": 2039030.0, "step": 670 }, { "epoch": 2.131764705882353, "grad_norm": 1.6947963237762451, "learning_rate": 5.0853242320819115e-06, "loss": 2.5809, "mean_token_accuracy": 0.5050426244735717, "num_tokens": 2068912.0, "step": 680 }, { "epoch": 2.1631372549019607, "grad_norm": 1.5759137868881226, "learning_rate": 5e-06, "loss": 2.4439, "mean_token_accuracy": 0.5173273866996169, "num_tokens": 2101461.0, "step": 690 }, { "epoch": 2.1945098039215685, "grad_norm": 1.685102939605713, "learning_rate": 4.914675767918089e-06, "loss": 2.4616, "mean_token_accuracy": 0.5100228149443865, "num_tokens": 2131232.0, "step": 700 }, { "epoch": 2.2258823529411766, "grad_norm": 1.9910387992858887, "learning_rate": 4.829351535836178e-06, "loss": 2.3545, "mean_token_accuracy": 0.5206725034862757, "num_tokens": 2160460.0, "step": 710 }, { "epoch": 2.2572549019607844, "grad_norm": 1.7385118007659912, "learning_rate": 4.744027303754267e-06, "loss": 2.521, "mean_token_accuracy": 0.503148902207613, "num_tokens": 2188175.0, "step": 720 }, { "epoch": 2.288627450980392, "grad_norm": 5.597545623779297, "learning_rate": 4.658703071672355e-06, "loss": 2.467, "mean_token_accuracy": 0.5022781057283282, "num_tokens": 2218714.0, "step": 730 }, { "epoch": 2.32, "grad_norm": 1.7059907913208008, "learning_rate": 4.573378839590444e-06, "loss": 2.4086, "mean_token_accuracy": 0.504382885247469, "num_tokens": 2249170.0, "step": 740 }, { "epoch": 2.351372549019608, "grad_norm": 1.951714277267456, "learning_rate": 4.488054607508533e-06, "loss": 2.3236, "mean_token_accuracy": 0.5256480574607849, "num_tokens": 2280286.0, "step": 750 }, { "epoch": 2.3827450980392157, "grad_norm": 1.0276103019714355, "learning_rate": 4.402730375426622e-06, "loss": 2.3727, "mean_token_accuracy": 0.5266215573996306, "num_tokens": 2311312.0, "step": 760 }, { "epoch": 2.4141176470588235, "grad_norm": 2.829286813735962, "learning_rate": 4.31740614334471e-06, "loss": 2.5146, "mean_token_accuracy": 0.5105616014450789, "num_tokens": 2340935.0, "step": 770 }, { "epoch": 2.445490196078431, "grad_norm": 3.0118846893310547, "learning_rate": 4.232081911262799e-06, "loss": 2.3505, "mean_token_accuracy": 0.5210155340842902, "num_tokens": 2370291.0, "step": 780 }, { "epoch": 2.4768627450980394, "grad_norm": 1.9568514823913574, "learning_rate": 4.1467576791808874e-06, "loss": 2.3832, "mean_token_accuracy": 0.5071445981040597, "num_tokens": 2399843.0, "step": 790 }, { "epoch": 2.508235294117647, "grad_norm": 1.8932603597640991, "learning_rate": 4.061433447098976e-06, "loss": 2.3508, "mean_token_accuracy": 0.5251543965190649, "num_tokens": 2428762.0, "step": 800 }, { "epoch": 2.539607843137255, "grad_norm": 1.755767822265625, "learning_rate": 3.976109215017065e-06, "loss": 2.3532, "mean_token_accuracy": 0.5324380807578564, "num_tokens": 2458475.0, "step": 810 }, { "epoch": 2.5709803921568626, "grad_norm": 2.4889233112335205, "learning_rate": 3.890784982935154e-06, "loss": 2.6067, "mean_token_accuracy": 0.5031498618423939, "num_tokens": 2489770.0, "step": 820 }, { "epoch": 2.6023529411764708, "grad_norm": 4.700379371643066, "learning_rate": 3.8054607508532425e-06, "loss": 2.5566, "mean_token_accuracy": 0.502924164570868, "num_tokens": 2521156.0, "step": 830 }, { "epoch": 2.6337254901960785, "grad_norm": 12.594019889831543, "learning_rate": 3.7201365187713314e-06, "loss": 2.1664, "mean_token_accuracy": 0.5561403293162585, "num_tokens": 2553903.0, "step": 840 }, { "epoch": 2.665098039215686, "grad_norm": 5.380671977996826, "learning_rate": 3.6348122866894202e-06, "loss": 2.3804, "mean_token_accuracy": 0.5276698149740696, "num_tokens": 2583417.0, "step": 850 }, { "epoch": 2.696470588235294, "grad_norm": 6.616447448730469, "learning_rate": 3.5494880546075087e-06, "loss": 2.4498, "mean_token_accuracy": 0.5167227942496538, "num_tokens": 2612099.0, "step": 860 }, { "epoch": 2.7278431372549017, "grad_norm": 1.3597829341888428, "learning_rate": 3.4641638225255976e-06, "loss": 2.173, "mean_token_accuracy": 0.5551321767270565, "num_tokens": 2644692.0, "step": 870 }, { "epoch": 2.75921568627451, "grad_norm": 2.5514867305755615, "learning_rate": 3.378839590443686e-06, "loss": 2.3411, "mean_token_accuracy": 0.534308859705925, "num_tokens": 2680221.0, "step": 880 }, { "epoch": 2.7905882352941176, "grad_norm": 2.470513105392456, "learning_rate": 3.2935153583617753e-06, "loss": 2.3716, "mean_token_accuracy": 0.5275221727788448, "num_tokens": 2715613.0, "step": 890 }, { "epoch": 2.8219607843137258, "grad_norm": 1.194263219833374, "learning_rate": 3.2081911262798638e-06, "loss": 2.3571, "mean_token_accuracy": 0.5199422530829907, "num_tokens": 2745234.0, "step": 900 }, { "epoch": 2.8533333333333335, "grad_norm": Infinity, "learning_rate": 3.122866894197952e-06, "loss": 2.4158, "mean_token_accuracy": 0.5191751107573509, "num_tokens": 2775161.0, "step": 910 }, { "epoch": 2.8847058823529412, "grad_norm": 1.294569492340088, "learning_rate": 3.046075085324232e-06, "loss": 2.3558, "mean_token_accuracy": 0.5214510016143322, "num_tokens": 2805373.0, "step": 920 }, { "epoch": 2.916078431372549, "grad_norm": 4.139784336090088, "learning_rate": 2.9607508532423213e-06, "loss": 2.3869, "mean_token_accuracy": 0.5307831708341837, "num_tokens": 2831957.0, "step": 930 }, { "epoch": 2.9474509803921567, "grad_norm": 1.2397838830947876, "learning_rate": 2.8754266211604098e-06, "loss": 2.3455, "mean_token_accuracy": 0.5367285626009106, "num_tokens": 2862724.0, "step": 940 }, { "epoch": 2.978823529411765, "grad_norm": 1.8458396196365356, "learning_rate": 2.790102389078498e-06, "loss": 2.3212, "mean_token_accuracy": 0.540785015001893, "num_tokens": 2895266.0, "step": 950 }, { "epoch": 3.0094117647058822, "grad_norm": 2.0150907039642334, "learning_rate": 2.7047781569965875e-06, "loss": 2.3589, "mean_token_accuracy": 0.5204295409031403, "num_tokens": 2924126.0, "step": 960 }, { "epoch": 3.0407843137254904, "grad_norm": 10.822606086730957, "learning_rate": 2.619453924914676e-06, "loss": 2.1408, "mean_token_accuracy": 0.5493647336959839, "num_tokens": 2956817.0, "step": 970 }, { "epoch": 3.072156862745098, "grad_norm": 1.3175485134124756, "learning_rate": 2.534129692832765e-06, "loss": 2.3916, "mean_token_accuracy": 0.5206685658544302, "num_tokens": 2986467.0, "step": 980 }, { "epoch": 3.103529411764706, "grad_norm": 1.7138490676879883, "learning_rate": 2.4488054607508537e-06, "loss": 2.3403, "mean_token_accuracy": 0.5319944698363542, "num_tokens": 3018127.0, "step": 990 }, { "epoch": 3.1349019607843136, "grad_norm": 1.6033964157104492, "learning_rate": 2.363481228668942e-06, "loss": 2.2751, "mean_token_accuracy": 0.5398386877030135, "num_tokens": 3047280.0, "step": 1000 }, { "epoch": 3.1662745098039213, "grad_norm": 7.103280544281006, "learning_rate": 2.278156996587031e-06, "loss": 2.3816, "mean_token_accuracy": 0.5190372098237276, "num_tokens": 3077137.0, "step": 1010 }, { "epoch": 3.1976470588235295, "grad_norm": 2.4392924308776855, "learning_rate": 2.1928327645051195e-06, "loss": 2.3052, "mean_token_accuracy": 0.5296947434544563, "num_tokens": 3106067.0, "step": 1020 }, { "epoch": 3.2290196078431372, "grad_norm": 1.4106686115264893, "learning_rate": 2.1075085324232083e-06, "loss": 2.3615, "mean_token_accuracy": 0.525895349867642, "num_tokens": 3136450.0, "step": 1030 }, { "epoch": 3.260392156862745, "grad_norm": 3.269272565841675, "learning_rate": 2.022184300341297e-06, "loss": 2.3037, "mean_token_accuracy": 0.5490067519247532, "num_tokens": 3166808.0, "step": 1040 }, { "epoch": 3.291764705882353, "grad_norm": 1.5100555419921875, "learning_rate": 1.9368600682593857e-06, "loss": 2.3014, "mean_token_accuracy": 0.5390114476904273, "num_tokens": 3197483.0, "step": 1050 }, { "epoch": 3.323137254901961, "grad_norm": 1.4328869581222534, "learning_rate": 1.8515358361774745e-06, "loss": 2.2193, "mean_token_accuracy": 0.5445488292723895, "num_tokens": 3229662.0, "step": 1060 }, { "epoch": 3.3545098039215686, "grad_norm": 0.9292280077934265, "learning_rate": 1.7662116040955632e-06, "loss": 2.1304, "mean_token_accuracy": 0.5581423584371805, "num_tokens": 3262175.0, "step": 1070 }, { "epoch": 3.3858823529411763, "grad_norm": 2.55062198638916, "learning_rate": 1.680887372013652e-06, "loss": 2.4022, "mean_token_accuracy": 0.5283184833824635, "num_tokens": 3291239.0, "step": 1080 }, { "epoch": 3.417254901960784, "grad_norm": 3.2028212547302246, "learning_rate": 1.5955631399317405e-06, "loss": 2.4047, "mean_token_accuracy": 0.530560277402401, "num_tokens": 3321636.0, "step": 1090 }, { "epoch": 3.4486274509803923, "grad_norm": 1.1053611040115356, "learning_rate": 1.5102389078498294e-06, "loss": 2.0193, "mean_token_accuracy": 0.5678496524691582, "num_tokens": 3355839.0, "step": 1100 }, { "epoch": 3.48, "grad_norm": 1.1278761625289917, "learning_rate": 1.4249146757679183e-06, "loss": 2.1899, "mean_token_accuracy": 0.5349464191123843, "num_tokens": 3390743.0, "step": 1110 }, { "epoch": 3.5113725490196077, "grad_norm": 1.3680450916290283, "learning_rate": 1.339590443686007e-06, "loss": 2.3307, "mean_token_accuracy": 0.5308054933324456, "num_tokens": 3422911.0, "step": 1120 }, { "epoch": 3.542745098039216, "grad_norm": 3.9734294414520264, "learning_rate": 1.2542662116040958e-06, "loss": 2.2857, "mean_token_accuracy": 0.5387092420831323, "num_tokens": 3453759.0, "step": 1130 }, { "epoch": 3.5741176470588236, "grad_norm": 2.855978012084961, "learning_rate": 1.1689419795221844e-06, "loss": 2.2933, "mean_token_accuracy": 0.5302057925611734, "num_tokens": 3482976.0, "step": 1140 }, { "epoch": 3.6054901960784314, "grad_norm": 2.837674617767334, "learning_rate": 1.0836177474402731e-06, "loss": 2.3656, "mean_token_accuracy": 0.5338190544396639, "num_tokens": 3512124.0, "step": 1150 }, { "epoch": 3.636862745098039, "grad_norm": 1.6821599006652832, "learning_rate": 9.982935153583618e-07, "loss": 2.3696, "mean_token_accuracy": 0.5232982926070691, "num_tokens": 3539944.0, "step": 1160 }, { "epoch": 3.668235294117647, "grad_norm": 8.743041038513184, "learning_rate": 9.129692832764505e-07, "loss": 2.3186, "mean_token_accuracy": 0.5293452955782414, "num_tokens": 3568686.0, "step": 1170 }, { "epoch": 3.699607843137255, "grad_norm": 3.6034657955169678, "learning_rate": 8.276450511945393e-07, "loss": 2.474, "mean_token_accuracy": 0.518931976519525, "num_tokens": 3596306.0, "step": 1180 }, { "epoch": 3.7309803921568627, "grad_norm": 1.2798527479171753, "learning_rate": 7.42320819112628e-07, "loss": 2.1739, "mean_token_accuracy": 0.5471075214445591, "num_tokens": 3625513.0, "step": 1190 }, { "epoch": 3.7623529411764705, "grad_norm": 1.1355539560317993, "learning_rate": 6.569965870307168e-07, "loss": 2.2781, "mean_token_accuracy": 0.5349656146019697, "num_tokens": 3658136.0, "step": 1200 } ], "logging_steps": 10, "max_steps": 1272, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.324879825159782e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }