Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.7623529411764705, | |
| "eval_steps": 500, | |
| "global_step": 1200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03137254901960784, | |
| "grad_norm": 2.0088300704956055, | |
| "learning_rate": 8.000000000000001e-07, | |
| "loss": 3.282, | |
| "mean_token_accuracy": 0.4480127369053662, | |
| "num_tokens": 34003.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06274509803921569, | |
| "grad_norm": 0.46773308515548706, | |
| "learning_rate": 1.7000000000000002e-06, | |
| "loss": 3.3452, | |
| "mean_token_accuracy": 0.4279281569644809, | |
| "num_tokens": 66834.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09411764705882353, | |
| "grad_norm": 1.1083784103393555, | |
| "learning_rate": 2.7000000000000004e-06, | |
| "loss": 3.1795, | |
| "mean_token_accuracy": 0.4369500808417797, | |
| "num_tokens": 102094.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.12549019607843137, | |
| "grad_norm": 3.110588788986206, | |
| "learning_rate": 3.7e-06, | |
| "loss": 3.1706, | |
| "mean_token_accuracy": 0.43956867372617126, | |
| "num_tokens": 136916.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1568627450980392, | |
| "grad_norm": 0.6114773750305176, | |
| "learning_rate": 4.600000000000001e-06, | |
| "loss": 3.2986, | |
| "mean_token_accuracy": 0.4236688693985343, | |
| "num_tokens": 166339.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18823529411764706, | |
| "grad_norm": 1.4991090297698975, | |
| "learning_rate": 5.600000000000001e-06, | |
| "loss": 3.3758, | |
| "mean_token_accuracy": 0.4320780340582132, | |
| "num_tokens": 193757.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2196078431372549, | |
| "grad_norm": 1.0190929174423218, | |
| "learning_rate": 6.600000000000001e-06, | |
| "loss": 3.5999, | |
| "mean_token_accuracy": 0.4074632978066802, | |
| "num_tokens": 227753.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.25098039215686274, | |
| "grad_norm": 0.5823692679405212, | |
| "learning_rate": 7.600000000000001e-06, | |
| "loss": 3.242, | |
| "mean_token_accuracy": 0.4243007113225758, | |
| "num_tokens": 258774.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2823529411764706, | |
| "grad_norm": 1.197152018547058, | |
| "learning_rate": 8.6e-06, | |
| "loss": 3.7351, | |
| "mean_token_accuracy": 0.40340174464508893, | |
| "num_tokens": 289476.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3137254901960784, | |
| "grad_norm": 1.116959810256958, | |
| "learning_rate": 9.600000000000001e-06, | |
| "loss": 3.4449, | |
| "mean_token_accuracy": 0.42097287215292456, | |
| "num_tokens": 319562.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.34509803921568627, | |
| "grad_norm": 2.1092543601989746, | |
| "learning_rate": 9.948805460750855e-06, | |
| "loss": 3.2034, | |
| "mean_token_accuracy": 0.42690765811130404, | |
| "num_tokens": 350950.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3764705882352941, | |
| "grad_norm": 0.726530909538269, | |
| "learning_rate": 9.863481228668942e-06, | |
| "loss": 3.1113, | |
| "mean_token_accuracy": 0.44094684603624046, | |
| "num_tokens": 379819.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.40784313725490196, | |
| "grad_norm": 1.3136755228042603, | |
| "learning_rate": 9.778156996587031e-06, | |
| "loss": 3.1945, | |
| "mean_token_accuracy": 0.448084157705307, | |
| "num_tokens": 412785.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4392156862745098, | |
| "grad_norm": 0.9245865941047668, | |
| "learning_rate": 9.69283276450512e-06, | |
| "loss": 3.0248, | |
| "mean_token_accuracy": 0.4554275684058666, | |
| "num_tokens": 442964.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.47058823529411764, | |
| "grad_norm": 4.568413257598877, | |
| "learning_rate": 9.607508532423209e-06, | |
| "loss": 3.0576, | |
| "mean_token_accuracy": 0.45087954150512816, | |
| "num_tokens": 473446.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5019607843137255, | |
| "grad_norm": 7.357224464416504, | |
| "learning_rate": 9.522184300341298e-06, | |
| "loss": 3.195, | |
| "mean_token_accuracy": 0.4267027805559337, | |
| "num_tokens": 503608.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 0.9659298658370972, | |
| "learning_rate": 9.436860068259387e-06, | |
| "loss": 3.1946, | |
| "mean_token_accuracy": 0.4488052343018353, | |
| "num_tokens": 533341.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5647058823529412, | |
| "grad_norm": 1.9798550605773926, | |
| "learning_rate": 9.351535836177476e-06, | |
| "loss": 3.25, | |
| "mean_token_accuracy": 0.4342062085866928, | |
| "num_tokens": 563710.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.596078431372549, | |
| "grad_norm": 2.385053873062134, | |
| "learning_rate": 9.266211604095564e-06, | |
| "loss": 2.8966, | |
| "mean_token_accuracy": 0.4620134405791759, | |
| "num_tokens": 592080.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6274509803921569, | |
| "grad_norm": 1.955040693283081, | |
| "learning_rate": 9.180887372013653e-06, | |
| "loss": 3.2465, | |
| "mean_token_accuracy": 0.42782977214083073, | |
| "num_tokens": 621337.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6588235294117647, | |
| "grad_norm": 3.6970317363739014, | |
| "learning_rate": 9.09556313993174e-06, | |
| "loss": 3.1251, | |
| "mean_token_accuracy": 0.44717809772118927, | |
| "num_tokens": 646419.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6901960784313725, | |
| "grad_norm": 2.0861480236053467, | |
| "learning_rate": 9.01023890784983e-06, | |
| "loss": 3.1319, | |
| "mean_token_accuracy": 0.4380856929346919, | |
| "num_tokens": 678845.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7215686274509804, | |
| "grad_norm": 1.1843408346176147, | |
| "learning_rate": 8.924914675767918e-06, | |
| "loss": 3.0282, | |
| "mean_token_accuracy": 0.4654800074175, | |
| "num_tokens": 708108.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7529411764705882, | |
| "grad_norm": 2.084069013595581, | |
| "learning_rate": 8.839590443686009e-06, | |
| "loss": 3.1245, | |
| "mean_token_accuracy": 0.43198747336864474, | |
| "num_tokens": 734439.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7843137254901961, | |
| "grad_norm": 3.9663286209106445, | |
| "learning_rate": 8.754266211604096e-06, | |
| "loss": 2.8906, | |
| "mean_token_accuracy": 0.45770675158128143, | |
| "num_tokens": 763349.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8156862745098039, | |
| "grad_norm": 2.0605413913726807, | |
| "learning_rate": 8.668941979522185e-06, | |
| "loss": 2.9757, | |
| "mean_token_accuracy": 0.4534512896090746, | |
| "num_tokens": 791592.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8470588235294118, | |
| "grad_norm": 3.5317554473876953, | |
| "learning_rate": 8.583617747440274e-06, | |
| "loss": 2.8376, | |
| "mean_token_accuracy": 0.4683062855154276, | |
| "num_tokens": 825019.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8784313725490196, | |
| "grad_norm": 3.9178497791290283, | |
| "learning_rate": 8.498293515358363e-06, | |
| "loss": 2.9376, | |
| "mean_token_accuracy": 0.45492212250828745, | |
| "num_tokens": 854288.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9098039215686274, | |
| "grad_norm": 0.9526835680007935, | |
| "learning_rate": 8.412969283276451e-06, | |
| "loss": 2.8571, | |
| "mean_token_accuracy": 0.46086471611633895, | |
| "num_tokens": 884793.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9411764705882353, | |
| "grad_norm": 3.918769598007202, | |
| "learning_rate": 8.327645051194539e-06, | |
| "loss": 2.7934, | |
| "mean_token_accuracy": 0.4795181108638644, | |
| "num_tokens": 915321.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9725490196078431, | |
| "grad_norm": 3.45381760597229, | |
| "learning_rate": 8.24232081911263e-06, | |
| "loss": 2.8085, | |
| "mean_token_accuracy": 0.4741422997787595, | |
| "num_tokens": 946666.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.0031372549019608, | |
| "grad_norm": 2.1785495281219482, | |
| "learning_rate": 8.156996587030718e-06, | |
| "loss": 2.8618, | |
| "mean_token_accuracy": 0.4749741800702535, | |
| "num_tokens": 974017.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.0345098039215685, | |
| "grad_norm": 6.006409168243408, | |
| "learning_rate": 8.071672354948807e-06, | |
| "loss": 2.9078, | |
| "mean_token_accuracy": 0.46515854969620707, | |
| "num_tokens": 1004744.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.0658823529411765, | |
| "grad_norm": 1.7984623908996582, | |
| "learning_rate": 7.986348122866894e-06, | |
| "loss": 2.9124, | |
| "mean_token_accuracy": 0.4585884911939502, | |
| "num_tokens": 1033652.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.0972549019607842, | |
| "grad_norm": 2.510467052459717, | |
| "learning_rate": 7.901023890784983e-06, | |
| "loss": 2.8057, | |
| "mean_token_accuracy": 0.4740089667029679, | |
| "num_tokens": 1066035.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1286274509803922, | |
| "grad_norm": 3.545011520385742, | |
| "learning_rate": 7.815699658703072e-06, | |
| "loss": 2.8801, | |
| "mean_token_accuracy": 0.4632578143849969, | |
| "num_tokens": 1092737.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 2.1517884731292725, | |
| "learning_rate": 7.73037542662116e-06, | |
| "loss": 2.7748, | |
| "mean_token_accuracy": 0.47425267212092875, | |
| "num_tokens": 1121228.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.1913725490196079, | |
| "grad_norm": 1.727739691734314, | |
| "learning_rate": 7.64505119453925e-06, | |
| "loss": 2.7721, | |
| "mean_token_accuracy": 0.4736901242285967, | |
| "num_tokens": 1152714.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.2227450980392156, | |
| "grad_norm": 2.197744131088257, | |
| "learning_rate": 7.5597269624573385e-06, | |
| "loss": 2.7644, | |
| "mean_token_accuracy": 0.47409027721732855, | |
| "num_tokens": 1184573.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.2541176470588236, | |
| "grad_norm": 3.178690195083618, | |
| "learning_rate": 7.474402730375427e-06, | |
| "loss": 2.6941, | |
| "mean_token_accuracy": 0.48159148562699555, | |
| "num_tokens": 1218513.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2854901960784313, | |
| "grad_norm": 1.3430229425430298, | |
| "learning_rate": 7.389078498293516e-06, | |
| "loss": 2.5874, | |
| "mean_token_accuracy": 0.49995266608893874, | |
| "num_tokens": 1250333.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.3168627450980392, | |
| "grad_norm": 3.5784506797790527, | |
| "learning_rate": 7.303754266211604e-06, | |
| "loss": 2.5586, | |
| "mean_token_accuracy": 0.5180117629468441, | |
| "num_tokens": 1286668.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.348235294117647, | |
| "grad_norm": 31.7750186920166, | |
| "learning_rate": 7.218430034129693e-06, | |
| "loss": 2.6383, | |
| "mean_token_accuracy": 0.48776071686297656, | |
| "num_tokens": 1315580.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.379607843137255, | |
| "grad_norm": 2.4759323596954346, | |
| "learning_rate": 7.133105802047782e-06, | |
| "loss": 2.6451, | |
| "mean_token_accuracy": 0.4944142198190093, | |
| "num_tokens": 1347539.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.4109803921568629, | |
| "grad_norm": 1.7809475660324097, | |
| "learning_rate": 7.047781569965872e-06, | |
| "loss": 2.7221, | |
| "mean_token_accuracy": 0.47517210952937605, | |
| "num_tokens": 1377083.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.4423529411764706, | |
| "grad_norm": 1.1610660552978516, | |
| "learning_rate": 6.96245733788396e-06, | |
| "loss": 2.5579, | |
| "mean_token_accuracy": 0.49381575733423233, | |
| "num_tokens": 1408914.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.4737254901960783, | |
| "grad_norm": 4.139962673187256, | |
| "learning_rate": 6.877133105802049e-06, | |
| "loss": 2.9326, | |
| "mean_token_accuracy": 0.45861218236386775, | |
| "num_tokens": 1438118.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.5050980392156863, | |
| "grad_norm": 3.0993845462799072, | |
| "learning_rate": 6.7918088737201375e-06, | |
| "loss": 2.8458, | |
| "mean_token_accuracy": 0.47443244988098743, | |
| "num_tokens": 1467640.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.5364705882352943, | |
| "grad_norm": 1.291991949081421, | |
| "learning_rate": 6.7064846416382255e-06, | |
| "loss": 2.6781, | |
| "mean_token_accuracy": 0.4779525174759328, | |
| "num_tokens": 1495733.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.567843137254902, | |
| "grad_norm": 4.795923709869385, | |
| "learning_rate": 6.621160409556314e-06, | |
| "loss": 2.9197, | |
| "mean_token_accuracy": 0.4680457916110754, | |
| "num_tokens": 1525251.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5992156862745097, | |
| "grad_norm": 1.3896703720092773, | |
| "learning_rate": 6.535836177474402e-06, | |
| "loss": 2.6147, | |
| "mean_token_accuracy": 0.49835432767868043, | |
| "num_tokens": 1554363.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.6305882352941177, | |
| "grad_norm": 1.1814641952514648, | |
| "learning_rate": 6.450511945392492e-06, | |
| "loss": 2.6656, | |
| "mean_token_accuracy": 0.48573412485420703, | |
| "num_tokens": 1581026.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.6619607843137256, | |
| "grad_norm": 1.8640310764312744, | |
| "learning_rate": 6.365187713310581e-06, | |
| "loss": 2.5826, | |
| "mean_token_accuracy": 0.4969061462208629, | |
| "num_tokens": 1611477.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.6933333333333334, | |
| "grad_norm": 4.471650123596191, | |
| "learning_rate": 6.27986348122867e-06, | |
| "loss": 2.6517, | |
| "mean_token_accuracy": 0.4934783162549138, | |
| "num_tokens": 1641681.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.724705882352941, | |
| "grad_norm": 3.423351526260376, | |
| "learning_rate": 6.194539249146758e-06, | |
| "loss": 2.6683, | |
| "mean_token_accuracy": 0.48104359675198793, | |
| "num_tokens": 1670996.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.756078431372549, | |
| "grad_norm": 1.9675357341766357, | |
| "learning_rate": 6.109215017064847e-06, | |
| "loss": 2.5381, | |
| "mean_token_accuracy": 0.49859709180891515, | |
| "num_tokens": 1702169.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.787450980392157, | |
| "grad_norm": 1.6399911642074585, | |
| "learning_rate": 6.023890784982936e-06, | |
| "loss": 2.5058, | |
| "mean_token_accuracy": 0.5064322877675295, | |
| "num_tokens": 1731408.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.8188235294117647, | |
| "grad_norm": 1.8453171253204346, | |
| "learning_rate": 5.938566552901024e-06, | |
| "loss": 2.6272, | |
| "mean_token_accuracy": 0.4801918284967542, | |
| "num_tokens": 1759204.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.8501960784313725, | |
| "grad_norm": 1.7112871408462524, | |
| "learning_rate": 5.853242320819113e-06, | |
| "loss": 2.4362, | |
| "mean_token_accuracy": 0.512086040340364, | |
| "num_tokens": 1789717.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.8815686274509804, | |
| "grad_norm": 3.174295663833618, | |
| "learning_rate": 5.767918088737202e-06, | |
| "loss": 2.5042, | |
| "mean_token_accuracy": 0.5141274336725473, | |
| "num_tokens": 1821803.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.9129411764705884, | |
| "grad_norm": 3.231480121612549, | |
| "learning_rate": 5.682593856655291e-06, | |
| "loss": 2.6359, | |
| "mean_token_accuracy": 0.49160230327397586, | |
| "num_tokens": 1853817.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.944313725490196, | |
| "grad_norm": 1.1881468296051025, | |
| "learning_rate": 5.597269624573379e-06, | |
| "loss": 2.4535, | |
| "mean_token_accuracy": 0.5213793812319636, | |
| "num_tokens": 1885929.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.9756862745098038, | |
| "grad_norm": 1.3049256801605225, | |
| "learning_rate": 5.511945392491468e-06, | |
| "loss": 2.5596, | |
| "mean_token_accuracy": 0.5133258309215307, | |
| "num_tokens": 1918060.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.0062745098039216, | |
| "grad_norm": 2.1421661376953125, | |
| "learning_rate": 5.426621160409556e-06, | |
| "loss": 2.4831, | |
| "mean_token_accuracy": 0.5165034267000663, | |
| "num_tokens": 1948420.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.0376470588235294, | |
| "grad_norm": 2.0425727367401123, | |
| "learning_rate": 5.341296928327645e-06, | |
| "loss": 2.3654, | |
| "mean_token_accuracy": 0.5259943537414074, | |
| "num_tokens": 1977715.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.069019607843137, | |
| "grad_norm": 4.167781352996826, | |
| "learning_rate": 5.255972696245735e-06, | |
| "loss": 2.3315, | |
| "mean_token_accuracy": 0.5249333314597606, | |
| "num_tokens": 2008534.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.1003921568627453, | |
| "grad_norm": 1.0092592239379883, | |
| "learning_rate": 5.1706484641638235e-06, | |
| "loss": 2.5238, | |
| "mean_token_accuracy": 0.5057306325063109, | |
| "num_tokens": 2039030.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.131764705882353, | |
| "grad_norm": 1.6947963237762451, | |
| "learning_rate": 5.0853242320819115e-06, | |
| "loss": 2.5809, | |
| "mean_token_accuracy": 0.5050426244735717, | |
| "num_tokens": 2068912.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.1631372549019607, | |
| "grad_norm": 1.5759137868881226, | |
| "learning_rate": 5e-06, | |
| "loss": 2.4439, | |
| "mean_token_accuracy": 0.5173273866996169, | |
| "num_tokens": 2101461.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.1945098039215685, | |
| "grad_norm": 1.685102939605713, | |
| "learning_rate": 4.914675767918089e-06, | |
| "loss": 2.4616, | |
| "mean_token_accuracy": 0.5100228149443865, | |
| "num_tokens": 2131232.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.2258823529411766, | |
| "grad_norm": 1.9910387992858887, | |
| "learning_rate": 4.829351535836178e-06, | |
| "loss": 2.3545, | |
| "mean_token_accuracy": 0.5206725034862757, | |
| "num_tokens": 2160460.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.2572549019607844, | |
| "grad_norm": 1.7385118007659912, | |
| "learning_rate": 4.744027303754267e-06, | |
| "loss": 2.521, | |
| "mean_token_accuracy": 0.503148902207613, | |
| "num_tokens": 2188175.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.288627450980392, | |
| "grad_norm": 5.597545623779297, | |
| "learning_rate": 4.658703071672355e-06, | |
| "loss": 2.467, | |
| "mean_token_accuracy": 0.5022781057283282, | |
| "num_tokens": 2218714.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 1.7059907913208008, | |
| "learning_rate": 4.573378839590444e-06, | |
| "loss": 2.4086, | |
| "mean_token_accuracy": 0.504382885247469, | |
| "num_tokens": 2249170.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.351372549019608, | |
| "grad_norm": 1.951714277267456, | |
| "learning_rate": 4.488054607508533e-06, | |
| "loss": 2.3236, | |
| "mean_token_accuracy": 0.5256480574607849, | |
| "num_tokens": 2280286.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.3827450980392157, | |
| "grad_norm": 1.0276103019714355, | |
| "learning_rate": 4.402730375426622e-06, | |
| "loss": 2.3727, | |
| "mean_token_accuracy": 0.5266215573996306, | |
| "num_tokens": 2311312.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.4141176470588235, | |
| "grad_norm": 2.829286813735962, | |
| "learning_rate": 4.31740614334471e-06, | |
| "loss": 2.5146, | |
| "mean_token_accuracy": 0.5105616014450789, | |
| "num_tokens": 2340935.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.445490196078431, | |
| "grad_norm": 3.0118846893310547, | |
| "learning_rate": 4.232081911262799e-06, | |
| "loss": 2.3505, | |
| "mean_token_accuracy": 0.5210155340842902, | |
| "num_tokens": 2370291.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.4768627450980394, | |
| "grad_norm": 1.9568514823913574, | |
| "learning_rate": 4.1467576791808874e-06, | |
| "loss": 2.3832, | |
| "mean_token_accuracy": 0.5071445981040597, | |
| "num_tokens": 2399843.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.508235294117647, | |
| "grad_norm": 1.8932603597640991, | |
| "learning_rate": 4.061433447098976e-06, | |
| "loss": 2.3508, | |
| "mean_token_accuracy": 0.5251543965190649, | |
| "num_tokens": 2428762.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.539607843137255, | |
| "grad_norm": 1.755767822265625, | |
| "learning_rate": 3.976109215017065e-06, | |
| "loss": 2.3532, | |
| "mean_token_accuracy": 0.5324380807578564, | |
| "num_tokens": 2458475.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.5709803921568626, | |
| "grad_norm": 2.4889233112335205, | |
| "learning_rate": 3.890784982935154e-06, | |
| "loss": 2.6067, | |
| "mean_token_accuracy": 0.5031498618423939, | |
| "num_tokens": 2489770.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.6023529411764708, | |
| "grad_norm": 4.700379371643066, | |
| "learning_rate": 3.8054607508532425e-06, | |
| "loss": 2.5566, | |
| "mean_token_accuracy": 0.502924164570868, | |
| "num_tokens": 2521156.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.6337254901960785, | |
| "grad_norm": 12.594019889831543, | |
| "learning_rate": 3.7201365187713314e-06, | |
| "loss": 2.1664, | |
| "mean_token_accuracy": 0.5561403293162585, | |
| "num_tokens": 2553903.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.665098039215686, | |
| "grad_norm": 5.380671977996826, | |
| "learning_rate": 3.6348122866894202e-06, | |
| "loss": 2.3804, | |
| "mean_token_accuracy": 0.5276698149740696, | |
| "num_tokens": 2583417.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.696470588235294, | |
| "grad_norm": 6.616447448730469, | |
| "learning_rate": 3.5494880546075087e-06, | |
| "loss": 2.4498, | |
| "mean_token_accuracy": 0.5167227942496538, | |
| "num_tokens": 2612099.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.7278431372549017, | |
| "grad_norm": 1.3597829341888428, | |
| "learning_rate": 3.4641638225255976e-06, | |
| "loss": 2.173, | |
| "mean_token_accuracy": 0.5551321767270565, | |
| "num_tokens": 2644692.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.75921568627451, | |
| "grad_norm": 2.5514867305755615, | |
| "learning_rate": 3.378839590443686e-06, | |
| "loss": 2.3411, | |
| "mean_token_accuracy": 0.534308859705925, | |
| "num_tokens": 2680221.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.7905882352941176, | |
| "grad_norm": 2.470513105392456, | |
| "learning_rate": 3.2935153583617753e-06, | |
| "loss": 2.3716, | |
| "mean_token_accuracy": 0.5275221727788448, | |
| "num_tokens": 2715613.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.8219607843137258, | |
| "grad_norm": 1.194263219833374, | |
| "learning_rate": 3.2081911262798638e-06, | |
| "loss": 2.3571, | |
| "mean_token_accuracy": 0.5199422530829907, | |
| "num_tokens": 2745234.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.8533333333333335, | |
| "grad_norm": Infinity, | |
| "learning_rate": 3.122866894197952e-06, | |
| "loss": 2.4158, | |
| "mean_token_accuracy": 0.5191751107573509, | |
| "num_tokens": 2775161.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.8847058823529412, | |
| "grad_norm": 1.294569492340088, | |
| "learning_rate": 3.046075085324232e-06, | |
| "loss": 2.3558, | |
| "mean_token_accuracy": 0.5214510016143322, | |
| "num_tokens": 2805373.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.916078431372549, | |
| "grad_norm": 4.139784336090088, | |
| "learning_rate": 2.9607508532423213e-06, | |
| "loss": 2.3869, | |
| "mean_token_accuracy": 0.5307831708341837, | |
| "num_tokens": 2831957.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.9474509803921567, | |
| "grad_norm": 1.2397838830947876, | |
| "learning_rate": 2.8754266211604098e-06, | |
| "loss": 2.3455, | |
| "mean_token_accuracy": 0.5367285626009106, | |
| "num_tokens": 2862724.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.978823529411765, | |
| "grad_norm": 1.8458396196365356, | |
| "learning_rate": 2.790102389078498e-06, | |
| "loss": 2.3212, | |
| "mean_token_accuracy": 0.540785015001893, | |
| "num_tokens": 2895266.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 3.0094117647058822, | |
| "grad_norm": 2.0150907039642334, | |
| "learning_rate": 2.7047781569965875e-06, | |
| "loss": 2.3589, | |
| "mean_token_accuracy": 0.5204295409031403, | |
| "num_tokens": 2924126.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 3.0407843137254904, | |
| "grad_norm": 10.822606086730957, | |
| "learning_rate": 2.619453924914676e-06, | |
| "loss": 2.1408, | |
| "mean_token_accuracy": 0.5493647336959839, | |
| "num_tokens": 2956817.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 3.072156862745098, | |
| "grad_norm": 1.3175485134124756, | |
| "learning_rate": 2.534129692832765e-06, | |
| "loss": 2.3916, | |
| "mean_token_accuracy": 0.5206685658544302, | |
| "num_tokens": 2986467.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 3.103529411764706, | |
| "grad_norm": 1.7138490676879883, | |
| "learning_rate": 2.4488054607508537e-06, | |
| "loss": 2.3403, | |
| "mean_token_accuracy": 0.5319944698363542, | |
| "num_tokens": 3018127.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 3.1349019607843136, | |
| "grad_norm": 1.6033964157104492, | |
| "learning_rate": 2.363481228668942e-06, | |
| "loss": 2.2751, | |
| "mean_token_accuracy": 0.5398386877030135, | |
| "num_tokens": 3047280.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.1662745098039213, | |
| "grad_norm": 7.103280544281006, | |
| "learning_rate": 2.278156996587031e-06, | |
| "loss": 2.3816, | |
| "mean_token_accuracy": 0.5190372098237276, | |
| "num_tokens": 3077137.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 3.1976470588235295, | |
| "grad_norm": 2.4392924308776855, | |
| "learning_rate": 2.1928327645051195e-06, | |
| "loss": 2.3052, | |
| "mean_token_accuracy": 0.5296947434544563, | |
| "num_tokens": 3106067.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 3.2290196078431372, | |
| "grad_norm": 1.4106686115264893, | |
| "learning_rate": 2.1075085324232083e-06, | |
| "loss": 2.3615, | |
| "mean_token_accuracy": 0.525895349867642, | |
| "num_tokens": 3136450.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 3.260392156862745, | |
| "grad_norm": 3.269272565841675, | |
| "learning_rate": 2.022184300341297e-06, | |
| "loss": 2.3037, | |
| "mean_token_accuracy": 0.5490067519247532, | |
| "num_tokens": 3166808.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 3.291764705882353, | |
| "grad_norm": 1.5100555419921875, | |
| "learning_rate": 1.9368600682593857e-06, | |
| "loss": 2.3014, | |
| "mean_token_accuracy": 0.5390114476904273, | |
| "num_tokens": 3197483.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 3.323137254901961, | |
| "grad_norm": 1.4328869581222534, | |
| "learning_rate": 1.8515358361774745e-06, | |
| "loss": 2.2193, | |
| "mean_token_accuracy": 0.5445488292723895, | |
| "num_tokens": 3229662.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 3.3545098039215686, | |
| "grad_norm": 0.9292280077934265, | |
| "learning_rate": 1.7662116040955632e-06, | |
| "loss": 2.1304, | |
| "mean_token_accuracy": 0.5581423584371805, | |
| "num_tokens": 3262175.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 3.3858823529411763, | |
| "grad_norm": 2.55062198638916, | |
| "learning_rate": 1.680887372013652e-06, | |
| "loss": 2.4022, | |
| "mean_token_accuracy": 0.5283184833824635, | |
| "num_tokens": 3291239.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 3.417254901960784, | |
| "grad_norm": 3.2028212547302246, | |
| "learning_rate": 1.5955631399317405e-06, | |
| "loss": 2.4047, | |
| "mean_token_accuracy": 0.530560277402401, | |
| "num_tokens": 3321636.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 3.4486274509803923, | |
| "grad_norm": 1.1053611040115356, | |
| "learning_rate": 1.5102389078498294e-06, | |
| "loss": 2.0193, | |
| "mean_token_accuracy": 0.5678496524691582, | |
| "num_tokens": 3355839.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 1.1278761625289917, | |
| "learning_rate": 1.4249146757679183e-06, | |
| "loss": 2.1899, | |
| "mean_token_accuracy": 0.5349464191123843, | |
| "num_tokens": 3390743.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 3.5113725490196077, | |
| "grad_norm": 1.3680450916290283, | |
| "learning_rate": 1.339590443686007e-06, | |
| "loss": 2.3307, | |
| "mean_token_accuracy": 0.5308054933324456, | |
| "num_tokens": 3422911.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 3.542745098039216, | |
| "grad_norm": 3.9734294414520264, | |
| "learning_rate": 1.2542662116040958e-06, | |
| "loss": 2.2857, | |
| "mean_token_accuracy": 0.5387092420831323, | |
| "num_tokens": 3453759.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 3.5741176470588236, | |
| "grad_norm": 2.855978012084961, | |
| "learning_rate": 1.1689419795221844e-06, | |
| "loss": 2.2933, | |
| "mean_token_accuracy": 0.5302057925611734, | |
| "num_tokens": 3482976.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 3.6054901960784314, | |
| "grad_norm": 2.837674617767334, | |
| "learning_rate": 1.0836177474402731e-06, | |
| "loss": 2.3656, | |
| "mean_token_accuracy": 0.5338190544396639, | |
| "num_tokens": 3512124.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 3.636862745098039, | |
| "grad_norm": 1.6821599006652832, | |
| "learning_rate": 9.982935153583618e-07, | |
| "loss": 2.3696, | |
| "mean_token_accuracy": 0.5232982926070691, | |
| "num_tokens": 3539944.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 3.668235294117647, | |
| "grad_norm": 8.743041038513184, | |
| "learning_rate": 9.129692832764505e-07, | |
| "loss": 2.3186, | |
| "mean_token_accuracy": 0.5293452955782414, | |
| "num_tokens": 3568686.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 3.699607843137255, | |
| "grad_norm": 3.6034657955169678, | |
| "learning_rate": 8.276450511945393e-07, | |
| "loss": 2.474, | |
| "mean_token_accuracy": 0.518931976519525, | |
| "num_tokens": 3596306.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 3.7309803921568627, | |
| "grad_norm": 1.2798527479171753, | |
| "learning_rate": 7.42320819112628e-07, | |
| "loss": 2.1739, | |
| "mean_token_accuracy": 0.5471075214445591, | |
| "num_tokens": 3625513.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 3.7623529411764705, | |
| "grad_norm": 1.1355539560317993, | |
| "learning_rate": 6.569965870307168e-07, | |
| "loss": 2.2781, | |
| "mean_token_accuracy": 0.5349656146019697, | |
| "num_tokens": 3658136.0, | |
| "step": 1200 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1272, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.324879825159782e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |