finetuned_starcoder2_3b_V1.1 / trainer_state.json
Willowclem's picture
checkpoint complet pour reprise
c348f2f verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.7623529411764705,
"eval_steps": 500,
"global_step": 1200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03137254901960784,
"grad_norm": 2.0088300704956055,
"learning_rate": 8.000000000000001e-07,
"loss": 3.282,
"mean_token_accuracy": 0.4480127369053662,
"num_tokens": 34003.0,
"step": 10
},
{
"epoch": 0.06274509803921569,
"grad_norm": 0.46773308515548706,
"learning_rate": 1.7000000000000002e-06,
"loss": 3.3452,
"mean_token_accuracy": 0.4279281569644809,
"num_tokens": 66834.0,
"step": 20
},
{
"epoch": 0.09411764705882353,
"grad_norm": 1.1083784103393555,
"learning_rate": 2.7000000000000004e-06,
"loss": 3.1795,
"mean_token_accuracy": 0.4369500808417797,
"num_tokens": 102094.0,
"step": 30
},
{
"epoch": 0.12549019607843137,
"grad_norm": 3.110588788986206,
"learning_rate": 3.7e-06,
"loss": 3.1706,
"mean_token_accuracy": 0.43956867372617126,
"num_tokens": 136916.0,
"step": 40
},
{
"epoch": 0.1568627450980392,
"grad_norm": 0.6114773750305176,
"learning_rate": 4.600000000000001e-06,
"loss": 3.2986,
"mean_token_accuracy": 0.4236688693985343,
"num_tokens": 166339.0,
"step": 50
},
{
"epoch": 0.18823529411764706,
"grad_norm": 1.4991090297698975,
"learning_rate": 5.600000000000001e-06,
"loss": 3.3758,
"mean_token_accuracy": 0.4320780340582132,
"num_tokens": 193757.0,
"step": 60
},
{
"epoch": 0.2196078431372549,
"grad_norm": 1.0190929174423218,
"learning_rate": 6.600000000000001e-06,
"loss": 3.5999,
"mean_token_accuracy": 0.4074632978066802,
"num_tokens": 227753.0,
"step": 70
},
{
"epoch": 0.25098039215686274,
"grad_norm": 0.5823692679405212,
"learning_rate": 7.600000000000001e-06,
"loss": 3.242,
"mean_token_accuracy": 0.4243007113225758,
"num_tokens": 258774.0,
"step": 80
},
{
"epoch": 0.2823529411764706,
"grad_norm": 1.197152018547058,
"learning_rate": 8.6e-06,
"loss": 3.7351,
"mean_token_accuracy": 0.40340174464508893,
"num_tokens": 289476.0,
"step": 90
},
{
"epoch": 0.3137254901960784,
"grad_norm": 1.116959810256958,
"learning_rate": 9.600000000000001e-06,
"loss": 3.4449,
"mean_token_accuracy": 0.42097287215292456,
"num_tokens": 319562.0,
"step": 100
},
{
"epoch": 0.34509803921568627,
"grad_norm": 2.1092543601989746,
"learning_rate": 9.948805460750855e-06,
"loss": 3.2034,
"mean_token_accuracy": 0.42690765811130404,
"num_tokens": 350950.0,
"step": 110
},
{
"epoch": 0.3764705882352941,
"grad_norm": 0.726530909538269,
"learning_rate": 9.863481228668942e-06,
"loss": 3.1113,
"mean_token_accuracy": 0.44094684603624046,
"num_tokens": 379819.0,
"step": 120
},
{
"epoch": 0.40784313725490196,
"grad_norm": 1.3136755228042603,
"learning_rate": 9.778156996587031e-06,
"loss": 3.1945,
"mean_token_accuracy": 0.448084157705307,
"num_tokens": 412785.0,
"step": 130
},
{
"epoch": 0.4392156862745098,
"grad_norm": 0.9245865941047668,
"learning_rate": 9.69283276450512e-06,
"loss": 3.0248,
"mean_token_accuracy": 0.4554275684058666,
"num_tokens": 442964.0,
"step": 140
},
{
"epoch": 0.47058823529411764,
"grad_norm": 4.568413257598877,
"learning_rate": 9.607508532423209e-06,
"loss": 3.0576,
"mean_token_accuracy": 0.45087954150512816,
"num_tokens": 473446.0,
"step": 150
},
{
"epoch": 0.5019607843137255,
"grad_norm": 7.357224464416504,
"learning_rate": 9.522184300341298e-06,
"loss": 3.195,
"mean_token_accuracy": 0.4267027805559337,
"num_tokens": 503608.0,
"step": 160
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.9659298658370972,
"learning_rate": 9.436860068259387e-06,
"loss": 3.1946,
"mean_token_accuracy": 0.4488052343018353,
"num_tokens": 533341.0,
"step": 170
},
{
"epoch": 0.5647058823529412,
"grad_norm": 1.9798550605773926,
"learning_rate": 9.351535836177476e-06,
"loss": 3.25,
"mean_token_accuracy": 0.4342062085866928,
"num_tokens": 563710.0,
"step": 180
},
{
"epoch": 0.596078431372549,
"grad_norm": 2.385053873062134,
"learning_rate": 9.266211604095564e-06,
"loss": 2.8966,
"mean_token_accuracy": 0.4620134405791759,
"num_tokens": 592080.0,
"step": 190
},
{
"epoch": 0.6274509803921569,
"grad_norm": 1.955040693283081,
"learning_rate": 9.180887372013653e-06,
"loss": 3.2465,
"mean_token_accuracy": 0.42782977214083073,
"num_tokens": 621337.0,
"step": 200
},
{
"epoch": 0.6588235294117647,
"grad_norm": 3.6970317363739014,
"learning_rate": 9.09556313993174e-06,
"loss": 3.1251,
"mean_token_accuracy": 0.44717809772118927,
"num_tokens": 646419.0,
"step": 210
},
{
"epoch": 0.6901960784313725,
"grad_norm": 2.0861480236053467,
"learning_rate": 9.01023890784983e-06,
"loss": 3.1319,
"mean_token_accuracy": 0.4380856929346919,
"num_tokens": 678845.0,
"step": 220
},
{
"epoch": 0.7215686274509804,
"grad_norm": 1.1843408346176147,
"learning_rate": 8.924914675767918e-06,
"loss": 3.0282,
"mean_token_accuracy": 0.4654800074175,
"num_tokens": 708108.0,
"step": 230
},
{
"epoch": 0.7529411764705882,
"grad_norm": 2.084069013595581,
"learning_rate": 8.839590443686009e-06,
"loss": 3.1245,
"mean_token_accuracy": 0.43198747336864474,
"num_tokens": 734439.0,
"step": 240
},
{
"epoch": 0.7843137254901961,
"grad_norm": 3.9663286209106445,
"learning_rate": 8.754266211604096e-06,
"loss": 2.8906,
"mean_token_accuracy": 0.45770675158128143,
"num_tokens": 763349.0,
"step": 250
},
{
"epoch": 0.8156862745098039,
"grad_norm": 2.0605413913726807,
"learning_rate": 8.668941979522185e-06,
"loss": 2.9757,
"mean_token_accuracy": 0.4534512896090746,
"num_tokens": 791592.0,
"step": 260
},
{
"epoch": 0.8470588235294118,
"grad_norm": 3.5317554473876953,
"learning_rate": 8.583617747440274e-06,
"loss": 2.8376,
"mean_token_accuracy": 0.4683062855154276,
"num_tokens": 825019.0,
"step": 270
},
{
"epoch": 0.8784313725490196,
"grad_norm": 3.9178497791290283,
"learning_rate": 8.498293515358363e-06,
"loss": 2.9376,
"mean_token_accuracy": 0.45492212250828745,
"num_tokens": 854288.0,
"step": 280
},
{
"epoch": 0.9098039215686274,
"grad_norm": 0.9526835680007935,
"learning_rate": 8.412969283276451e-06,
"loss": 2.8571,
"mean_token_accuracy": 0.46086471611633895,
"num_tokens": 884793.0,
"step": 290
},
{
"epoch": 0.9411764705882353,
"grad_norm": 3.918769598007202,
"learning_rate": 8.327645051194539e-06,
"loss": 2.7934,
"mean_token_accuracy": 0.4795181108638644,
"num_tokens": 915321.0,
"step": 300
},
{
"epoch": 0.9725490196078431,
"grad_norm": 3.45381760597229,
"learning_rate": 8.24232081911263e-06,
"loss": 2.8085,
"mean_token_accuracy": 0.4741422997787595,
"num_tokens": 946666.0,
"step": 310
},
{
"epoch": 1.0031372549019608,
"grad_norm": 2.1785495281219482,
"learning_rate": 8.156996587030718e-06,
"loss": 2.8618,
"mean_token_accuracy": 0.4749741800702535,
"num_tokens": 974017.0,
"step": 320
},
{
"epoch": 1.0345098039215685,
"grad_norm": 6.006409168243408,
"learning_rate": 8.071672354948807e-06,
"loss": 2.9078,
"mean_token_accuracy": 0.46515854969620707,
"num_tokens": 1004744.0,
"step": 330
},
{
"epoch": 1.0658823529411765,
"grad_norm": 1.7984623908996582,
"learning_rate": 7.986348122866894e-06,
"loss": 2.9124,
"mean_token_accuracy": 0.4585884911939502,
"num_tokens": 1033652.0,
"step": 340
},
{
"epoch": 1.0972549019607842,
"grad_norm": 2.510467052459717,
"learning_rate": 7.901023890784983e-06,
"loss": 2.8057,
"mean_token_accuracy": 0.4740089667029679,
"num_tokens": 1066035.0,
"step": 350
},
{
"epoch": 1.1286274509803922,
"grad_norm": 3.545011520385742,
"learning_rate": 7.815699658703072e-06,
"loss": 2.8801,
"mean_token_accuracy": 0.4632578143849969,
"num_tokens": 1092737.0,
"step": 360
},
{
"epoch": 1.16,
"grad_norm": 2.1517884731292725,
"learning_rate": 7.73037542662116e-06,
"loss": 2.7748,
"mean_token_accuracy": 0.47425267212092875,
"num_tokens": 1121228.0,
"step": 370
},
{
"epoch": 1.1913725490196079,
"grad_norm": 1.727739691734314,
"learning_rate": 7.64505119453925e-06,
"loss": 2.7721,
"mean_token_accuracy": 0.4736901242285967,
"num_tokens": 1152714.0,
"step": 380
},
{
"epoch": 1.2227450980392156,
"grad_norm": 2.197744131088257,
"learning_rate": 7.5597269624573385e-06,
"loss": 2.7644,
"mean_token_accuracy": 0.47409027721732855,
"num_tokens": 1184573.0,
"step": 390
},
{
"epoch": 1.2541176470588236,
"grad_norm": 3.178690195083618,
"learning_rate": 7.474402730375427e-06,
"loss": 2.6941,
"mean_token_accuracy": 0.48159148562699555,
"num_tokens": 1218513.0,
"step": 400
},
{
"epoch": 1.2854901960784313,
"grad_norm": 1.3430229425430298,
"learning_rate": 7.389078498293516e-06,
"loss": 2.5874,
"mean_token_accuracy": 0.49995266608893874,
"num_tokens": 1250333.0,
"step": 410
},
{
"epoch": 1.3168627450980392,
"grad_norm": 3.5784506797790527,
"learning_rate": 7.303754266211604e-06,
"loss": 2.5586,
"mean_token_accuracy": 0.5180117629468441,
"num_tokens": 1286668.0,
"step": 420
},
{
"epoch": 1.348235294117647,
"grad_norm": 31.7750186920166,
"learning_rate": 7.218430034129693e-06,
"loss": 2.6383,
"mean_token_accuracy": 0.48776071686297656,
"num_tokens": 1315580.0,
"step": 430
},
{
"epoch": 1.379607843137255,
"grad_norm": 2.4759323596954346,
"learning_rate": 7.133105802047782e-06,
"loss": 2.6451,
"mean_token_accuracy": 0.4944142198190093,
"num_tokens": 1347539.0,
"step": 440
},
{
"epoch": 1.4109803921568629,
"grad_norm": 1.7809475660324097,
"learning_rate": 7.047781569965872e-06,
"loss": 2.7221,
"mean_token_accuracy": 0.47517210952937605,
"num_tokens": 1377083.0,
"step": 450
},
{
"epoch": 1.4423529411764706,
"grad_norm": 1.1610660552978516,
"learning_rate": 6.96245733788396e-06,
"loss": 2.5579,
"mean_token_accuracy": 0.49381575733423233,
"num_tokens": 1408914.0,
"step": 460
},
{
"epoch": 1.4737254901960783,
"grad_norm": 4.139962673187256,
"learning_rate": 6.877133105802049e-06,
"loss": 2.9326,
"mean_token_accuracy": 0.45861218236386775,
"num_tokens": 1438118.0,
"step": 470
},
{
"epoch": 1.5050980392156863,
"grad_norm": 3.0993845462799072,
"learning_rate": 6.7918088737201375e-06,
"loss": 2.8458,
"mean_token_accuracy": 0.47443244988098743,
"num_tokens": 1467640.0,
"step": 480
},
{
"epoch": 1.5364705882352943,
"grad_norm": 1.291991949081421,
"learning_rate": 6.7064846416382255e-06,
"loss": 2.6781,
"mean_token_accuracy": 0.4779525174759328,
"num_tokens": 1495733.0,
"step": 490
},
{
"epoch": 1.567843137254902,
"grad_norm": 4.795923709869385,
"learning_rate": 6.621160409556314e-06,
"loss": 2.9197,
"mean_token_accuracy": 0.4680457916110754,
"num_tokens": 1525251.0,
"step": 500
},
{
"epoch": 1.5992156862745097,
"grad_norm": 1.3896703720092773,
"learning_rate": 6.535836177474402e-06,
"loss": 2.6147,
"mean_token_accuracy": 0.49835432767868043,
"num_tokens": 1554363.0,
"step": 510
},
{
"epoch": 1.6305882352941177,
"grad_norm": 1.1814641952514648,
"learning_rate": 6.450511945392492e-06,
"loss": 2.6656,
"mean_token_accuracy": 0.48573412485420703,
"num_tokens": 1581026.0,
"step": 520
},
{
"epoch": 1.6619607843137256,
"grad_norm": 1.8640310764312744,
"learning_rate": 6.365187713310581e-06,
"loss": 2.5826,
"mean_token_accuracy": 0.4969061462208629,
"num_tokens": 1611477.0,
"step": 530
},
{
"epoch": 1.6933333333333334,
"grad_norm": 4.471650123596191,
"learning_rate": 6.27986348122867e-06,
"loss": 2.6517,
"mean_token_accuracy": 0.4934783162549138,
"num_tokens": 1641681.0,
"step": 540
},
{
"epoch": 1.724705882352941,
"grad_norm": 3.423351526260376,
"learning_rate": 6.194539249146758e-06,
"loss": 2.6683,
"mean_token_accuracy": 0.48104359675198793,
"num_tokens": 1670996.0,
"step": 550
},
{
"epoch": 1.756078431372549,
"grad_norm": 1.9675357341766357,
"learning_rate": 6.109215017064847e-06,
"loss": 2.5381,
"mean_token_accuracy": 0.49859709180891515,
"num_tokens": 1702169.0,
"step": 560
},
{
"epoch": 1.787450980392157,
"grad_norm": 1.6399911642074585,
"learning_rate": 6.023890784982936e-06,
"loss": 2.5058,
"mean_token_accuracy": 0.5064322877675295,
"num_tokens": 1731408.0,
"step": 570
},
{
"epoch": 1.8188235294117647,
"grad_norm": 1.8453171253204346,
"learning_rate": 5.938566552901024e-06,
"loss": 2.6272,
"mean_token_accuracy": 0.4801918284967542,
"num_tokens": 1759204.0,
"step": 580
},
{
"epoch": 1.8501960784313725,
"grad_norm": 1.7112871408462524,
"learning_rate": 5.853242320819113e-06,
"loss": 2.4362,
"mean_token_accuracy": 0.512086040340364,
"num_tokens": 1789717.0,
"step": 590
},
{
"epoch": 1.8815686274509804,
"grad_norm": 3.174295663833618,
"learning_rate": 5.767918088737202e-06,
"loss": 2.5042,
"mean_token_accuracy": 0.5141274336725473,
"num_tokens": 1821803.0,
"step": 600
},
{
"epoch": 1.9129411764705884,
"grad_norm": 3.231480121612549,
"learning_rate": 5.682593856655291e-06,
"loss": 2.6359,
"mean_token_accuracy": 0.49160230327397586,
"num_tokens": 1853817.0,
"step": 610
},
{
"epoch": 1.944313725490196,
"grad_norm": 1.1881468296051025,
"learning_rate": 5.597269624573379e-06,
"loss": 2.4535,
"mean_token_accuracy": 0.5213793812319636,
"num_tokens": 1885929.0,
"step": 620
},
{
"epoch": 1.9756862745098038,
"grad_norm": 1.3049256801605225,
"learning_rate": 5.511945392491468e-06,
"loss": 2.5596,
"mean_token_accuracy": 0.5133258309215307,
"num_tokens": 1918060.0,
"step": 630
},
{
"epoch": 2.0062745098039216,
"grad_norm": 2.1421661376953125,
"learning_rate": 5.426621160409556e-06,
"loss": 2.4831,
"mean_token_accuracy": 0.5165034267000663,
"num_tokens": 1948420.0,
"step": 640
},
{
"epoch": 2.0376470588235294,
"grad_norm": 2.0425727367401123,
"learning_rate": 5.341296928327645e-06,
"loss": 2.3654,
"mean_token_accuracy": 0.5259943537414074,
"num_tokens": 1977715.0,
"step": 650
},
{
"epoch": 2.069019607843137,
"grad_norm": 4.167781352996826,
"learning_rate": 5.255972696245735e-06,
"loss": 2.3315,
"mean_token_accuracy": 0.5249333314597606,
"num_tokens": 2008534.0,
"step": 660
},
{
"epoch": 2.1003921568627453,
"grad_norm": 1.0092592239379883,
"learning_rate": 5.1706484641638235e-06,
"loss": 2.5238,
"mean_token_accuracy": 0.5057306325063109,
"num_tokens": 2039030.0,
"step": 670
},
{
"epoch": 2.131764705882353,
"grad_norm": 1.6947963237762451,
"learning_rate": 5.0853242320819115e-06,
"loss": 2.5809,
"mean_token_accuracy": 0.5050426244735717,
"num_tokens": 2068912.0,
"step": 680
},
{
"epoch": 2.1631372549019607,
"grad_norm": 1.5759137868881226,
"learning_rate": 5e-06,
"loss": 2.4439,
"mean_token_accuracy": 0.5173273866996169,
"num_tokens": 2101461.0,
"step": 690
},
{
"epoch": 2.1945098039215685,
"grad_norm": 1.685102939605713,
"learning_rate": 4.914675767918089e-06,
"loss": 2.4616,
"mean_token_accuracy": 0.5100228149443865,
"num_tokens": 2131232.0,
"step": 700
},
{
"epoch": 2.2258823529411766,
"grad_norm": 1.9910387992858887,
"learning_rate": 4.829351535836178e-06,
"loss": 2.3545,
"mean_token_accuracy": 0.5206725034862757,
"num_tokens": 2160460.0,
"step": 710
},
{
"epoch": 2.2572549019607844,
"grad_norm": 1.7385118007659912,
"learning_rate": 4.744027303754267e-06,
"loss": 2.521,
"mean_token_accuracy": 0.503148902207613,
"num_tokens": 2188175.0,
"step": 720
},
{
"epoch": 2.288627450980392,
"grad_norm": 5.597545623779297,
"learning_rate": 4.658703071672355e-06,
"loss": 2.467,
"mean_token_accuracy": 0.5022781057283282,
"num_tokens": 2218714.0,
"step": 730
},
{
"epoch": 2.32,
"grad_norm": 1.7059907913208008,
"learning_rate": 4.573378839590444e-06,
"loss": 2.4086,
"mean_token_accuracy": 0.504382885247469,
"num_tokens": 2249170.0,
"step": 740
},
{
"epoch": 2.351372549019608,
"grad_norm": 1.951714277267456,
"learning_rate": 4.488054607508533e-06,
"loss": 2.3236,
"mean_token_accuracy": 0.5256480574607849,
"num_tokens": 2280286.0,
"step": 750
},
{
"epoch": 2.3827450980392157,
"grad_norm": 1.0276103019714355,
"learning_rate": 4.402730375426622e-06,
"loss": 2.3727,
"mean_token_accuracy": 0.5266215573996306,
"num_tokens": 2311312.0,
"step": 760
},
{
"epoch": 2.4141176470588235,
"grad_norm": 2.829286813735962,
"learning_rate": 4.31740614334471e-06,
"loss": 2.5146,
"mean_token_accuracy": 0.5105616014450789,
"num_tokens": 2340935.0,
"step": 770
},
{
"epoch": 2.445490196078431,
"grad_norm": 3.0118846893310547,
"learning_rate": 4.232081911262799e-06,
"loss": 2.3505,
"mean_token_accuracy": 0.5210155340842902,
"num_tokens": 2370291.0,
"step": 780
},
{
"epoch": 2.4768627450980394,
"grad_norm": 1.9568514823913574,
"learning_rate": 4.1467576791808874e-06,
"loss": 2.3832,
"mean_token_accuracy": 0.5071445981040597,
"num_tokens": 2399843.0,
"step": 790
},
{
"epoch": 2.508235294117647,
"grad_norm": 1.8932603597640991,
"learning_rate": 4.061433447098976e-06,
"loss": 2.3508,
"mean_token_accuracy": 0.5251543965190649,
"num_tokens": 2428762.0,
"step": 800
},
{
"epoch": 2.539607843137255,
"grad_norm": 1.755767822265625,
"learning_rate": 3.976109215017065e-06,
"loss": 2.3532,
"mean_token_accuracy": 0.5324380807578564,
"num_tokens": 2458475.0,
"step": 810
},
{
"epoch": 2.5709803921568626,
"grad_norm": 2.4889233112335205,
"learning_rate": 3.890784982935154e-06,
"loss": 2.6067,
"mean_token_accuracy": 0.5031498618423939,
"num_tokens": 2489770.0,
"step": 820
},
{
"epoch": 2.6023529411764708,
"grad_norm": 4.700379371643066,
"learning_rate": 3.8054607508532425e-06,
"loss": 2.5566,
"mean_token_accuracy": 0.502924164570868,
"num_tokens": 2521156.0,
"step": 830
},
{
"epoch": 2.6337254901960785,
"grad_norm": 12.594019889831543,
"learning_rate": 3.7201365187713314e-06,
"loss": 2.1664,
"mean_token_accuracy": 0.5561403293162585,
"num_tokens": 2553903.0,
"step": 840
},
{
"epoch": 2.665098039215686,
"grad_norm": 5.380671977996826,
"learning_rate": 3.6348122866894202e-06,
"loss": 2.3804,
"mean_token_accuracy": 0.5276698149740696,
"num_tokens": 2583417.0,
"step": 850
},
{
"epoch": 2.696470588235294,
"grad_norm": 6.616447448730469,
"learning_rate": 3.5494880546075087e-06,
"loss": 2.4498,
"mean_token_accuracy": 0.5167227942496538,
"num_tokens": 2612099.0,
"step": 860
},
{
"epoch": 2.7278431372549017,
"grad_norm": 1.3597829341888428,
"learning_rate": 3.4641638225255976e-06,
"loss": 2.173,
"mean_token_accuracy": 0.5551321767270565,
"num_tokens": 2644692.0,
"step": 870
},
{
"epoch": 2.75921568627451,
"grad_norm": 2.5514867305755615,
"learning_rate": 3.378839590443686e-06,
"loss": 2.3411,
"mean_token_accuracy": 0.534308859705925,
"num_tokens": 2680221.0,
"step": 880
},
{
"epoch": 2.7905882352941176,
"grad_norm": 2.470513105392456,
"learning_rate": 3.2935153583617753e-06,
"loss": 2.3716,
"mean_token_accuracy": 0.5275221727788448,
"num_tokens": 2715613.0,
"step": 890
},
{
"epoch": 2.8219607843137258,
"grad_norm": 1.194263219833374,
"learning_rate": 3.2081911262798638e-06,
"loss": 2.3571,
"mean_token_accuracy": 0.5199422530829907,
"num_tokens": 2745234.0,
"step": 900
},
{
"epoch": 2.8533333333333335,
"grad_norm": Infinity,
"learning_rate": 3.122866894197952e-06,
"loss": 2.4158,
"mean_token_accuracy": 0.5191751107573509,
"num_tokens": 2775161.0,
"step": 910
},
{
"epoch": 2.8847058823529412,
"grad_norm": 1.294569492340088,
"learning_rate": 3.046075085324232e-06,
"loss": 2.3558,
"mean_token_accuracy": 0.5214510016143322,
"num_tokens": 2805373.0,
"step": 920
},
{
"epoch": 2.916078431372549,
"grad_norm": 4.139784336090088,
"learning_rate": 2.9607508532423213e-06,
"loss": 2.3869,
"mean_token_accuracy": 0.5307831708341837,
"num_tokens": 2831957.0,
"step": 930
},
{
"epoch": 2.9474509803921567,
"grad_norm": 1.2397838830947876,
"learning_rate": 2.8754266211604098e-06,
"loss": 2.3455,
"mean_token_accuracy": 0.5367285626009106,
"num_tokens": 2862724.0,
"step": 940
},
{
"epoch": 2.978823529411765,
"grad_norm": 1.8458396196365356,
"learning_rate": 2.790102389078498e-06,
"loss": 2.3212,
"mean_token_accuracy": 0.540785015001893,
"num_tokens": 2895266.0,
"step": 950
},
{
"epoch": 3.0094117647058822,
"grad_norm": 2.0150907039642334,
"learning_rate": 2.7047781569965875e-06,
"loss": 2.3589,
"mean_token_accuracy": 0.5204295409031403,
"num_tokens": 2924126.0,
"step": 960
},
{
"epoch": 3.0407843137254904,
"grad_norm": 10.822606086730957,
"learning_rate": 2.619453924914676e-06,
"loss": 2.1408,
"mean_token_accuracy": 0.5493647336959839,
"num_tokens": 2956817.0,
"step": 970
},
{
"epoch": 3.072156862745098,
"grad_norm": 1.3175485134124756,
"learning_rate": 2.534129692832765e-06,
"loss": 2.3916,
"mean_token_accuracy": 0.5206685658544302,
"num_tokens": 2986467.0,
"step": 980
},
{
"epoch": 3.103529411764706,
"grad_norm": 1.7138490676879883,
"learning_rate": 2.4488054607508537e-06,
"loss": 2.3403,
"mean_token_accuracy": 0.5319944698363542,
"num_tokens": 3018127.0,
"step": 990
},
{
"epoch": 3.1349019607843136,
"grad_norm": 1.6033964157104492,
"learning_rate": 2.363481228668942e-06,
"loss": 2.2751,
"mean_token_accuracy": 0.5398386877030135,
"num_tokens": 3047280.0,
"step": 1000
},
{
"epoch": 3.1662745098039213,
"grad_norm": 7.103280544281006,
"learning_rate": 2.278156996587031e-06,
"loss": 2.3816,
"mean_token_accuracy": 0.5190372098237276,
"num_tokens": 3077137.0,
"step": 1010
},
{
"epoch": 3.1976470588235295,
"grad_norm": 2.4392924308776855,
"learning_rate": 2.1928327645051195e-06,
"loss": 2.3052,
"mean_token_accuracy": 0.5296947434544563,
"num_tokens": 3106067.0,
"step": 1020
},
{
"epoch": 3.2290196078431372,
"grad_norm": 1.4106686115264893,
"learning_rate": 2.1075085324232083e-06,
"loss": 2.3615,
"mean_token_accuracy": 0.525895349867642,
"num_tokens": 3136450.0,
"step": 1030
},
{
"epoch": 3.260392156862745,
"grad_norm": 3.269272565841675,
"learning_rate": 2.022184300341297e-06,
"loss": 2.3037,
"mean_token_accuracy": 0.5490067519247532,
"num_tokens": 3166808.0,
"step": 1040
},
{
"epoch": 3.291764705882353,
"grad_norm": 1.5100555419921875,
"learning_rate": 1.9368600682593857e-06,
"loss": 2.3014,
"mean_token_accuracy": 0.5390114476904273,
"num_tokens": 3197483.0,
"step": 1050
},
{
"epoch": 3.323137254901961,
"grad_norm": 1.4328869581222534,
"learning_rate": 1.8515358361774745e-06,
"loss": 2.2193,
"mean_token_accuracy": 0.5445488292723895,
"num_tokens": 3229662.0,
"step": 1060
},
{
"epoch": 3.3545098039215686,
"grad_norm": 0.9292280077934265,
"learning_rate": 1.7662116040955632e-06,
"loss": 2.1304,
"mean_token_accuracy": 0.5581423584371805,
"num_tokens": 3262175.0,
"step": 1070
},
{
"epoch": 3.3858823529411763,
"grad_norm": 2.55062198638916,
"learning_rate": 1.680887372013652e-06,
"loss": 2.4022,
"mean_token_accuracy": 0.5283184833824635,
"num_tokens": 3291239.0,
"step": 1080
},
{
"epoch": 3.417254901960784,
"grad_norm": 3.2028212547302246,
"learning_rate": 1.5955631399317405e-06,
"loss": 2.4047,
"mean_token_accuracy": 0.530560277402401,
"num_tokens": 3321636.0,
"step": 1090
},
{
"epoch": 3.4486274509803923,
"grad_norm": 1.1053611040115356,
"learning_rate": 1.5102389078498294e-06,
"loss": 2.0193,
"mean_token_accuracy": 0.5678496524691582,
"num_tokens": 3355839.0,
"step": 1100
},
{
"epoch": 3.48,
"grad_norm": 1.1278761625289917,
"learning_rate": 1.4249146757679183e-06,
"loss": 2.1899,
"mean_token_accuracy": 0.5349464191123843,
"num_tokens": 3390743.0,
"step": 1110
},
{
"epoch": 3.5113725490196077,
"grad_norm": 1.3680450916290283,
"learning_rate": 1.339590443686007e-06,
"loss": 2.3307,
"mean_token_accuracy": 0.5308054933324456,
"num_tokens": 3422911.0,
"step": 1120
},
{
"epoch": 3.542745098039216,
"grad_norm": 3.9734294414520264,
"learning_rate": 1.2542662116040958e-06,
"loss": 2.2857,
"mean_token_accuracy": 0.5387092420831323,
"num_tokens": 3453759.0,
"step": 1130
},
{
"epoch": 3.5741176470588236,
"grad_norm": 2.855978012084961,
"learning_rate": 1.1689419795221844e-06,
"loss": 2.2933,
"mean_token_accuracy": 0.5302057925611734,
"num_tokens": 3482976.0,
"step": 1140
},
{
"epoch": 3.6054901960784314,
"grad_norm": 2.837674617767334,
"learning_rate": 1.0836177474402731e-06,
"loss": 2.3656,
"mean_token_accuracy": 0.5338190544396639,
"num_tokens": 3512124.0,
"step": 1150
},
{
"epoch": 3.636862745098039,
"grad_norm": 1.6821599006652832,
"learning_rate": 9.982935153583618e-07,
"loss": 2.3696,
"mean_token_accuracy": 0.5232982926070691,
"num_tokens": 3539944.0,
"step": 1160
},
{
"epoch": 3.668235294117647,
"grad_norm": 8.743041038513184,
"learning_rate": 9.129692832764505e-07,
"loss": 2.3186,
"mean_token_accuracy": 0.5293452955782414,
"num_tokens": 3568686.0,
"step": 1170
},
{
"epoch": 3.699607843137255,
"grad_norm": 3.6034657955169678,
"learning_rate": 8.276450511945393e-07,
"loss": 2.474,
"mean_token_accuracy": 0.518931976519525,
"num_tokens": 3596306.0,
"step": 1180
},
{
"epoch": 3.7309803921568627,
"grad_norm": 1.2798527479171753,
"learning_rate": 7.42320819112628e-07,
"loss": 2.1739,
"mean_token_accuracy": 0.5471075214445591,
"num_tokens": 3625513.0,
"step": 1190
},
{
"epoch": 3.7623529411764705,
"grad_norm": 1.1355539560317993,
"learning_rate": 6.569965870307168e-07,
"loss": 2.2781,
"mean_token_accuracy": 0.5349656146019697,
"num_tokens": 3658136.0,
"step": 1200
}
],
"logging_steps": 10,
"max_steps": 1272,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.324879825159782e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}