lr2.0e-06_data-mix / trainer_state.json
Gabe-Thomp's picture
Model save
3a75c98 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 486,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.030959752321981424,
"grad_norm": 37.88094364827914,
"learning_rate": 1.6326530612244896e-07,
"loss": 2.012,
"mean_token_accuracy": 0.6721480131149292,
"num_tokens": 356347.0,
"step": 5
},
{
"epoch": 0.06191950464396285,
"grad_norm": 33.96281709708476,
"learning_rate": 3.673469387755102e-07,
"loss": 1.9687,
"mean_token_accuracy": 0.6716048419475555,
"num_tokens": 710036.0,
"step": 10
},
{
"epoch": 0.09287925696594428,
"grad_norm": 22.65005612491039,
"learning_rate": 5.714285714285714e-07,
"loss": 1.7907,
"mean_token_accuracy": 0.68106742699941,
"num_tokens": 1066084.0,
"step": 15
},
{
"epoch": 0.1238390092879257,
"grad_norm": 12.825394787025013,
"learning_rate": 7.755102040816326e-07,
"loss": 1.3465,
"mean_token_accuracy": 0.7263136367003123,
"num_tokens": 1418691.0,
"step": 20
},
{
"epoch": 0.15479876160990713,
"grad_norm": 4.82390687349429,
"learning_rate": 9.795918367346939e-07,
"loss": 1.105,
"mean_token_accuracy": 0.7517302592595418,
"num_tokens": 1773596.0,
"step": 25
},
{
"epoch": 0.18575851393188855,
"grad_norm": 2.138743079455233,
"learning_rate": 1.183673469387755e-06,
"loss": 0.9184,
"mean_token_accuracy": 0.7795046945412953,
"num_tokens": 2129005.0,
"step": 30
},
{
"epoch": 0.21671826625386997,
"grad_norm": 1.56843353818363,
"learning_rate": 1.3877551020408162e-06,
"loss": 0.8507,
"mean_token_accuracy": 0.7924608170986176,
"num_tokens": 2484075.0,
"step": 35
},
{
"epoch": 0.2476780185758514,
"grad_norm": 1.3036043605324696,
"learning_rate": 1.5918367346938775e-06,
"loss": 0.7988,
"mean_token_accuracy": 0.802043096224467,
"num_tokens": 2838384.0,
"step": 40
},
{
"epoch": 0.2786377708978328,
"grad_norm": 1.1700385198630212,
"learning_rate": 1.7959183673469386e-06,
"loss": 0.7936,
"mean_token_accuracy": 0.7999271392822266,
"num_tokens": 3192492.0,
"step": 45
},
{
"epoch": 0.30959752321981426,
"grad_norm": 1.1487890193406949,
"learning_rate": 2e-06,
"loss": 0.7533,
"mean_token_accuracy": 0.8103362222512563,
"num_tokens": 3545097.0,
"step": 50
},
{
"epoch": 0.34055727554179566,
"grad_norm": 1.162010071560409,
"learning_rate": 1.9993540481842407e-06,
"loss": 0.8066,
"mean_token_accuracy": 0.7923434535662334,
"num_tokens": 3903788.0,
"step": 55
},
{
"epoch": 0.3715170278637771,
"grad_norm": 1.2349108637093285,
"learning_rate": 1.9974170272444602e-06,
"loss": 0.7724,
"mean_token_accuracy": 0.8026398738225301,
"num_tokens": 4259188.0,
"step": 60
},
{
"epoch": 0.4024767801857585,
"grad_norm": 1.1955598435745196,
"learning_rate": 1.9941914396250445e-06,
"loss": 0.7695,
"mean_token_accuracy": 0.8047589619954427,
"num_tokens": 4613362.0,
"step": 65
},
{
"epoch": 0.43343653250773995,
"grad_norm": 1.0813062584413962,
"learning_rate": 1.9896814524743527e-06,
"loss": 0.7418,
"mean_token_accuracy": 0.8074904640515645,
"num_tokens": 4968926.0,
"step": 70
},
{
"epoch": 0.46439628482972134,
"grad_norm": 1.2490538133285465,
"learning_rate": 1.983892892261163e-06,
"loss": 0.782,
"mean_token_accuracy": 0.7980295320351919,
"num_tokens": 5326662.0,
"step": 75
},
{
"epoch": 0.4953560371517028,
"grad_norm": 1.1648094821501256,
"learning_rate": 1.9768332372474366e-06,
"loss": 0.781,
"mean_token_accuracy": 0.7981564621130626,
"num_tokens": 5683326.0,
"step": 80
},
{
"epoch": 0.5263157894736842,
"grad_norm": 1.153658574148309,
"learning_rate": 1.9685116078271223e-06,
"loss": 0.7208,
"mean_token_accuracy": 0.8101318061351777,
"num_tokens": 6037360.0,
"step": 85
},
{
"epoch": 0.5572755417956656,
"grad_norm": 1.0499709465738478,
"learning_rate": 1.958938754743489e-06,
"loss": 0.7171,
"mean_token_accuracy": 0.8120081921418508,
"num_tokens": 6391493.0,
"step": 90
},
{
"epoch": 0.5882352941176471,
"grad_norm": 1.1336114333093723,
"learning_rate": 1.9481270452001986e-06,
"loss": 0.7145,
"mean_token_accuracy": 0.8118303279081981,
"num_tokens": 6744880.0,
"step": 95
},
{
"epoch": 0.6191950464396285,
"grad_norm": 1.1623292661065792,
"learning_rate": 1.9360904468840735e-06,
"loss": 0.7779,
"mean_token_accuracy": 0.7967695931593577,
"num_tokens": 7104675.0,
"step": 100
},
{
"epoch": 0.6501547987616099,
"grad_norm": 1.1258828367181777,
"learning_rate": 1.92284450992019e-06,
"loss": 0.6596,
"mean_token_accuracy": 0.8252765933672587,
"num_tokens": 7456185.0,
"step": 105
},
{
"epoch": 0.6811145510835913,
"grad_norm": 1.0529210280076258,
"learning_rate": 1.9084063467826133e-06,
"loss": 0.7169,
"mean_token_accuracy": 0.807880413532257,
"num_tokens": 7810532.0,
"step": 110
},
{
"epoch": 0.7120743034055728,
"grad_norm": 0.9981313212919131,
"learning_rate": 1.8927946101867344e-06,
"loss": 0.7207,
"mean_token_accuracy": 0.8063897867997487,
"num_tokens": 8166048.0,
"step": 115
},
{
"epoch": 0.7430340557275542,
"grad_norm": 1.1009837013665933,
"learning_rate": 1.8760294689917554e-06,
"loss": 0.7037,
"mean_token_accuracy": 0.8074031293392181,
"num_tokens": 8523864.0,
"step": 120
},
{
"epoch": 0.7739938080495357,
"grad_norm": 1.0150983601738315,
"learning_rate": 1.858132582144469e-06,
"loss": 0.6768,
"mean_token_accuracy": 0.8155300041039785,
"num_tokens": 8875663.0,
"step": 125
},
{
"epoch": 0.804953560371517,
"grad_norm": 0.9926457956250347,
"learning_rate": 1.8391270706979861e-06,
"loss": 0.6913,
"mean_token_accuracy": 0.8103034933408101,
"num_tokens": 9230905.0,
"step": 130
},
{
"epoch": 0.8359133126934984,
"grad_norm": 1.0928972897431848,
"learning_rate": 1.819037487941563e-06,
"loss": 0.7202,
"mean_token_accuracy": 0.8045949776967366,
"num_tokens": 9587132.0,
"step": 135
},
{
"epoch": 0.8668730650154799,
"grad_norm": 1.0669887753572231,
"learning_rate": 1.7978897876801188e-06,
"loss": 0.7363,
"mean_token_accuracy": 0.8004952649275462,
"num_tokens": 9943019.0,
"step": 140
},
{
"epoch": 0.8978328173374613,
"grad_norm": 0.9764132937473873,
"learning_rate": 1.7757112907044198e-06,
"loss": 0.7099,
"mean_token_accuracy": 0.8056580940882365,
"num_tokens": 10299363.0,
"step": 145
},
{
"epoch": 0.9287925696594427,
"grad_norm": 0.9521500960876352,
"learning_rate": 1.7525306494952496e-06,
"loss": 0.6723,
"mean_token_accuracy": 0.8147999107837677,
"num_tokens": 10653205.0,
"step": 150
},
{
"epoch": 0.9597523219814241,
"grad_norm": 0.9014288610770413,
"learning_rate": 1.728377811207168e-06,
"loss": 0.6682,
"mean_token_accuracy": 0.8167306999365489,
"num_tokens": 11006084.0,
"step": 155
},
{
"epoch": 0.9907120743034056,
"grad_norm": 0.9887395976242098,
"learning_rate": 1.7032839789796709e-06,
"loss": 0.6729,
"mean_token_accuracy": 0.812424510717392,
"num_tokens": 11362461.0,
"step": 160
},
{
"epoch": 1.0185758513931888,
"grad_norm": 1.0123568191837427,
"learning_rate": 1.6772815716257411e-06,
"loss": 0.6889,
"mean_token_accuracy": 0.812437218648416,
"num_tokens": 11684418.0,
"step": 165
},
{
"epoch": 1.0495356037151702,
"grad_norm": 1.2247818299294482,
"learning_rate": 1.6504041817498676e-06,
"loss": 0.6003,
"mean_token_accuracy": 0.8325801193714142,
"num_tokens": 12038916.0,
"step": 170
},
{
"epoch": 1.0804953560371517,
"grad_norm": 1.1309870189119007,
"learning_rate": 1.622686532349637e-06,
"loss": 0.595,
"mean_token_accuracy": 0.8306204895178477,
"num_tokens": 12392491.0,
"step": 175
},
{
"epoch": 1.111455108359133,
"grad_norm": 1.0261193684060383,
"learning_rate": 1.5941644319569663e-06,
"loss": 0.5745,
"mean_token_accuracy": 0.8364119688669841,
"num_tokens": 12746211.0,
"step": 180
},
{
"epoch": 1.1424148606811146,
"grad_norm": 1.065208049531987,
"learning_rate": 1.5648747283769316e-06,
"loss": 0.6384,
"mean_token_accuracy": 0.821829471985499,
"num_tokens": 13105142.0,
"step": 185
},
{
"epoch": 1.173374613003096,
"grad_norm": 1.049060212665513,
"learning_rate": 1.5348552610839538e-06,
"loss": 0.583,
"mean_token_accuracy": 0.8339940627415975,
"num_tokens": 13459616.0,
"step": 190
},
{
"epoch": 1.2043343653250773,
"grad_norm": 1.0646992389936152,
"learning_rate": 1.5041448123368452e-06,
"loss": 0.585,
"mean_token_accuracy": 0.8339759588241578,
"num_tokens": 13813912.0,
"step": 195
},
{
"epoch": 1.2352941176470589,
"grad_norm": 1.0502983258770635,
"learning_rate": 1.4727830570758676e-06,
"loss": 0.6229,
"mean_token_accuracy": 0.8257229665915171,
"num_tokens": 14170920.0,
"step": 200
},
{
"epoch": 1.2662538699690402,
"grad_norm": 1.0418470874061867,
"learning_rate": 1.4408105116665333e-06,
"loss": 0.5806,
"mean_token_accuracy": 0.836287780602773,
"num_tokens": 14524500.0,
"step": 205
},
{
"epoch": 1.2972136222910216,
"grad_norm": 1.087294144738772,
"learning_rate": 1.4082684815563658e-06,
"loss": 0.5956,
"mean_token_accuracy": 0.8319136381149292,
"num_tokens": 14880512.0,
"step": 210
},
{
"epoch": 1.328173374613003,
"grad_norm": 1.101276572209567,
"learning_rate": 1.375199007912241e-06,
"loss": 0.6054,
"mean_token_accuracy": 0.8299936970074971,
"num_tokens": 15237315.0,
"step": 215
},
{
"epoch": 1.3591331269349844,
"grad_norm": 1.010589916838959,
"learning_rate": 1.3416448133072523e-06,
"loss": 0.5881,
"mean_token_accuracy": 0.8364007751146952,
"num_tokens": 15589797.0,
"step": 220
},
{
"epoch": 1.390092879256966,
"grad_norm": 1.0622699241655902,
"learning_rate": 1.307649246527263e-06,
"loss": 0.5995,
"mean_token_accuracy": 0.8305212179819743,
"num_tokens": 15946015.0,
"step": 225
},
{
"epoch": 1.4210526315789473,
"grad_norm": 1.1594145716237165,
"learning_rate": 1.273256226568451e-06,
"loss": 0.6167,
"mean_token_accuracy": 0.8282975077629089,
"num_tokens": 16300916.0,
"step": 230
},
{
"epoch": 1.4520123839009287,
"grad_norm": 1.051260396120812,
"learning_rate": 1.2385101858982004e-06,
"loss": 0.6137,
"mean_token_accuracy": 0.8268493433793386,
"num_tokens": 16658826.0,
"step": 235
},
{
"epoch": 1.4829721362229102,
"grad_norm": 1.1511715900211696,
"learning_rate": 1.203456013052634e-06,
"loss": 0.6359,
"mean_token_accuracy": 0.8252548217773438,
"num_tokens": 17016876.0,
"step": 240
},
{
"epoch": 1.5139318885448918,
"grad_norm": 1.0811656373656746,
"learning_rate": 1.1681389946449502e-06,
"loss": 0.5956,
"mean_token_accuracy": 0.8341775079568227,
"num_tokens": 17371949.0,
"step": 245
},
{
"epoch": 1.5448916408668731,
"grad_norm": 1.0456087268969014,
"learning_rate": 1.132604756859485e-06,
"loss": 0.5821,
"mean_token_accuracy": 0.836975779136022,
"num_tokens": 17724910.0,
"step": 250
},
{
"epoch": 1.5758513931888545,
"grad_norm": 1.1464877768864348,
"learning_rate": 1.0968992065070768e-06,
"loss": 0.6304,
"mean_token_accuracy": 0.8253893832365672,
"num_tokens": 18082483.0,
"step": 255
},
{
"epoch": 1.6068111455108358,
"grad_norm": 1.0740762703373092,
"learning_rate": 1.0610684717178905e-06,
"loss": 0.6069,
"mean_token_accuracy": 0.8298774818579356,
"num_tokens": 18438484.0,
"step": 260
},
{
"epoch": 1.6377708978328174,
"grad_norm": 1.058918476787192,
"learning_rate": 1.0251588423483204e-06,
"loss": 0.5919,
"mean_token_accuracy": 0.831935566663742,
"num_tokens": 18794196.0,
"step": 265
},
{
"epoch": 1.6687306501547987,
"grad_norm": 1.0594158390719453,
"learning_rate": 9.892167101789563e-07,
"loss": 0.5799,
"mean_token_accuracy": 0.8371186554431915,
"num_tokens": 19147172.0,
"step": 270
},
{
"epoch": 1.6996904024767803,
"grad_norm": 1.0350217488731246,
"learning_rate": 9.532885089808712e-07,
"loss": 0.5707,
"mean_token_accuracy": 0.8371800223986308,
"num_tokens": 19500395.0,
"step": 275
},
{
"epoch": 1.7306501547987616,
"grad_norm": 1.1043252167652924,
"learning_rate": 9.174206545276677e-07,
"loss": 0.5919,
"mean_token_accuracy": 0.833638709783554,
"num_tokens": 19854642.0,
"step": 280
},
{
"epoch": 1.761609907120743,
"grad_norm": 1.0594957855862892,
"learning_rate": 8.81659484630768e-07,
"loss": 0.6144,
"mean_token_accuracy": 0.8295779307683309,
"num_tokens": 20210201.0,
"step": 285
},
{
"epoch": 1.7925696594427245,
"grad_norm": 1.0655593034762212,
"learning_rate": 8.460511992754299e-07,
"loss": 0.6008,
"mean_token_accuracy": 0.8292633573214213,
"num_tokens": 20564970.0,
"step": 290
},
{
"epoch": 1.8235294117647058,
"grad_norm": 1.0925951219291423,
"learning_rate": 8.106418009348156e-07,
"loss": 0.5495,
"mean_token_accuracy": 0.8422843952973683,
"num_tokens": 20916569.0,
"step": 295
},
{
"epoch": 1.8544891640866874,
"grad_norm": 1.066475477830169,
"learning_rate": 7.75477035139231e-07,
"loss": 0.592,
"mean_token_accuracy": 0.8322900295257568,
"num_tokens": 21270517.0,
"step": 300
},
{
"epoch": 1.8854489164086687,
"grad_norm": 1.0726398714625474,
"learning_rate": 7.406023313773097e-07,
"loss": 0.5846,
"mean_token_accuracy": 0.833445531129837,
"num_tokens": 21626435.0,
"step": 305
},
{
"epoch": 1.91640866873065,
"grad_norm": 1.0081168980943895,
"learning_rate": 7.060627444054893e-07,
"loss": 0.599,
"mean_token_accuracy": 0.8309976756572723,
"num_tokens": 21981684.0,
"step": 310
},
{
"epoch": 1.9473684210526314,
"grad_norm": 1.1440591625457015,
"learning_rate": 6.719028960416098e-07,
"loss": 0.5952,
"mean_token_accuracy": 0.8320066591103872,
"num_tokens": 22335994.0,
"step": 315
},
{
"epoch": 1.978328173374613,
"grad_norm": 1.0647370303589478,
"learning_rate": 6.381669175178248e-07,
"loss": 0.5769,
"mean_token_accuracy": 0.8349888563156128,
"num_tokens": 22690758.0,
"step": 320
},
{
"epoch": 2.0061919504643964,
"grad_norm": 1.4685059022343072,
"learning_rate": 6.048983924673022e-07,
"loss": 0.6154,
"mean_token_accuracy": 0.8296286706571225,
"num_tokens": 23012996.0,
"step": 325
},
{
"epoch": 2.0371517027863777,
"grad_norm": 1.0756981234282057,
"learning_rate": 5.72140300618369e-07,
"loss": 0.548,
"mean_token_accuracy": 0.845786041021347,
"num_tokens": 23369489.0,
"step": 330
},
{
"epoch": 2.068111455108359,
"grad_norm": 1.22987487372924,
"learning_rate": 5.399349622688478e-07,
"loss": 0.5536,
"mean_token_accuracy": 0.8429702619711558,
"num_tokens": 23727840.0,
"step": 335
},
{
"epoch": 2.0990712074303404,
"grad_norm": 1.0621757442951574,
"learning_rate": 5.083239836123059e-07,
"loss": 0.5258,
"mean_token_accuracy": 0.8519696414470672,
"num_tokens": 24082355.0,
"step": 340
},
{
"epoch": 2.130030959752322,
"grad_norm": 1.1425718017062751,
"learning_rate": 4.773482029868656e-07,
"loss": 0.5293,
"mean_token_accuracy": 0.8492769340674082,
"num_tokens": 24438493.0,
"step": 345
},
{
"epoch": 2.1609907120743035,
"grad_norm": 1.1312756633475332,
"learning_rate": 4.4704763811600643e-07,
"loss": 0.5308,
"mean_token_accuracy": 0.8488172392050425,
"num_tokens": 24792322.0,
"step": 350
},
{
"epoch": 2.191950464396285,
"grad_norm": 1.2842356289182306,
"learning_rate": 4.174614344095213e-07,
"loss": 0.5655,
"mean_token_accuracy": 0.8417594293753307,
"num_tokens": 25151009.0,
"step": 355
},
{
"epoch": 2.222910216718266,
"grad_norm": 1.1702297440144753,
"learning_rate": 3.886278143914219e-07,
"loss": 0.5364,
"mean_token_accuracy": 0.8470952173074087,
"num_tokens": 25507520.0,
"step": 360
},
{
"epoch": 2.2538699690402475,
"grad_norm": 1.3759095012811366,
"learning_rate": 3.605840283201195e-07,
"loss": 0.5599,
"mean_token_accuracy": 0.8408537685871125,
"num_tokens": 25864215.0,
"step": 365
},
{
"epoch": 2.2848297213622293,
"grad_norm": 1.1193461931741189,
"learning_rate": 3.333663060646813e-07,
"loss": 0.49,
"mean_token_accuracy": 0.859304424126943,
"num_tokens": 26216877.0,
"step": 370
},
{
"epoch": 2.3157894736842106,
"grad_norm": 1.1648036361258658,
"learning_rate": 3.0700981029933016e-07,
"loss": 0.4931,
"mean_token_accuracy": 0.8578304747740427,
"num_tokens": 26573016.0,
"step": 375
},
{
"epoch": 2.346749226006192,
"grad_norm": 1.1405575574665878,
"learning_rate": 2.8154859107665987e-07,
"loss": 0.4917,
"mean_token_accuracy": 0.8590823928515117,
"num_tokens": 26925170.0,
"step": 380
},
{
"epoch": 2.3777089783281733,
"grad_norm": 1.1724816632075952,
"learning_rate": 2.5701554183824724e-07,
"loss": 0.5237,
"mean_token_accuracy": 0.8520345091819763,
"num_tokens": 27279422.0,
"step": 385
},
{
"epoch": 2.4086687306501546,
"grad_norm": 1.1357266729474,
"learning_rate": 2.3344235691949476e-07,
"loss": 0.4672,
"mean_token_accuracy": 0.8642761449019114,
"num_tokens": 27630575.0,
"step": 390
},
{
"epoch": 2.4396284829721364,
"grad_norm": 1.2409266497871647,
"learning_rate": 2.1085949060360653e-07,
"loss": 0.5369,
"mean_token_accuracy": 0.849452143907547,
"num_tokens": 27984763.0,
"step": 395
},
{
"epoch": 2.4705882352941178,
"grad_norm": 1.188481006905719,
"learning_rate": 1.8929611777758525e-07,
"loss": 0.5212,
"mean_token_accuracy": 0.8504625717798869,
"num_tokens": 28339838.0,
"step": 400
},
{
"epoch": 2.501547987616099,
"grad_norm": 1.1055493889153472,
"learning_rate": 1.6878009624109312e-07,
"loss": 0.5062,
"mean_token_accuracy": 0.8539404590924581,
"num_tokens": 28694134.0,
"step": 405
},
{
"epoch": 2.5325077399380804,
"grad_norm": 1.0702929024634484,
"learning_rate": 1.493379307168573e-07,
"loss": 0.5392,
"mean_token_accuracy": 0.8472303132216136,
"num_tokens": 29049115.0,
"step": 410
},
{
"epoch": 2.5634674922600618,
"grad_norm": 1.2498891293489098,
"learning_rate": 1.3099473860912325e-07,
"loss": 0.5218,
"mean_token_accuracy": 0.8514606674512227,
"num_tokens": 29404233.0,
"step": 415
},
{
"epoch": 2.594427244582043,
"grad_norm": 1.135112532376098,
"learning_rate": 1.1377421755438832e-07,
"loss": 0.5072,
"mean_token_accuracy": 0.8529640992482503,
"num_tokens": 29759031.0,
"step": 420
},
{
"epoch": 2.625386996904025,
"grad_norm": 1.228152423413386,
"learning_rate": 9.769861480633979e-08,
"loss": 0.5377,
"mean_token_accuracy": 0.8478512247403462,
"num_tokens": 30116017.0,
"step": 425
},
{
"epoch": 2.656346749226006,
"grad_norm": 1.1448683622452198,
"learning_rate": 8.278869849454717e-08,
"loss": 0.5106,
"mean_token_accuracy": 0.8535682797431946,
"num_tokens": 30469452.0,
"step": 430
},
{
"epoch": 2.6873065015479876,
"grad_norm": 1.1448594360250504,
"learning_rate": 6.906373079403849e-08,
"loss": 0.4842,
"mean_token_accuracy": 0.8605853617191315,
"num_tokens": 30822195.0,
"step": 435
},
{
"epoch": 2.718266253869969,
"grad_norm": 1.1485381285009257,
"learning_rate": 5.6541443040429295e-08,
"loss": 0.5117,
"mean_token_accuracy": 0.8541167537371318,
"num_tokens": 31177700.0,
"step": 440
},
{
"epoch": 2.7492260061919502,
"grad_norm": 1.1004929101776235,
"learning_rate": 4.523801282274286e-08,
"loss": 0.5198,
"mean_token_accuracy": 0.8509711424509684,
"num_tokens": 31534499.0,
"step": 445
},
{
"epoch": 2.780185758513932,
"grad_norm": 1.235343825874549,
"learning_rate": 3.5168043083526274e-08,
"loss": 0.5151,
"mean_token_accuracy": 0.8518358329931895,
"num_tokens": 31890705.0,
"step": 450
},
{
"epoch": 2.8111455108359134,
"grad_norm": 1.2048990404018094,
"learning_rate": 2.634454325325497e-08,
"loss": 0.5085,
"mean_token_accuracy": 0.8524168650309245,
"num_tokens": 32246784.0,
"step": 455
},
{
"epoch": 2.8421052631578947,
"grad_norm": 1.1330314224549871,
"learning_rate": 1.877891244340224e-08,
"loss": 0.5141,
"mean_token_accuracy": 0.853009025255839,
"num_tokens": 32601413.0,
"step": 460
},
{
"epoch": 2.873065015479876,
"grad_norm": 1.1251032096069928,
"learning_rate": 1.2480924719885932e-08,
"loss": 0.4948,
"mean_token_accuracy": 0.8585106293360393,
"num_tokens": 32955057.0,
"step": 465
},
{
"epoch": 2.9040247678018574,
"grad_norm": 1.3402807034322533,
"learning_rate": 7.45871647591756e-09,
"loss": 0.5186,
"mean_token_accuracy": 0.8510903239250183,
"num_tokens": 33310606.0,
"step": 470
},
{
"epoch": 2.934984520123839,
"grad_norm": 1.1610151530941533,
"learning_rate": 3.7187759205656864e-09,
"loss": 0.5312,
"mean_token_accuracy": 0.8494451999664306,
"num_tokens": 33664584.0,
"step": 475
},
{
"epoch": 2.9659442724458205,
"grad_norm": 1.2159889934118617,
"learning_rate": 1.2659346966152895e-09,
"loss": 0.5202,
"mean_token_accuracy": 0.8501146256923675,
"num_tokens": 34019473.0,
"step": 480
},
{
"epoch": 2.996904024767802,
"grad_norm": 1.1268577396776651,
"learning_rate": 1.0336163855129143e-10,
"loss": 0.524,
"mean_token_accuracy": 0.8500737905502319,
"num_tokens": 34375925.0,
"step": 485
},
{
"epoch": 3.0,
"mean_token_accuracy": 0.8452582756678263,
"num_tokens": 34411845.0,
"step": 486,
"total_flos": 131252961812480.0,
"train_loss": 0.6672706213998206,
"train_runtime": 11583.057,
"train_samples_per_second": 4.014,
"train_steps_per_second": 0.042
}
],
"logging_steps": 5,
"max_steps": 486,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 131252961812480.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}