DataMind-Analysis-Qwen2.5-14B / trainer_state.json
Yukirsh's picture
Upload folder using huggingface_hub
8090119 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9745042492917846,
"eval_steps": 500,
"global_step": 264,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0113314447592068,
"grad_norm": 23.53424926746809,
"learning_rate": 3.7037037037037036e-07,
"loss": 1.4032,
"step": 1
},
{
"epoch": 0.0226628895184136,
"grad_norm": 23.298712049647957,
"learning_rate": 7.407407407407407e-07,
"loss": 1.4235,
"step": 2
},
{
"epoch": 0.0339943342776204,
"grad_norm": 25.15418991437702,
"learning_rate": 1.111111111111111e-06,
"loss": 1.3918,
"step": 3
},
{
"epoch": 0.0453257790368272,
"grad_norm": 23.513705846381587,
"learning_rate": 1.4814814814814815e-06,
"loss": 1.3583,
"step": 4
},
{
"epoch": 0.056657223796033995,
"grad_norm": 18.732779423937743,
"learning_rate": 1.8518518518518519e-06,
"loss": 1.2546,
"step": 5
},
{
"epoch": 0.0679886685552408,
"grad_norm": 19.557080948103327,
"learning_rate": 2.222222222222222e-06,
"loss": 1.2998,
"step": 6
},
{
"epoch": 0.07932011331444759,
"grad_norm": 10.724009151468064,
"learning_rate": 2.5925925925925925e-06,
"loss": 1.2226,
"step": 7
},
{
"epoch": 0.0906515580736544,
"grad_norm": 4.775800267490731,
"learning_rate": 2.962962962962963e-06,
"loss": 1.0559,
"step": 8
},
{
"epoch": 0.10198300283286119,
"grad_norm": 3.94648433253879,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.0475,
"step": 9
},
{
"epoch": 0.11331444759206799,
"grad_norm": 3.2410802138408448,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.9999,
"step": 10
},
{
"epoch": 0.12464589235127478,
"grad_norm": 3.409262196892178,
"learning_rate": 4.074074074074074e-06,
"loss": 0.9393,
"step": 11
},
{
"epoch": 0.1359773371104816,
"grad_norm": 2.7175044653926625,
"learning_rate": 4.444444444444444e-06,
"loss": 0.9487,
"step": 12
},
{
"epoch": 0.14730878186968838,
"grad_norm": 2.3048951821321078,
"learning_rate": 4.814814814814815e-06,
"loss": 0.9302,
"step": 13
},
{
"epoch": 0.15864022662889518,
"grad_norm": 2.172665075682734,
"learning_rate": 5.185185185185185e-06,
"loss": 0.9152,
"step": 14
},
{
"epoch": 0.16997167138810199,
"grad_norm": 2.1574147876066445,
"learning_rate": 5.555555555555557e-06,
"loss": 0.828,
"step": 15
},
{
"epoch": 0.1813031161473088,
"grad_norm": 2.097885575557383,
"learning_rate": 5.925925925925926e-06,
"loss": 0.8426,
"step": 16
},
{
"epoch": 0.19263456090651557,
"grad_norm": 1.7637510926108797,
"learning_rate": 6.296296296296297e-06,
"loss": 0.8096,
"step": 17
},
{
"epoch": 0.20396600566572237,
"grad_norm": 1.6562239272452715,
"learning_rate": 6.666666666666667e-06,
"loss": 0.838,
"step": 18
},
{
"epoch": 0.21529745042492918,
"grad_norm": 1.4205229302221682,
"learning_rate": 7.0370370370370375e-06,
"loss": 0.7763,
"step": 19
},
{
"epoch": 0.22662889518413598,
"grad_norm": 1.4262379616902559,
"learning_rate": 7.4074074074074075e-06,
"loss": 0.7698,
"step": 20
},
{
"epoch": 0.23796033994334279,
"grad_norm": 1.6494892959766825,
"learning_rate": 7.77777777777778e-06,
"loss": 0.7665,
"step": 21
},
{
"epoch": 0.24929178470254956,
"grad_norm": 1.4334685604983732,
"learning_rate": 8.148148148148148e-06,
"loss": 0.7822,
"step": 22
},
{
"epoch": 0.26062322946175637,
"grad_norm": 1.3849818905239097,
"learning_rate": 8.518518518518519e-06,
"loss": 0.7283,
"step": 23
},
{
"epoch": 0.2719546742209632,
"grad_norm": 1.341658865495544,
"learning_rate": 8.888888888888888e-06,
"loss": 0.6999,
"step": 24
},
{
"epoch": 0.28328611898017,
"grad_norm": 1.30493584130229,
"learning_rate": 9.25925925925926e-06,
"loss": 0.7367,
"step": 25
},
{
"epoch": 0.29461756373937675,
"grad_norm": 1.400982166207809,
"learning_rate": 9.62962962962963e-06,
"loss": 0.7212,
"step": 26
},
{
"epoch": 0.3059490084985836,
"grad_norm": 1.2376220091039114,
"learning_rate": 1e-05,
"loss": 0.7211,
"step": 27
},
{
"epoch": 0.31728045325779036,
"grad_norm": 1.3155466485701666,
"learning_rate": 9.999560724782173e-06,
"loss": 0.7194,
"step": 28
},
{
"epoch": 0.3286118980169972,
"grad_norm": 1.1452371029975463,
"learning_rate": 9.998242976313777e-06,
"loss": 0.7205,
"step": 29
},
{
"epoch": 0.33994334277620397,
"grad_norm": 1.166971440865611,
"learning_rate": 9.99604698613651e-06,
"loss": 0.7097,
"step": 30
},
{
"epoch": 0.35127478753541075,
"grad_norm": 1.3261901880488491,
"learning_rate": 9.992973140107998e-06,
"loss": 0.6974,
"step": 31
},
{
"epoch": 0.3626062322946176,
"grad_norm": 1.2464894375774034,
"learning_rate": 9.989021978333996e-06,
"loss": 0.7082,
"step": 32
},
{
"epoch": 0.37393767705382436,
"grad_norm": 1.1355726846388239,
"learning_rate": 9.98419419507348e-06,
"loss": 0.6734,
"step": 33
},
{
"epoch": 0.38526912181303113,
"grad_norm": 1.0490944265922426,
"learning_rate": 9.978490638616671e-06,
"loss": 0.6853,
"step": 34
},
{
"epoch": 0.39660056657223797,
"grad_norm": 1.2600398341735712,
"learning_rate": 9.971912311135967e-06,
"loss": 0.6703,
"step": 35
},
{
"epoch": 0.40793201133144474,
"grad_norm": 1.1344614916090783,
"learning_rate": 9.964460368509868e-06,
"loss": 0.6841,
"step": 36
},
{
"epoch": 0.4192634560906516,
"grad_norm": 1.1274773706270436,
"learning_rate": 9.956136120119858e-06,
"loss": 0.6817,
"step": 37
},
{
"epoch": 0.43059490084985835,
"grad_norm": 1.2113014190849545,
"learning_rate": 9.946941028620349e-06,
"loss": 0.6837,
"step": 38
},
{
"epoch": 0.44192634560906513,
"grad_norm": 1.172097274900492,
"learning_rate": 9.936876709681668e-06,
"loss": 0.6678,
"step": 39
},
{
"epoch": 0.45325779036827196,
"grad_norm": 1.2490078546533159,
"learning_rate": 9.925944931706174e-06,
"loss": 0.7413,
"step": 40
},
{
"epoch": 0.46458923512747874,
"grad_norm": 1.140086857717275,
"learning_rate": 9.914147615517527e-06,
"loss": 0.6778,
"step": 41
},
{
"epoch": 0.47592067988668557,
"grad_norm": 1.2223423143241732,
"learning_rate": 9.901486834023182e-06,
"loss": 0.7388,
"step": 42
},
{
"epoch": 0.48725212464589235,
"grad_norm": 1.2452223599483243,
"learning_rate": 9.887964811850159e-06,
"loss": 0.691,
"step": 43
},
{
"epoch": 0.4985835694050991,
"grad_norm": 1.1350515055455908,
"learning_rate": 9.873583924954152e-06,
"loss": 0.6593,
"step": 44
},
{
"epoch": 0.509915014164306,
"grad_norm": 1.0405719380983063,
"learning_rate": 9.85834670020205e-06,
"loss": 0.6351,
"step": 45
},
{
"epoch": 0.5212464589235127,
"grad_norm": 1.3303109757890985,
"learning_rate": 9.842255814927945e-06,
"loss": 0.6404,
"step": 46
},
{
"epoch": 0.5325779036827195,
"grad_norm": 1.1787017310861478,
"learning_rate": 9.825314096462686e-06,
"loss": 0.6858,
"step": 47
},
{
"epoch": 0.5439093484419264,
"grad_norm": 1.1028621034116284,
"learning_rate": 9.807524521637103e-06,
"loss": 0.6554,
"step": 48
},
{
"epoch": 0.5552407932011332,
"grad_norm": 1.0192876247663198,
"learning_rate": 9.78889021625894e-06,
"loss": 0.6581,
"step": 49
},
{
"epoch": 0.56657223796034,
"grad_norm": 1.0981773991994468,
"learning_rate": 9.769414454563614e-06,
"loss": 0.6873,
"step": 50
},
{
"epoch": 0.5779036827195467,
"grad_norm": 1.080964680948062,
"learning_rate": 9.749100658638914e-06,
"loss": 0.6313,
"step": 51
},
{
"epoch": 0.5892351274787535,
"grad_norm": 1.060635241593271,
"learning_rate": 9.72795239782369e-06,
"loss": 0.657,
"step": 52
},
{
"epoch": 0.6005665722379604,
"grad_norm": 1.1436681010237095,
"learning_rate": 9.705973388080694e-06,
"loss": 0.6521,
"step": 53
},
{
"epoch": 0.6118980169971672,
"grad_norm": 1.0838029458150678,
"learning_rate": 9.68316749134364e-06,
"loss": 0.6712,
"step": 54
},
{
"epoch": 0.623229461756374,
"grad_norm": 1.0579456798759823,
"learning_rate": 9.659538714838635e-06,
"loss": 0.6439,
"step": 55
},
{
"epoch": 0.6345609065155807,
"grad_norm": 1.000408593357701,
"learning_rate": 9.635091210380052e-06,
"loss": 0.6164,
"step": 56
},
{
"epoch": 0.6458923512747875,
"grad_norm": 1.0871122101771147,
"learning_rate": 9.609829273641034e-06,
"loss": 0.6561,
"step": 57
},
{
"epoch": 0.6572237960339944,
"grad_norm": 1.0392258903623652,
"learning_rate": 9.583757343398685e-06,
"loss": 0.6353,
"step": 58
},
{
"epoch": 0.6685552407932012,
"grad_norm": 1.0694855168162771,
"learning_rate": 9.55688000075414e-06,
"loss": 0.672,
"step": 59
},
{
"epoch": 0.6798866855524079,
"grad_norm": 1.0818048041242603,
"learning_rate": 9.529201968327618e-06,
"loss": 0.6649,
"step": 60
},
{
"epoch": 0.6912181303116147,
"grad_norm": 1.122154267801109,
"learning_rate": 9.500728109428603e-06,
"loss": 0.6338,
"step": 61
},
{
"epoch": 0.7025495750708215,
"grad_norm": 1.0115716268572774,
"learning_rate": 9.47146342720133e-06,
"loss": 0.6404,
"step": 62
},
{
"epoch": 0.7138810198300283,
"grad_norm": 1.060628179091387,
"learning_rate": 9.44141306374566e-06,
"loss": 0.6491,
"step": 63
},
{
"epoch": 0.7252124645892352,
"grad_norm": 1.0433876035374046,
"learning_rate": 9.410582299213574e-06,
"loss": 0.6131,
"step": 64
},
{
"epoch": 0.7365439093484419,
"grad_norm": 1.0724446453489962,
"learning_rate": 9.378976550881393e-06,
"loss": 0.645,
"step": 65
},
{
"epoch": 0.7478753541076487,
"grad_norm": 16.698318216158572,
"learning_rate": 9.346601372197914e-06,
"loss": 0.628,
"step": 66
},
{
"epoch": 0.7592067988668555,
"grad_norm": 1.088611623094774,
"learning_rate": 9.3134624518086e-06,
"loss": 0.651,
"step": 67
},
{
"epoch": 0.7705382436260623,
"grad_norm": 1.08573159288467,
"learning_rate": 9.279565612556043e-06,
"loss": 0.6913,
"step": 68
},
{
"epoch": 0.7818696883852692,
"grad_norm": 1.287771998076043,
"learning_rate": 9.244916810456822e-06,
"loss": 0.6167,
"step": 69
},
{
"epoch": 0.7932011331444759,
"grad_norm": 1.0734450115631073,
"learning_rate": 9.20952213365497e-06,
"loss": 0.6048,
"step": 70
},
{
"epoch": 0.8045325779036827,
"grad_norm": 1.041169203868327,
"learning_rate": 9.173387801352232e-06,
"loss": 0.622,
"step": 71
},
{
"epoch": 0.8158640226628895,
"grad_norm": 1.0441941562582049,
"learning_rate": 9.136520162715288e-06,
"loss": 0.636,
"step": 72
},
{
"epoch": 0.8271954674220963,
"grad_norm": 1.0266494367822185,
"learning_rate": 9.098925695760132e-06,
"loss": 0.641,
"step": 73
},
{
"epoch": 0.8385269121813032,
"grad_norm": 1.0525228370033899,
"learning_rate": 9.060611006213833e-06,
"loss": 0.605,
"step": 74
},
{
"epoch": 0.8498583569405099,
"grad_norm": 1.0169561500024211,
"learning_rate": 9.021582826353825e-06,
"loss": 0.6691,
"step": 75
},
{
"epoch": 0.8611898016997167,
"grad_norm": 1.0482739302531685,
"learning_rate": 8.981848013824995e-06,
"loss": 0.6658,
"step": 76
},
{
"epoch": 0.8725212464589235,
"grad_norm": 1.0794377750181379,
"learning_rate": 8.94141355043471e-06,
"loss": 0.6578,
"step": 77
},
{
"epoch": 0.8838526912181303,
"grad_norm": 1.0439742131558416,
"learning_rate": 8.900286540926062e-06,
"loss": 0.6138,
"step": 78
},
{
"epoch": 0.8951841359773371,
"grad_norm": 1.072198566934302,
"learning_rate": 8.85847421172947e-06,
"loss": 0.6313,
"step": 79
},
{
"epoch": 0.9065155807365439,
"grad_norm": 1.0570789500714661,
"learning_rate": 8.815983909692941e-06,
"loss": 0.611,
"step": 80
},
{
"epoch": 0.9178470254957507,
"grad_norm": 0.9747424186741095,
"learning_rate": 8.772823100791152e-06,
"loss": 0.6235,
"step": 81
},
{
"epoch": 0.9291784702549575,
"grad_norm": 0.9650403389286071,
"learning_rate": 8.728999368813591e-06,
"loss": 0.6289,
"step": 82
},
{
"epoch": 0.9405099150141643,
"grad_norm": 1.0608225953186365,
"learning_rate": 8.684520414032023e-06,
"loss": 0.6534,
"step": 83
},
{
"epoch": 0.9518413597733711,
"grad_norm": 1.0400599060401146,
"learning_rate": 8.639394051847472e-06,
"loss": 0.6351,
"step": 84
},
{
"epoch": 0.9631728045325779,
"grad_norm": 1.029029843151287,
"learning_rate": 8.593628211416964e-06,
"loss": 0.637,
"step": 85
},
{
"epoch": 0.9745042492917847,
"grad_norm": 0.9884213872615792,
"learning_rate": 8.547230934260313e-06,
"loss": 0.6414,
"step": 86
},
{
"epoch": 0.9858356940509915,
"grad_norm": 1.0448881569178157,
"learning_rate": 8.500210372847128e-06,
"loss": 0.6234,
"step": 87
},
{
"epoch": 0.9971671388101983,
"grad_norm": 1.0141851489732272,
"learning_rate": 8.452574789164352e-06,
"loss": 0.636,
"step": 88
},
{
"epoch": 1.0,
"grad_norm": 1.0141851489732272,
"learning_rate": 8.404332553264548e-06,
"loss": 0.6351,
"step": 89
},
{
"epoch": 1.0113314447592068,
"grad_norm": 2.0125074643954024,
"learning_rate": 8.355492141795185e-06,
"loss": 0.5146,
"step": 90
},
{
"epoch": 1.0226628895184136,
"grad_norm": 1.1306298266109818,
"learning_rate": 8.30606213650922e-06,
"loss": 0.497,
"step": 91
},
{
"epoch": 1.0339943342776203,
"grad_norm": 1.0714951468489908,
"learning_rate": 8.256051222757188e-06,
"loss": 0.4921,
"step": 92
},
{
"epoch": 1.045325779036827,
"grad_norm": 0.9830972246185706,
"learning_rate": 8.2054681879611e-06,
"loss": 0.4906,
"step": 93
},
{
"epoch": 1.056657223796034,
"grad_norm": 0.9632332800113752,
"learning_rate": 8.154321920070415e-06,
"loss": 0.4657,
"step": 94
},
{
"epoch": 1.0679886685552409,
"grad_norm": 1.193395200214797,
"learning_rate": 8.10262140600031e-06,
"loss": 0.4861,
"step": 95
},
{
"epoch": 1.0793201133144477,
"grad_norm": 1.28865060369019,
"learning_rate": 8.050375730052622e-06,
"loss": 0.5093,
"step": 96
},
{
"epoch": 1.0906515580736544,
"grad_norm": 1.247161113611643,
"learning_rate": 7.997594072319625e-06,
"loss": 0.504,
"step": 97
},
{
"epoch": 1.1019830028328612,
"grad_norm": 1.1321908951225559,
"learning_rate": 7.944285707070999e-06,
"loss": 0.514,
"step": 98
},
{
"epoch": 1.113314447592068,
"grad_norm": 1.097294675331813,
"learning_rate": 7.890460001124242e-06,
"loss": 0.5074,
"step": 99
},
{
"epoch": 1.1246458923512748,
"grad_norm": 1.1106766243842143,
"learning_rate": 7.836126412198842e-06,
"loss": 0.495,
"step": 100
},
{
"epoch": 1.1359773371104815,
"grad_norm": 1.0781028414115594,
"learning_rate": 7.781294487254436e-06,
"loss": 0.4917,
"step": 101
},
{
"epoch": 1.1473087818696883,
"grad_norm": 1.0597834799331805,
"learning_rate": 7.725973860813338e-06,
"loss": 0.4953,
"step": 102
},
{
"epoch": 1.158640226628895,
"grad_norm": 1.075317244066298,
"learning_rate": 7.67017425326764e-06,
"loss": 0.4985,
"step": 103
},
{
"epoch": 1.1699716713881019,
"grad_norm": 1.126814415152867,
"learning_rate": 7.613905469171247e-06,
"loss": 0.4869,
"step": 104
},
{
"epoch": 1.1813031161473089,
"grad_norm": 1.0228965180222989,
"learning_rate": 7.5571773955171124e-06,
"loss": 0.4956,
"step": 105
},
{
"epoch": 1.1926345609065157,
"grad_norm": 1.0496260656765666,
"learning_rate": 7.500000000000001e-06,
"loss": 0.4804,
"step": 106
},
{
"epoch": 1.2039660056657224,
"grad_norm": 1.057021462285616,
"learning_rate": 7.442383329265063e-06,
"loss": 0.4802,
"step": 107
},
{
"epoch": 1.2152974504249292,
"grad_norm": 1.0586760394529304,
"learning_rate": 7.3843375071425315e-06,
"loss": 0.4755,
"step": 108
},
{
"epoch": 1.226628895184136,
"grad_norm": 1.098164645835599,
"learning_rate": 7.32587273286887e-06,
"loss": 0.4806,
"step": 109
},
{
"epoch": 1.2379603399433428,
"grad_norm": 1.0425540537419706,
"learning_rate": 7.2669992792946595e-06,
"loss": 0.4976,
"step": 110
},
{
"epoch": 1.2492917847025495,
"grad_norm": 1.0081778159600596,
"learning_rate": 7.2077274910795605e-06,
"loss": 0.4775,
"step": 111
},
{
"epoch": 1.2606232294617563,
"grad_norm": 1.0426051895523285,
"learning_rate": 7.14806778287464e-06,
"loss": 0.4948,
"step": 112
},
{
"epoch": 1.271954674220963,
"grad_norm": 1.0491543765702032,
"learning_rate": 7.088030637492429e-06,
"loss": 0.5198,
"step": 113
},
{
"epoch": 1.28328611898017,
"grad_norm": 1.0120042186636362,
"learning_rate": 7.02762660406497e-06,
"loss": 0.5032,
"step": 114
},
{
"epoch": 1.2946175637393766,
"grad_norm": 1.0538656654185354,
"learning_rate": 6.966866296190243e-06,
"loss": 0.4835,
"step": 115
},
{
"epoch": 1.3059490084985836,
"grad_norm": 0.983675200448248,
"learning_rate": 6.9057603900672355e-06,
"loss": 0.4469,
"step": 116
},
{
"epoch": 1.3172804532577904,
"grad_norm": 1.1103412550285476,
"learning_rate": 6.844319622620039e-06,
"loss": 0.5124,
"step": 117
},
{
"epoch": 1.3286118980169972,
"grad_norm": 1.064307096238654,
"learning_rate": 6.782554789611256e-06,
"loss": 0.4943,
"step": 118
},
{
"epoch": 1.339943342776204,
"grad_norm": 1.0325281954877101,
"learning_rate": 6.7204767437450725e-06,
"loss": 0.4703,
"step": 119
},
{
"epoch": 1.3512747875354107,
"grad_norm": 1.0365628583864628,
"learning_rate": 6.65809639276034e-06,
"loss": 0.494,
"step": 120
},
{
"epoch": 1.3626062322946175,
"grad_norm": 1.0482388627399757,
"learning_rate": 6.595424697513963e-06,
"loss": 0.4502,
"step": 121
},
{
"epoch": 1.3739376770538243,
"grad_norm": 1.0272064142818405,
"learning_rate": 6.532472670054975e-06,
"loss": 0.492,
"step": 122
},
{
"epoch": 1.385269121813031,
"grad_norm": 1.0810272879082132,
"learning_rate": 6.469251371689606e-06,
"loss": 0.4847,
"step": 123
},
{
"epoch": 1.3966005665722379,
"grad_norm": 1.0366197600921454,
"learning_rate": 6.405771911037698e-06,
"loss": 0.4999,
"step": 124
},
{
"epoch": 1.4079320113314449,
"grad_norm": 1.0295069364200777,
"learning_rate": 6.342045442080818e-06,
"loss": 0.4783,
"step": 125
},
{
"epoch": 1.4192634560906516,
"grad_norm": 1.0528763013327969,
"learning_rate": 6.278083162202374e-06,
"loss": 0.4846,
"step": 126
},
{
"epoch": 1.4305949008498584,
"grad_norm": 1.0734593139015471,
"learning_rate": 6.21389631022014e-06,
"loss": 0.5134,
"step": 127
},
{
"epoch": 1.4419263456090652,
"grad_norm": 1.0207282551843653,
"learning_rate": 6.1494961644114685e-06,
"loss": 0.4855,
"step": 128
},
{
"epoch": 1.453257790368272,
"grad_norm": 0.9713494112903828,
"learning_rate": 6.084894040531591e-06,
"loss": 0.4667,
"step": 129
},
{
"epoch": 1.4645892351274787,
"grad_norm": 1.1036048289558185,
"learning_rate": 6.0201012898253244e-06,
"loss": 0.4905,
"step": 130
},
{
"epoch": 1.4759206798866855,
"grad_norm": 0.996202225854195,
"learning_rate": 5.9551292970325394e-06,
"loss": 0.4746,
"step": 131
},
{
"epoch": 1.4872521246458923,
"grad_norm": 1.0919133151119662,
"learning_rate": 5.8899894783877536e-06,
"loss": 0.5201,
"step": 132
},
{
"epoch": 1.498583569405099,
"grad_norm": 1.11280141387768,
"learning_rate": 5.824693279614171e-06,
"loss": 0.4953,
"step": 133
},
{
"epoch": 1.509915014164306,
"grad_norm": 1.1163217052046956,
"learning_rate": 5.759252173912573e-06,
"loss": 0.481,
"step": 134
},
{
"epoch": 1.5212464589235126,
"grad_norm": 1.0688323988028812,
"learning_rate": 5.693677659945343e-06,
"loss": 0.4711,
"step": 135
},
{
"epoch": 1.5325779036827196,
"grad_norm": 0.9512892167994508,
"learning_rate": 5.627981259816041e-06,
"loss": 0.4697,
"step": 136
},
{
"epoch": 1.5439093484419264,
"grad_norm": 1.0157798339830766,
"learning_rate": 5.562174517044862e-06,
"loss": 0.4728,
"step": 137
},
{
"epoch": 1.5552407932011332,
"grad_norm": 0.9982778169224142,
"learning_rate": 5.496268994540309e-06,
"loss": 0.453,
"step": 138
},
{
"epoch": 1.56657223796034,
"grad_norm": 1.1297738773397445,
"learning_rate": 5.430276272567485e-06,
"loss": 0.495,
"step": 139
},
{
"epoch": 1.5779036827195467,
"grad_norm": 1.0139903899310507,
"learning_rate": 5.364207946713318e-06,
"loss": 0.4844,
"step": 140
},
{
"epoch": 1.5892351274787535,
"grad_norm": 0.9490126458491319,
"learning_rate": 5.2980756258491e-06,
"loss": 0.4632,
"step": 141
},
{
"epoch": 1.6005665722379603,
"grad_norm": 0.9789924111916612,
"learning_rate": 5.231890930090692e-06,
"loss": 0.4641,
"step": 142
},
{
"epoch": 1.6118980169971673,
"grad_norm": 1.0001930502458516,
"learning_rate": 5.165665488756755e-06,
"loss": 0.4511,
"step": 143
},
{
"epoch": 1.6232294617563738,
"grad_norm": 1.0412278168604834,
"learning_rate": 5.099410938325351e-06,
"loss": 0.4813,
"step": 144
},
{
"epoch": 1.6345609065155808,
"grad_norm": 2.990945647537025,
"learning_rate": 5.033138920389313e-06,
"loss": 0.4949,
"step": 145
},
{
"epoch": 1.6458923512747874,
"grad_norm": 0.9622418163601026,
"learning_rate": 4.966861079610688e-06,
"loss": 0.4855,
"step": 146
},
{
"epoch": 1.6572237960339944,
"grad_norm": 1.0030167678640822,
"learning_rate": 4.900589061674649e-06,
"loss": 0.4589,
"step": 147
},
{
"epoch": 1.6685552407932012,
"grad_norm": 1.0109166766299091,
"learning_rate": 4.8343345112432475e-06,
"loss": 0.4778,
"step": 148
},
{
"epoch": 1.679886685552408,
"grad_norm": 1.0402771028968805,
"learning_rate": 4.7681090699093076e-06,
"loss": 0.4874,
"step": 149
},
{
"epoch": 1.6912181303116147,
"grad_norm": 1.0333160217244122,
"learning_rate": 4.701924374150901e-06,
"loss": 0.469,
"step": 150
},
{
"epoch": 1.7025495750708215,
"grad_norm": 1.0264878278726923,
"learning_rate": 4.635792053286682e-06,
"loss": 0.477,
"step": 151
},
{
"epoch": 1.7138810198300283,
"grad_norm": 0.9806277129349131,
"learning_rate": 4.569723727432517e-06,
"loss": 0.4609,
"step": 152
},
{
"epoch": 1.725212464589235,
"grad_norm": 1.0430109649067774,
"learning_rate": 4.5037310054596936e-06,
"loss": 0.4852,
"step": 153
},
{
"epoch": 1.736543909348442,
"grad_norm": 1.0177412955604808,
"learning_rate": 4.43782548295514e-06,
"loss": 0.4538,
"step": 154
},
{
"epoch": 1.7478753541076486,
"grad_norm": 1.0742221754801993,
"learning_rate": 4.372018740183961e-06,
"loss": 0.502,
"step": 155
},
{
"epoch": 1.7592067988668556,
"grad_norm": 1.2114594760413002,
"learning_rate": 4.30632234005466e-06,
"loss": 0.4626,
"step": 156
},
{
"epoch": 1.7705382436260622,
"grad_norm": 1.0105219104936058,
"learning_rate": 4.2407478260874294e-06,
"loss": 0.4443,
"step": 157
},
{
"epoch": 1.7818696883852692,
"grad_norm": 1.0676939421912321,
"learning_rate": 4.175306720385831e-06,
"loss": 0.461,
"step": 158
},
{
"epoch": 1.793201133144476,
"grad_norm": 1.0843360976121068,
"learning_rate": 4.11001052161225e-06,
"loss": 0.4562,
"step": 159
},
{
"epoch": 1.8045325779036827,
"grad_norm": 1.0182445190909426,
"learning_rate": 4.044870702967461e-06,
"loss": 0.4597,
"step": 160
},
{
"epoch": 1.8158640226628895,
"grad_norm": 1.0266398146802735,
"learning_rate": 3.979898710174678e-06,
"loss": 0.4737,
"step": 161
},
{
"epoch": 1.8271954674220963,
"grad_norm": 1.0375307407230006,
"learning_rate": 3.91510595946841e-06,
"loss": 0.476,
"step": 162
},
{
"epoch": 1.8385269121813033,
"grad_norm": 1.0510195116895713,
"learning_rate": 3.850503835588533e-06,
"loss": 0.4572,
"step": 163
},
{
"epoch": 1.8498583569405098,
"grad_norm": 1.0707576258473916,
"learning_rate": 3.786103689779861e-06,
"loss": 0.4855,
"step": 164
},
{
"epoch": 1.8611898016997168,
"grad_norm": 1.109879197789788,
"learning_rate": 3.721916837797627e-06,
"loss": 0.4744,
"step": 165
},
{
"epoch": 1.8725212464589234,
"grad_norm": 0.9430434127126872,
"learning_rate": 3.6579545579191834e-06,
"loss": 0.5036,
"step": 166
},
{
"epoch": 1.8838526912181304,
"grad_norm": 1.0454617136926816,
"learning_rate": 3.5942280889623028e-06,
"loss": 0.4757,
"step": 167
},
{
"epoch": 1.8951841359773371,
"grad_norm": 0.9669993221473043,
"learning_rate": 3.5307486283103966e-06,
"loss": 0.4939,
"step": 168
},
{
"epoch": 1.906515580736544,
"grad_norm": 1.1489596332179548,
"learning_rate": 3.4675273299450264e-06,
"loss": 0.4875,
"step": 169
},
{
"epoch": 1.9178470254957507,
"grad_norm": 1.236638321882873,
"learning_rate": 3.4045753024860393e-06,
"loss": 0.4899,
"step": 170
},
{
"epoch": 1.9291784702549575,
"grad_norm": 1.0015067232304347,
"learning_rate": 3.3419036072396614e-06,
"loss": 0.4367,
"step": 171
},
{
"epoch": 1.9405099150141643,
"grad_norm": 0.991139662986458,
"learning_rate": 3.2795232562549296e-06,
"loss": 0.4593,
"step": 172
},
{
"epoch": 1.951841359773371,
"grad_norm": 1.0171228373147831,
"learning_rate": 3.2174452103887455e-06,
"loss": 0.4864,
"step": 173
},
{
"epoch": 1.963172804532578,
"grad_norm": 1.0183503025841374,
"learning_rate": 3.1556803773799616e-06,
"loss": 0.4775,
"step": 174
},
{
"epoch": 1.9745042492917846,
"grad_norm": 0.9658158834425475,
"learning_rate": 3.0942396099327645e-06,
"loss": 0.4628,
"step": 175
},
{
"epoch": 1.9858356940509916,
"grad_norm": 1.0046391704473616,
"learning_rate": 3.03313370380976e-06,
"loss": 0.4945,
"step": 176
},
{
"epoch": 1.9971671388101981,
"grad_norm": 0.9746868290860945,
"learning_rate": 2.972373395935031e-06,
"loss": 0.4384,
"step": 177
},
{
"epoch": 2.0,
"grad_norm": 0.9746868290860945,
"learning_rate": 2.911969362507574e-06,
"loss": 0.4562,
"step": 178
},
{
"epoch": 2.011331444759207,
"grad_norm": 2.1634786845934584,
"learning_rate": 2.8519322171253605e-06,
"loss": 0.3576,
"step": 179
},
{
"epoch": 2.0226628895184136,
"grad_norm": 1.259167809122465,
"learning_rate": 2.792272508920443e-06,
"loss": 0.3306,
"step": 180
},
{
"epoch": 2.0339943342776206,
"grad_norm": 1.3388873888110011,
"learning_rate": 2.7330007207053413e-06,
"loss": 0.353,
"step": 181
},
{
"epoch": 2.045325779036827,
"grad_norm": 1.1581849151502048,
"learning_rate": 2.674127267131131e-06,
"loss": 0.3317,
"step": 182
},
{
"epoch": 2.056657223796034,
"grad_norm": 1.0160032268336192,
"learning_rate": 2.615662492857471e-06,
"loss": 0.3581,
"step": 183
},
{
"epoch": 2.0679886685552407,
"grad_norm": 1.0233678646861728,
"learning_rate": 2.5576166707349387e-06,
"loss": 0.3359,
"step": 184
},
{
"epoch": 2.0793201133144477,
"grad_norm": 1.0874679159300038,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.3219,
"step": 185
},
{
"epoch": 2.090651558073654,
"grad_norm": 1.2469998353736902,
"learning_rate": 2.4428226044828896e-06,
"loss": 0.3271,
"step": 186
},
{
"epoch": 2.101983002832861,
"grad_norm": 1.1847806975199535,
"learning_rate": 2.3860945308287554e-06,
"loss": 0.3429,
"step": 187
},
{
"epoch": 2.113314447592068,
"grad_norm": 1.3829661881977866,
"learning_rate": 2.3298257467323605e-06,
"loss": 0.3492,
"step": 188
},
{
"epoch": 2.1246458923512748,
"grad_norm": 1.1118666347289263,
"learning_rate": 2.2740261391866634e-06,
"loss": 0.3343,
"step": 189
},
{
"epoch": 2.1359773371104818,
"grad_norm": 1.1295786044065697,
"learning_rate": 2.2187055127455653e-06,
"loss": 0.3306,
"step": 190
},
{
"epoch": 2.1473087818696883,
"grad_norm": 1.3950194361496737,
"learning_rate": 2.1638735878011603e-06,
"loss": 0.3515,
"step": 191
},
{
"epoch": 2.1586402266288953,
"grad_norm": 1.1337210762125438,
"learning_rate": 2.1095399988757574e-06,
"loss": 0.3201,
"step": 192
},
{
"epoch": 2.169971671388102,
"grad_norm": 1.059433716116878,
"learning_rate": 2.0557142929290027e-06,
"loss": 0.3526,
"step": 193
},
{
"epoch": 2.181303116147309,
"grad_norm": 1.0876920114742847,
"learning_rate": 2.0024059276803742e-06,
"loss": 0.3275,
"step": 194
},
{
"epoch": 2.1926345609065154,
"grad_norm": 1.136528776323311,
"learning_rate": 1.949624269947378e-06,
"loss": 0.3499,
"step": 195
},
{
"epoch": 2.2039660056657224,
"grad_norm": 1.1195654060494844,
"learning_rate": 1.897378593999693e-06,
"loss": 0.3105,
"step": 196
},
{
"epoch": 2.215297450424929,
"grad_norm": 1.0686107201673802,
"learning_rate": 1.8456780799295888e-06,
"loss": 0.3409,
"step": 197
},
{
"epoch": 2.226628895184136,
"grad_norm": 1.1176135978118285,
"learning_rate": 1.794531812038901e-06,
"loss": 0.3242,
"step": 198
},
{
"epoch": 2.237960339943343,
"grad_norm": 1.1225522593354427,
"learning_rate": 1.7439487772428142e-06,
"loss": 0.3331,
"step": 199
},
{
"epoch": 2.2492917847025495,
"grad_norm": 1.0504526826797216,
"learning_rate": 1.6939378634907815e-06,
"loss": 0.3223,
"step": 200
},
{
"epoch": 2.2606232294617565,
"grad_norm": 1.0112717450368687,
"learning_rate": 1.6445078582048158e-06,
"loss": 0.3328,
"step": 201
},
{
"epoch": 2.271954674220963,
"grad_norm": 1.0056815807805697,
"learning_rate": 1.5956674467354538e-06,
"loss": 0.3349,
"step": 202
},
{
"epoch": 2.28328611898017,
"grad_norm": 1.0324761445382153,
"learning_rate": 1.5474252108356475e-06,
"loss": 0.3147,
"step": 203
},
{
"epoch": 2.2946175637393766,
"grad_norm": 3.0756191856725437,
"learning_rate": 1.499789627152874e-06,
"loss": 0.3148,
"step": 204
},
{
"epoch": 2.3059490084985836,
"grad_norm": 1.14933836933374,
"learning_rate": 1.452769065739688e-06,
"loss": 0.3487,
"step": 205
},
{
"epoch": 2.31728045325779,
"grad_norm": 0.9691075451255097,
"learning_rate": 1.4063717885830375e-06,
"loss": 0.3216,
"step": 206
},
{
"epoch": 2.328611898016997,
"grad_norm": 1.2745727227347767,
"learning_rate": 1.3606059481525296e-06,
"loss": 0.3585,
"step": 207
},
{
"epoch": 2.3399433427762037,
"grad_norm": 0.9868916262509804,
"learning_rate": 1.3154795859679781e-06,
"loss": 0.3416,
"step": 208
},
{
"epoch": 2.3512747875354107,
"grad_norm": 1.029329657926381,
"learning_rate": 1.2710006311864104e-06,
"loss": 0.3438,
"step": 209
},
{
"epoch": 2.3626062322946177,
"grad_norm": 1.2301937202365874,
"learning_rate": 1.227176899208849e-06,
"loss": 0.3232,
"step": 210
},
{
"epoch": 2.3739376770538243,
"grad_norm": 1.1079694734813215,
"learning_rate": 1.1840160903070591e-06,
"loss": 0.3533,
"step": 211
},
{
"epoch": 2.3852691218130313,
"grad_norm": 1.0355467829487406,
"learning_rate": 1.141525788270531e-06,
"loss": 0.3455,
"step": 212
},
{
"epoch": 2.396600566572238,
"grad_norm": 1.0285862271263877,
"learning_rate": 1.09971345907394e-06,
"loss": 0.2994,
"step": 213
},
{
"epoch": 2.407932011331445,
"grad_norm": 1.0633893519411577,
"learning_rate": 1.0585864495652899e-06,
"loss": 0.3386,
"step": 214
},
{
"epoch": 2.4192634560906514,
"grad_norm": 0.985097331489618,
"learning_rate": 1.0181519861750078e-06,
"loss": 0.3181,
"step": 215
},
{
"epoch": 2.4305949008498584,
"grad_norm": 0.951413780263271,
"learning_rate": 9.784171736461762e-07,
"loss": 0.3105,
"step": 216
},
{
"epoch": 2.441926345609065,
"grad_norm": 1.0282273427987358,
"learning_rate": 9.393889937861694e-07,
"loss": 0.3179,
"step": 217
},
{
"epoch": 2.453257790368272,
"grad_norm": 1.026529941608791,
"learning_rate": 9.010743042398684e-07,
"loss": 0.3234,
"step": 218
},
{
"epoch": 2.4645892351274785,
"grad_norm": 1.0299442438320148,
"learning_rate": 8.634798372847148e-07,
"loss": 0.335,
"step": 219
},
{
"epoch": 2.4759206798866855,
"grad_norm": 0.9309231031132973,
"learning_rate": 8.266121986477699e-07,
"loss": 0.318,
"step": 220
},
{
"epoch": 2.4872521246458925,
"grad_norm": 1.0062159661580126,
"learning_rate": 7.904778663450325e-07,
"loss": 0.3292,
"step": 221
},
{
"epoch": 2.498583569405099,
"grad_norm": 1.0354919361102888,
"learning_rate": 7.550831895431799e-07,
"loss": 0.3266,
"step": 222
},
{
"epoch": 2.509915014164306,
"grad_norm": 0.9693415538045153,
"learning_rate": 7.204343874439578e-07,
"loss": 0.3282,
"step": 223
},
{
"epoch": 2.5212464589235126,
"grad_norm": 1.0178821797615285,
"learning_rate": 6.865375481914017e-07,
"loss": 0.3561,
"step": 224
},
{
"epoch": 2.5325779036827196,
"grad_norm": 1.0271091771586642,
"learning_rate": 6.533986278020876e-07,
"loss": 0.3064,
"step": 225
},
{
"epoch": 2.543909348441926,
"grad_norm": 0.9930205073186488,
"learning_rate": 6.210234491186079e-07,
"loss": 0.318,
"step": 226
},
{
"epoch": 2.555240793201133,
"grad_norm": 1.015466323155115,
"learning_rate": 5.894177007864272e-07,
"loss": 0.3408,
"step": 227
},
{
"epoch": 2.56657223796034,
"grad_norm": 1.065785552873228,
"learning_rate": 5.585869362543416e-07,
"loss": 0.3414,
"step": 228
},
{
"epoch": 2.5779036827195467,
"grad_norm": 1.0524927446179813,
"learning_rate": 5.285365727986708e-07,
"loss": 0.3422,
"step": 229
},
{
"epoch": 2.5892351274787533,
"grad_norm": 1.0219196548167786,
"learning_rate": 4.992718905713967e-07,
"loss": 0.3388,
"step": 230
},
{
"epoch": 2.6005665722379603,
"grad_norm": 0.9679912813387603,
"learning_rate": 4.707980316723837e-07,
"loss": 0.3165,
"step": 231
},
{
"epoch": 2.6118980169971673,
"grad_norm": 0.9893500327460035,
"learning_rate": 4.431199992458607e-07,
"loss": 0.3238,
"step": 232
},
{
"epoch": 2.623229461756374,
"grad_norm": 0.9876579686339385,
"learning_rate": 4.16242656601315e-07,
"loss": 0.308,
"step": 233
},
{
"epoch": 2.634560906515581,
"grad_norm": 1.01213916356771,
"learning_rate": 3.9017072635896716e-07,
"loss": 0.331,
"step": 234
},
{
"epoch": 2.6458923512747874,
"grad_norm": 1.0151577294613559,
"learning_rate": 3.649087896199488e-07,
"loss": 0.3098,
"step": 235
},
{
"epoch": 2.6572237960339944,
"grad_norm": 0.9854787297770221,
"learning_rate": 3.404612851613676e-07,
"loss": 0.3202,
"step": 236
},
{
"epoch": 2.668555240793201,
"grad_norm": 2.5197939583747866,
"learning_rate": 3.168325086563612e-07,
"loss": 0.3302,
"step": 237
},
{
"epoch": 2.679886685552408,
"grad_norm": 0.9681009670329549,
"learning_rate": 2.9402661191930803e-07,
"loss": 0.3221,
"step": 238
},
{
"epoch": 2.691218130311615,
"grad_norm": 1.0155833734622453,
"learning_rate": 2.7204760217631074e-07,
"loss": 0.324,
"step": 239
},
{
"epoch": 2.7025495750708215,
"grad_norm": 1.1931982505904983,
"learning_rate": 2.5089934136108665e-07,
"loss": 0.3327,
"step": 240
},
{
"epoch": 2.713881019830028,
"grad_norm": 0.9735860788683143,
"learning_rate": 2.30585545436387e-07,
"loss": 0.3483,
"step": 241
},
{
"epoch": 2.725212464589235,
"grad_norm": 0.9628214952166717,
"learning_rate": 2.1110978374106195e-07,
"loss": 0.3455,
"step": 242
},
{
"epoch": 2.736543909348442,
"grad_norm": 1.4367674984114238,
"learning_rate": 1.9247547836289792e-07,
"loss": 0.3565,
"step": 243
},
{
"epoch": 2.7478753541076486,
"grad_norm": 1.0738794442822241,
"learning_rate": 1.7468590353731495e-07,
"loss": 0.3577,
"step": 244
},
{
"epoch": 2.7592067988668556,
"grad_norm": 1.0163993435166494,
"learning_rate": 1.577441850720568e-07,
"loss": 0.3346,
"step": 245
},
{
"epoch": 2.770538243626062,
"grad_norm": 1.1268283470669345,
"learning_rate": 1.4165329979794972e-07,
"loss": 0.3204,
"step": 246
},
{
"epoch": 2.781869688385269,
"grad_norm": 1.00412302366148,
"learning_rate": 1.264160750458493e-07,
"loss": 0.3091,
"step": 247
},
{
"epoch": 2.7932011331444757,
"grad_norm": 1.0878323463224275,
"learning_rate": 1.1203518814984216e-07,
"loss": 0.3219,
"step": 248
},
{
"epoch": 2.8045325779036827,
"grad_norm": 1.0326844241286977,
"learning_rate": 9.851316597681959e-08,
"loss": 0.3407,
"step": 249
},
{
"epoch": 2.8158640226628897,
"grad_norm": 1.0488660487318535,
"learning_rate": 8.585238448247434e-08,
"loss": 0.3066,
"step": 250
},
{
"epoch": 2.8271954674220963,
"grad_norm": 0.9440222402450956,
"learning_rate": 7.405506829382736e-08,
"loss": 0.3007,
"step": 251
},
{
"epoch": 2.8385269121813033,
"grad_norm": 0.9992965787158642,
"learning_rate": 6.31232903183332e-08,
"loss": 0.3211,
"step": 252
},
{
"epoch": 2.84985835694051,
"grad_norm": 1.0525889142182898,
"learning_rate": 5.305897137965199e-08,
"loss": 0.3339,
"step": 253
},
{
"epoch": 2.861189801699717,
"grad_norm": 1.0406232501867803,
"learning_rate": 4.3863879880142737e-08,
"loss": 0.3188,
"step": 254
},
{
"epoch": 2.8725212464589234,
"grad_norm": 1.0108504238438418,
"learning_rate": 3.553963149013295e-08,
"loss": 0.3426,
"step": 255
},
{
"epoch": 2.8838526912181304,
"grad_norm": 1.040975846702501,
"learning_rate": 2.8087688864033014e-08,
"loss": 0.3365,
"step": 256
},
{
"epoch": 2.8951841359773374,
"grad_norm": 1.0279134406587973,
"learning_rate": 2.1509361383330597e-08,
"loss": 0.3167,
"step": 257
},
{
"epoch": 2.906515580736544,
"grad_norm": 1.0127896976081647,
"learning_rate": 1.580580492652084e-08,
"loss": 0.3589,
"step": 258
},
{
"epoch": 2.9178470254957505,
"grad_norm": 1.002944001928922,
"learning_rate": 1.0978021666005479e-08,
"loss": 0.3382,
"step": 259
},
{
"epoch": 2.9291784702549575,
"grad_norm": 0.9936405641782646,
"learning_rate": 7.02685989200258e-09,
"loss": 0.3373,
"step": 260
},
{
"epoch": 2.9405099150141645,
"grad_norm": 1.0916598818224916,
"learning_rate": 3.953013863490784e-09,
"loss": 0.3124,
"step": 261
},
{
"epoch": 2.951841359773371,
"grad_norm": 0.9881063904383428,
"learning_rate": 1.757023686224102e-09,
"loss": 0.3401,
"step": 262
},
{
"epoch": 2.963172804532578,
"grad_norm": 0.9953124861905701,
"learning_rate": 4.392752178278281e-10,
"loss": 0.3202,
"step": 263
},
{
"epoch": 2.9745042492917846,
"grad_norm": 0.9957928075882635,
"learning_rate": 0.0,
"loss": 0.299,
"step": 264
}
],
"logging_steps": 1,
"max_steps": 264,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 72196646453248.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}