DataMind-Analysis-Qwen2.5-7B / trainer_state.json
Yukirsh's picture
Upload folder using huggingface_hub
892f583 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9745042492917846,
"eval_steps": 500,
"global_step": 264,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0113314447592068,
"grad_norm": 24.700892966997085,
"learning_rate": 1.8518518518518518e-07,
"loss": 1.6228,
"step": 1
},
{
"epoch": 0.0226628895184136,
"grad_norm": 25.319873190341358,
"learning_rate": 3.7037037037037036e-07,
"loss": 1.6327,
"step": 2
},
{
"epoch": 0.0339943342776204,
"grad_norm": 24.785410230478583,
"learning_rate": 5.555555555555555e-07,
"loss": 1.6085,
"step": 3
},
{
"epoch": 0.0453257790368272,
"grad_norm": 24.530949684513566,
"learning_rate": 7.407407407407407e-07,
"loss": 1.5921,
"step": 4
},
{
"epoch": 0.056657223796033995,
"grad_norm": 21.437475590002645,
"learning_rate": 9.259259259259259e-07,
"loss": 1.4876,
"step": 5
},
{
"epoch": 0.0679886685552408,
"grad_norm": 23.598738552751918,
"learning_rate": 1.111111111111111e-06,
"loss": 1.5618,
"step": 6
},
{
"epoch": 0.07932011331444759,
"grad_norm": 24.355821351281282,
"learning_rate": 1.2962962962962962e-06,
"loss": 1.5728,
"step": 7
},
{
"epoch": 0.0906515580736544,
"grad_norm": 17.949944552531893,
"learning_rate": 1.4814814814814815e-06,
"loss": 1.3796,
"step": 8
},
{
"epoch": 0.10198300283286119,
"grad_norm": 15.608410231061514,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.3663,
"step": 9
},
{
"epoch": 0.11331444759206799,
"grad_norm": 12.967898302916174,
"learning_rate": 1.8518518518518519e-06,
"loss": 1.2852,
"step": 10
},
{
"epoch": 0.12464589235127478,
"grad_norm": 5.5745605303959005,
"learning_rate": 2.037037037037037e-06,
"loss": 1.1179,
"step": 11
},
{
"epoch": 0.1359773371104816,
"grad_norm": 5.003567410730206,
"learning_rate": 2.222222222222222e-06,
"loss": 1.1428,
"step": 12
},
{
"epoch": 0.14730878186968838,
"grad_norm": 4.580788434319785,
"learning_rate": 2.4074074074074075e-06,
"loss": 1.1386,
"step": 13
},
{
"epoch": 0.15864022662889518,
"grad_norm": 4.208213518050668,
"learning_rate": 2.5925925925925925e-06,
"loss": 1.1171,
"step": 14
},
{
"epoch": 0.16997167138810199,
"grad_norm": 3.6924260397238076,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.9945,
"step": 15
},
{
"epoch": 0.1813031161473088,
"grad_norm": 4.282401825239783,
"learning_rate": 2.962962962962963e-06,
"loss": 1.0298,
"step": 16
},
{
"epoch": 0.19263456090651557,
"grad_norm": 3.998142705803431,
"learning_rate": 3.1481481481481483e-06,
"loss": 0.998,
"step": 17
},
{
"epoch": 0.20396600566572237,
"grad_norm": 3.652954638326853,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.0252,
"step": 18
},
{
"epoch": 0.21529745042492918,
"grad_norm": 3.1106976788005833,
"learning_rate": 3.5185185185185187e-06,
"loss": 0.948,
"step": 19
},
{
"epoch": 0.22662889518413598,
"grad_norm": 2.3583842636458874,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.9141,
"step": 20
},
{
"epoch": 0.23796033994334279,
"grad_norm": 2.331383902586234,
"learning_rate": 3.88888888888889e-06,
"loss": 0.91,
"step": 21
},
{
"epoch": 0.24929178470254956,
"grad_norm": 2.3290851154332155,
"learning_rate": 4.074074074074074e-06,
"loss": 0.92,
"step": 22
},
{
"epoch": 0.26062322946175637,
"grad_norm": 2.088852025013323,
"learning_rate": 4.2592592592592596e-06,
"loss": 0.8521,
"step": 23
},
{
"epoch": 0.2719546742209632,
"grad_norm": 1.8360508671725202,
"learning_rate": 4.444444444444444e-06,
"loss": 0.8143,
"step": 24
},
{
"epoch": 0.28328611898017,
"grad_norm": 1.7885693902522186,
"learning_rate": 4.62962962962963e-06,
"loss": 0.8486,
"step": 25
},
{
"epoch": 0.29461756373937675,
"grad_norm": 1.9031889309290633,
"learning_rate": 4.814814814814815e-06,
"loss": 0.834,
"step": 26
},
{
"epoch": 0.3059490084985836,
"grad_norm": 1.6679534815550068,
"learning_rate": 5e-06,
"loss": 0.8224,
"step": 27
},
{
"epoch": 0.31728045325779036,
"grad_norm": 1.6886754763902796,
"learning_rate": 4.999780362391087e-06,
"loss": 0.8133,
"step": 28
},
{
"epoch": 0.3286118980169972,
"grad_norm": 1.8004804802995344,
"learning_rate": 4.9991214881568884e-06,
"loss": 0.8145,
"step": 29
},
{
"epoch": 0.33994334277620397,
"grad_norm": 1.7444546855121879,
"learning_rate": 4.998023493068255e-06,
"loss": 0.8028,
"step": 30
},
{
"epoch": 0.35127478753541075,
"grad_norm": 1.6882783810092117,
"learning_rate": 4.996486570053999e-06,
"loss": 0.784,
"step": 31
},
{
"epoch": 0.3626062322946176,
"grad_norm": 1.7424306321156553,
"learning_rate": 4.994510989166998e-06,
"loss": 0.802,
"step": 32
},
{
"epoch": 0.37393767705382436,
"grad_norm": 1.5326860366155606,
"learning_rate": 4.99209709753674e-06,
"loss": 0.7578,
"step": 33
},
{
"epoch": 0.38526912181303113,
"grad_norm": 1.645466819050736,
"learning_rate": 4.9892453193083354e-06,
"loss": 0.7715,
"step": 34
},
{
"epoch": 0.39660056657223797,
"grad_norm": 1.8351805495790603,
"learning_rate": 4.9859561555679835e-06,
"loss": 0.7516,
"step": 35
},
{
"epoch": 0.40793201133144474,
"grad_norm": 1.5892721538246122,
"learning_rate": 4.982230184254934e-06,
"loss": 0.7658,
"step": 36
},
{
"epoch": 0.4192634560906516,
"grad_norm": 1.493020596244931,
"learning_rate": 4.978068060059929e-06,
"loss": 0.7676,
"step": 37
},
{
"epoch": 0.43059490084985835,
"grad_norm": 1.502680841198379,
"learning_rate": 4.9734705143101744e-06,
"loss": 0.7674,
"step": 38
},
{
"epoch": 0.44192634560906513,
"grad_norm": 1.526315536964418,
"learning_rate": 4.968438354840834e-06,
"loss": 0.747,
"step": 39
},
{
"epoch": 0.45325779036827196,
"grad_norm": 1.771193783955554,
"learning_rate": 4.962972465853087e-06,
"loss": 0.8251,
"step": 40
},
{
"epoch": 0.46458923512747874,
"grad_norm": 1.4751654383068609,
"learning_rate": 4.9570738077587635e-06,
"loss": 0.7587,
"step": 41
},
{
"epoch": 0.47592067988668557,
"grad_norm": 1.498540884039298,
"learning_rate": 4.950743417011591e-06,
"loss": 0.8311,
"step": 42
},
{
"epoch": 0.48725212464589235,
"grad_norm": 1.4059209239064798,
"learning_rate": 4.9439824059250794e-06,
"loss": 0.7655,
"step": 43
},
{
"epoch": 0.4985835694050991,
"grad_norm": 1.4871399277100446,
"learning_rate": 4.936791962477076e-06,
"loss": 0.7358,
"step": 44
},
{
"epoch": 0.509915014164306,
"grad_norm": 1.4257000290167645,
"learning_rate": 4.929173350101025e-06,
"loss": 0.7163,
"step": 45
},
{
"epoch": 0.5212464589235127,
"grad_norm": 1.4659192682033193,
"learning_rate": 4.921127907463972e-06,
"loss": 0.7061,
"step": 46
},
{
"epoch": 0.5325779036827195,
"grad_norm": 1.484618377703729,
"learning_rate": 4.912657048231343e-06,
"loss": 0.7651,
"step": 47
},
{
"epoch": 0.5439093484419264,
"grad_norm": 1.4947896835576073,
"learning_rate": 4.903762260818552e-06,
"loss": 0.7311,
"step": 48
},
{
"epoch": 0.5552407932011332,
"grad_norm": 1.3805335802092924,
"learning_rate": 4.89444510812947e-06,
"loss": 0.7327,
"step": 49
},
{
"epoch": 0.56657223796034,
"grad_norm": 1.5190953752467466,
"learning_rate": 4.884707227281807e-06,
"loss": 0.772,
"step": 50
},
{
"epoch": 0.5779036827195467,
"grad_norm": 1.5965731476318468,
"learning_rate": 4.874550329319457e-06,
"loss": 0.698,
"step": 51
},
{
"epoch": 0.5892351274787535,
"grad_norm": 1.4556721950898377,
"learning_rate": 4.863976198911845e-06,
"loss": 0.7267,
"step": 52
},
{
"epoch": 0.6005665722379604,
"grad_norm": 1.548206103166572,
"learning_rate": 4.852986694040347e-06,
"loss": 0.7188,
"step": 53
},
{
"epoch": 0.6118980169971672,
"grad_norm": 1.5029231129419343,
"learning_rate": 4.84158374567182e-06,
"loss": 0.7452,
"step": 54
},
{
"epoch": 0.623229461756374,
"grad_norm": 1.3962918545527763,
"learning_rate": 4.829769357419317e-06,
"loss": 0.7117,
"step": 55
},
{
"epoch": 0.6345609065155807,
"grad_norm": 1.3990946918305756,
"learning_rate": 4.817545605190026e-06,
"loss": 0.6797,
"step": 56
},
{
"epoch": 0.6458923512747875,
"grad_norm": 1.4406458676366833,
"learning_rate": 4.804914636820517e-06,
"loss": 0.7229,
"step": 57
},
{
"epoch": 0.6572237960339944,
"grad_norm": 1.4774647700554635,
"learning_rate": 4.791878671699343e-06,
"loss": 0.7117,
"step": 58
},
{
"epoch": 0.6685552407932012,
"grad_norm": 1.5167039884678175,
"learning_rate": 4.77844000037707e-06,
"loss": 0.7401,
"step": 59
},
{
"epoch": 0.6798866855524079,
"grad_norm": 1.427117682796941,
"learning_rate": 4.764600984163809e-06,
"loss": 0.7299,
"step": 60
},
{
"epoch": 0.6912181303116147,
"grad_norm": 1.4111273048659723,
"learning_rate": 4.750364054714302e-06,
"loss": 0.6947,
"step": 61
},
{
"epoch": 0.7025495750708215,
"grad_norm": 1.4031005513050043,
"learning_rate": 4.735731713600665e-06,
"loss": 0.7104,
"step": 62
},
{
"epoch": 0.7138810198300283,
"grad_norm": 1.3908217678623624,
"learning_rate": 4.72070653187283e-06,
"loss": 0.7215,
"step": 63
},
{
"epoch": 0.7252124645892352,
"grad_norm": 1.4016377075250317,
"learning_rate": 4.705291149606787e-06,
"loss": 0.6801,
"step": 64
},
{
"epoch": 0.7365439093484419,
"grad_norm": 1.373368498937934,
"learning_rate": 4.6894882754406965e-06,
"loss": 0.7115,
"step": 65
},
{
"epoch": 0.7478753541076487,
"grad_norm": 1.3083514867883594,
"learning_rate": 4.673300686098957e-06,
"loss": 0.6944,
"step": 66
},
{
"epoch": 0.7592067988668555,
"grad_norm": 1.408923716819617,
"learning_rate": 4.6567312259043e-06,
"loss": 0.7166,
"step": 67
},
{
"epoch": 0.7705382436260623,
"grad_norm": 1.4446149604718292,
"learning_rate": 4.639782806278021e-06,
"loss": 0.7643,
"step": 68
},
{
"epoch": 0.7818696883852692,
"grad_norm": 1.472987719405972,
"learning_rate": 4.622458405228411e-06,
"loss": 0.6748,
"step": 69
},
{
"epoch": 0.7932011331444759,
"grad_norm": 1.3910737638781718,
"learning_rate": 4.604761066827485e-06,
"loss": 0.6599,
"step": 70
},
{
"epoch": 0.8045325779036827,
"grad_norm": 1.4486804488805007,
"learning_rate": 4.586693900676116e-06,
"loss": 0.6844,
"step": 71
},
{
"epoch": 0.8158640226628895,
"grad_norm": 1.4339732853429255,
"learning_rate": 4.568260081357644e-06,
"loss": 0.6934,
"step": 72
},
{
"epoch": 0.8271954674220963,
"grad_norm": 1.4071241403434436,
"learning_rate": 4.549462847880066e-06,
"loss": 0.7042,
"step": 73
},
{
"epoch": 0.8385269121813032,
"grad_norm": 1.381067168557849,
"learning_rate": 4.5303055031069165e-06,
"loss": 0.6594,
"step": 74
},
{
"epoch": 0.8498583569405099,
"grad_norm": 1.370636257051823,
"learning_rate": 4.510791413176912e-06,
"loss": 0.7339,
"step": 75
},
{
"epoch": 0.8611898016997167,
"grad_norm": 1.3933507828709049,
"learning_rate": 4.490924006912497e-06,
"loss": 0.7319,
"step": 76
},
{
"epoch": 0.8725212464589235,
"grad_norm": 1.4434701376432475,
"learning_rate": 4.470706775217355e-06,
"loss": 0.7235,
"step": 77
},
{
"epoch": 0.8838526912181303,
"grad_norm": 1.3230684123602419,
"learning_rate": 4.450143270463031e-06,
"loss": 0.6653,
"step": 78
},
{
"epoch": 0.8951841359773371,
"grad_norm": 1.377425868827391,
"learning_rate": 4.429237105864735e-06,
"loss": 0.6929,
"step": 79
},
{
"epoch": 0.9065155807365439,
"grad_norm": 1.4118910710643473,
"learning_rate": 4.407991954846471e-06,
"loss": 0.6713,
"step": 80
},
{
"epoch": 0.9178470254957507,
"grad_norm": 1.2911698771129907,
"learning_rate": 4.386411550395576e-06,
"loss": 0.6828,
"step": 81
},
{
"epoch": 0.9291784702549575,
"grad_norm": 1.3062416329678588,
"learning_rate": 4.364499684406796e-06,
"loss": 0.6902,
"step": 82
},
{
"epoch": 0.9405099150141643,
"grad_norm": 1.4212108714610014,
"learning_rate": 4.3422602070160116e-06,
"loss": 0.7139,
"step": 83
},
{
"epoch": 0.9518413597733711,
"grad_norm": 1.4022201237779546,
"learning_rate": 4.319697025923736e-06,
"loss": 0.696,
"step": 84
},
{
"epoch": 0.9631728045325779,
"grad_norm": 1.4000079484179047,
"learning_rate": 4.296814105708482e-06,
"loss": 0.6978,
"step": 85
},
{
"epoch": 0.9745042492917847,
"grad_norm": 1.3135869027950182,
"learning_rate": 4.273615467130156e-06,
"loss": 0.7094,
"step": 86
},
{
"epoch": 0.9858356940509915,
"grad_norm": 1.5103342340296206,
"learning_rate": 4.250105186423564e-06,
"loss": 0.6864,
"step": 87
},
{
"epoch": 0.9971671388101983,
"grad_norm": 1.5395763391294515,
"learning_rate": 4.226287394582176e-06,
"loss": 0.6997,
"step": 88
},
{
"epoch": 1.0,
"grad_norm": 1.5395763391294515,
"learning_rate": 4.202166276632274e-06,
"loss": 0.7015,
"step": 89
},
{
"epoch": 1.0113314447592068,
"grad_norm": 2.9632436977275973,
"learning_rate": 4.177746070897593e-06,
"loss": 0.6146,
"step": 90
},
{
"epoch": 1.0226628895184136,
"grad_norm": 1.3701820521458077,
"learning_rate": 4.15303106825461e-06,
"loss": 0.5952,
"step": 91
},
{
"epoch": 1.0339943342776203,
"grad_norm": 1.4232251740156472,
"learning_rate": 4.128025611378594e-06,
"loss": 0.6013,
"step": 92
},
{
"epoch": 1.045325779036827,
"grad_norm": 1.4243450179098962,
"learning_rate": 4.10273409398055e-06,
"loss": 0.5838,
"step": 93
},
{
"epoch": 1.056657223796034,
"grad_norm": 1.3024426134525557,
"learning_rate": 4.077160960035207e-06,
"loss": 0.5719,
"step": 94
},
{
"epoch": 1.0679886685552409,
"grad_norm": 1.346350514168427,
"learning_rate": 4.051310703000155e-06,
"loss": 0.5969,
"step": 95
},
{
"epoch": 1.0793201133144477,
"grad_norm": 1.370884384362504,
"learning_rate": 4.025187865026311e-06,
"loss": 0.6079,
"step": 96
},
{
"epoch": 1.0906515580736544,
"grad_norm": 1.462001933003737,
"learning_rate": 3.998797036159813e-06,
"loss": 0.6286,
"step": 97
},
{
"epoch": 1.1019830028328612,
"grad_norm": 1.332600598608553,
"learning_rate": 3.972142853535499e-06,
"loss": 0.606,
"step": 98
},
{
"epoch": 1.113314447592068,
"grad_norm": 1.426061501851815,
"learning_rate": 3.945230000562121e-06,
"loss": 0.6109,
"step": 99
},
{
"epoch": 1.1246458923512748,
"grad_norm": 1.4004325571146066,
"learning_rate": 3.918063206099421e-06,
"loss": 0.62,
"step": 100
},
{
"epoch": 1.1359773371104815,
"grad_norm": 1.3396518626669338,
"learning_rate": 3.890647243627218e-06,
"loss": 0.5934,
"step": 101
},
{
"epoch": 1.1473087818696883,
"grad_norm": 1.380208956021839,
"learning_rate": 3.862986930406669e-06,
"loss": 0.5968,
"step": 102
},
{
"epoch": 1.158640226628895,
"grad_norm": 1.3961154205163853,
"learning_rate": 3.83508712663382e-06,
"loss": 0.6032,
"step": 103
},
{
"epoch": 1.1699716713881019,
"grad_norm": 1.3448453940648393,
"learning_rate": 3.8069527345856233e-06,
"loss": 0.5915,
"step": 104
},
{
"epoch": 1.1813031161473089,
"grad_norm": 1.3902990971135147,
"learning_rate": 3.7785886977585562e-06,
"loss": 0.5918,
"step": 105
},
{
"epoch": 1.1926345609065157,
"grad_norm": 1.3820749771088052,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.5969,
"step": 106
},
{
"epoch": 1.2039660056657224,
"grad_norm": 1.4716970228552981,
"learning_rate": 3.7211916646325315e-06,
"loss": 0.5941,
"step": 107
},
{
"epoch": 1.2152974504249292,
"grad_norm": 1.34235642577707,
"learning_rate": 3.6921687535712657e-06,
"loss": 0.5803,
"step": 108
},
{
"epoch": 1.226628895184136,
"grad_norm": 1.360958822094796,
"learning_rate": 3.662936366434435e-06,
"loss": 0.5882,
"step": 109
},
{
"epoch": 1.2379603399433428,
"grad_norm": 1.3865643140417971,
"learning_rate": 3.6334996396473298e-06,
"loss": 0.6127,
"step": 110
},
{
"epoch": 1.2492917847025495,
"grad_norm": 1.3096840589094985,
"learning_rate": 3.6038637455397802e-06,
"loss": 0.5703,
"step": 111
},
{
"epoch": 1.2606232294617563,
"grad_norm": 1.4346753864435622,
"learning_rate": 3.57403389143732e-06,
"loss": 0.5997,
"step": 112
},
{
"epoch": 1.271954674220963,
"grad_norm": 1.4377459031956494,
"learning_rate": 3.5440153187462146e-06,
"loss": 0.6251,
"step": 113
},
{
"epoch": 1.28328611898017,
"grad_norm": 1.4225200803648703,
"learning_rate": 3.513813302032485e-06,
"loss": 0.6202,
"step": 114
},
{
"epoch": 1.2946175637393766,
"grad_norm": 1.3836498692303638,
"learning_rate": 3.4834331480951213e-06,
"loss": 0.5944,
"step": 115
},
{
"epoch": 1.3059490084985836,
"grad_norm": 1.281401506633823,
"learning_rate": 3.4528801950336177e-06,
"loss": 0.551,
"step": 116
},
{
"epoch": 1.3172804532577904,
"grad_norm": 1.4771037032630314,
"learning_rate": 3.4221598113100196e-06,
"loss": 0.6072,
"step": 117
},
{
"epoch": 1.3286118980169972,
"grad_norm": 1.4652916781647747,
"learning_rate": 3.391277394805628e-06,
"loss": 0.6166,
"step": 118
},
{
"epoch": 1.339943342776204,
"grad_norm": 1.3590014582336747,
"learning_rate": 3.3602383718725363e-06,
"loss": 0.5753,
"step": 119
},
{
"epoch": 1.3512747875354107,
"grad_norm": 1.3644484003880029,
"learning_rate": 3.32904819638017e-06,
"loss": 0.5892,
"step": 120
},
{
"epoch": 1.3626062322946175,
"grad_norm": 1.3191831153419338,
"learning_rate": 3.2977123487569816e-06,
"loss": 0.5624,
"step": 121
},
{
"epoch": 1.3739376770538243,
"grad_norm": 1.3319312452343077,
"learning_rate": 3.2662363350274874e-06,
"loss": 0.5851,
"step": 122
},
{
"epoch": 1.385269121813031,
"grad_norm": 1.4257384239422966,
"learning_rate": 3.234625685844803e-06,
"loss": 0.5893,
"step": 123
},
{
"epoch": 1.3966005665722379,
"grad_norm": 1.3953828272396132,
"learning_rate": 3.202885955518849e-06,
"loss": 0.5973,
"step": 124
},
{
"epoch": 1.4079320113314449,
"grad_norm": 1.395900408790709,
"learning_rate": 3.171022721040409e-06,
"loss": 0.588,
"step": 125
},
{
"epoch": 1.4192634560906516,
"grad_norm": 1.4082770595189713,
"learning_rate": 3.139041581101187e-06,
"loss": 0.5955,
"step": 126
},
{
"epoch": 1.4305949008498584,
"grad_norm": 1.4111017631505742,
"learning_rate": 3.10694815511007e-06,
"loss": 0.6304,
"step": 127
},
{
"epoch": 1.4419263456090652,
"grad_norm": 1.3684496285444403,
"learning_rate": 3.0747480822057342e-06,
"loss": 0.5895,
"step": 128
},
{
"epoch": 1.453257790368272,
"grad_norm": 1.3150808865823653,
"learning_rate": 3.0424470202657953e-06,
"loss": 0.577,
"step": 129
},
{
"epoch": 1.4645892351274787,
"grad_norm": 1.4075517143230738,
"learning_rate": 3.0100506449126622e-06,
"loss": 0.5939,
"step": 130
},
{
"epoch": 1.4759206798866855,
"grad_norm": 1.3153882543446243,
"learning_rate": 2.9775646485162697e-06,
"loss": 0.5735,
"step": 131
},
{
"epoch": 1.4872521246458923,
"grad_norm": 1.3348680664842318,
"learning_rate": 2.9449947391938768e-06,
"loss": 0.625,
"step": 132
},
{
"epoch": 1.498583569405099,
"grad_norm": 1.3489224682673129,
"learning_rate": 2.9123466398070855e-06,
"loss": 0.5981,
"step": 133
},
{
"epoch": 1.509915014164306,
"grad_norm": 1.3429701772744527,
"learning_rate": 2.8796260869562865e-06,
"loss": 0.5887,
"step": 134
},
{
"epoch": 1.5212464589235126,
"grad_norm": 1.4722400862474931,
"learning_rate": 2.8468388299726714e-06,
"loss": 0.5831,
"step": 135
},
{
"epoch": 1.5325779036827196,
"grad_norm": 1.2672960818674879,
"learning_rate": 2.8139906299080205e-06,
"loss": 0.5825,
"step": 136
},
{
"epoch": 1.5439093484419264,
"grad_norm": 1.3627343997731545,
"learning_rate": 2.781087258522431e-06,
"loss": 0.5832,
"step": 137
},
{
"epoch": 1.5552407932011332,
"grad_norm": 1.2876772239163903,
"learning_rate": 2.7481344972701545e-06,
"loss": 0.5531,
"step": 138
},
{
"epoch": 1.56657223796034,
"grad_norm": 1.3472454325008387,
"learning_rate": 2.7151381362837424e-06,
"loss": 0.5842,
"step": 139
},
{
"epoch": 1.5779036827195467,
"grad_norm": 1.3762098724750873,
"learning_rate": 2.682103973356659e-06,
"loss": 0.5712,
"step": 140
},
{
"epoch": 1.5892351274787535,
"grad_norm": 1.3492795625450165,
"learning_rate": 2.64903781292455e-06,
"loss": 0.5782,
"step": 141
},
{
"epoch": 1.6005665722379603,
"grad_norm": 1.3499587695758297,
"learning_rate": 2.615945465045346e-06,
"loss": 0.5669,
"step": 142
},
{
"epoch": 1.6118980169971673,
"grad_norm": 1.3511311195079772,
"learning_rate": 2.5828327443783775e-06,
"loss": 0.551,
"step": 143
},
{
"epoch": 1.6232294617563738,
"grad_norm": 1.33090787217115,
"learning_rate": 2.5497054691626754e-06,
"loss": 0.579,
"step": 144
},
{
"epoch": 1.6345609065155808,
"grad_norm": 1.3670590438796546,
"learning_rate": 2.5165694601946566e-06,
"loss": 0.5959,
"step": 145
},
{
"epoch": 1.6458923512747874,
"grad_norm": 1.3299516191015563,
"learning_rate": 2.483430539805344e-06,
"loss": 0.5979,
"step": 146
},
{
"epoch": 1.6572237960339944,
"grad_norm": 1.4129085459444395,
"learning_rate": 2.4502945308373246e-06,
"loss": 0.585,
"step": 147
},
{
"epoch": 1.6685552407932012,
"grad_norm": 1.4275247309590513,
"learning_rate": 2.4171672556216237e-06,
"loss": 0.576,
"step": 148
},
{
"epoch": 1.679886685552408,
"grad_norm": 1.3619200458990444,
"learning_rate": 2.3840545349546538e-06,
"loss": 0.5841,
"step": 149
},
{
"epoch": 1.6912181303116147,
"grad_norm": 1.3819435365787816,
"learning_rate": 2.3509621870754505e-06,
"loss": 0.5685,
"step": 150
},
{
"epoch": 1.7025495750708215,
"grad_norm": 1.3294738919196907,
"learning_rate": 2.317896026643341e-06,
"loss": 0.5871,
"step": 151
},
{
"epoch": 1.7138810198300283,
"grad_norm": 1.2730007734790987,
"learning_rate": 2.2848618637162584e-06,
"loss": 0.5592,
"step": 152
},
{
"epoch": 1.725212464589235,
"grad_norm": 1.3785771168476042,
"learning_rate": 2.2518655027298468e-06,
"loss": 0.577,
"step": 153
},
{
"epoch": 1.736543909348442,
"grad_norm": 1.3530333929384266,
"learning_rate": 2.21891274147757e-06,
"loss": 0.5503,
"step": 154
},
{
"epoch": 1.7478753541076486,
"grad_norm": 1.4267187662110872,
"learning_rate": 2.1860093700919804e-06,
"loss": 0.6071,
"step": 155
},
{
"epoch": 1.7592067988668556,
"grad_norm": 1.3951110903420234,
"learning_rate": 2.15316117002733e-06,
"loss": 0.5629,
"step": 156
},
{
"epoch": 1.7705382436260622,
"grad_norm": 1.3033237896515593,
"learning_rate": 2.1203739130437147e-06,
"loss": 0.5452,
"step": 157
},
{
"epoch": 1.7818696883852692,
"grad_norm": 1.380166287245991,
"learning_rate": 2.0876533601929153e-06,
"loss": 0.5811,
"step": 158
},
{
"epoch": 1.793201133144476,
"grad_norm": 1.2839541967552655,
"learning_rate": 2.055005260806125e-06,
"loss": 0.5672,
"step": 159
},
{
"epoch": 1.8045325779036827,
"grad_norm": 1.3067862009338267,
"learning_rate": 2.0224353514837307e-06,
"loss": 0.5683,
"step": 160
},
{
"epoch": 1.8158640226628895,
"grad_norm": 1.3243283509277737,
"learning_rate": 1.989949355087339e-06,
"loss": 0.5689,
"step": 161
},
{
"epoch": 1.8271954674220963,
"grad_norm": 1.3356790286830134,
"learning_rate": 1.957552979734205e-06,
"loss": 0.5802,
"step": 162
},
{
"epoch": 1.8385269121813033,
"grad_norm": 1.2892603884545701,
"learning_rate": 1.9252519177942666e-06,
"loss": 0.5692,
"step": 163
},
{
"epoch": 1.8498583569405098,
"grad_norm": 1.4440754700865919,
"learning_rate": 1.8930518448899304e-06,
"loss": 0.5965,
"step": 164
},
{
"epoch": 1.8611898016997168,
"grad_norm": 1.387136746836695,
"learning_rate": 1.8609584188988135e-06,
"loss": 0.5736,
"step": 165
},
{
"epoch": 1.8725212464589234,
"grad_norm": 1.1917095159893407,
"learning_rate": 1.8289772789595917e-06,
"loss": 0.6144,
"step": 166
},
{
"epoch": 1.8838526912181304,
"grad_norm": 1.375742481693197,
"learning_rate": 1.7971140444811514e-06,
"loss": 0.5763,
"step": 167
},
{
"epoch": 1.8951841359773371,
"grad_norm": 1.2404940123916632,
"learning_rate": 1.7653743141551983e-06,
"loss": 0.6063,
"step": 168
},
{
"epoch": 1.906515580736544,
"grad_norm": 1.3989557605487426,
"learning_rate": 1.7337636649725132e-06,
"loss": 0.5892,
"step": 169
},
{
"epoch": 1.9178470254957507,
"grad_norm": 1.2710248336513368,
"learning_rate": 1.7022876512430197e-06,
"loss": 0.5813,
"step": 170
},
{
"epoch": 1.9291784702549575,
"grad_norm": 1.2716638071587159,
"learning_rate": 1.6709518036198307e-06,
"loss": 0.5565,
"step": 171
},
{
"epoch": 1.9405099150141643,
"grad_norm": 1.277667152903096,
"learning_rate": 1.6397616281274648e-06,
"loss": 0.5727,
"step": 172
},
{
"epoch": 1.951841359773371,
"grad_norm": 1.296382463166991,
"learning_rate": 1.6087226051943728e-06,
"loss": 0.593,
"step": 173
},
{
"epoch": 1.963172804532578,
"grad_norm": 1.2844540181786357,
"learning_rate": 1.5778401886899808e-06,
"loss": 0.5841,
"step": 174
},
{
"epoch": 1.9745042492917846,
"grad_norm": 1.3053433701687789,
"learning_rate": 1.5471198049663822e-06,
"loss": 0.575,
"step": 175
},
{
"epoch": 1.9858356940509916,
"grad_norm": 1.268811845033466,
"learning_rate": 1.51656685190488e-06,
"loss": 0.588,
"step": 176
},
{
"epoch": 1.9971671388101981,
"grad_norm": 1.247792131322831,
"learning_rate": 1.4861866979675155e-06,
"loss": 0.5534,
"step": 177
},
{
"epoch": 2.0,
"grad_norm": 1.247792131322831,
"learning_rate": 1.455984681253787e-06,
"loss": 0.5438,
"step": 178
},
{
"epoch": 2.011331444759207,
"grad_norm": 2.7694057916801307,
"learning_rate": 1.4259661085626802e-06,
"loss": 0.5062,
"step": 179
},
{
"epoch": 2.0226628895184136,
"grad_norm": 1.2786953899807016,
"learning_rate": 1.3961362544602215e-06,
"loss": 0.4878,
"step": 180
},
{
"epoch": 2.0339943342776206,
"grad_norm": 1.3338942724237564,
"learning_rate": 1.3665003603526706e-06,
"loss": 0.5131,
"step": 181
},
{
"epoch": 2.045325779036827,
"grad_norm": 1.3381680372557467,
"learning_rate": 1.3370636335655656e-06,
"loss": 0.4976,
"step": 182
},
{
"epoch": 2.056657223796034,
"grad_norm": 1.2513390840039769,
"learning_rate": 1.3078312464287355e-06,
"loss": 0.5211,
"step": 183
},
{
"epoch": 2.0679886685552407,
"grad_norm": 1.2770532289534442,
"learning_rate": 1.2788083353674694e-06,
"loss": 0.5007,
"step": 184
},
{
"epoch": 2.0793201133144477,
"grad_norm": 1.2927230496024624,
"learning_rate": 1.2500000000000007e-06,
"loss": 0.4622,
"step": 185
},
{
"epoch": 2.090651558073654,
"grad_norm": 1.2508343832786988,
"learning_rate": 1.2214113022414448e-06,
"loss": 0.4844,
"step": 186
},
{
"epoch": 2.101983002832861,
"grad_norm": 1.207057675657945,
"learning_rate": 1.1930472654143777e-06,
"loss": 0.4948,
"step": 187
},
{
"epoch": 2.113314447592068,
"grad_norm": 1.2515484350526327,
"learning_rate": 1.1649128733661802e-06,
"loss": 0.4975,
"step": 188
},
{
"epoch": 2.1246458923512748,
"grad_norm": 1.2820103087615313,
"learning_rate": 1.1370130695933317e-06,
"loss": 0.5033,
"step": 189
},
{
"epoch": 2.1359773371104818,
"grad_norm": 1.3883649461446737,
"learning_rate": 1.1093527563727827e-06,
"loss": 0.4959,
"step": 190
},
{
"epoch": 2.1473087818696883,
"grad_norm": 1.2985788114536188,
"learning_rate": 1.0819367939005802e-06,
"loss": 0.5109,
"step": 191
},
{
"epoch": 2.1586402266288953,
"grad_norm": 1.3965326476598117,
"learning_rate": 1.0547699994378787e-06,
"loss": 0.4812,
"step": 192
},
{
"epoch": 2.169971671388102,
"grad_norm": 1.3349013266585352,
"learning_rate": 1.0278571464645013e-06,
"loss": 0.4926,
"step": 193
},
{
"epoch": 2.181303116147309,
"grad_norm": 1.3212505511291743,
"learning_rate": 1.0012029638401871e-06,
"loss": 0.4882,
"step": 194
},
{
"epoch": 2.1926345609065154,
"grad_norm": 1.30500171720855,
"learning_rate": 9.74812134973689e-07,
"loss": 0.5173,
"step": 195
},
{
"epoch": 2.2039660056657224,
"grad_norm": 1.2488171277934157,
"learning_rate": 9.486892969998465e-07,
"loss": 0.482,
"step": 196
},
{
"epoch": 2.215297450424929,
"grad_norm": 1.2854653346406788,
"learning_rate": 9.228390399647944e-07,
"loss": 0.5015,
"step": 197
},
{
"epoch": 2.226628895184136,
"grad_norm": 1.323794953427552,
"learning_rate": 8.972659060194505e-07,
"loss": 0.4735,
"step": 198
},
{
"epoch": 2.237960339943343,
"grad_norm": 1.3376265040553337,
"learning_rate": 8.719743886214071e-07,
"loss": 0.4875,
"step": 199
},
{
"epoch": 2.2492917847025495,
"grad_norm": 1.3285002033736215,
"learning_rate": 8.469689317453907e-07,
"loss": 0.4962,
"step": 200
},
{
"epoch": 2.2606232294617565,
"grad_norm": 1.2555976735704084,
"learning_rate": 8.222539291024079e-07,
"loss": 0.5005,
"step": 201
},
{
"epoch": 2.271954674220963,
"grad_norm": 1.2410255920694668,
"learning_rate": 7.978337233677269e-07,
"loss": 0.4882,
"step": 202
},
{
"epoch": 2.28328611898017,
"grad_norm": 1.2713376284380098,
"learning_rate": 7.737126054178238e-07,
"loss": 0.4739,
"step": 203
},
{
"epoch": 2.2946175637393766,
"grad_norm": 1.2772097228177208,
"learning_rate": 7.49894813576437e-07,
"loss": 0.4652,
"step": 204
},
{
"epoch": 2.3059490084985836,
"grad_norm": 1.3985331094997697,
"learning_rate": 7.26384532869844e-07,
"loss": 0.4983,
"step": 205
},
{
"epoch": 2.31728045325779,
"grad_norm": 1.2385757104958386,
"learning_rate": 7.031858942915187e-07,
"loss": 0.4848,
"step": 206
},
{
"epoch": 2.328611898016997,
"grad_norm": 1.298621814561205,
"learning_rate": 6.803029740762648e-07,
"loss": 0.499,
"step": 207
},
{
"epoch": 2.3399433427762037,
"grad_norm": 1.1982592548193622,
"learning_rate": 6.577397929839891e-07,
"loss": 0.5074,
"step": 208
},
{
"epoch": 2.3512747875354107,
"grad_norm": 1.3230369234558008,
"learning_rate": 6.355003155932052e-07,
"loss": 0.5082,
"step": 209
},
{
"epoch": 2.3626062322946177,
"grad_norm": 1.2353352817463088,
"learning_rate": 6.135884496044245e-07,
"loss": 0.5024,
"step": 210
},
{
"epoch": 2.3739376770538243,
"grad_norm": 1.2866413763490911,
"learning_rate": 5.920080451535296e-07,
"loss": 0.5158,
"step": 211
},
{
"epoch": 2.3852691218130313,
"grad_norm": 1.366484849292798,
"learning_rate": 5.707628941352655e-07,
"loss": 0.5068,
"step": 212
},
{
"epoch": 2.396600566572238,
"grad_norm": 1.287762702589202,
"learning_rate": 5.4985672953697e-07,
"loss": 0.4563,
"step": 213
},
{
"epoch": 2.407932011331445,
"grad_norm": 1.2310856033033788,
"learning_rate": 5.292932247826449e-07,
"loss": 0.5104,
"step": 214
},
{
"epoch": 2.4192634560906514,
"grad_norm": 1.2722457695575675,
"learning_rate": 5.090759930875039e-07,
"loss": 0.4745,
"step": 215
},
{
"epoch": 2.4305949008498584,
"grad_norm": 1.1881380406354796,
"learning_rate": 4.892085868230881e-07,
"loss": 0.4684,
"step": 216
},
{
"epoch": 2.441926345609065,
"grad_norm": 1.2279337924506066,
"learning_rate": 4.696944968930847e-07,
"loss": 0.4766,
"step": 217
},
{
"epoch": 2.453257790368272,
"grad_norm": 1.2965879507443776,
"learning_rate": 4.505371521199342e-07,
"loss": 0.4887,
"step": 218
},
{
"epoch": 2.4645892351274785,
"grad_norm": 1.2726278849407309,
"learning_rate": 4.317399186423574e-07,
"loss": 0.49,
"step": 219
},
{
"epoch": 2.4759206798866855,
"grad_norm": 1.2083763664947262,
"learning_rate": 4.1330609932388493e-07,
"loss": 0.4714,
"step": 220
},
{
"epoch": 2.4872521246458925,
"grad_norm": 1.2790819302434515,
"learning_rate": 3.9523893317251624e-07,
"loss": 0.4924,
"step": 221
},
{
"epoch": 2.498583569405099,
"grad_norm": 1.2824144839819291,
"learning_rate": 3.7754159477158994e-07,
"loss": 0.4969,
"step": 222
},
{
"epoch": 2.509915014164306,
"grad_norm": 1.2322829650550764,
"learning_rate": 3.602171937219789e-07,
"loss": 0.4922,
"step": 223
},
{
"epoch": 2.5212464589235126,
"grad_norm": 1.2932932363502108,
"learning_rate": 3.4326877409570083e-07,
"loss": 0.5135,
"step": 224
},
{
"epoch": 2.5325779036827196,
"grad_norm": 1.3265821035298209,
"learning_rate": 3.266993139010438e-07,
"loss": 0.4824,
"step": 225
},
{
"epoch": 2.543909348441926,
"grad_norm": 1.2634627344428153,
"learning_rate": 3.1051172455930395e-07,
"loss": 0.4756,
"step": 226
},
{
"epoch": 2.555240793201133,
"grad_norm": 1.2955666908690064,
"learning_rate": 2.947088503932136e-07,
"loss": 0.49,
"step": 227
},
{
"epoch": 2.56657223796034,
"grad_norm": 1.2966827797243252,
"learning_rate": 2.792934681271708e-07,
"loss": 0.5022,
"step": 228
},
{
"epoch": 2.5779036827195467,
"grad_norm": 1.294488037545825,
"learning_rate": 2.642682863993354e-07,
"loss": 0.4995,
"step": 229
},
{
"epoch": 2.5892351274787533,
"grad_norm": 1.2899319442218364,
"learning_rate": 2.4963594528569835e-07,
"loss": 0.5022,
"step": 230
},
{
"epoch": 2.6005665722379603,
"grad_norm": 1.2411992299049828,
"learning_rate": 2.3539901583619186e-07,
"loss": 0.4815,
"step": 231
},
{
"epoch": 2.6118980169971673,
"grad_norm": 1.290057331367847,
"learning_rate": 2.2155999962293035e-07,
"loss": 0.4777,
"step": 232
},
{
"epoch": 2.623229461756374,
"grad_norm": 1.336160271366626,
"learning_rate": 2.081213283006575e-07,
"loss": 0.4814,
"step": 233
},
{
"epoch": 2.634560906515581,
"grad_norm": 1.2513839029803793,
"learning_rate": 1.9508536317948358e-07,
"loss": 0.4871,
"step": 234
},
{
"epoch": 2.6458923512747874,
"grad_norm": 1.212015408532025,
"learning_rate": 1.824543948099744e-07,
"loss": 0.4726,
"step": 235
},
{
"epoch": 2.6572237960339944,
"grad_norm": 1.2427046825799253,
"learning_rate": 1.702306425806838e-07,
"loss": 0.4807,
"step": 236
},
{
"epoch": 2.668555240793201,
"grad_norm": 1.2778407829783978,
"learning_rate": 1.584162543281806e-07,
"loss": 0.4957,
"step": 237
},
{
"epoch": 2.679886685552408,
"grad_norm": 1.226886722658913,
"learning_rate": 1.4701330595965401e-07,
"loss": 0.4898,
"step": 238
},
{
"epoch": 2.691218130311615,
"grad_norm": 1.300360670595622,
"learning_rate": 1.3602380108815537e-07,
"loss": 0.4841,
"step": 239
},
{
"epoch": 2.7025495750708215,
"grad_norm": 1.313490684903286,
"learning_rate": 1.2544967068054332e-07,
"loss": 0.4954,
"step": 240
},
{
"epoch": 2.713881019830028,
"grad_norm": 1.2920263932419636,
"learning_rate": 1.152927727181935e-07,
"loss": 0.5249,
"step": 241
},
{
"epoch": 2.725212464589235,
"grad_norm": 1.3447396451038305,
"learning_rate": 1.0555489187053097e-07,
"loss": 0.5207,
"step": 242
},
{
"epoch": 2.736543909348442,
"grad_norm": 1.3527682966828949,
"learning_rate": 9.623773918144896e-08,
"loss": 0.5077,
"step": 243
},
{
"epoch": 2.7478753541076486,
"grad_norm": 1.368471140884032,
"learning_rate": 8.734295176865748e-08,
"loss": 0.5081,
"step": 244
},
{
"epoch": 2.7592067988668556,
"grad_norm": 1.278186747537926,
"learning_rate": 7.88720925360284e-08,
"loss": 0.4944,
"step": 245
},
{
"epoch": 2.770538243626062,
"grad_norm": 1.337416906476136,
"learning_rate": 7.082664989897486e-08,
"loss": 0.4764,
"step": 246
},
{
"epoch": 2.781869688385269,
"grad_norm": 1.280766158552745,
"learning_rate": 6.320803752292465e-08,
"loss": 0.4567,
"step": 247
},
{
"epoch": 2.7932011331444757,
"grad_norm": 1.1973951885363563,
"learning_rate": 5.601759407492108e-08,
"loss": 0.4896,
"step": 248
},
{
"epoch": 2.8045325779036827,
"grad_norm": 1.3298617699728477,
"learning_rate": 4.9256582988409795e-08,
"loss": 0.5015,
"step": 249
},
{
"epoch": 2.8158640226628897,
"grad_norm": 1.2840493283766243,
"learning_rate": 4.292619224123717e-08,
"loss": 0.4702,
"step": 250
},
{
"epoch": 2.8271954674220963,
"grad_norm": 1.2753413052551887,
"learning_rate": 3.702753414691368e-08,
"loss": 0.4677,
"step": 251
},
{
"epoch": 2.8385269121813033,
"grad_norm": 1.2669874412423119,
"learning_rate": 3.15616451591666e-08,
"loss": 0.485,
"step": 252
},
{
"epoch": 2.84985835694051,
"grad_norm": 1.2670916015981173,
"learning_rate": 2.6529485689825996e-08,
"loss": 0.4979,
"step": 253
},
{
"epoch": 2.861189801699717,
"grad_norm": 1.3102429493654797,
"learning_rate": 2.1931939940071368e-08,
"loss": 0.4719,
"step": 254
},
{
"epoch": 2.8725212464589234,
"grad_norm": 1.2716849784011235,
"learning_rate": 1.7769815745066476e-08,
"loss": 0.5018,
"step": 255
},
{
"epoch": 2.8838526912181304,
"grad_norm": 1.2711203429937163,
"learning_rate": 1.4043844432016507e-08,
"loss": 0.5098,
"step": 256
},
{
"epoch": 2.8951841359773374,
"grad_norm": 1.3331489651068749,
"learning_rate": 1.0754680691665299e-08,
"loss": 0.4731,
"step": 257
},
{
"epoch": 2.906515580736544,
"grad_norm": 1.3288134823029176,
"learning_rate": 7.90290246326042e-09,
"loss": 0.5416,
"step": 258
},
{
"epoch": 2.9178470254957505,
"grad_norm": 1.2953779393999099,
"learning_rate": 5.489010833002739e-09,
"loss": 0.4851,
"step": 259
},
{
"epoch": 2.9291784702549575,
"grad_norm": 1.3104504966732855,
"learning_rate": 3.51342994600129e-09,
"loss": 0.5082,
"step": 260
},
{
"epoch": 2.9405099150141645,
"grad_norm": 1.2404192298354901,
"learning_rate": 1.976506931745392e-09,
"loss": 0.4797,
"step": 261
},
{
"epoch": 2.951841359773371,
"grad_norm": 1.25894417590066,
"learning_rate": 8.78511843112051e-10,
"loss": 0.5091,
"step": 262
},
{
"epoch": 2.963172804532578,
"grad_norm": 1.3062783600910168,
"learning_rate": 2.1963760891391406e-10,
"loss": 0.495,
"step": 263
},
{
"epoch": 2.9745042492917846,
"grad_norm": 1.2953858272939929,
"learning_rate": 0.0,
"loss": 0.4696,
"step": 264
}
],
"logging_steps": 1,
"max_steps": 264,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 28634663264256.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}