llama_mo / model_v2 /checkpoint-2000 /trainer_state.json
gboxo's picture
Upload folder using huggingface_hub
ee8b7ae verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.5877022653721684,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012944983818770227,
"grad_norm": 2.9600605964660645,
"learning_rate": 9e-06,
"loss": 2.4498,
"step": 10
},
{
"epoch": 0.025889967637540454,
"grad_norm": 2.9047722816467285,
"learning_rate": 1.9e-05,
"loss": 2.307,
"step": 20
},
{
"epoch": 0.038834951456310676,
"grad_norm": 1.4037628173828125,
"learning_rate": 2.9e-05,
"loss": 1.9692,
"step": 30
},
{
"epoch": 0.05177993527508091,
"grad_norm": 0.9827209711074829,
"learning_rate": 3.9000000000000006e-05,
"loss": 1.5169,
"step": 40
},
{
"epoch": 0.06472491909385113,
"grad_norm": 1.69009530544281,
"learning_rate": 4.9e-05,
"loss": 1.1746,
"step": 50
},
{
"epoch": 0.07766990291262135,
"grad_norm": 0.6970093250274658,
"learning_rate": 5.9e-05,
"loss": 1.1462,
"step": 60
},
{
"epoch": 0.09061488673139159,
"grad_norm": 1.1838749647140503,
"learning_rate": 6.9e-05,
"loss": 1.0314,
"step": 70
},
{
"epoch": 0.10355987055016182,
"grad_norm": 1.2029207944869995,
"learning_rate": 7.900000000000001e-05,
"loss": 1.015,
"step": 80
},
{
"epoch": 0.11650485436893204,
"grad_norm": 0.7995015978813171,
"learning_rate": 8.900000000000001e-05,
"loss": 0.9881,
"step": 90
},
{
"epoch": 0.12944983818770225,
"grad_norm": 0.7544731497764587,
"learning_rate": 9.900000000000001e-05,
"loss": 0.9749,
"step": 100
},
{
"epoch": 0.1423948220064725,
"grad_norm": 0.9032944440841675,
"learning_rate": 9.959441189725102e-05,
"loss": 0.9717,
"step": 110
},
{
"epoch": 0.1553398058252427,
"grad_norm": 0.7858815789222717,
"learning_rate": 9.914375844975215e-05,
"loss": 0.9365,
"step": 120
},
{
"epoch": 0.16828478964401294,
"grad_norm": 0.6408785581588745,
"learning_rate": 9.869310500225327e-05,
"loss": 1.0061,
"step": 130
},
{
"epoch": 0.18122977346278318,
"grad_norm": 0.7295084595680237,
"learning_rate": 9.82424515547544e-05,
"loss": 1.0838,
"step": 140
},
{
"epoch": 0.1941747572815534,
"grad_norm": 1.299147129058838,
"learning_rate": 9.779179810725552e-05,
"loss": 0.8957,
"step": 150
},
{
"epoch": 0.20711974110032363,
"grad_norm": 0.6116259694099426,
"learning_rate": 9.734114465975666e-05,
"loss": 0.9007,
"step": 160
},
{
"epoch": 0.22006472491909385,
"grad_norm": 1.0018341541290283,
"learning_rate": 9.689049121225779e-05,
"loss": 0.9559,
"step": 170
},
{
"epoch": 0.23300970873786409,
"grad_norm": 0.6822437047958374,
"learning_rate": 9.64398377647589e-05,
"loss": 0.8807,
"step": 180
},
{
"epoch": 0.2459546925566343,
"grad_norm": 0.6875489950180054,
"learning_rate": 9.598918431726003e-05,
"loss": 0.892,
"step": 190
},
{
"epoch": 0.2588996763754045,
"grad_norm": 0.9155406355857849,
"learning_rate": 9.553853086976116e-05,
"loss": 0.8846,
"step": 200
},
{
"epoch": 0.27184466019417475,
"grad_norm": 0.7715094685554504,
"learning_rate": 9.508787742226228e-05,
"loss": 0.9194,
"step": 210
},
{
"epoch": 0.284789644012945,
"grad_norm": 0.9338945150375366,
"learning_rate": 9.463722397476341e-05,
"loss": 0.843,
"step": 220
},
{
"epoch": 0.2977346278317152,
"grad_norm": 0.5477790236473083,
"learning_rate": 9.418657052726453e-05,
"loss": 0.9608,
"step": 230
},
{
"epoch": 0.3106796116504854,
"grad_norm": 0.7385268211364746,
"learning_rate": 9.373591707976567e-05,
"loss": 0.8578,
"step": 240
},
{
"epoch": 0.32362459546925565,
"grad_norm": 0.7461796402931213,
"learning_rate": 9.32852636322668e-05,
"loss": 0.8357,
"step": 250
},
{
"epoch": 0.3365695792880259,
"grad_norm": 0.7043349146842957,
"learning_rate": 9.283461018476792e-05,
"loss": 0.9199,
"step": 260
},
{
"epoch": 0.34951456310679613,
"grad_norm": 0.6864545941352844,
"learning_rate": 9.238395673726904e-05,
"loss": 0.8716,
"step": 270
},
{
"epoch": 0.36245954692556637,
"grad_norm": 0.8263736963272095,
"learning_rate": 9.193330328977017e-05,
"loss": 0.9072,
"step": 280
},
{
"epoch": 0.37540453074433655,
"grad_norm": 0.7197927832603455,
"learning_rate": 9.148264984227129e-05,
"loss": 0.9276,
"step": 290
},
{
"epoch": 0.3883495145631068,
"grad_norm": 0.769057035446167,
"learning_rate": 9.103199639477243e-05,
"loss": 0.9371,
"step": 300
},
{
"epoch": 0.40129449838187703,
"grad_norm": 0.6112708449363708,
"learning_rate": 9.058134294727354e-05,
"loss": 0.841,
"step": 310
},
{
"epoch": 0.41423948220064727,
"grad_norm": 0.6785593628883362,
"learning_rate": 9.013068949977468e-05,
"loss": 0.8026,
"step": 320
},
{
"epoch": 0.42718446601941745,
"grad_norm": 0.75263911485672,
"learning_rate": 8.968003605227581e-05,
"loss": 0.8829,
"step": 330
},
{
"epoch": 0.4401294498381877,
"grad_norm": 0.6636873483657837,
"learning_rate": 8.922938260477693e-05,
"loss": 0.877,
"step": 340
},
{
"epoch": 0.45307443365695793,
"grad_norm": 0.7837623357772827,
"learning_rate": 8.877872915727806e-05,
"loss": 0.8355,
"step": 350
},
{
"epoch": 0.46601941747572817,
"grad_norm": 0.7330045104026794,
"learning_rate": 8.832807570977918e-05,
"loss": 0.8869,
"step": 360
},
{
"epoch": 0.47896440129449835,
"grad_norm": 0.7116459012031555,
"learning_rate": 8.78774222622803e-05,
"loss": 0.8861,
"step": 370
},
{
"epoch": 0.4919093851132686,
"grad_norm": 0.7309095859527588,
"learning_rate": 8.742676881478144e-05,
"loss": 0.8943,
"step": 380
},
{
"epoch": 0.5048543689320388,
"grad_norm": 0.9951479434967041,
"learning_rate": 8.697611536728256e-05,
"loss": 0.8906,
"step": 390
},
{
"epoch": 0.517799352750809,
"grad_norm": 0.8258851766586304,
"learning_rate": 8.652546191978369e-05,
"loss": 0.7817,
"step": 400
},
{
"epoch": 0.5307443365695793,
"grad_norm": 0.8005662560462952,
"learning_rate": 8.607480847228482e-05,
"loss": 0.8965,
"step": 410
},
{
"epoch": 0.5436893203883495,
"grad_norm": 0.956330418586731,
"learning_rate": 8.562415502478594e-05,
"loss": 0.8186,
"step": 420
},
{
"epoch": 0.5566343042071198,
"grad_norm": 0.7853320240974426,
"learning_rate": 8.517350157728708e-05,
"loss": 0.7989,
"step": 430
},
{
"epoch": 0.56957928802589,
"grad_norm": 0.8193638920783997,
"learning_rate": 8.47228481297882e-05,
"loss": 0.833,
"step": 440
},
{
"epoch": 0.5825242718446602,
"grad_norm": 0.6770658493041992,
"learning_rate": 8.427219468228931e-05,
"loss": 0.754,
"step": 450
},
{
"epoch": 0.5954692556634305,
"grad_norm": 0.7300212979316711,
"learning_rate": 8.382154123479045e-05,
"loss": 0.8801,
"step": 460
},
{
"epoch": 0.6084142394822006,
"grad_norm": 0.9311557412147522,
"learning_rate": 8.337088778729157e-05,
"loss": 0.9497,
"step": 470
},
{
"epoch": 0.6213592233009708,
"grad_norm": 0.8132520914077759,
"learning_rate": 8.29202343397927e-05,
"loss": 0.7912,
"step": 480
},
{
"epoch": 0.6343042071197411,
"grad_norm": 0.6899361610412598,
"learning_rate": 8.246958089229383e-05,
"loss": 0.8118,
"step": 490
},
{
"epoch": 0.6472491909385113,
"grad_norm": 0.8400319218635559,
"learning_rate": 8.201892744479495e-05,
"loss": 0.8223,
"step": 500
},
{
"epoch": 0.6601941747572816,
"grad_norm": 0.6506232023239136,
"learning_rate": 8.156827399729609e-05,
"loss": 0.8067,
"step": 510
},
{
"epoch": 0.6731391585760518,
"grad_norm": 0.7419421672821045,
"learning_rate": 8.111762054979722e-05,
"loss": 0.8338,
"step": 520
},
{
"epoch": 0.686084142394822,
"grad_norm": 0.8188750743865967,
"learning_rate": 8.066696710229834e-05,
"loss": 0.8502,
"step": 530
},
{
"epoch": 0.6990291262135923,
"grad_norm": 0.7666177153587341,
"learning_rate": 8.021631365479946e-05,
"loss": 0.9033,
"step": 540
},
{
"epoch": 0.7119741100323624,
"grad_norm": 0.7812498211860657,
"learning_rate": 7.976566020730059e-05,
"loss": 0.8686,
"step": 550
},
{
"epoch": 0.7249190938511327,
"grad_norm": 0.7209528684616089,
"learning_rate": 7.931500675980171e-05,
"loss": 0.9243,
"step": 560
},
{
"epoch": 0.7378640776699029,
"grad_norm": 0.8110234141349792,
"learning_rate": 7.886435331230284e-05,
"loss": 0.8535,
"step": 570
},
{
"epoch": 0.7508090614886731,
"grad_norm": 0.9169409871101379,
"learning_rate": 7.841369986480396e-05,
"loss": 0.7916,
"step": 580
},
{
"epoch": 0.7637540453074434,
"grad_norm": 0.9186325073242188,
"learning_rate": 7.79630464173051e-05,
"loss": 0.8712,
"step": 590
},
{
"epoch": 0.7766990291262136,
"grad_norm": 0.7394826412200928,
"learning_rate": 7.751239296980623e-05,
"loss": 0.8319,
"step": 600
},
{
"epoch": 0.7896440129449838,
"grad_norm": 0.9118407964706421,
"learning_rate": 7.706173952230735e-05,
"loss": 0.8615,
"step": 610
},
{
"epoch": 0.8025889967637541,
"grad_norm": 0.8932146430015564,
"learning_rate": 7.661108607480848e-05,
"loss": 0.8729,
"step": 620
},
{
"epoch": 0.8155339805825242,
"grad_norm": 0.8682609796524048,
"learning_rate": 7.61604326273096e-05,
"loss": 0.8196,
"step": 630
},
{
"epoch": 0.8284789644012945,
"grad_norm": 0.7901037931442261,
"learning_rate": 7.570977917981072e-05,
"loss": 0.8124,
"step": 640
},
{
"epoch": 0.8414239482200647,
"grad_norm": 0.787814199924469,
"learning_rate": 7.525912573231186e-05,
"loss": 0.7854,
"step": 650
},
{
"epoch": 0.8543689320388349,
"grad_norm": 0.9356242418289185,
"learning_rate": 7.480847228481298e-05,
"loss": 0.7527,
"step": 660
},
{
"epoch": 0.8673139158576052,
"grad_norm": 0.7236626744270325,
"learning_rate": 7.435781883731411e-05,
"loss": 0.7956,
"step": 670
},
{
"epoch": 0.8802588996763754,
"grad_norm": 1.0258172750473022,
"learning_rate": 7.390716538981524e-05,
"loss": 0.7896,
"step": 680
},
{
"epoch": 0.8932038834951457,
"grad_norm": 0.8183866739273071,
"learning_rate": 7.345651194231636e-05,
"loss": 0.7084,
"step": 690
},
{
"epoch": 0.9061488673139159,
"grad_norm": 1.1878470182418823,
"learning_rate": 7.30058584948175e-05,
"loss": 0.7582,
"step": 700
},
{
"epoch": 0.919093851132686,
"grad_norm": 0.933775007724762,
"learning_rate": 7.255520504731861e-05,
"loss": 0.7988,
"step": 710
},
{
"epoch": 0.9320388349514563,
"grad_norm": 0.8619435429573059,
"learning_rate": 7.210455159981975e-05,
"loss": 0.8472,
"step": 720
},
{
"epoch": 0.9449838187702265,
"grad_norm": 0.8439723253250122,
"learning_rate": 7.165389815232087e-05,
"loss": 0.7647,
"step": 730
},
{
"epoch": 0.9579288025889967,
"grad_norm": 0.8391640782356262,
"learning_rate": 7.120324470482199e-05,
"loss": 0.811,
"step": 740
},
{
"epoch": 0.970873786407767,
"grad_norm": 0.8031836748123169,
"learning_rate": 7.075259125732312e-05,
"loss": 0.7377,
"step": 750
},
{
"epoch": 0.9838187702265372,
"grad_norm": 0.9124151468276978,
"learning_rate": 7.030193780982425e-05,
"loss": 0.7091,
"step": 760
},
{
"epoch": 0.9967637540453075,
"grad_norm": 1.0887125730514526,
"learning_rate": 6.985128436232537e-05,
"loss": 0.8262,
"step": 770
},
{
"epoch": 1.009061488673139,
"grad_norm": 0.9555509090423584,
"learning_rate": 6.94006309148265e-05,
"loss": 0.7195,
"step": 780
},
{
"epoch": 1.0220064724919093,
"grad_norm": 1.1681190729141235,
"learning_rate": 6.894997746732763e-05,
"loss": 0.6532,
"step": 790
},
{
"epoch": 1.0349514563106796,
"grad_norm": 1.120592713356018,
"learning_rate": 6.849932401982876e-05,
"loss": 0.6175,
"step": 800
},
{
"epoch": 1.04789644012945,
"grad_norm": 0.9625107645988464,
"learning_rate": 6.804867057232989e-05,
"loss": 0.6541,
"step": 810
},
{
"epoch": 1.06084142394822,
"grad_norm": 0.9709998965263367,
"learning_rate": 6.7598017124831e-05,
"loss": 0.6446,
"step": 820
},
{
"epoch": 1.0737864077669903,
"grad_norm": 0.9543795585632324,
"learning_rate": 6.714736367733213e-05,
"loss": 0.5685,
"step": 830
},
{
"epoch": 1.0867313915857606,
"grad_norm": 1.2367397546768188,
"learning_rate": 6.669671022983326e-05,
"loss": 0.6747,
"step": 840
},
{
"epoch": 1.0996763754045307,
"grad_norm": 1.0891395807266235,
"learning_rate": 6.624605678233438e-05,
"loss": 0.6536,
"step": 850
},
{
"epoch": 1.112621359223301,
"grad_norm": 1.1543422937393188,
"learning_rate": 6.579540333483552e-05,
"loss": 0.7627,
"step": 860
},
{
"epoch": 1.1255663430420713,
"grad_norm": 1.363010048866272,
"learning_rate": 6.534474988733664e-05,
"loss": 0.7699,
"step": 870
},
{
"epoch": 1.1385113268608413,
"grad_norm": 1.171339511871338,
"learning_rate": 6.489409643983777e-05,
"loss": 0.6775,
"step": 880
},
{
"epoch": 1.1514563106796116,
"grad_norm": 1.1836340427398682,
"learning_rate": 6.44434429923389e-05,
"loss": 0.6744,
"step": 890
},
{
"epoch": 1.164401294498382,
"grad_norm": 1.1864937543869019,
"learning_rate": 6.399278954484002e-05,
"loss": 0.61,
"step": 900
},
{
"epoch": 1.177346278317152,
"grad_norm": 1.3767447471618652,
"learning_rate": 6.354213609734114e-05,
"loss": 0.7274,
"step": 910
},
{
"epoch": 1.1902912621359223,
"grad_norm": 1.1915229558944702,
"learning_rate": 6.309148264984228e-05,
"loss": 0.6787,
"step": 920
},
{
"epoch": 1.2032362459546926,
"grad_norm": 1.416157603263855,
"learning_rate": 6.26408292023434e-05,
"loss": 0.5878,
"step": 930
},
{
"epoch": 1.2161812297734629,
"grad_norm": 1.179671049118042,
"learning_rate": 6.219017575484453e-05,
"loss": 0.6416,
"step": 940
},
{
"epoch": 1.229126213592233,
"grad_norm": 0.9643092751502991,
"learning_rate": 6.173952230734565e-05,
"loss": 0.6718,
"step": 950
},
{
"epoch": 1.2420711974110032,
"grad_norm": 1.417594075202942,
"learning_rate": 6.128886885984678e-05,
"loss": 0.6965,
"step": 960
},
{
"epoch": 1.2550161812297735,
"grad_norm": 1.740006685256958,
"learning_rate": 6.083821541234791e-05,
"loss": 0.6674,
"step": 970
},
{
"epoch": 1.2679611650485436,
"grad_norm": 1.2844997644424438,
"learning_rate": 6.0387561964849034e-05,
"loss": 0.7144,
"step": 980
},
{
"epoch": 1.280906148867314,
"grad_norm": 1.012427568435669,
"learning_rate": 5.993690851735017e-05,
"loss": 0.6602,
"step": 990
},
{
"epoch": 1.2938511326860842,
"grad_norm": 1.6793211698532104,
"learning_rate": 5.948625506985128e-05,
"loss": 0.6379,
"step": 1000
},
{
"epoch": 1.3067961165048545,
"grad_norm": 1.1831872463226318,
"learning_rate": 5.903560162235241e-05,
"loss": 0.6309,
"step": 1010
},
{
"epoch": 1.3197411003236246,
"grad_norm": 1.1994572877883911,
"learning_rate": 5.858494817485354e-05,
"loss": 0.6099,
"step": 1020
},
{
"epoch": 1.3326860841423949,
"grad_norm": 1.3904818296432495,
"learning_rate": 5.8134294727354666e-05,
"loss": 0.6408,
"step": 1030
},
{
"epoch": 1.3456310679611652,
"grad_norm": 1.3394759893417358,
"learning_rate": 5.768364127985579e-05,
"loss": 0.6148,
"step": 1040
},
{
"epoch": 1.3585760517799352,
"grad_norm": 1.0991851091384888,
"learning_rate": 5.723298783235692e-05,
"loss": 0.6353,
"step": 1050
},
{
"epoch": 1.3715210355987055,
"grad_norm": 2.004030227661133,
"learning_rate": 5.678233438485805e-05,
"loss": 0.6547,
"step": 1060
},
{
"epoch": 1.3844660194174758,
"grad_norm": 1.5414537191390991,
"learning_rate": 5.633168093735918e-05,
"loss": 0.6684,
"step": 1070
},
{
"epoch": 1.397411003236246,
"grad_norm": 1.4660983085632324,
"learning_rate": 5.5881027489860305e-05,
"loss": 0.7235,
"step": 1080
},
{
"epoch": 1.4103559870550162,
"grad_norm": 0.6685907244682312,
"learning_rate": 5.5430374042361425e-05,
"loss": 0.6797,
"step": 1090
},
{
"epoch": 1.4233009708737865,
"grad_norm": 1.2811195850372314,
"learning_rate": 5.497972059486255e-05,
"loss": 0.6325,
"step": 1100
},
{
"epoch": 1.4362459546925566,
"grad_norm": 1.3929194211959839,
"learning_rate": 5.452906714736368e-05,
"loss": 0.6982,
"step": 1110
},
{
"epoch": 1.4491909385113269,
"grad_norm": 1.1184651851654053,
"learning_rate": 5.4078413699864804e-05,
"loss": 0.561,
"step": 1120
},
{
"epoch": 1.4621359223300971,
"grad_norm": 1.3536425828933716,
"learning_rate": 5.362776025236593e-05,
"loss": 0.606,
"step": 1130
},
{
"epoch": 1.4750809061488672,
"grad_norm": 1.4096359014511108,
"learning_rate": 5.3177106804867064e-05,
"loss": 0.7083,
"step": 1140
},
{
"epoch": 1.4880258899676375,
"grad_norm": 1.5671049356460571,
"learning_rate": 5.272645335736819e-05,
"loss": 0.6337,
"step": 1150
},
{
"epoch": 1.5009708737864078,
"grad_norm": 1.3812810182571411,
"learning_rate": 5.2275799909869316e-05,
"loss": 0.646,
"step": 1160
},
{
"epoch": 1.5139158576051779,
"grad_norm": 1.2574256658554077,
"learning_rate": 5.182514646237044e-05,
"loss": 0.5943,
"step": 1170
},
{
"epoch": 1.5268608414239482,
"grad_norm": 1.0817134380340576,
"learning_rate": 5.137449301487156e-05,
"loss": 0.6729,
"step": 1180
},
{
"epoch": 1.5398058252427185,
"grad_norm": 2.198194980621338,
"learning_rate": 5.092383956737269e-05,
"loss": 0.6537,
"step": 1190
},
{
"epoch": 1.5527508090614885,
"grad_norm": 1.8652335405349731,
"learning_rate": 5.0473186119873815e-05,
"loss": 0.6486,
"step": 1200
},
{
"epoch": 1.565695792880259,
"grad_norm": 1.6180390119552612,
"learning_rate": 5.002253267237494e-05,
"loss": 0.567,
"step": 1210
},
{
"epoch": 1.5786407766990291,
"grad_norm": 1.9595000743865967,
"learning_rate": 4.9571879224876075e-05,
"loss": 0.6248,
"step": 1220
},
{
"epoch": 1.5915857605177992,
"grad_norm": 0.9194151759147644,
"learning_rate": 4.91212257773772e-05,
"loss": 0.6251,
"step": 1230
},
{
"epoch": 1.6045307443365697,
"grad_norm": 1.026760458946228,
"learning_rate": 4.867057232987833e-05,
"loss": 0.7088,
"step": 1240
},
{
"epoch": 1.6174757281553398,
"grad_norm": 1.3673216104507446,
"learning_rate": 4.821991888237945e-05,
"loss": 0.6626,
"step": 1250
},
{
"epoch": 1.6304207119741099,
"grad_norm": 1.1107579469680786,
"learning_rate": 4.776926543488058e-05,
"loss": 0.7106,
"step": 1260
},
{
"epoch": 1.6433656957928804,
"grad_norm": 0.9585609436035156,
"learning_rate": 4.731861198738171e-05,
"loss": 0.7389,
"step": 1270
},
{
"epoch": 1.6563106796116505,
"grad_norm": 1.2537648677825928,
"learning_rate": 4.686795853988283e-05,
"loss": 0.6415,
"step": 1280
},
{
"epoch": 1.6692556634304208,
"grad_norm": 1.3973714113235474,
"learning_rate": 4.641730509238396e-05,
"loss": 0.5704,
"step": 1290
},
{
"epoch": 1.682200647249191,
"grad_norm": 1.69650399684906,
"learning_rate": 4.5966651644885086e-05,
"loss": 0.6029,
"step": 1300
},
{
"epoch": 1.6951456310679611,
"grad_norm": 1.6716722249984741,
"learning_rate": 4.551599819738621e-05,
"loss": 0.582,
"step": 1310
},
{
"epoch": 1.7080906148867314,
"grad_norm": 1.383424997329712,
"learning_rate": 4.506534474988734e-05,
"loss": 0.6176,
"step": 1320
},
{
"epoch": 1.7210355987055017,
"grad_norm": 1.0892506837844849,
"learning_rate": 4.4614691302388465e-05,
"loss": 0.5937,
"step": 1330
},
{
"epoch": 1.7339805825242718,
"grad_norm": 1.4017115831375122,
"learning_rate": 4.416403785488959e-05,
"loss": 0.5447,
"step": 1340
},
{
"epoch": 1.746925566343042,
"grad_norm": 1.332664132118225,
"learning_rate": 4.371338440739072e-05,
"loss": 0.65,
"step": 1350
},
{
"epoch": 1.7598705501618124,
"grad_norm": 1.4425687789916992,
"learning_rate": 4.3262730959891845e-05,
"loss": 0.6095,
"step": 1360
},
{
"epoch": 1.7728155339805824,
"grad_norm": 1.7485853433609009,
"learning_rate": 4.281207751239297e-05,
"loss": 0.6011,
"step": 1370
},
{
"epoch": 1.7857605177993527,
"grad_norm": 1.2251473665237427,
"learning_rate": 4.23614240648941e-05,
"loss": 0.5049,
"step": 1380
},
{
"epoch": 1.798705501618123,
"grad_norm": 1.543966293334961,
"learning_rate": 4.1910770617395224e-05,
"loss": 0.5768,
"step": 1390
},
{
"epoch": 1.811650485436893,
"grad_norm": 1.153024435043335,
"learning_rate": 4.146011716989635e-05,
"loss": 0.606,
"step": 1400
},
{
"epoch": 1.8245954692556634,
"grad_norm": 1.4503074884414673,
"learning_rate": 4.1009463722397477e-05,
"loss": 0.6068,
"step": 1410
},
{
"epoch": 1.8375404530744337,
"grad_norm": 1.5761051177978516,
"learning_rate": 4.055881027489861e-05,
"loss": 0.5736,
"step": 1420
},
{
"epoch": 1.8504854368932038,
"grad_norm": 1.4788157939910889,
"learning_rate": 4.010815682739973e-05,
"loss": 0.6098,
"step": 1430
},
{
"epoch": 1.863430420711974,
"grad_norm": 1.3545244932174683,
"learning_rate": 3.9657503379900856e-05,
"loss": 0.5781,
"step": 1440
},
{
"epoch": 1.8763754045307444,
"grad_norm": 1.4322385787963867,
"learning_rate": 3.920684993240198e-05,
"loss": 0.5779,
"step": 1450
},
{
"epoch": 1.8893203883495144,
"grad_norm": 1.4146760702133179,
"learning_rate": 3.8756196484903115e-05,
"loss": 0.6097,
"step": 1460
},
{
"epoch": 1.902265372168285,
"grad_norm": 1.7581102848052979,
"learning_rate": 3.830554303740424e-05,
"loss": 0.5672,
"step": 1470
},
{
"epoch": 1.915210355987055,
"grad_norm": 1.5007004737854004,
"learning_rate": 3.785488958990536e-05,
"loss": 0.5951,
"step": 1480
},
{
"epoch": 1.928155339805825,
"grad_norm": 1.2334699630737305,
"learning_rate": 3.740423614240649e-05,
"loss": 0.549,
"step": 1490
},
{
"epoch": 1.9411003236245956,
"grad_norm": 0.9463567733764648,
"learning_rate": 3.695358269490762e-05,
"loss": 0.6448,
"step": 1500
},
{
"epoch": 1.9540453074433657,
"grad_norm": 1.8025217056274414,
"learning_rate": 3.650292924740875e-05,
"loss": 0.7363,
"step": 1510
},
{
"epoch": 1.9669902912621358,
"grad_norm": 2.9467597007751465,
"learning_rate": 3.6052275799909874e-05,
"loss": 0.5586,
"step": 1520
},
{
"epoch": 1.9799352750809063,
"grad_norm": 1.8900437355041504,
"learning_rate": 3.5601622352410993e-05,
"loss": 0.623,
"step": 1530
},
{
"epoch": 1.9928802588996763,
"grad_norm": 1.5157594680786133,
"learning_rate": 3.515096890491213e-05,
"loss": 0.606,
"step": 1540
},
{
"epoch": 2.005177993527508,
"grad_norm": 1.370686411857605,
"learning_rate": 3.470031545741325e-05,
"loss": 0.5081,
"step": 1550
},
{
"epoch": 2.018122977346278,
"grad_norm": 1.3025308847427368,
"learning_rate": 3.424966200991438e-05,
"loss": 0.4448,
"step": 1560
},
{
"epoch": 2.0310679611650486,
"grad_norm": 0.7799197435379028,
"learning_rate": 3.37990085624155e-05,
"loss": 0.4341,
"step": 1570
},
{
"epoch": 2.0440129449838187,
"grad_norm": 1.375433087348938,
"learning_rate": 3.334835511491663e-05,
"loss": 0.5702,
"step": 1580
},
{
"epoch": 2.056957928802589,
"grad_norm": 1.4452621936798096,
"learning_rate": 3.289770166741776e-05,
"loss": 0.455,
"step": 1590
},
{
"epoch": 2.0699029126213593,
"grad_norm": 1.0591763257980347,
"learning_rate": 3.2447048219918885e-05,
"loss": 0.5472,
"step": 1600
},
{
"epoch": 2.0828478964401294,
"grad_norm": 1.366735816001892,
"learning_rate": 3.199639477242001e-05,
"loss": 0.5011,
"step": 1610
},
{
"epoch": 2.0957928802589,
"grad_norm": 2.3307456970214844,
"learning_rate": 3.154574132492114e-05,
"loss": 0.5293,
"step": 1620
},
{
"epoch": 2.10873786407767,
"grad_norm": 1.5701334476470947,
"learning_rate": 3.1095087877422264e-05,
"loss": 0.5571,
"step": 1630
},
{
"epoch": 2.12168284789644,
"grad_norm": 1.585015892982483,
"learning_rate": 3.064443442992339e-05,
"loss": 0.4914,
"step": 1640
},
{
"epoch": 2.1346278317152105,
"grad_norm": 1.5167232751846313,
"learning_rate": 3.0193780982424517e-05,
"loss": 0.4462,
"step": 1650
},
{
"epoch": 2.1475728155339806,
"grad_norm": 1.8519450426101685,
"learning_rate": 2.974312753492564e-05,
"loss": 0.5367,
"step": 1660
},
{
"epoch": 2.1605177993527507,
"grad_norm": 1.1500009298324585,
"learning_rate": 2.929247408742677e-05,
"loss": 0.4091,
"step": 1670
},
{
"epoch": 2.173462783171521,
"grad_norm": 1.9004416465759277,
"learning_rate": 2.8841820639927896e-05,
"loss": 0.4752,
"step": 1680
},
{
"epoch": 2.1864077669902913,
"grad_norm": 1.588977575302124,
"learning_rate": 2.8391167192429026e-05,
"loss": 0.4229,
"step": 1690
},
{
"epoch": 2.1993527508090613,
"grad_norm": 2.033543825149536,
"learning_rate": 2.7940513744930153e-05,
"loss": 0.5575,
"step": 1700
},
{
"epoch": 2.212297734627832,
"grad_norm": 1.1972986459732056,
"learning_rate": 2.7489860297431276e-05,
"loss": 0.4302,
"step": 1710
},
{
"epoch": 2.225242718446602,
"grad_norm": 0.8637037873268127,
"learning_rate": 2.7039206849932402e-05,
"loss": 0.4924,
"step": 1720
},
{
"epoch": 2.238187702265372,
"grad_norm": 1.6572158336639404,
"learning_rate": 2.6588553402433532e-05,
"loss": 0.5,
"step": 1730
},
{
"epoch": 2.2511326860841425,
"grad_norm": 1.8110625743865967,
"learning_rate": 2.6137899954934658e-05,
"loss": 0.4361,
"step": 1740
},
{
"epoch": 2.2640776699029126,
"grad_norm": 1.676248550415039,
"learning_rate": 2.568724650743578e-05,
"loss": 0.4407,
"step": 1750
},
{
"epoch": 2.2770226537216827,
"grad_norm": 1.149107813835144,
"learning_rate": 2.5236593059936908e-05,
"loss": 0.5472,
"step": 1760
},
{
"epoch": 2.289967637540453,
"grad_norm": 1.7543455362319946,
"learning_rate": 2.4785939612438037e-05,
"loss": 0.5396,
"step": 1770
},
{
"epoch": 2.3029126213592233,
"grad_norm": 2.7097597122192383,
"learning_rate": 2.4335286164939164e-05,
"loss": 0.5013,
"step": 1780
},
{
"epoch": 2.3158576051779933,
"grad_norm": 1.9181076288223267,
"learning_rate": 2.388463271744029e-05,
"loss": 0.4476,
"step": 1790
},
{
"epoch": 2.328802588996764,
"grad_norm": 1.1099668741226196,
"learning_rate": 2.3433979269941417e-05,
"loss": 0.4113,
"step": 1800
},
{
"epoch": 2.341747572815534,
"grad_norm": 1.5468546152114868,
"learning_rate": 2.2983325822442543e-05,
"loss": 0.4629,
"step": 1810
},
{
"epoch": 2.354692556634304,
"grad_norm": 1.149834394454956,
"learning_rate": 2.253267237494367e-05,
"loss": 0.434,
"step": 1820
},
{
"epoch": 2.3676375404530745,
"grad_norm": 1.4918863773345947,
"learning_rate": 2.2082018927444796e-05,
"loss": 0.4554,
"step": 1830
},
{
"epoch": 2.3805825242718446,
"grad_norm": 1.610051155090332,
"learning_rate": 2.1631365479945922e-05,
"loss": 0.4081,
"step": 1840
},
{
"epoch": 2.3935275080906147,
"grad_norm": 1.377886176109314,
"learning_rate": 2.118071203244705e-05,
"loss": 0.4531,
"step": 1850
},
{
"epoch": 2.406472491909385,
"grad_norm": 1.3633161783218384,
"learning_rate": 2.0730058584948175e-05,
"loss": 0.4767,
"step": 1860
},
{
"epoch": 2.4194174757281552,
"grad_norm": 2.3759074211120605,
"learning_rate": 2.0279405137449305e-05,
"loss": 0.5155,
"step": 1870
},
{
"epoch": 2.4323624595469258,
"grad_norm": 1.634912133216858,
"learning_rate": 1.9828751689950428e-05,
"loss": 0.4279,
"step": 1880
},
{
"epoch": 2.445307443365696,
"grad_norm": 1.3101922273635864,
"learning_rate": 1.9378098242451558e-05,
"loss": 0.4284,
"step": 1890
},
{
"epoch": 2.458252427184466,
"grad_norm": 1.2841248512268066,
"learning_rate": 1.892744479495268e-05,
"loss": 0.5135,
"step": 1900
},
{
"epoch": 2.4711974110032364,
"grad_norm": 2.4284558296203613,
"learning_rate": 1.847679134745381e-05,
"loss": 0.4455,
"step": 1910
},
{
"epoch": 2.4841423948220065,
"grad_norm": 1.589353084564209,
"learning_rate": 1.8026137899954937e-05,
"loss": 0.484,
"step": 1920
},
{
"epoch": 2.4970873786407766,
"grad_norm": 1.588586688041687,
"learning_rate": 1.7575484452456063e-05,
"loss": 0.5047,
"step": 1930
},
{
"epoch": 2.510032362459547,
"grad_norm": 1.8486393690109253,
"learning_rate": 1.712483100495719e-05,
"loss": 0.4384,
"step": 1940
},
{
"epoch": 2.522977346278317,
"grad_norm": 1.9914798736572266,
"learning_rate": 1.6674177557458316e-05,
"loss": 0.4324,
"step": 1950
},
{
"epoch": 2.5359223300970872,
"grad_norm": 1.3951258659362793,
"learning_rate": 1.6223524109959443e-05,
"loss": 0.5316,
"step": 1960
},
{
"epoch": 2.5488673139158577,
"grad_norm": 1.471160888671875,
"learning_rate": 1.577287066246057e-05,
"loss": 0.4943,
"step": 1970
},
{
"epoch": 2.561812297734628,
"grad_norm": 1.5667455196380615,
"learning_rate": 1.5322217214961695e-05,
"loss": 0.5354,
"step": 1980
},
{
"epoch": 2.574757281553398,
"grad_norm": 1.5954725742340088,
"learning_rate": 1.487156376746282e-05,
"loss": 0.4484,
"step": 1990
},
{
"epoch": 2.5877022653721684,
"grad_norm": 1.3914281129837036,
"learning_rate": 1.4420910319963948e-05,
"loss": 0.4617,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 2319,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.084236146533417e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}