Mistral-EN-Part-3 / trainer_state.json
SamChen888's picture
Upload folder using huggingface_hub
7632848 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9928263988522237,
"eval_steps": 500,
"global_step": 1044,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.028694404591104734,
"grad_norm": 6.9535064697265625,
"learning_rate": 2.9993209101500904e-05,
"loss": 0.8011,
"num_input_tokens_seen": 26464,
"step": 10
},
{
"epoch": 0.05738880918220947,
"grad_norm": 3.7668867111206055,
"learning_rate": 2.997284255484393e-05,
"loss": 0.1799,
"num_input_tokens_seen": 53568,
"step": 20
},
{
"epoch": 0.08608321377331421,
"grad_norm": 5.130955219268799,
"learning_rate": 2.9938918800982563e-05,
"loss": 0.1418,
"num_input_tokens_seen": 79840,
"step": 30
},
{
"epoch": 0.11477761836441894,
"grad_norm": 5.7834391593933105,
"learning_rate": 2.9891468556286034e-05,
"loss": 0.1428,
"num_input_tokens_seen": 106848,
"step": 40
},
{
"epoch": 0.14347202295552366,
"grad_norm": 3.7895705699920654,
"learning_rate": 2.983053478472707e-05,
"loss": 0.1297,
"num_input_tokens_seen": 133600,
"step": 50
},
{
"epoch": 0.17216642754662842,
"grad_norm": 2.2129366397857666,
"learning_rate": 2.975617265898004e-05,
"loss": 0.1392,
"num_input_tokens_seen": 159872,
"step": 60
},
{
"epoch": 0.20086083213773315,
"grad_norm": 5.727760314941406,
"learning_rate": 2.9668449510464707e-05,
"loss": 0.1418,
"num_input_tokens_seen": 186912,
"step": 70
},
{
"epoch": 0.22955523672883787,
"grad_norm": 7.682604789733887,
"learning_rate": 2.9567444768380745e-05,
"loss": 0.1336,
"num_input_tokens_seen": 213696,
"step": 80
},
{
"epoch": 0.2582496413199426,
"grad_norm": 5.216188430786133,
"learning_rate": 2.9453249887788343e-05,
"loss": 0.0996,
"num_input_tokens_seen": 240736,
"step": 90
},
{
"epoch": 0.28694404591104733,
"grad_norm": 4.260756015777588,
"learning_rate": 2.9325968266799934e-05,
"loss": 0.1102,
"num_input_tokens_seen": 266816,
"step": 100
},
{
"epoch": 0.31563845050215206,
"grad_norm": 1.7236586809158325,
"learning_rate": 2.918571515295803e-05,
"loss": 0.1443,
"num_input_tokens_seen": 293504,
"step": 110
},
{
"epoch": 0.34433285509325684,
"grad_norm": 3.35003399848938,
"learning_rate": 2.9032617538884018e-05,
"loss": 0.1224,
"num_input_tokens_seen": 319872,
"step": 120
},
{
"epoch": 0.37302725968436157,
"grad_norm": 3.347268581390381,
"learning_rate": 2.8866814047292232e-05,
"loss": 0.1678,
"num_input_tokens_seen": 346816,
"step": 130
},
{
"epoch": 0.4017216642754663,
"grad_norm": 3.474947929382324,
"learning_rate": 2.8688454805473647e-05,
"loss": 0.1154,
"num_input_tokens_seen": 374304,
"step": 140
},
{
"epoch": 0.430416068866571,
"grad_norm": 6.668752670288086,
"learning_rate": 2.84977013093626e-05,
"loss": 0.1285,
"num_input_tokens_seen": 401568,
"step": 150
},
{
"epoch": 0.45911047345767575,
"grad_norm": 4.752432346343994,
"learning_rate": 2.8294726277309815e-05,
"loss": 0.1144,
"num_input_tokens_seen": 428384,
"step": 160
},
{
"epoch": 0.4878048780487805,
"grad_norm": 3.637117624282837,
"learning_rate": 2.8079713493694024e-05,
"loss": 0.0875,
"num_input_tokens_seen": 454880,
"step": 170
},
{
"epoch": 0.5164992826398852,
"grad_norm": 3.157396078109741,
"learning_rate": 2.7852857642513838e-05,
"loss": 0.151,
"num_input_tokens_seen": 481856,
"step": 180
},
{
"epoch": 0.5451936872309899,
"grad_norm": 2.444923162460327,
"learning_rate": 2.7614364131110498e-05,
"loss": 0.1153,
"num_input_tokens_seen": 508192,
"step": 190
},
{
"epoch": 0.5738880918220947,
"grad_norm": 3.9938931465148926,
"learning_rate": 2.7364448904181152e-05,
"loss": 0.0971,
"num_input_tokens_seen": 534240,
"step": 200
},
{
"epoch": 0.6025824964131994,
"grad_norm": 2.390864133834839,
"learning_rate": 2.7103338248251055e-05,
"loss": 0.1035,
"num_input_tokens_seen": 561664,
"step": 210
},
{
"epoch": 0.6312769010043041,
"grad_norm": 4.270871639251709,
"learning_rate": 2.6831268586781746e-05,
"loss": 0.0908,
"num_input_tokens_seen": 588128,
"step": 220
},
{
"epoch": 0.6599713055954088,
"grad_norm": 2.367370367050171,
"learning_rate": 2.6548486266100645e-05,
"loss": 0.1216,
"num_input_tokens_seen": 614432,
"step": 230
},
{
"epoch": 0.6886657101865137,
"grad_norm": 2.0463204383850098,
"learning_rate": 2.6255247332346036e-05,
"loss": 0.0952,
"num_input_tokens_seen": 640832,
"step": 240
},
{
"epoch": 0.7173601147776184,
"grad_norm": 3.1408205032348633,
"learning_rate": 2.5951817299629266e-05,
"loss": 0.1073,
"num_input_tokens_seen": 668448,
"step": 250
},
{
"epoch": 0.7460545193687231,
"grad_norm": 3.5036535263061523,
"learning_rate": 2.5638470909624166e-05,
"loss": 0.1048,
"num_input_tokens_seen": 695008,
"step": 260
},
{
"epoch": 0.7747489239598279,
"grad_norm": 2.891324996948242,
"learning_rate": 2.531549188280135e-05,
"loss": 0.1015,
"num_input_tokens_seen": 722208,
"step": 270
},
{
"epoch": 0.8034433285509326,
"grad_norm": 1.6332054138183594,
"learning_rate": 2.498317266153262e-05,
"loss": 0.0815,
"num_input_tokens_seen": 749216,
"step": 280
},
{
"epoch": 0.8321377331420373,
"grad_norm": 4.443371295928955,
"learning_rate": 2.464181414529809e-05,
"loss": 0.0843,
"num_input_tokens_seen": 776000,
"step": 290
},
{
"epoch": 0.860832137733142,
"grad_norm": 3.0447335243225098,
"learning_rate": 2.4291725418235848e-05,
"loss": 0.1321,
"num_input_tokens_seen": 802560,
"step": 300
},
{
"epoch": 0.8895265423242468,
"grad_norm": 3.728531837463379,
"learning_rate": 2.3933223469280704e-05,
"loss": 0.0868,
"num_input_tokens_seen": 828704,
"step": 310
},
{
"epoch": 0.9182209469153515,
"grad_norm": 3.703484296798706,
"learning_rate": 2.3566632905145604e-05,
"loss": 0.1012,
"num_input_tokens_seen": 855616,
"step": 320
},
{
"epoch": 0.9469153515064562,
"grad_norm": 3.728098154067993,
"learning_rate": 2.3192285656405456e-05,
"loss": 0.1139,
"num_input_tokens_seen": 882400,
"step": 330
},
{
"epoch": 0.975609756097561,
"grad_norm": 1.163082480430603,
"learning_rate": 2.2810520676949537e-05,
"loss": 0.1004,
"num_input_tokens_seen": 909760,
"step": 340
},
{
"epoch": 1.0028694404591105,
"grad_norm": 3.1779849529266357,
"learning_rate": 2.2421683637074648e-05,
"loss": 0.1131,
"num_input_tokens_seen": 934112,
"step": 350
},
{
"epoch": 1.0315638450502151,
"grad_norm": 1.9103726148605347,
"learning_rate": 2.2026126610496852e-05,
"loss": 0.0896,
"num_input_tokens_seen": 960320,
"step": 360
},
{
"epoch": 1.06025824964132,
"grad_norm": 2.4402358531951904,
"learning_rate": 2.1624207755565232e-05,
"loss": 0.0629,
"num_input_tokens_seen": 986688,
"step": 370
},
{
"epoch": 1.0889526542324246,
"grad_norm": 3.0046820640563965,
"learning_rate": 2.121629099096628e-05,
"loss": 0.0658,
"num_input_tokens_seen": 1013088,
"step": 380
},
{
"epoch": 1.1176470588235294,
"grad_norm": 2.693220376968384,
"learning_rate": 2.0802745666212592e-05,
"loss": 0.0777,
"num_input_tokens_seen": 1039808,
"step": 390
},
{
"epoch": 1.146341463414634,
"grad_norm": 2.451016664505005,
"learning_rate": 2.0383946227214188e-05,
"loss": 0.0908,
"num_input_tokens_seen": 1066144,
"step": 400
},
{
"epoch": 1.175035868005739,
"grad_norm": 0.8696689605712891,
"learning_rate": 1.9960271877235306e-05,
"loss": 0.0731,
"num_input_tokens_seen": 1092576,
"step": 410
},
{
"epoch": 1.2037302725968435,
"grad_norm": 2.7256174087524414,
"learning_rate": 1.953210623354359e-05,
"loss": 0.0661,
"num_input_tokens_seen": 1119104,
"step": 420
},
{
"epoch": 1.2324246771879483,
"grad_norm": 2.751459836959839,
"learning_rate": 1.909983698006266e-05,
"loss": 0.0391,
"num_input_tokens_seen": 1145376,
"step": 430
},
{
"epoch": 1.2611190817790532,
"grad_norm": 3.420569658279419,
"learning_rate": 1.8663855516342468e-05,
"loss": 0.0594,
"num_input_tokens_seen": 1172736,
"step": 440
},
{
"epoch": 1.2898134863701578,
"grad_norm": 3.2319304943084717,
"learning_rate": 1.8224556603165363e-05,
"loss": 0.0492,
"num_input_tokens_seen": 1199296,
"step": 450
},
{
"epoch": 1.3185078909612624,
"grad_norm": 6.221270561218262,
"learning_rate": 1.7782338005108694e-05,
"loss": 0.0465,
"num_input_tokens_seen": 1226368,
"step": 460
},
{
"epoch": 1.3472022955523673,
"grad_norm": 1.9661167860031128,
"learning_rate": 1.733760013038765e-05,
"loss": 0.1173,
"num_input_tokens_seen": 1252352,
"step": 470
},
{
"epoch": 1.375896700143472,
"grad_norm": 2.8439626693725586,
"learning_rate": 1.689074566830434e-05,
"loss": 0.0721,
"num_input_tokens_seen": 1278912,
"step": 480
},
{
"epoch": 1.4045911047345767,
"grad_norm": 2.0552496910095215,
"learning_rate": 1.6442179224631558e-05,
"loss": 0.061,
"num_input_tokens_seen": 1306176,
"step": 490
},
{
"epoch": 1.4332855093256816,
"grad_norm": 2.505808115005493,
"learning_rate": 1.5992306955261175e-05,
"loss": 0.0599,
"num_input_tokens_seen": 1332608,
"step": 500
},
{
"epoch": 1.4619799139167862,
"grad_norm": 1.2224159240722656,
"learning_rate": 1.5541536198449044e-05,
"loss": 0.0721,
"num_input_tokens_seen": 1359520,
"step": 510
},
{
"epoch": 1.490674318507891,
"grad_norm": 1.0898300409317017,
"learning_rate": 1.5090275105989284e-05,
"loss": 0.0595,
"num_input_tokens_seen": 1386368,
"step": 520
},
{
"epoch": 1.5193687230989958,
"grad_norm": 1.6561291217803955,
"learning_rate": 1.463893227365195e-05,
"loss": 0.0519,
"num_input_tokens_seen": 1412992,
"step": 530
},
{
"epoch": 1.5480631276901005,
"grad_norm": 3.09395694732666,
"learning_rate": 1.4187916371218739e-05,
"loss": 0.0752,
"num_input_tokens_seen": 1439616,
"step": 540
},
{
"epoch": 1.576757532281205,
"grad_norm": 3.0223190784454346,
"learning_rate": 1.3737635772451642e-05,
"loss": 0.0895,
"num_input_tokens_seen": 1466464,
"step": 550
},
{
"epoch": 1.60545193687231,
"grad_norm": 2.7507283687591553,
"learning_rate": 1.328849818532963e-05,
"loss": 0.1118,
"num_input_tokens_seen": 1493440,
"step": 560
},
{
"epoch": 1.6341463414634148,
"grad_norm": 1.4776769876480103,
"learning_rate": 1.2840910282888211e-05,
"loss": 0.0917,
"num_input_tokens_seen": 1520256,
"step": 570
},
{
"epoch": 1.6628407460545194,
"grad_norm": 0.9096710681915283,
"learning_rate": 1.2395277334996045e-05,
"loss": 0.0797,
"num_input_tokens_seen": 1547712,
"step": 580
},
{
"epoch": 1.691535150645624,
"grad_norm": 3.431025266647339,
"learning_rate": 1.1952002841402057e-05,
"loss": 0.0609,
"num_input_tokens_seen": 1574528,
"step": 590
},
{
"epoch": 1.7202295552367288,
"grad_norm": 5.7561726570129395,
"learning_rate": 1.1511488166385349e-05,
"loss": 0.0602,
"num_input_tokens_seen": 1600768,
"step": 600
},
{
"epoch": 1.7489239598278337,
"grad_norm": 2.8678972721099854,
"learning_rate": 1.107413217533863e-05,
"loss": 0.0714,
"num_input_tokens_seen": 1627744,
"step": 610
},
{
"epoch": 1.7776183644189383,
"grad_norm": 3.1199018955230713,
"learning_rate": 1.0640330873614336e-05,
"loss": 0.0559,
"num_input_tokens_seen": 1654784,
"step": 620
},
{
"epoch": 1.806312769010043,
"grad_norm": 4.480892181396484,
"learning_rate": 1.0210477047960303e-05,
"loss": 0.1277,
"num_input_tokens_seen": 1680928,
"step": 630
},
{
"epoch": 1.8350071736011477,
"grad_norm": 3.975166082382202,
"learning_rate": 9.78495991086979e-06,
"loss": 0.0817,
"num_input_tokens_seen": 1707232,
"step": 640
},
{
"epoch": 1.8637015781922526,
"grad_norm": 1.2731448411941528,
"learning_rate": 9.364164748167806e-06,
"loss": 0.073,
"num_input_tokens_seen": 1733568,
"step": 650
},
{
"epoch": 1.8923959827833574,
"grad_norm": 2.2199535369873047,
"learning_rate": 8.948472570152874e-06,
"loss": 0.0637,
"num_input_tokens_seen": 1760608,
"step": 660
},
{
"epoch": 1.921090387374462,
"grad_norm": 2.6249306201934814,
"learning_rate": 8.538259766610019e-06,
"loss": 0.0512,
"num_input_tokens_seen": 1788064,
"step": 670
},
{
"epoch": 1.9497847919655666,
"grad_norm": 4.65223503112793,
"learning_rate": 8.133897766007499e-06,
"loss": 0.0608,
"num_input_tokens_seen": 1814688,
"step": 680
},
{
"epoch": 1.9784791965566715,
"grad_norm": 6.477709770202637,
"learning_rate": 7.735752699185711e-06,
"loss": 0.0596,
"num_input_tokens_seen": 1841056,
"step": 690
},
{
"epoch": 2.005738880918221,
"grad_norm": 1.0149333477020264,
"learning_rate": 7.344185067842878e-06,
"loss": 0.059,
"num_input_tokens_seen": 1865472,
"step": 700
},
{
"epoch": 2.034433285509326,
"grad_norm": 1.824137806892395,
"learning_rate": 6.959549418117669e-06,
"loss": 0.0411,
"num_input_tokens_seen": 1891968,
"step": 710
},
{
"epoch": 2.0631276901004303,
"grad_norm": 1.539084553718567,
"learning_rate": 6.582194019564266e-06,
"loss": 0.0296,
"num_input_tokens_seen": 1918912,
"step": 720
},
{
"epoch": 2.091822094691535,
"grad_norm": 3.0269973278045654,
"learning_rate": 6.2124605498106336e-06,
"loss": 0.0433,
"num_input_tokens_seen": 1945312,
"step": 730
},
{
"epoch": 2.12051649928264,
"grad_norm": 1.6702455282211304,
"learning_rate": 5.850683785185409e-06,
"loss": 0.0283,
"num_input_tokens_seen": 1971520,
"step": 740
},
{
"epoch": 2.149210903873745,
"grad_norm": 2.5078916549682617,
"learning_rate": 5.497191297593647e-06,
"loss": 0.0419,
"num_input_tokens_seen": 1998528,
"step": 750
},
{
"epoch": 2.177905308464849,
"grad_norm": 0.1790330410003662,
"learning_rate": 5.1523031579157994e-06,
"loss": 0.0601,
"num_input_tokens_seen": 2025088,
"step": 760
},
{
"epoch": 2.206599713055954,
"grad_norm": 2.8430678844451904,
"learning_rate": 4.816331646198556e-06,
"loss": 0.0368,
"num_input_tokens_seen": 2052032,
"step": 770
},
{
"epoch": 2.235294117647059,
"grad_norm": 2.523798704147339,
"learning_rate": 4.4895809688998655e-06,
"loss": 0.0474,
"num_input_tokens_seen": 2079104,
"step": 780
},
{
"epoch": 2.2639885222381637,
"grad_norm": 4.507711410522461,
"learning_rate": 4.172346983444269e-06,
"loss": 0.0498,
"num_input_tokens_seen": 2106496,
"step": 790
},
{
"epoch": 2.292682926829268,
"grad_norm": 5.649550914764404,
"learning_rate": 3.864916930337852e-06,
"loss": 0.0389,
"num_input_tokens_seen": 2133248,
"step": 800
},
{
"epoch": 2.321377331420373,
"grad_norm": 2.364619255065918,
"learning_rate": 3.567569173085455e-06,
"loss": 0.0275,
"num_input_tokens_seen": 2159840,
"step": 810
},
{
"epoch": 2.350071736011478,
"grad_norm": 2.0380613803863525,
"learning_rate": 3.2805729461455307e-06,
"loss": 0.0269,
"num_input_tokens_seen": 2186528,
"step": 820
},
{
"epoch": 2.3787661406025826,
"grad_norm": 3.2729814052581787,
"learning_rate": 3.0041881111509783e-06,
"loss": 0.0586,
"num_input_tokens_seen": 2213344,
"step": 830
},
{
"epoch": 2.407460545193687,
"grad_norm": 4.258326053619385,
"learning_rate": 2.7386649216166233e-06,
"loss": 0.046,
"num_input_tokens_seen": 2239648,
"step": 840
},
{
"epoch": 2.436154949784792,
"grad_norm": 3.044335126876831,
"learning_rate": 2.484243796346367e-06,
"loss": 0.0414,
"num_input_tokens_seen": 2267008,
"step": 850
},
{
"epoch": 2.4648493543758967,
"grad_norm": 1.9635220766067505,
"learning_rate": 2.241155101745242e-06,
"loss": 0.0375,
"num_input_tokens_seen": 2293664,
"step": 860
},
{
"epoch": 2.4935437589670015,
"grad_norm": 0.2657749652862549,
"learning_rate": 2.0096189432334194e-06,
"loss": 0.0541,
"num_input_tokens_seen": 2320032,
"step": 870
},
{
"epoch": 2.5222381635581064,
"grad_norm": 2.3013086318969727,
"learning_rate": 1.7898449659510841e-06,
"loss": 0.0324,
"num_input_tokens_seen": 2346848,
"step": 880
},
{
"epoch": 2.5509325681492108,
"grad_norm": 3.4938905239105225,
"learning_rate": 1.5820321649345582e-06,
"loss": 0.0704,
"num_input_tokens_seen": 2373408,
"step": 890
},
{
"epoch": 2.5796269727403156,
"grad_norm": 1.1432446241378784,
"learning_rate": 1.3863687049356465e-06,
"loss": 0.0376,
"num_input_tokens_seen": 2400192,
"step": 900
},
{
"epoch": 2.6083213773314204,
"grad_norm": 3.4845967292785645,
"learning_rate": 1.2030317500472572e-06,
"loss": 0.0377,
"num_input_tokens_seen": 2426720,
"step": 910
},
{
"epoch": 2.637015781922525,
"grad_norm": 0.9145282506942749,
"learning_rate": 1.0321873032896328e-06,
"loss": 0.0459,
"num_input_tokens_seen": 2453504,
"step": 920
},
{
"epoch": 2.6657101865136297,
"grad_norm": 6.571470260620117,
"learning_rate": 8.7399005630238e-07,
"loss": 0.0352,
"num_input_tokens_seen": 2480064,
"step": 930
},
{
"epoch": 2.6944045911047345,
"grad_norm": 1.0509833097457886,
"learning_rate": 7.285832492784456e-07,
"loss": 0.0308,
"num_input_tokens_seen": 2507904,
"step": 940
},
{
"epoch": 2.7230989956958394,
"grad_norm": 1.271236777305603,
"learning_rate": 5.960985412668457e-07,
"loss": 0.0359,
"num_input_tokens_seen": 2534624,
"step": 950
},
{
"epoch": 2.751793400286944,
"grad_norm": 4.02854061126709,
"learning_rate": 4.766558909615504e-07,
"loss": 0.0251,
"num_input_tokens_seen": 2560384,
"step": 960
},
{
"epoch": 2.7804878048780486,
"grad_norm": 2.239231824874878,
"learning_rate": 3.703634480845175e-07,
"loss": 0.0278,
"num_input_tokens_seen": 2586976,
"step": 970
},
{
"epoch": 2.8091822094691534,
"grad_norm": 1.5159451961517334,
"learning_rate": 2.7731745546118295e-07,
"loss": 0.0337,
"num_input_tokens_seen": 2613568,
"step": 980
},
{
"epoch": 2.8378766140602583,
"grad_norm": 2.9302163124084473,
"learning_rate": 1.9760216187710788e-07,
"loss": 0.0418,
"num_input_tokens_seen": 2640128,
"step": 990
},
{
"epoch": 2.866571018651363,
"grad_norm": 3.014836311340332,
"learning_rate": 1.3128974579462771e-07,
"loss": 0.042,
"num_input_tokens_seen": 2666880,
"step": 1000
},
{
"epoch": 2.895265423242468,
"grad_norm": 4.583406448364258,
"learning_rate": 7.844024999865806e-08,
"loss": 0.043,
"num_input_tokens_seen": 2694272,
"step": 1010
},
{
"epoch": 2.9239598278335723,
"grad_norm": 0.8867899179458618,
"learning_rate": 3.910152723075322e-08,
"loss": 0.0282,
"num_input_tokens_seen": 2721024,
"step": 1020
},
{
"epoch": 2.952654232424677,
"grad_norm": 2.2085297107696533,
"learning_rate": 1.3309196860701867e-08,
"loss": 0.049,
"num_input_tokens_seen": 2746976,
"step": 1030
},
{
"epoch": 2.981348637015782,
"grad_norm": 1.866648554801941,
"learning_rate": 1.0866126348507699e-09,
"loss": 0.0376,
"num_input_tokens_seen": 2773152,
"step": 1040
},
{
"epoch": 2.9928263988522237,
"num_input_tokens_seen": 2783872,
"step": 1044,
"total_flos": 1.191738412451758e+17,
"train_loss": 0.08308002222828938,
"train_runtime": 963.6682,
"train_samples_per_second": 8.673,
"train_steps_per_second": 1.083
}
],
"logging_steps": 10,
"max_steps": 1044,
"num_input_tokens_seen": 2783872,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.191738412451758e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}