Mistral-EN-Part-2 / trainer_state.json
SamChen888's picture
Upload folder using huggingface_hub
2308a88 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.989247311827957,
"eval_steps": 500,
"global_step": 696,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.043010752688172046,
"grad_norm": 3.3071749210357666,
"learning_rate": 2.9984721919587606e-05,
"loss": 0.9267,
"num_input_tokens_seen": 26208,
"step": 10
},
{
"epoch": 0.08602150537634409,
"grad_norm": 1.9867345094680786,
"learning_rate": 2.9938918800982563e-05,
"loss": 0.1561,
"num_input_tokens_seen": 53152,
"step": 20
},
{
"epoch": 0.12903225806451613,
"grad_norm": 10.124670028686523,
"learning_rate": 2.9862683948682103e-05,
"loss": 0.1764,
"num_input_tokens_seen": 79776,
"step": 30
},
{
"epoch": 0.17204301075268819,
"grad_norm": 2.0668740272521973,
"learning_rate": 2.975617265898004e-05,
"loss": 0.1505,
"num_input_tokens_seen": 106496,
"step": 40
},
{
"epoch": 0.21505376344086022,
"grad_norm": 2.0054426193237305,
"learning_rate": 2.961960190361624e-05,
"loss": 0.1615,
"num_input_tokens_seen": 133728,
"step": 50
},
{
"epoch": 0.25806451612903225,
"grad_norm": 2.5156946182250977,
"learning_rate": 2.9453249887788343e-05,
"loss": 0.1369,
"num_input_tokens_seen": 159936,
"step": 60
},
{
"epoch": 0.3010752688172043,
"grad_norm": 1.1180003881454468,
"learning_rate": 2.925745548342631e-05,
"loss": 0.1253,
"num_input_tokens_seen": 186240,
"step": 70
},
{
"epoch": 0.34408602150537637,
"grad_norm": 13.929261207580566,
"learning_rate": 2.9032617538884018e-05,
"loss": 0.1479,
"num_input_tokens_seen": 212832,
"step": 80
},
{
"epoch": 0.3870967741935484,
"grad_norm": 2.071075439453125,
"learning_rate": 2.877919406645433e-05,
"loss": 0.1102,
"num_input_tokens_seen": 239648,
"step": 90
},
{
"epoch": 0.43010752688172044,
"grad_norm": 2.170992374420166,
"learning_rate": 2.84977013093626e-05,
"loss": 0.099,
"num_input_tokens_seen": 265888,
"step": 100
},
{
"epoch": 0.4731182795698925,
"grad_norm": 5.617093086242676,
"learning_rate": 2.818871269013928e-05,
"loss": 0.107,
"num_input_tokens_seen": 292480,
"step": 110
},
{
"epoch": 0.5161290322580645,
"grad_norm": 6.3603010177612305,
"learning_rate": 2.7852857642513838e-05,
"loss": 0.1183,
"num_input_tokens_seen": 318784,
"step": 120
},
{
"epoch": 0.5591397849462365,
"grad_norm": 3.8436505794525146,
"learning_rate": 2.7490820329209546e-05,
"loss": 0.1097,
"num_input_tokens_seen": 346016,
"step": 130
},
{
"epoch": 0.6021505376344086,
"grad_norm": 1.736432671546936,
"learning_rate": 2.7103338248251055e-05,
"loss": 0.0946,
"num_input_tokens_seen": 372384,
"step": 140
},
{
"epoch": 0.6451612903225806,
"grad_norm": 8.300951957702637,
"learning_rate": 2.6691200730623874e-05,
"loss": 0.1251,
"num_input_tokens_seen": 399328,
"step": 150
},
{
"epoch": 0.6881720430107527,
"grad_norm": 2.540724277496338,
"learning_rate": 2.6255247332346036e-05,
"loss": 0.1069,
"num_input_tokens_seen": 426048,
"step": 160
},
{
"epoch": 0.7311827956989247,
"grad_norm": 1.967483639717102,
"learning_rate": 2.5796366124227532e-05,
"loss": 0.0904,
"num_input_tokens_seen": 452640,
"step": 170
},
{
"epoch": 0.7741935483870968,
"grad_norm": 5.206757545471191,
"learning_rate": 2.531549188280135e-05,
"loss": 0.1273,
"num_input_tokens_seen": 479808,
"step": 180
},
{
"epoch": 0.8172043010752689,
"grad_norm": 3.0387344360351562,
"learning_rate": 2.481360418611132e-05,
"loss": 0.1206,
"num_input_tokens_seen": 506176,
"step": 190
},
{
"epoch": 0.8602150537634409,
"grad_norm": 2.0281670093536377,
"learning_rate": 2.4291725418235848e-05,
"loss": 0.103,
"num_input_tokens_seen": 533216,
"step": 200
},
{
"epoch": 0.9032258064516129,
"grad_norm": 2.5650763511657715,
"learning_rate": 2.3750918686612414e-05,
"loss": 0.0696,
"num_input_tokens_seen": 561056,
"step": 210
},
{
"epoch": 0.946236559139785,
"grad_norm": 8.955713272094727,
"learning_rate": 2.3192285656405456e-05,
"loss": 0.0822,
"num_input_tokens_seen": 588160,
"step": 220
},
{
"epoch": 0.989247311827957,
"grad_norm": 3.034013032913208,
"learning_rate": 2.2616964306329183e-05,
"loss": 0.0913,
"num_input_tokens_seen": 615168,
"step": 230
},
{
"epoch": 1.0301075268817204,
"grad_norm": 2.60020112991333,
"learning_rate": 2.2026126610496852e-05,
"loss": 0.0735,
"num_input_tokens_seen": 639864,
"step": 240
},
{
"epoch": 1.0731182795698926,
"grad_norm": 4.891764163970947,
"learning_rate": 2.1420976151018813e-05,
"loss": 0.0752,
"num_input_tokens_seen": 667224,
"step": 250
},
{
"epoch": 1.1161290322580646,
"grad_norm": 1.1149002313613892,
"learning_rate": 2.0802745666212592e-05,
"loss": 0.0588,
"num_input_tokens_seen": 693848,
"step": 260
},
{
"epoch": 1.1591397849462366,
"grad_norm": 3.1601271629333496,
"learning_rate": 2.0172694539419557e-05,
"loss": 0.0924,
"num_input_tokens_seen": 720568,
"step": 270
},
{
"epoch": 1.2021505376344086,
"grad_norm": 3.555192470550537,
"learning_rate": 1.953210623354359e-05,
"loss": 0.062,
"num_input_tokens_seen": 746872,
"step": 280
},
{
"epoch": 1.2451612903225806,
"grad_norm": 2.235698699951172,
"learning_rate": 1.888228567653781e-05,
"loss": 0.0621,
"num_input_tokens_seen": 773720,
"step": 290
},
{
"epoch": 1.2881720430107526,
"grad_norm": 2.9058539867401123,
"learning_rate": 1.8224556603165363e-05,
"loss": 0.075,
"num_input_tokens_seen": 801464,
"step": 300
},
{
"epoch": 1.3311827956989246,
"grad_norm": 3.204787015914917,
"learning_rate": 1.7560258858449248e-05,
"loss": 0.0858,
"num_input_tokens_seen": 829144,
"step": 310
},
{
"epoch": 1.3741935483870968,
"grad_norm": 1.0885004997253418,
"learning_rate": 1.689074566830434e-05,
"loss": 0.0697,
"num_input_tokens_seen": 855672,
"step": 320
},
{
"epoch": 1.4172043010752688,
"grad_norm": 3.0750925540924072,
"learning_rate": 1.621738088291147e-05,
"loss": 0.0827,
"num_input_tokens_seen": 882424,
"step": 330
},
{
"epoch": 1.4602150537634409,
"grad_norm": 2.689297914505005,
"learning_rate": 1.5541536198449044e-05,
"loss": 0.0651,
"num_input_tokens_seen": 908792,
"step": 340
},
{
"epoch": 1.5032258064516129,
"grad_norm": 2.297851324081421,
"learning_rate": 1.4864588362841808e-05,
"loss": 0.0607,
"num_input_tokens_seen": 935672,
"step": 350
},
{
"epoch": 1.546236559139785,
"grad_norm": 2.712674140930176,
"learning_rate": 1.4187916371218739e-05,
"loss": 0.056,
"num_input_tokens_seen": 961848,
"step": 360
},
{
"epoch": 1.589247311827957,
"grad_norm": 0.8086225986480713,
"learning_rate": 1.3512898656793283e-05,
"loss": 0.0823,
"num_input_tokens_seen": 988600,
"step": 370
},
{
"epoch": 1.632258064516129,
"grad_norm": 2.166210174560547,
"learning_rate": 1.2840910282888211e-05,
"loss": 0.058,
"num_input_tokens_seen": 1014840,
"step": 380
},
{
"epoch": 1.675268817204301,
"grad_norm": 5.169621467590332,
"learning_rate": 1.2173320141825232e-05,
"loss": 0.0705,
"num_input_tokens_seen": 1040856,
"step": 390
},
{
"epoch": 1.718279569892473,
"grad_norm": 1.8176069259643555,
"learning_rate": 1.1511488166385349e-05,
"loss": 0.0514,
"num_input_tokens_seen": 1067544,
"step": 400
},
{
"epoch": 1.761290322580645,
"grad_norm": 3.424694776535034,
"learning_rate": 1.0856762559520605e-05,
"loss": 0.0834,
"num_input_tokens_seen": 1094584,
"step": 410
},
{
"epoch": 1.8043010752688171,
"grad_norm": 1.8838876485824585,
"learning_rate": 1.0210477047960303e-05,
"loss": 0.0583,
"num_input_tokens_seen": 1120760,
"step": 420
},
{
"epoch": 1.8473118279569891,
"grad_norm": 3.7757434844970703,
"learning_rate": 9.573948165306438e-06,
"loss": 0.0922,
"num_input_tokens_seen": 1146776,
"step": 430
},
{
"epoch": 1.8903225806451613,
"grad_norm": 3.0619328022003174,
"learning_rate": 8.948472570152874e-06,
"loss": 0.0633,
"num_input_tokens_seen": 1174424,
"step": 440
},
{
"epoch": 1.9333333333333333,
"grad_norm": 2.5175821781158447,
"learning_rate": 8.33532440469145e-06,
"loss": 0.0597,
"num_input_tokens_seen": 1201048,
"step": 450
},
{
"epoch": 1.9763440860215054,
"grad_norm": 3.6232197284698486,
"learning_rate": 7.735752699185711e-06,
"loss": 0.0491,
"num_input_tokens_seen": 1227576,
"step": 460
},
{
"epoch": 2.0172043010752687,
"grad_norm": 2.8846399784088135,
"learning_rate": 7.150978827599619e-06,
"loss": 0.0341,
"num_input_tokens_seen": 1252160,
"step": 470
},
{
"epoch": 2.0602150537634407,
"grad_norm": 2.196216106414795,
"learning_rate": 6.582194019564266e-06,
"loss": 0.0373,
"num_input_tokens_seen": 1279328,
"step": 480
},
{
"epoch": 2.1032258064516127,
"grad_norm": 1.4616115093231201,
"learning_rate": 6.0305569337509225e-06,
"loss": 0.0281,
"num_input_tokens_seen": 1306304,
"step": 490
},
{
"epoch": 2.146236559139785,
"grad_norm": 0.17581823468208313,
"learning_rate": 5.497191297593647e-06,
"loss": 0.0183,
"num_input_tokens_seen": 1333184,
"step": 500
},
{
"epoch": 2.189247311827957,
"grad_norm": 3.8919403553009033,
"learning_rate": 4.98318361816957e-06,
"loss": 0.0334,
"num_input_tokens_seen": 1359872,
"step": 510
},
{
"epoch": 2.232258064516129,
"grad_norm": 1.3041765689849854,
"learning_rate": 4.4895809688998655e-06,
"loss": 0.0282,
"num_input_tokens_seen": 1387328,
"step": 520
},
{
"epoch": 2.275268817204301,
"grad_norm": 1.669753074645996,
"learning_rate": 4.017388856580178e-06,
"loss": 0.0562,
"num_input_tokens_seen": 1414816,
"step": 530
},
{
"epoch": 2.318279569892473,
"grad_norm": 0.28061679005622864,
"learning_rate": 3.567569173085455e-06,
"loss": 0.0243,
"num_input_tokens_seen": 1441504,
"step": 540
},
{
"epoch": 2.361290322580645,
"grad_norm": 2.324270009994507,
"learning_rate": 3.1410382359217645e-06,
"loss": 0.044,
"num_input_tokens_seen": 1467680,
"step": 550
},
{
"epoch": 2.404301075268817,
"grad_norm": 2.708113670349121,
"learning_rate": 2.7386649216166233e-06,
"loss": 0.0551,
"num_input_tokens_seen": 1494176,
"step": 560
},
{
"epoch": 2.447311827956989,
"grad_norm": 3.16683030128479,
"learning_rate": 2.361268895750264e-06,
"loss": 0.0258,
"num_input_tokens_seen": 1520544,
"step": 570
},
{
"epoch": 2.490322580645161,
"grad_norm": 6.040332794189453,
"learning_rate": 2.0096189432334194e-06,
"loss": 0.0415,
"num_input_tokens_seen": 1547264,
"step": 580
},
{
"epoch": 2.533333333333333,
"grad_norm": 5.078160285949707,
"learning_rate": 1.6844314022329676e-06,
"loss": 0.0375,
"num_input_tokens_seen": 1573920,
"step": 590
},
{
"epoch": 2.576344086021505,
"grad_norm": 4.950022220611572,
"learning_rate": 1.3863687049356465e-06,
"loss": 0.0235,
"num_input_tokens_seen": 1600640,
"step": 600
},
{
"epoch": 2.6193548387096772,
"grad_norm": 1.7687643766403198,
"learning_rate": 1.116038028122413e-06,
"loss": 0.0354,
"num_input_tokens_seen": 1626848,
"step": 610
},
{
"epoch": 2.6623655913978492,
"grad_norm": 3.893580913543701,
"learning_rate": 8.7399005630238e-07,
"loss": 0.0357,
"num_input_tokens_seen": 1653408,
"step": 620
},
{
"epoch": 2.7053763440860212,
"grad_norm": 2.830453395843506,
"learning_rate": 6.607178599258268e-07,
"loss": 0.0512,
"num_input_tokens_seen": 1679968,
"step": 630
},
{
"epoch": 2.7483870967741937,
"grad_norm": 3.638772487640381,
"learning_rate": 4.766558909615504e-07,
"loss": 0.0243,
"num_input_tokens_seen": 1706944,
"step": 640
},
{
"epoch": 2.7913978494623657,
"grad_norm": 1.229244589805603,
"learning_rate": 3.22179097884579e-07,
"loss": 0.0367,
"num_input_tokens_seen": 1733888,
"step": 650
},
{
"epoch": 2.8344086021505377,
"grad_norm": 2.9207515716552734,
"learning_rate": 1.9760216187710788e-07,
"loss": 0.0377,
"num_input_tokens_seen": 1760448,
"step": 660
},
{
"epoch": 2.8774193548387097,
"grad_norm": 3.560971975326538,
"learning_rate": 1.0317885579858522e-07,
"loss": 0.0471,
"num_input_tokens_seen": 1787072,
"step": 670
},
{
"epoch": 2.9204301075268817,
"grad_norm": 0.2031625360250473,
"learning_rate": 3.910152723075322e-08,
"loss": 0.0222,
"num_input_tokens_seen": 1813632,
"step": 680
},
{
"epoch": 2.9634408602150537,
"grad_norm": 4.158380508422852,
"learning_rate": 5.50070665074065e-09,
"loss": 0.0454,
"num_input_tokens_seen": 1840384,
"step": 690
},
{
"epoch": 2.989247311827957,
"num_input_tokens_seen": 1855776,
"step": 696,
"total_flos": 7.944329136203366e+16,
"train_loss": 0.08561765917459097,
"train_runtime": 638.811,
"train_samples_per_second": 8.721,
"train_steps_per_second": 1.09
}
],
"logging_steps": 10,
"max_steps": 696,
"num_input_tokens_seen": 1855776,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.944329136203366e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}