Qwen2-Math-1.5B-MetaMathQA / trainer_state.json
Lorry0727's picture
Upload folder using huggingface_hub
9dfecfd verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 772,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012961762799740765,
"grad_norm": 3.271511827979835,
"learning_rate": 5.76923076923077e-06,
"loss": 0.5184,
"step": 10
},
{
"epoch": 0.02592352559948153,
"grad_norm": 0.831092338338801,
"learning_rate": 1.217948717948718e-05,
"loss": 0.3124,
"step": 20
},
{
"epoch": 0.03888528839922229,
"grad_norm": 0.8169740852721643,
"learning_rate": 1.858974358974359e-05,
"loss": 0.244,
"step": 30
},
{
"epoch": 0.05184705119896306,
"grad_norm": 0.8022441079176706,
"learning_rate": 2.5e-05,
"loss": 0.2194,
"step": 40
},
{
"epoch": 0.06480881399870382,
"grad_norm": 0.746730565456582,
"learning_rate": 3.141025641025641e-05,
"loss": 0.1932,
"step": 50
},
{
"epoch": 0.07777057679844458,
"grad_norm": 0.45890023499187765,
"learning_rate": 3.782051282051282e-05,
"loss": 0.175,
"step": 60
},
{
"epoch": 0.09073233959818536,
"grad_norm": 0.22457234130494264,
"learning_rate": 4.423076923076923e-05,
"loss": 0.1627,
"step": 70
},
{
"epoch": 0.10369410239792612,
"grad_norm": 0.20882952808150415,
"learning_rate": 4.999974385252693e-05,
"loss": 0.1604,
"step": 80
},
{
"epoch": 0.11665586519766688,
"grad_norm": 0.20503246990709634,
"learning_rate": 4.996901250644663e-05,
"loss": 0.1563,
"step": 90
},
{
"epoch": 0.12961762799740764,
"grad_norm": 0.18969366345140587,
"learning_rate": 4.9887123814116815e-05,
"loss": 0.158,
"step": 100
},
{
"epoch": 0.1425793907971484,
"grad_norm": 0.18632569690899564,
"learning_rate": 4.9754245551823644e-05,
"loss": 0.1564,
"step": 110
},
{
"epoch": 0.15554115359688916,
"grad_norm": 0.18117655622812814,
"learning_rate": 4.957064996498616e-05,
"loss": 0.154,
"step": 120
},
{
"epoch": 0.16850291639662995,
"grad_norm": 0.16588546566690052,
"learning_rate": 4.9336713210370824e-05,
"loss": 0.1523,
"step": 130
},
{
"epoch": 0.18146467919637072,
"grad_norm": 0.18912029718121498,
"learning_rate": 4.905291458540893e-05,
"loss": 0.1511,
"step": 140
},
{
"epoch": 0.19442644199611148,
"grad_norm": 0.17373091061587545,
"learning_rate": 4.871983554619603e-05,
"loss": 0.1503,
"step": 150
},
{
"epoch": 0.20738820479585224,
"grad_norm": 0.17290195615203574,
"learning_rate": 4.833815851618534e-05,
"loss": 0.1514,
"step": 160
},
{
"epoch": 0.220349967595593,
"grad_norm": 0.17919206762895043,
"learning_rate": 4.7908665488015724e-05,
"loss": 0.15,
"step": 170
},
{
"epoch": 0.23331173039533376,
"grad_norm": 0.16987143452586112,
"learning_rate": 4.7432236421339085e-05,
"loss": 0.1481,
"step": 180
},
{
"epoch": 0.24627349319507452,
"grad_norm": 0.18456595982528615,
"learning_rate": 4.690984743992968e-05,
"loss": 0.1479,
"step": 190
},
{
"epoch": 0.2592352559948153,
"grad_norm": 0.16935904456944317,
"learning_rate": 4.6342568831769154e-05,
"loss": 0.1476,
"step": 200
},
{
"epoch": 0.27219701879455604,
"grad_norm": 0.1686721280050141,
"learning_rate": 4.5731562856204766e-05,
"loss": 0.1456,
"step": 210
},
{
"epoch": 0.2851587815942968,
"grad_norm": 0.17009760438578003,
"learning_rate": 4.507808136267367e-05,
"loss": 0.1464,
"step": 220
},
{
"epoch": 0.29812054439403757,
"grad_norm": 0.16186269370223397,
"learning_rate": 4.4383463225872e-05,
"loss": 0.1454,
"step": 230
},
{
"epoch": 0.31108230719377833,
"grad_norm": 0.1567841142418867,
"learning_rate": 4.3649131602623684e-05,
"loss": 0.1461,
"step": 240
},
{
"epoch": 0.32404406999351915,
"grad_norm": 0.15669189958261343,
"learning_rate": 4.2876591016069276e-05,
"loss": 0.1425,
"step": 250
},
{
"epoch": 0.3370058327932599,
"grad_norm": 0.15932597917665003,
"learning_rate": 4.206742427314869e-05,
"loss": 0.142,
"step": 260
},
{
"epoch": 0.34996759559300067,
"grad_norm": 0.1619448915136352,
"learning_rate": 4.122328922169354e-05,
"loss": 0.1439,
"step": 270
},
{
"epoch": 0.36292935839274143,
"grad_norm": 0.15820361686037115,
"learning_rate": 4.034591535377315e-05,
"loss": 0.1431,
"step": 280
},
{
"epoch": 0.3758911211924822,
"grad_norm": 0.15375142675453407,
"learning_rate": 3.9437100262253444e-05,
"loss": 0.1439,
"step": 290
},
{
"epoch": 0.38885288399222295,
"grad_norm": 0.15083396004850522,
"learning_rate": 3.849870595782879e-05,
"loss": 0.1394,
"step": 300
},
{
"epoch": 0.4018146467919637,
"grad_norm": 0.15309832243642346,
"learning_rate": 3.7532655054072175e-05,
"loss": 0.1402,
"step": 310
},
{
"epoch": 0.4147764095917045,
"grad_norm": 0.15212599623263634,
"learning_rate": 3.65409268283205e-05,
"loss": 0.1394,
"step": 320
},
{
"epoch": 0.42773817239144524,
"grad_norm": 0.16388840607478222,
"learning_rate": 3.5525553166464995e-05,
"loss": 0.1393,
"step": 330
},
{
"epoch": 0.440699935191186,
"grad_norm": 0.14965585253954922,
"learning_rate": 3.4488614399955655e-05,
"loss": 0.1401,
"step": 340
},
{
"epoch": 0.45366169799092676,
"grad_norm": 0.14839886982769007,
"learning_rate": 3.343223504354868e-05,
"loss": 0.1388,
"step": 350
},
{
"epoch": 0.4666234607906675,
"grad_norm": 0.15069527899368187,
"learning_rate": 3.2358579442529756e-05,
"loss": 0.1397,
"step": 360
},
{
"epoch": 0.4795852235904083,
"grad_norm": 0.16024683724642488,
"learning_rate": 3.1269847338331195e-05,
"loss": 0.1393,
"step": 370
},
{
"epoch": 0.49254698639014904,
"grad_norm": 0.15567268677689633,
"learning_rate": 3.016826936162822e-05,
"loss": 0.1372,
"step": 380
},
{
"epoch": 0.5055087491898899,
"grad_norm": 0.152409091505418,
"learning_rate": 2.905610246214846e-05,
"loss": 0.1379,
"step": 390
},
{
"epoch": 0.5184705119896306,
"grad_norm": 0.15469090492982207,
"learning_rate": 2.7935625284557933e-05,
"loss": 0.1363,
"step": 400
},
{
"epoch": 0.5314322747893714,
"grad_norm": 0.15396385907985133,
"learning_rate": 2.6809133499897853e-05,
"loss": 0.1355,
"step": 410
},
{
"epoch": 0.5443940375891121,
"grad_norm": 0.1509398233656185,
"learning_rate": 2.567893510213716e-05,
"loss": 0.1357,
"step": 420
},
{
"epoch": 0.5573558003888529,
"grad_norm": 0.1517027742189059,
"learning_rate": 2.4547345679477424e-05,
"loss": 0.1356,
"step": 430
},
{
"epoch": 0.5703175631885936,
"grad_norm": 0.1445296123427847,
"learning_rate": 2.3416683670098457e-05,
"loss": 0.1335,
"step": 440
},
{
"epoch": 0.5832793259883344,
"grad_norm": 0.14623295764555178,
"learning_rate": 2.22892656120648e-05,
"loss": 0.134,
"step": 450
},
{
"epoch": 0.5962410887880751,
"grad_norm": 0.1438322715440882,
"learning_rate": 2.1167401397125193e-05,
"loss": 0.1311,
"step": 460
},
{
"epoch": 0.609202851587816,
"grad_norm": 0.13535718612249212,
"learning_rate": 2.0053389538129257e-05,
"loss": 0.1324,
"step": 470
},
{
"epoch": 0.6221646143875567,
"grad_norm": 0.14161349156500758,
"learning_rate": 1.8949512459757668e-05,
"loss": 0.1326,
"step": 480
},
{
"epoch": 0.6351263771872975,
"grad_norm": 0.15638548124890905,
"learning_rate": 1.7858031822214284e-05,
"loss": 0.1316,
"step": 490
},
{
"epoch": 0.6480881399870383,
"grad_norm": 0.13753009005968514,
"learning_rate": 1.678118388746118e-05,
"loss": 0.1318,
"step": 500
},
{
"epoch": 0.661049902786779,
"grad_norm": 0.13951872960880884,
"learning_rate": 1.5721174937490584e-05,
"loss": 0.128,
"step": 510
},
{
"epoch": 0.6740116655865198,
"grad_norm": 0.15062860291892982,
"learning_rate": 1.4680176754020627e-05,
"loss": 0.1329,
"step": 520
},
{
"epoch": 0.6869734283862605,
"grad_norm": 0.1373383252389213,
"learning_rate": 1.3660322168876483e-05,
"loss": 0.1314,
"step": 530
},
{
"epoch": 0.6999351911860013,
"grad_norm": 0.1450933429238937,
"learning_rate": 1.2663700694173325e-05,
"loss": 0.1307,
"step": 540
},
{
"epoch": 0.712896953985742,
"grad_norm": 0.1392322353140692,
"learning_rate": 1.1692354241254183e-05,
"loss": 0.1305,
"step": 550
},
{
"epoch": 0.7258587167854829,
"grad_norm": 0.14054841459970466,
"learning_rate": 1.0748272937153824e-05,
"loss": 0.1312,
"step": 560
},
{
"epoch": 0.7388204795852236,
"grad_norm": 0.14860896958457903,
"learning_rate": 9.83339104716002e-06,
"loss": 0.1292,
"step": 570
},
{
"epoch": 0.7517822423849644,
"grad_norm": 0.13529195006367872,
"learning_rate": 8.949583011826313e-06,
"loss": 0.1289,
"step": 580
},
{
"epoch": 0.7647440051847051,
"grad_norm": 0.1324125611532461,
"learning_rate": 8.098659606555617e-06,
"loss": 0.1281,
"step": 590
},
{
"epoch": 0.7777057679844459,
"grad_norm": 0.13800340855022483,
"learning_rate": 7.282364231623137e-06,
"loss": 0.1276,
"step": 600
},
{
"epoch": 0.7906675307841866,
"grad_norm": 0.13855130020381573,
"learning_rate": 6.502369340239678e-06,
"loss": 0.1275,
"step": 610
},
{
"epoch": 0.8036292935839274,
"grad_norm": 0.1357606689573841,
"learning_rate": 5.76027301197371e-06,
"loss": 0.1296,
"step": 620
},
{
"epoch": 0.8165910563836681,
"grad_norm": 0.137168244596666,
"learning_rate": 5.057595678552596e-06,
"loss": 0.1255,
"step": 630
},
{
"epoch": 0.829552819183409,
"grad_norm": 0.13260125486622454,
"learning_rate": 4.395777008751317e-06,
"loss": 0.1277,
"step": 640
},
{
"epoch": 0.8425145819831497,
"grad_norm": 0.1322898739724209,
"learning_rate": 3.776172958751012e-06,
"loss": 0.1276,
"step": 650
},
{
"epoch": 0.8554763447828905,
"grad_norm": 0.14011701850297203,
"learning_rate": 3.2000529940107353e-06,
"loss": 0.1289,
"step": 660
},
{
"epoch": 0.8684381075826313,
"grad_norm": 0.14213930494271382,
"learning_rate": 2.668597488344232e-06,
"loss": 0.1282,
"step": 670
},
{
"epoch": 0.881399870382372,
"grad_norm": 0.14207443413888324,
"learning_rate": 2.1828953055306468e-06,
"loss": 0.126,
"step": 680
},
{
"epoch": 0.8943616331821128,
"grad_norm": 0.14029242432499878,
"learning_rate": 1.7439415684141063e-06,
"loss": 0.1283,
"step": 690
},
{
"epoch": 0.9073233959818535,
"grad_norm": 0.13420810652236514,
"learning_rate": 1.3526356200628005e-06,
"loss": 0.1275,
"step": 700
},
{
"epoch": 0.9202851587815943,
"grad_norm": 0.13435026638625558,
"learning_rate": 1.009779181164891e-06,
"loss": 0.1287,
"step": 710
},
{
"epoch": 0.933246921581335,
"grad_norm": 0.1366326068760381,
"learning_rate": 7.160747074363927e-07,
"loss": 0.1276,
"step": 720
},
{
"epoch": 0.9462086843810759,
"grad_norm": 0.1369464774984695,
"learning_rate": 4.7212395040647783e-07,
"loss": 0.1274,
"step": 730
},
{
"epoch": 0.9591704471808166,
"grad_norm": 0.13491429289666754,
"learning_rate": 2.784267245288408e-07,
"loss": 0.128,
"step": 740
},
{
"epoch": 0.9721322099805574,
"grad_norm": 0.14545746024987635,
"learning_rate": 1.3537988314516748e-07,
"loss": 0.128,
"step": 750
},
{
"epoch": 0.9850939727802981,
"grad_norm": 0.14001699440802526,
"learning_rate": 4.3276505398764935e-08,
"loss": 0.1261,
"step": 760
},
{
"epoch": 0.9980557355800389,
"grad_norm": 0.1401206033433944,
"learning_rate": 2.3052957642238915e-09,
"loss": 0.1234,
"step": 770
},
{
"epoch": 1.0,
"step": 772,
"total_flos": 190452488863744.0,
"train_loss": 0.1484613320138788,
"train_runtime": 5530.0522,
"train_samples_per_second": 71.428,
"train_steps_per_second": 0.14
}
],
"logging_steps": 10,
"max_steps": 772,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 190452488863744.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}