masrl0206_notool / trainer_state.json
Mercury7353's picture
Add files using upload-large-folder tool
099335b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 838,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02386634844868735,
"grad_norm": 22.198865776871017,
"learning_rate": 1.0714285714285714e-06,
"loss": 1.5985,
"step": 10
},
{
"epoch": 0.0477326968973747,
"grad_norm": 3.2541165998224404,
"learning_rate": 2.261904761904762e-06,
"loss": 1.1786,
"step": 20
},
{
"epoch": 0.07159904534606205,
"grad_norm": 2.5041591504453575,
"learning_rate": 3.4523809523809528e-06,
"loss": 0.9456,
"step": 30
},
{
"epoch": 0.0954653937947494,
"grad_norm": 2.080495232890991,
"learning_rate": 4.642857142857144e-06,
"loss": 0.8296,
"step": 40
},
{
"epoch": 0.11933174224343675,
"grad_norm": 1.9634409779528466,
"learning_rate": 5.833333333333334e-06,
"loss": 0.7578,
"step": 50
},
{
"epoch": 0.1431980906921241,
"grad_norm": 1.697242706017287,
"learning_rate": 7.023809523809524e-06,
"loss": 0.7161,
"step": 60
},
{
"epoch": 0.16706443914081145,
"grad_norm": 1.8172294891473797,
"learning_rate": 8.214285714285714e-06,
"loss": 0.7254,
"step": 70
},
{
"epoch": 0.1909307875894988,
"grad_norm": 1.5681357090324912,
"learning_rate": 9.404761904761905e-06,
"loss": 0.6908,
"step": 80
},
{
"epoch": 0.21479713603818615,
"grad_norm": 1.652119506861618,
"learning_rate": 9.998915020921847e-06,
"loss": 0.6295,
"step": 90
},
{
"epoch": 0.2386634844868735,
"grad_norm": 1.8575385662659074,
"learning_rate": 9.990238013323298e-06,
"loss": 0.7128,
"step": 100
},
{
"epoch": 0.26252983293556087,
"grad_norm": 1.845005460336626,
"learning_rate": 9.972899059486629e-06,
"loss": 0.6907,
"step": 110
},
{
"epoch": 0.2863961813842482,
"grad_norm": 1.3478623294347822,
"learning_rate": 9.946928255989507e-06,
"loss": 0.6847,
"step": 120
},
{
"epoch": 0.31026252983293556,
"grad_norm": 1.4018840185317787,
"learning_rate": 9.912370682385866e-06,
"loss": 0.6866,
"step": 130
},
{
"epoch": 0.3341288782816229,
"grad_norm": 1.8134575466661624,
"learning_rate": 9.86928632295779e-06,
"loss": 0.6826,
"step": 140
},
{
"epoch": 0.35799522673031026,
"grad_norm": 1.7336473448072804,
"learning_rate": 9.817749962596115e-06,
"loss": 0.6763,
"step": 150
},
{
"epoch": 0.3818615751789976,
"grad_norm": 1.5075876283514944,
"learning_rate": 9.757851056990446e-06,
"loss": 0.7218,
"step": 160
},
{
"epoch": 0.40572792362768495,
"grad_norm": 1.5160111965341203,
"learning_rate": 9.689693577353917e-06,
"loss": 0.6813,
"step": 170
},
{
"epoch": 0.4295942720763723,
"grad_norm": 1.5232856099293663,
"learning_rate": 9.613395829952233e-06,
"loss": 0.7066,
"step": 180
},
{
"epoch": 0.45346062052505964,
"grad_norm": 1.7920470499490964,
"learning_rate": 9.529090250750234e-06,
"loss": 0.6921,
"step": 190
},
{
"epoch": 0.477326968973747,
"grad_norm": 1.2375823623473665,
"learning_rate": 9.436923175532442e-06,
"loss": 0.6796,
"step": 200
},
{
"epoch": 0.5011933174224343,
"grad_norm": 1.319341491851884,
"learning_rate": 9.337054585896596e-06,
"loss": 0.7005,
"step": 210
},
{
"epoch": 0.5250596658711217,
"grad_norm": 1.527221302956167,
"learning_rate": 9.229657831561082e-06,
"loss": 0.6867,
"step": 220
},
{
"epoch": 0.548926014319809,
"grad_norm": 1.2837983849777186,
"learning_rate": 9.114919329468283e-06,
"loss": 0.6256,
"step": 230
},
{
"epoch": 0.5727923627684964,
"grad_norm": 1.4385178716193245,
"learning_rate": 8.993038240206114e-06,
"loss": 0.6633,
"step": 240
},
{
"epoch": 0.5966587112171837,
"grad_norm": 1.3575850712285624,
"learning_rate": 8.864226122309423e-06,
"loss": 0.649,
"step": 250
},
{
"epoch": 0.6205250596658711,
"grad_norm": 1.3696512745412937,
"learning_rate": 8.728706565041296e-06,
"loss": 0.6946,
"step": 260
},
{
"epoch": 0.6443914081145584,
"grad_norm": 1.3839657939910561,
"learning_rate": 8.586714800291704e-06,
"loss": 0.6845,
"step": 270
},
{
"epoch": 0.6682577565632458,
"grad_norm": 1.2964537533820486,
"learning_rate": 8.438497294267117e-06,
"loss": 0.6991,
"step": 280
},
{
"epoch": 0.6921241050119332,
"grad_norm": 1.4244871422623,
"learning_rate": 8.28431131967984e-06,
"loss": 0.6726,
"step": 290
},
{
"epoch": 0.7159904534606205,
"grad_norm": 1.5421494025118538,
"learning_rate": 8.124424509179648e-06,
"loss": 0.6935,
"step": 300
},
{
"epoch": 0.7398568019093079,
"grad_norm": 1.223951035339654,
"learning_rate": 7.959114390802894e-06,
"loss": 0.6645,
"step": 310
},
{
"epoch": 0.7637231503579952,
"grad_norm": 1.2107381990131096,
"learning_rate": 7.78866790624538e-06,
"loss": 0.6774,
"step": 320
},
{
"epoch": 0.7875894988066826,
"grad_norm": 1.468623638321339,
"learning_rate": 7.613380912795225e-06,
"loss": 0.6811,
"step": 330
},
{
"epoch": 0.8114558472553699,
"grad_norm": 1.1793519797847982,
"learning_rate": 7.4335576697902546e-06,
"loss": 0.6705,
"step": 340
},
{
"epoch": 0.8353221957040573,
"grad_norm": 1.199432894712693,
"learning_rate": 7.249510310491268e-06,
"loss": 0.6694,
"step": 350
},
{
"epoch": 0.8591885441527446,
"grad_norm": 1.5006522362419077,
"learning_rate": 7.0615583002879465e-06,
"loss": 0.6541,
"step": 360
},
{
"epoch": 0.883054892601432,
"grad_norm": 1.3307392430470173,
"learning_rate": 6.870027882177791e-06,
"loss": 0.6804,
"step": 370
},
{
"epoch": 0.9069212410501193,
"grad_norm": 1.5324129053990416,
"learning_rate": 6.675251510480662e-06,
"loss": 0.6797,
"step": 380
},
{
"epoch": 0.9307875894988067,
"grad_norm": 1.6200646040789068,
"learning_rate": 6.477567273771807e-06,
"loss": 0.6413,
"step": 390
},
{
"epoch": 0.954653937947494,
"grad_norm": 1.469059513496822,
"learning_rate": 6.277318308035109e-06,
"loss": 0.6785,
"step": 400
},
{
"epoch": 0.9785202863961814,
"grad_norm": 1.2149644700216862,
"learning_rate": 6.074852201055121e-06,
"loss": 0.6639,
"step": 410
},
{
"epoch": 1.0023866348448687,
"grad_norm": 1.0025525829016835,
"learning_rate": 5.870520389081782e-06,
"loss": 0.6222,
"step": 420
},
{
"epoch": 1.026252983293556,
"grad_norm": 1.295913557732094,
"learning_rate": 5.664677546815043e-06,
"loss": 0.5493,
"step": 430
},
{
"epoch": 1.0501193317422435,
"grad_norm": 1.1487163460854986,
"learning_rate": 5.457680971768258e-06,
"loss": 0.5344,
"step": 440
},
{
"epoch": 1.0739856801909309,
"grad_norm": 1.3219574700118175,
"learning_rate": 5.249889964078965e-06,
"loss": 0.5275,
"step": 450
},
{
"epoch": 1.097852028639618,
"grad_norm": 1.3770329891016075,
"learning_rate": 5.041665202843543e-06,
"loss": 0.5603,
"step": 460
},
{
"epoch": 1.1217183770883055,
"grad_norm": 1.331643194868804,
"learning_rate": 4.833368120058317e-06,
"loss": 0.5159,
"step": 470
},
{
"epoch": 1.1455847255369929,
"grad_norm": 1.2515049886681966,
"learning_rate": 4.6253602732537685e-06,
"loss": 0.5565,
"step": 480
},
{
"epoch": 1.1694510739856803,
"grad_norm": 1.4687022123622233,
"learning_rate": 4.418002717910887e-06,
"loss": 0.5413,
"step": 490
},
{
"epoch": 1.1933174224343674,
"grad_norm": 1.353625693587157,
"learning_rate": 4.2116553807489255e-06,
"loss": 0.5313,
"step": 500
},
{
"epoch": 1.2171837708830548,
"grad_norm": 1.3516251877378511,
"learning_rate": 4.006676434972474e-06,
"loss": 0.5359,
"step": 510
},
{
"epoch": 1.2410501193317423,
"grad_norm": 1.4559533629414523,
"learning_rate": 3.803421678562213e-06,
"loss": 0.5283,
"step": 520
},
{
"epoch": 1.2649164677804297,
"grad_norm": 1.3830404444432236,
"learning_rate": 3.602243916688548e-06,
"loss": 0.5256,
"step": 530
},
{
"epoch": 1.288782816229117,
"grad_norm": 1.1347395367280129,
"learning_rate": 3.403492349320101e-06,
"loss": 0.5065,
"step": 540
},
{
"epoch": 1.3126491646778042,
"grad_norm": 1.369934655860597,
"learning_rate": 3.2075119650900166e-06,
"loss": 0.5185,
"step": 550
},
{
"epoch": 1.3365155131264916,
"grad_norm": 1.1407666119413948,
"learning_rate": 3.0146429424722277e-06,
"loss": 0.5269,
"step": 560
},
{
"epoch": 1.360381861575179,
"grad_norm": 1.194998910872438,
"learning_rate": 2.82522005930708e-06,
"loss": 0.5579,
"step": 570
},
{
"epoch": 1.3842482100238662,
"grad_norm": 1.4048755491651306,
"learning_rate": 2.6395721117012648e-06,
"loss": 0.5326,
"step": 580
},
{
"epoch": 1.4081145584725536,
"grad_norm": 1.2124990054259197,
"learning_rate": 2.458021343310713e-06,
"loss": 0.5767,
"step": 590
},
{
"epoch": 1.431980906921241,
"grad_norm": 1.2008255986572944,
"learning_rate": 2.2808828859970905e-06,
"loss": 0.5421,
"step": 600
},
{
"epoch": 1.4558472553699284,
"grad_norm": 1.5274746306867921,
"learning_rate": 2.108464212828786e-06,
"loss": 0.537,
"step": 610
},
{
"epoch": 1.4797136038186158,
"grad_norm": 1.2483775974884175,
"learning_rate": 1.9410646043758737e-06,
"loss": 0.5055,
"step": 620
},
{
"epoch": 1.503579952267303,
"grad_norm": 1.2765814338927353,
"learning_rate": 1.7789746292254313e-06,
"loss": 0.5158,
"step": 630
},
{
"epoch": 1.5274463007159904,
"grad_norm": 1.3914988398920136,
"learning_rate": 1.6224756396189216e-06,
"loss": 0.5338,
"step": 640
},
{
"epoch": 1.5513126491646778,
"grad_norm": 1.3639543604430029,
"learning_rate": 1.4718392830871192e-06,
"loss": 0.5501,
"step": 650
},
{
"epoch": 1.575178997613365,
"grad_norm": 1.4507981866997166,
"learning_rate": 1.32732703093025e-06,
"loss": 0.525,
"step": 660
},
{
"epoch": 1.5990453460620526,
"grad_norm": 1.137016746163519,
"learning_rate": 1.1891897243618184e-06,
"loss": 0.5624,
"step": 670
},
{
"epoch": 1.6229116945107398,
"grad_norm": 1.2443489508801573,
"learning_rate": 1.0576671391038996e-06,
"loss": 0.5304,
"step": 680
},
{
"epoch": 1.6467780429594272,
"grad_norm": 1.6061206833052404,
"learning_rate": 9.32987569189675e-07,
"loss": 0.5236,
"step": 690
},
{
"epoch": 1.6706443914081146,
"grad_norm": 1.3532986340010356,
"learning_rate": 8.15367430695636e-07,
"loss": 0.5209,
"step": 700
},
{
"epoch": 1.6945107398568018,
"grad_norm": 1.4515766409844337,
"learning_rate": 7.050108860912752e-07,
"loss": 0.5111,
"step": 710
},
{
"epoch": 1.7183770883054894,
"grad_norm": 1.3284962952790051,
"learning_rate": 6.021094898583269e-07,
"loss": 0.5213,
"step": 720
},
{
"epoch": 1.7422434367541766,
"grad_norm": 1.4635925791592053,
"learning_rate": 5.068418559946864e-07,
"loss": 0.5311,
"step": 730
},
{
"epoch": 1.766109785202864,
"grad_norm": 1.351280688220892,
"learning_rate": 4.193733479801232e-07,
"loss": 0.5125,
"step": 740
},
{
"epoch": 1.7899761336515514,
"grad_norm": 1.3180660321477928,
"learning_rate": 3.398557917419626e-07,
"loss": 0.5108,
"step": 750
},
{
"epoch": 1.8138424821002386,
"grad_norm": 1.4121861566519063,
"learning_rate": 2.6842721211895516e-07,
"loss": 0.5208,
"step": 760
},
{
"epoch": 1.837708830548926,
"grad_norm": 1.3881888133783364,
"learning_rate": 2.0521159328077856e-07,
"loss": 0.5137,
"step": 770
},
{
"epoch": 1.8615751789976134,
"grad_norm": 1.5243893621253777,
"learning_rate": 1.5031866351901182e-07,
"loss": 0.5452,
"step": 780
},
{
"epoch": 1.8854415274463006,
"grad_norm": 1.3779735632404062,
"learning_rate": 1.0384370478316919e-07,
"loss": 0.5489,
"step": 790
},
{
"epoch": 1.9093078758949882,
"grad_norm": 1.3140536094113324,
"learning_rate": 6.58673872923693e-08,
"loss": 0.4922,
"step": 800
},
{
"epoch": 1.9331742243436754,
"grad_norm": 1.2195541397252074,
"learning_rate": 3.645562950973014e-08,
"loss": 0.5346,
"step": 810
},
{
"epoch": 1.9570405727923628,
"grad_norm": 1.2374268027315825,
"learning_rate": 1.5659483722537117e-08,
"loss": 0.475,
"step": 820
},
{
"epoch": 1.9809069212410502,
"grad_norm": 1.4012637588425856,
"learning_rate": 3.5150474267992007e-09,
"loss": 0.5564,
"step": 830
}
],
"logging_steps": 10,
"max_steps": 838,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 25783465213952.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}