both_capaug_pack_22528 / trainer_state.json
LHL3341's picture
upload checkpoint
1762150 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 867,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03466204506065858,
"grad_norm": 1.614887252818893,
"learning_rate": 2.0689655172413796e-06,
"loss": 0.6065,
"step": 10
},
{
"epoch": 0.06932409012131716,
"grad_norm": 1.2876954446946685,
"learning_rate": 4.367816091954023e-06,
"loss": 0.5347,
"step": 20
},
{
"epoch": 0.10398613518197573,
"grad_norm": 0.4989329651688657,
"learning_rate": 6.666666666666667e-06,
"loss": 0.4714,
"step": 30
},
{
"epoch": 0.1386481802426343,
"grad_norm": 0.36809008403990967,
"learning_rate": 8.965517241379312e-06,
"loss": 0.4476,
"step": 40
},
{
"epoch": 0.1733102253032929,
"grad_norm": 0.3015275688562798,
"learning_rate": 1.1264367816091955e-05,
"loss": 0.4157,
"step": 50
},
{
"epoch": 0.20797227036395147,
"grad_norm": 0.23325709345466544,
"learning_rate": 1.3563218390804598e-05,
"loss": 0.4014,
"step": 60
},
{
"epoch": 0.24263431542461006,
"grad_norm": 0.2599261399956402,
"learning_rate": 1.586206896551724e-05,
"loss": 0.3918,
"step": 70
},
{
"epoch": 0.2772963604852686,
"grad_norm": 0.23676671921370496,
"learning_rate": 1.8160919540229885e-05,
"loss": 0.391,
"step": 80
},
{
"epoch": 0.3119584055459272,
"grad_norm": 0.24237959936856435,
"learning_rate": 1.9999675557165282e-05,
"loss": 0.3859,
"step": 90
},
{
"epoch": 0.3466204506065858,
"grad_norm": 0.4446903057333395,
"learning_rate": 1.998832226832327e-05,
"loss": 0.3805,
"step": 100
},
{
"epoch": 0.38128249566724437,
"grad_norm": 0.3211792221924684,
"learning_rate": 1.9960767884236132e-05,
"loss": 0.3806,
"step": 110
},
{
"epoch": 0.41594454072790293,
"grad_norm": 0.27403637242653317,
"learning_rate": 1.9917057098215624e-05,
"loss": 0.3805,
"step": 120
},
{
"epoch": 0.4506065857885615,
"grad_norm": 0.2508084333447016,
"learning_rate": 1.985726080931651e-05,
"loss": 0.3741,
"step": 130
},
{
"epoch": 0.4852686308492201,
"grad_norm": 0.28374356186453464,
"learning_rate": 1.9781476007338058e-05,
"loss": 0.3739,
"step": 140
},
{
"epoch": 0.5199306759098787,
"grad_norm": 0.35539697186524694,
"learning_rate": 1.968982561550621e-05,
"loss": 0.3678,
"step": 150
},
{
"epoch": 0.5545927209705372,
"grad_norm": 0.2613425020538643,
"learning_rate": 1.9582458291091664e-05,
"loss": 0.3647,
"step": 160
},
{
"epoch": 0.5892547660311959,
"grad_norm": 0.3109549671591506,
"learning_rate": 1.9459548184287254e-05,
"loss": 0.3638,
"step": 170
},
{
"epoch": 0.6239168110918544,
"grad_norm": 0.3368502474991166,
"learning_rate": 1.932129465573568e-05,
"loss": 0.3626,
"step": 180
},
{
"epoch": 0.658578856152513,
"grad_norm": 0.2670073589604095,
"learning_rate": 1.9167921953165827e-05,
"loss": 0.3635,
"step": 190
},
{
"epoch": 0.6932409012131716,
"grad_norm": 0.2916079955306212,
"learning_rate": 1.8999678847662124e-05,
"loss": 0.3597,
"step": 200
},
{
"epoch": 0.7279029462738301,
"grad_norm": 0.30166318114659063,
"learning_rate": 1.881683823015694e-05,
"loss": 0.3555,
"step": 210
},
{
"epoch": 0.7625649913344887,
"grad_norm": 0.2713616440874116,
"learning_rate": 1.8619696668800494e-05,
"loss": 0.3586,
"step": 220
},
{
"epoch": 0.7972270363951474,
"grad_norm": 0.26515561290090994,
"learning_rate": 1.8408573927926225e-05,
"loss": 0.3617,
"step": 230
},
{
"epoch": 0.8318890814558059,
"grad_norm": 0.2813509663127385,
"learning_rate": 1.818381244939187e-05,
"loss": 0.3556,
"step": 240
},
{
"epoch": 0.8665511265164645,
"grad_norm": 0.346388313968459,
"learning_rate": 1.7945776797137544e-05,
"loss": 0.3517,
"step": 250
},
{
"epoch": 0.901213171577123,
"grad_norm": 0.25559948474750327,
"learning_rate": 1.769485306586166e-05,
"loss": 0.3531,
"step": 260
},
{
"epoch": 0.9358752166377816,
"grad_norm": 0.23567303867404224,
"learning_rate": 1.7431448254773943e-05,
"loss": 0.354,
"step": 270
},
{
"epoch": 0.9705372616984402,
"grad_norm": 0.2507120585300497,
"learning_rate": 1.715598960744121e-05,
"loss": 0.353,
"step": 280
},
{
"epoch": 1.0034662045060658,
"grad_norm": 0.3235414509087519,
"learning_rate": 1.6868923918796753e-05,
"loss": 0.3479,
"step": 290
},
{
"epoch": 1.0381282495667243,
"grad_norm": 0.29249432292924354,
"learning_rate": 1.657071681043731e-05,
"loss": 0.3297,
"step": 300
},
{
"epoch": 1.072790294627383,
"grad_norm": 0.28107913308047194,
"learning_rate": 1.626185197538314e-05,
"loss": 0.3216,
"step": 310
},
{
"epoch": 1.1074523396880416,
"grad_norm": 0.2388850688621256,
"learning_rate": 1.5942830393526176e-05,
"loss": 0.3313,
"step": 320
},
{
"epoch": 1.1421143847487,
"grad_norm": 0.23828446739527873,
"learning_rate": 1.561416951903881e-05,
"loss": 0.3274,
"step": 330
},
{
"epoch": 1.1767764298093588,
"grad_norm": 0.24738586978364616,
"learning_rate": 1.527640244106133e-05,
"loss": 0.3261,
"step": 340
},
{
"epoch": 1.2114384748700173,
"grad_norm": 0.24976026828074924,
"learning_rate": 1.4930077019029376e-05,
"loss": 0.3277,
"step": 350
},
{
"epoch": 1.2461005199306758,
"grad_norm": 0.31665221410517264,
"learning_rate": 1.4575754994043956e-05,
"loss": 0.3242,
"step": 360
},
{
"epoch": 1.2807625649913346,
"grad_norm": 0.25301472762117483,
"learning_rate": 1.4214011077725293e-05,
"loss": 0.3296,
"step": 370
},
{
"epoch": 1.315424610051993,
"grad_norm": 0.3032496960590633,
"learning_rate": 1.3845432020028511e-05,
"loss": 0.3243,
"step": 380
},
{
"epoch": 1.3500866551126516,
"grad_norm": 0.2144539306745234,
"learning_rate": 1.347061565753303e-05,
"loss": 0.3223,
"step": 390
},
{
"epoch": 1.38474870017331,
"grad_norm": 0.25316588193158246,
"learning_rate": 1.3090169943749475e-05,
"loss": 0.3269,
"step": 400
},
{
"epoch": 1.4194107452339688,
"grad_norm": 0.23335401936650935,
"learning_rate": 1.270471196301684e-05,
"loss": 0.3233,
"step": 410
},
{
"epoch": 1.4540727902946273,
"grad_norm": 0.25740474483694903,
"learning_rate": 1.2314866929589434e-05,
"loss": 0.3257,
"step": 420
},
{
"epoch": 1.4887348353552858,
"grad_norm": 0.22583757323894366,
"learning_rate": 1.1921267173537085e-05,
"loss": 0.3258,
"step": 430
},
{
"epoch": 1.5233968804159446,
"grad_norm": 0.21072089179588632,
"learning_rate": 1.1524551115103455e-05,
"loss": 0.3198,
"step": 440
},
{
"epoch": 1.558058925476603,
"grad_norm": 0.2075217279484371,
"learning_rate": 1.1125362229186056e-05,
"loss": 0.3213,
"step": 450
},
{
"epoch": 1.5927209705372616,
"grad_norm": 0.20733188488510618,
"learning_rate": 1.0724348001617626e-05,
"loss": 0.3189,
"step": 460
},
{
"epoch": 1.6273830155979203,
"grad_norm": 0.23700844887424702,
"learning_rate": 1.0322158878941733e-05,
"loss": 0.3238,
"step": 470
},
{
"epoch": 1.6620450606585788,
"grad_norm": 0.2303410797551013,
"learning_rate": 9.919447213386103e-06,
"loss": 0.3188,
"step": 480
},
{
"epoch": 1.6967071057192373,
"grad_norm": 0.2545430511134805,
"learning_rate": 9.516866204744932e-06,
"loss": 0.3185,
"step": 490
},
{
"epoch": 1.731369150779896,
"grad_norm": 0.22010025072217687,
"learning_rate": 9.115068840886418e-06,
"loss": 0.3207,
"step": 500
},
{
"epoch": 1.7660311958405546,
"grad_norm": 0.20716728270454912,
"learning_rate": 8.714706838604056e-06,
"loss": 0.324,
"step": 510
},
{
"epoch": 1.800693240901213,
"grad_norm": 0.22069615326048286,
"learning_rate": 8.316429586529616e-06,
"loss": 0.3199,
"step": 520
},
{
"epoch": 1.8353552859618718,
"grad_norm": 0.23991187483045295,
"learning_rate": 7.92088309182241e-06,
"loss": 0.3211,
"step": 530
},
{
"epoch": 1.8700173310225303,
"grad_norm": 0.2151602225059441,
"learning_rate": 7.5287089323433035e-06,
"loss": 0.321,
"step": 540
},
{
"epoch": 1.9046793760831888,
"grad_norm": 0.20354266611589614,
"learning_rate": 7.140543216013109e-06,
"loss": 0.3169,
"step": 550
},
{
"epoch": 1.9393414211438476,
"grad_norm": 0.18489564276501388,
"learning_rate": 6.757015549043174e-06,
"loss": 0.3217,
"step": 560
},
{
"epoch": 1.974003466204506,
"grad_norm": 0.20059993408138063,
"learning_rate": 6.378748014711834e-06,
"loss": 0.3183,
"step": 570
},
{
"epoch": 2.0069324090121317,
"grad_norm": 0.20918648314950533,
"learning_rate": 6.006354164343047e-06,
"loss": 0.3143,
"step": 580
},
{
"epoch": 2.0415944540727904,
"grad_norm": 0.21120459555296436,
"learning_rate": 5.640438022123898e-06,
"loss": 0.2945,
"step": 590
},
{
"epoch": 2.0762564991334487,
"grad_norm": 0.19259124342978656,
"learning_rate": 5.28159310537518e-06,
"loss": 0.2964,
"step": 600
},
{
"epoch": 2.1109185441941074,
"grad_norm": 0.19456637048215897,
"learning_rate": 4.930401461864099e-06,
"loss": 0.2962,
"step": 610
},
{
"epoch": 2.145580589254766,
"grad_norm": 0.19317531728305584,
"learning_rate": 4.587432725720687e-06,
"loss": 0.2991,
"step": 620
},
{
"epoch": 2.1802426343154244,
"grad_norm": 0.1856506218049172,
"learning_rate": 4.2532431934891646e-06,
"loss": 0.2964,
"step": 630
},
{
"epoch": 2.214904679376083,
"grad_norm": 0.1856754928750834,
"learning_rate": 3.9283749218128885e-06,
"loss": 0.2995,
"step": 640
},
{
"epoch": 2.249566724436742,
"grad_norm": 0.184849964757185,
"learning_rate": 3.6133548482165225e-06,
"loss": 0.295,
"step": 650
},
{
"epoch": 2.2842287694974,
"grad_norm": 0.19565957222744196,
"learning_rate": 3.308693936411421e-06,
"loss": 0.2979,
"step": 660
},
{
"epoch": 2.318890814558059,
"grad_norm": 0.18136403678035376,
"learning_rate": 3.0148863475106315e-06,
"loss": 0.2979,
"step": 670
},
{
"epoch": 2.3535528596187176,
"grad_norm": 0.19360555697632642,
"learning_rate": 2.73240863849777e-06,
"loss": 0.2992,
"step": 680
},
{
"epoch": 2.388214904679376,
"grad_norm": 0.1775757947686451,
"learning_rate": 2.4617189892498326e-06,
"loss": 0.2935,
"step": 690
},
{
"epoch": 2.4228769497400346,
"grad_norm": 0.16925773266551042,
"learning_rate": 2.2032564593677773e-06,
"loss": 0.2954,
"step": 700
},
{
"epoch": 2.4575389948006934,
"grad_norm": 0.16916508338919892,
"learning_rate": 1.9574402760202315e-06,
"loss": 0.2955,
"step": 710
},
{
"epoch": 2.4922010398613517,
"grad_norm": 0.18075348143642583,
"learning_rate": 1.7246691539555027e-06,
"loss": 0.2922,
"step": 720
},
{
"epoch": 2.5268630849220104,
"grad_norm": 0.1850332211086371,
"learning_rate": 1.5053206487847916e-06,
"loss": 0.2955,
"step": 730
},
{
"epoch": 2.561525129982669,
"grad_norm": 0.16621824099266747,
"learning_rate": 1.2997505445856085e-06,
"loss": 0.2962,
"step": 740
},
{
"epoch": 2.5961871750433274,
"grad_norm": 0.16991443069932874,
"learning_rate": 1.1082922768187098e-06,
"loss": 0.2956,
"step": 750
},
{
"epoch": 2.630849220103986,
"grad_norm": 0.16292018023051463,
"learning_rate": 9.312563914945461e-07,
"loss": 0.2943,
"step": 760
},
{
"epoch": 2.665511265164645,
"grad_norm": 0.16599730368446816,
"learning_rate": 7.689300414665124e-07,
"loss": 0.2947,
"step": 770
},
{
"epoch": 2.700173310225303,
"grad_norm": 0.18732619784560842,
"learning_rate": 6.215765206679569e-07,
"loss": 0.2975,
"step": 780
},
{
"epoch": 2.734835355285962,
"grad_norm": 0.3651555640716956,
"learning_rate": 4.894348370484648e-07,
"loss": 0.2976,
"step": 790
},
{
"epoch": 2.76949740034662,
"grad_norm": 0.15975086160134086,
"learning_rate": 3.7271932490209327e-07,
"loss": 0.2961,
"step": 800
},
{
"epoch": 2.804159445407279,
"grad_norm": 0.16718571110994368,
"learning_rate": 2.716192972163556e-07,
"loss": 0.2969,
"step": 810
},
{
"epoch": 2.8388214904679376,
"grad_norm": 0.16340541384128912,
"learning_rate": 1.8629873860586567e-07,
"loss": 0.2958,
"step": 820
},
{
"epoch": 2.873483535528596,
"grad_norm": 0.17190266146012298,
"learning_rate": 1.1689603932869664e-07,
"loss": 0.3001,
"step": 830
},
{
"epoch": 2.9081455805892547,
"grad_norm": 0.1591106267026665,
"learning_rate": 6.352377081687011e-08,
"loss": 0.2943,
"step": 840
},
{
"epoch": 2.9428076256499134,
"grad_norm": 0.1674267163284321,
"learning_rate": 2.6268503085089547e-08,
"loss": 0.2982,
"step": 850
},
{
"epoch": 2.9774696707105717,
"grad_norm": 0.15742123030957314,
"learning_rate": 5.190664313851068e-09,
"loss": 0.2965,
"step": 860
}
],
"logging_steps": 10,
"max_steps": 867,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5763988440743936.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}