Anthro_train / trainer_state.json
grohitraj's picture
Upload folder using huggingface_hub
38fa281 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 590,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.08547008547008547,
"grad_norm": 4.597737789154053,
"learning_rate": 8e-05,
"loss": 3.144,
"step": 5
},
{
"epoch": 0.17094017094017094,
"grad_norm": 3.6257293224334717,
"learning_rate": 0.00018,
"loss": 1.023,
"step": 10
},
{
"epoch": 0.2564102564102564,
"grad_norm": 0.9387032389640808,
"learning_rate": 0.00019862068965517243,
"loss": 0.3362,
"step": 15
},
{
"epoch": 0.3418803418803419,
"grad_norm": 0.41102728247642517,
"learning_rate": 0.00019689655172413795,
"loss": 0.1727,
"step": 20
},
{
"epoch": 0.42735042735042733,
"grad_norm": 0.504966676235199,
"learning_rate": 0.00019517241379310345,
"loss": 0.168,
"step": 25
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.43475794792175293,
"learning_rate": 0.00019344827586206898,
"loss": 0.1699,
"step": 30
},
{
"epoch": 0.5982905982905983,
"grad_norm": 0.5228849649429321,
"learning_rate": 0.0001917241379310345,
"loss": 0.1454,
"step": 35
},
{
"epoch": 0.6837606837606838,
"grad_norm": 0.4822940230369568,
"learning_rate": 0.00019,
"loss": 0.206,
"step": 40
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.4049399197101593,
"learning_rate": 0.00018827586206896554,
"loss": 0.132,
"step": 45
},
{
"epoch": 0.8547008547008547,
"grad_norm": 0.4594310224056244,
"learning_rate": 0.00018655172413793104,
"loss": 0.1541,
"step": 50
},
{
"epoch": 0.9401709401709402,
"grad_norm": 0.3666519820690155,
"learning_rate": 0.00018482758620689654,
"loss": 0.1144,
"step": 55
},
{
"epoch": 1.017094017094017,
"grad_norm": 0.16928212344646454,
"learning_rate": 0.00018310344827586207,
"loss": 0.1208,
"step": 60
},
{
"epoch": 1.1025641025641026,
"grad_norm": 0.09923699498176575,
"learning_rate": 0.0001813793103448276,
"loss": 0.0761,
"step": 65
},
{
"epoch": 1.188034188034188,
"grad_norm": 0.23626509308815002,
"learning_rate": 0.0001796551724137931,
"loss": 0.0786,
"step": 70
},
{
"epoch": 1.2735042735042734,
"grad_norm": 0.45735999941825867,
"learning_rate": 0.00017793103448275862,
"loss": 0.13,
"step": 75
},
{
"epoch": 1.358974358974359,
"grad_norm": 0.24871651828289032,
"learning_rate": 0.00017620689655172415,
"loss": 0.0814,
"step": 80
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.21524538099765778,
"learning_rate": 0.00017448275862068965,
"loss": 0.0737,
"step": 85
},
{
"epoch": 1.5299145299145298,
"grad_norm": 0.4590378701686859,
"learning_rate": 0.00017275862068965518,
"loss": 0.0955,
"step": 90
},
{
"epoch": 1.6153846153846154,
"grad_norm": 0.7036776542663574,
"learning_rate": 0.0001710344827586207,
"loss": 0.0671,
"step": 95
},
{
"epoch": 1.7008547008547008,
"grad_norm": 0.26162663102149963,
"learning_rate": 0.0001693103448275862,
"loss": 0.0828,
"step": 100
},
{
"epoch": 1.7863247863247862,
"grad_norm": 0.4105569124221802,
"learning_rate": 0.00016758620689655173,
"loss": 0.0768,
"step": 105
},
{
"epoch": 1.8717948717948718,
"grad_norm": 0.3037894666194916,
"learning_rate": 0.00016586206896551726,
"loss": 0.1149,
"step": 110
},
{
"epoch": 1.9572649572649574,
"grad_norm": 0.19420042634010315,
"learning_rate": 0.00016413793103448276,
"loss": 0.0635,
"step": 115
},
{
"epoch": 2.034188034188034,
"grad_norm": 0.13855452835559845,
"learning_rate": 0.0001624137931034483,
"loss": 0.0594,
"step": 120
},
{
"epoch": 2.1196581196581197,
"grad_norm": 0.17749273777008057,
"learning_rate": 0.00016068965517241382,
"loss": 0.0725,
"step": 125
},
{
"epoch": 2.2051282051282053,
"grad_norm": 0.13107630610466003,
"learning_rate": 0.00015896551724137932,
"loss": 0.0619,
"step": 130
},
{
"epoch": 2.2905982905982905,
"grad_norm": 0.11133825778961182,
"learning_rate": 0.00015724137931034485,
"loss": 0.0624,
"step": 135
},
{
"epoch": 2.376068376068376,
"grad_norm": 0.187343031167984,
"learning_rate": 0.00015551724137931037,
"loss": 0.0581,
"step": 140
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.27685755491256714,
"learning_rate": 0.00015379310344827587,
"loss": 0.0613,
"step": 145
},
{
"epoch": 2.547008547008547,
"grad_norm": 0.4320373833179474,
"learning_rate": 0.0001520689655172414,
"loss": 0.0735,
"step": 150
},
{
"epoch": 2.6324786324786325,
"grad_norm": 0.13862545788288116,
"learning_rate": 0.0001503448275862069,
"loss": 0.0582,
"step": 155
},
{
"epoch": 2.717948717948718,
"grad_norm": 0.7963452339172363,
"learning_rate": 0.00014862068965517243,
"loss": 0.0651,
"step": 160
},
{
"epoch": 2.8034188034188032,
"grad_norm": 0.14564156532287598,
"learning_rate": 0.00014689655172413793,
"loss": 0.0559,
"step": 165
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.15069833397865295,
"learning_rate": 0.00014517241379310346,
"loss": 0.0529,
"step": 170
},
{
"epoch": 2.9743589743589745,
"grad_norm": 0.3557753562927246,
"learning_rate": 0.00014344827586206896,
"loss": 0.0773,
"step": 175
},
{
"epoch": 3.051282051282051,
"grad_norm": 0.08716096729040146,
"learning_rate": 0.0001417241379310345,
"loss": 0.0513,
"step": 180
},
{
"epoch": 3.1367521367521367,
"grad_norm": 0.15282496809959412,
"learning_rate": 0.00014,
"loss": 0.0621,
"step": 185
},
{
"epoch": 3.2222222222222223,
"grad_norm": 0.09816001355648041,
"learning_rate": 0.00013827586206896552,
"loss": 0.0648,
"step": 190
},
{
"epoch": 3.3076923076923075,
"grad_norm": 0.13748367130756378,
"learning_rate": 0.00013655172413793104,
"loss": 0.0485,
"step": 195
},
{
"epoch": 3.393162393162393,
"grad_norm": 0.10656469315290451,
"learning_rate": 0.00013482758620689654,
"loss": 0.0531,
"step": 200
},
{
"epoch": 3.4786324786324787,
"grad_norm": 0.1901499480009079,
"learning_rate": 0.00013310344827586207,
"loss": 0.0612,
"step": 205
},
{
"epoch": 3.564102564102564,
"grad_norm": 0.16148889064788818,
"learning_rate": 0.0001313793103448276,
"loss": 0.0546,
"step": 210
},
{
"epoch": 3.6495726495726495,
"grad_norm": 0.19384047389030457,
"learning_rate": 0.0001296551724137931,
"loss": 0.0589,
"step": 215
},
{
"epoch": 3.735042735042735,
"grad_norm": 0.08794084936380386,
"learning_rate": 0.00012793103448275863,
"loss": 0.0573,
"step": 220
},
{
"epoch": 3.8205128205128203,
"grad_norm": 0.10576070100069046,
"learning_rate": 0.00012620689655172415,
"loss": 0.0471,
"step": 225
},
{
"epoch": 3.905982905982906,
"grad_norm": 0.08111118525266647,
"learning_rate": 0.00012448275862068966,
"loss": 0.0572,
"step": 230
},
{
"epoch": 3.9914529914529915,
"grad_norm": 0.4230298101902008,
"learning_rate": 0.00012275862068965518,
"loss": 0.0617,
"step": 235
},
{
"epoch": 4.068376068376068,
"grad_norm": 0.08736063539981842,
"learning_rate": 0.00012103448275862071,
"loss": 0.0493,
"step": 240
},
{
"epoch": 4.153846153846154,
"grad_norm": 0.06979858875274658,
"learning_rate": 0.00011931034482758621,
"loss": 0.0469,
"step": 245
},
{
"epoch": 4.239316239316239,
"grad_norm": 0.10242439806461334,
"learning_rate": 0.00011758620689655173,
"loss": 0.0508,
"step": 250
},
{
"epoch": 4.3247863247863245,
"grad_norm": 0.11685860902070999,
"learning_rate": 0.00011586206896551725,
"loss": 0.0522,
"step": 255
},
{
"epoch": 4.410256410256411,
"grad_norm": 0.1084512323141098,
"learning_rate": 0.00011413793103448275,
"loss": 0.0519,
"step": 260
},
{
"epoch": 4.495726495726496,
"grad_norm": 0.09368503093719482,
"learning_rate": 0.00011241379310344828,
"loss": 0.0494,
"step": 265
},
{
"epoch": 4.581196581196581,
"grad_norm": 0.1777074784040451,
"learning_rate": 0.00011068965517241381,
"loss": 0.0515,
"step": 270
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.056768111884593964,
"learning_rate": 0.00010896551724137931,
"loss": 0.044,
"step": 275
},
{
"epoch": 4.752136752136752,
"grad_norm": 0.08062291890382767,
"learning_rate": 0.00010724137931034484,
"loss": 0.0476,
"step": 280
},
{
"epoch": 4.837606837606837,
"grad_norm": 0.09975454211235046,
"learning_rate": 0.00010551724137931037,
"loss": 0.0522,
"step": 285
},
{
"epoch": 4.923076923076923,
"grad_norm": 0.14652380347251892,
"learning_rate": 0.00010379310344827587,
"loss": 0.0498,
"step": 290
},
{
"epoch": 5.0,
"grad_norm": 0.12409216165542603,
"learning_rate": 0.0001020689655172414,
"loss": 0.0474,
"step": 295
},
{
"epoch": 5.085470085470085,
"grad_norm": 0.09494274109601974,
"learning_rate": 0.0001003448275862069,
"loss": 0.0462,
"step": 300
},
{
"epoch": 5.170940170940171,
"grad_norm": 0.1240062415599823,
"learning_rate": 9.862068965517242e-05,
"loss": 0.0425,
"step": 305
},
{
"epoch": 5.256410256410256,
"grad_norm": 0.1713438332080841,
"learning_rate": 9.689655172413794e-05,
"loss": 0.0431,
"step": 310
},
{
"epoch": 5.3418803418803416,
"grad_norm": 0.1990644931793213,
"learning_rate": 9.517241379310345e-05,
"loss": 0.048,
"step": 315
},
{
"epoch": 5.427350427350428,
"grad_norm": 0.09711036831140518,
"learning_rate": 9.344827586206896e-05,
"loss": 0.0476,
"step": 320
},
{
"epoch": 5.512820512820513,
"grad_norm": 0.11504214257001877,
"learning_rate": 9.172413793103448e-05,
"loss": 0.0494,
"step": 325
},
{
"epoch": 5.598290598290598,
"grad_norm": 0.08380427211523056,
"learning_rate": 9e-05,
"loss": 0.047,
"step": 330
},
{
"epoch": 5.683760683760684,
"grad_norm": 0.08641541749238968,
"learning_rate": 8.827586206896552e-05,
"loss": 0.0457,
"step": 335
},
{
"epoch": 5.769230769230769,
"grad_norm": 0.0935196503996849,
"learning_rate": 8.655172413793103e-05,
"loss": 0.0489,
"step": 340
},
{
"epoch": 5.854700854700854,
"grad_norm": 0.11386577785015106,
"learning_rate": 8.482758620689656e-05,
"loss": 0.0479,
"step": 345
},
{
"epoch": 5.94017094017094,
"grad_norm": 0.08249244838953018,
"learning_rate": 8.310344827586208e-05,
"loss": 0.0469,
"step": 350
},
{
"epoch": 6.017094017094017,
"grad_norm": 0.09115161001682281,
"learning_rate": 8.137931034482759e-05,
"loss": 0.0455,
"step": 355
},
{
"epoch": 6.102564102564102,
"grad_norm": 0.06610054522752762,
"learning_rate": 7.965517241379312e-05,
"loss": 0.0432,
"step": 360
},
{
"epoch": 6.188034188034188,
"grad_norm": 0.09798604249954224,
"learning_rate": 7.793103448275862e-05,
"loss": 0.0442,
"step": 365
},
{
"epoch": 6.273504273504273,
"grad_norm": 0.12107487767934799,
"learning_rate": 7.620689655172413e-05,
"loss": 0.0418,
"step": 370
},
{
"epoch": 6.358974358974359,
"grad_norm": 0.10651250928640366,
"learning_rate": 7.448275862068966e-05,
"loss": 0.0437,
"step": 375
},
{
"epoch": 6.444444444444445,
"grad_norm": 0.09335967153310776,
"learning_rate": 7.275862068965517e-05,
"loss": 0.044,
"step": 380
},
{
"epoch": 6.52991452991453,
"grad_norm": 0.10894130915403366,
"learning_rate": 7.103448275862069e-05,
"loss": 0.0493,
"step": 385
},
{
"epoch": 6.615384615384615,
"grad_norm": 0.09522519260644913,
"learning_rate": 6.931034482758622e-05,
"loss": 0.0463,
"step": 390
},
{
"epoch": 6.700854700854701,
"grad_norm": 0.09910976886749268,
"learning_rate": 6.758620689655173e-05,
"loss": 0.0427,
"step": 395
},
{
"epoch": 6.786324786324786,
"grad_norm": 0.11286190897226334,
"learning_rate": 6.586206896551724e-05,
"loss": 0.0444,
"step": 400
},
{
"epoch": 6.871794871794872,
"grad_norm": 0.07890793681144714,
"learning_rate": 6.413793103448276e-05,
"loss": 0.0407,
"step": 405
},
{
"epoch": 6.957264957264957,
"grad_norm": 0.08769431710243225,
"learning_rate": 6.241379310344829e-05,
"loss": 0.0479,
"step": 410
},
{
"epoch": 7.034188034188034,
"grad_norm": 0.06925784051418304,
"learning_rate": 6.068965517241379e-05,
"loss": 0.0439,
"step": 415
},
{
"epoch": 7.119658119658119,
"grad_norm": 0.08389502763748169,
"learning_rate": 5.896551724137931e-05,
"loss": 0.0437,
"step": 420
},
{
"epoch": 7.205128205128205,
"grad_norm": 0.10391002893447876,
"learning_rate": 5.7241379310344835e-05,
"loss": 0.042,
"step": 425
},
{
"epoch": 7.2905982905982905,
"grad_norm": 0.09842480719089508,
"learning_rate": 5.551724137931035e-05,
"loss": 0.0407,
"step": 430
},
{
"epoch": 7.3760683760683765,
"grad_norm": 0.09367308020591736,
"learning_rate": 5.379310344827586e-05,
"loss": 0.0422,
"step": 435
},
{
"epoch": 7.461538461538462,
"grad_norm": 0.11631827801465988,
"learning_rate": 5.2068965517241384e-05,
"loss": 0.0453,
"step": 440
},
{
"epoch": 7.547008547008547,
"grad_norm": 0.13546331226825714,
"learning_rate": 5.03448275862069e-05,
"loss": 0.0405,
"step": 445
},
{
"epoch": 7.632478632478632,
"grad_norm": 0.1015164852142334,
"learning_rate": 4.862068965517241e-05,
"loss": 0.0433,
"step": 450
},
{
"epoch": 7.717948717948718,
"grad_norm": 0.12304691225290298,
"learning_rate": 4.689655172413793e-05,
"loss": 0.0439,
"step": 455
},
{
"epoch": 7.803418803418803,
"grad_norm": 0.11133451014757156,
"learning_rate": 4.5172413793103454e-05,
"loss": 0.0404,
"step": 460
},
{
"epoch": 7.888888888888889,
"grad_norm": 0.11199292540550232,
"learning_rate": 4.344827586206897e-05,
"loss": 0.0401,
"step": 465
},
{
"epoch": 7.9743589743589745,
"grad_norm": 0.10854869335889816,
"learning_rate": 4.172413793103448e-05,
"loss": 0.047,
"step": 470
},
{
"epoch": 8.051282051282051,
"grad_norm": 0.08034314215183258,
"learning_rate": 4e-05,
"loss": 0.0372,
"step": 475
},
{
"epoch": 8.136752136752136,
"grad_norm": 0.07888869941234589,
"learning_rate": 3.827586206896552e-05,
"loss": 0.0374,
"step": 480
},
{
"epoch": 8.222222222222221,
"grad_norm": 0.08299173414707184,
"learning_rate": 3.655172413793104e-05,
"loss": 0.0415,
"step": 485
},
{
"epoch": 8.307692307692308,
"grad_norm": 0.10082942992448807,
"learning_rate": 3.482758620689655e-05,
"loss": 0.0431,
"step": 490
},
{
"epoch": 8.393162393162394,
"grad_norm": 0.13129588961601257,
"learning_rate": 3.310344827586207e-05,
"loss": 0.0381,
"step": 495
},
{
"epoch": 8.478632478632479,
"grad_norm": 0.0956198126077652,
"learning_rate": 3.137931034482759e-05,
"loss": 0.0391,
"step": 500
},
{
"epoch": 8.564102564102564,
"grad_norm": 0.10935048758983612,
"learning_rate": 2.96551724137931e-05,
"loss": 0.0415,
"step": 505
},
{
"epoch": 8.649572649572649,
"grad_norm": 0.09700857102870941,
"learning_rate": 2.7931034482758622e-05,
"loss": 0.042,
"step": 510
},
{
"epoch": 8.735042735042736,
"grad_norm": 0.09681924432516098,
"learning_rate": 2.620689655172414e-05,
"loss": 0.041,
"step": 515
},
{
"epoch": 8.820512820512821,
"grad_norm": 0.10170122236013412,
"learning_rate": 2.4482758620689654e-05,
"loss": 0.0404,
"step": 520
},
{
"epoch": 8.905982905982906,
"grad_norm": 0.10559462755918503,
"learning_rate": 2.2758620689655175e-05,
"loss": 0.0395,
"step": 525
},
{
"epoch": 8.991452991452991,
"grad_norm": 0.11863423138856888,
"learning_rate": 2.1034482758620692e-05,
"loss": 0.0433,
"step": 530
},
{
"epoch": 9.068376068376068,
"grad_norm": 0.0633588433265686,
"learning_rate": 1.9310344827586207e-05,
"loss": 0.0383,
"step": 535
},
{
"epoch": 9.153846153846153,
"grad_norm": 0.08409127593040466,
"learning_rate": 1.7586206896551724e-05,
"loss": 0.038,
"step": 540
},
{
"epoch": 9.239316239316238,
"grad_norm": 0.12133090943098068,
"learning_rate": 1.586206896551724e-05,
"loss": 0.0366,
"step": 545
},
{
"epoch": 9.324786324786325,
"grad_norm": 0.09883731603622437,
"learning_rate": 1.4137931034482759e-05,
"loss": 0.0386,
"step": 550
},
{
"epoch": 9.41025641025641,
"grad_norm": 0.20076970756053925,
"learning_rate": 1.2413793103448277e-05,
"loss": 0.0375,
"step": 555
},
{
"epoch": 9.495726495726496,
"grad_norm": 0.103940449655056,
"learning_rate": 1.0689655172413794e-05,
"loss": 0.0394,
"step": 560
},
{
"epoch": 9.581196581196581,
"grad_norm": 0.09235844761133194,
"learning_rate": 8.96551724137931e-06,
"loss": 0.0405,
"step": 565
},
{
"epoch": 9.666666666666666,
"grad_norm": 0.07304095476865768,
"learning_rate": 7.241379310344828e-06,
"loss": 0.0352,
"step": 570
},
{
"epoch": 9.752136752136753,
"grad_norm": 0.12776847183704376,
"learning_rate": 5.517241379310345e-06,
"loss": 0.04,
"step": 575
},
{
"epoch": 9.837606837606838,
"grad_norm": 0.11009430885314941,
"learning_rate": 3.793103448275862e-06,
"loss": 0.0374,
"step": 580
},
{
"epoch": 9.923076923076923,
"grad_norm": 0.13841569423675537,
"learning_rate": 2.0689655172413796e-06,
"loss": 0.0401,
"step": 585
},
{
"epoch": 10.0,
"grad_norm": 0.1534666121006012,
"learning_rate": 3.4482758620689656e-07,
"loss": 0.0366,
"step": 590
},
{
"epoch": 10.0,
"step": 590,
"total_flos": 9496524054435840.0,
"train_loss": 0.09625538042036154,
"train_runtime": 681.7113,
"train_samples_per_second": 6.85,
"train_steps_per_second": 0.865
}
],
"logging_steps": 5,
"max_steps": 590,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9496524054435840.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}