Training in progress, step 20000, checkpoint

Browse files

Files changed (7) hide show

last-checkpoint/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
last-checkpoint/global_step20000/mp_rank_00_model_states.pt +3 -0
last-checkpoint/latest +1 -1
last-checkpoint/model.safetensors +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +586 -6

last-checkpoint/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ade9c3e43b2a1550492ecf4b91e9228af429dcf0d7b1c09aea81ebc7a5842d20
+size 761059696

last-checkpoint/global_step20000/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58f46b37e83d56bff8e8b49fc01d48e56f7c2f6034abd01b65de03f862980853
+size 129965712

last-checkpoint/latest CHANGED Viewed

	@@ -1 +1 @@
1	- ~~global_step18000~~


1	+ global_step20000

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ad61c1d6676decf968d7cc262cb88d3340a58571f59eb03dc41c8694daf8e28e
 size 181508256

 version https://git-lfs.github.com/spec/v1
+oid sha256:b22de776648c8fc55dbdb37a34986669b21215c0d0cc7d4355ba0090a00314ad
 size 181508256

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76da6beb47ba6fea32e3903f5fb6715e6c7d9cfa9223676725c0a4f3ab456246
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:907910c4d615478ec9b347b176d82b2a1be77f33469156f9f4b3321b8fe69355
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:389fc41872de99e18419ed46bb961f8c27ddde2cc92d05129c78c005704b1713
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:29c7a79b53a589de48d3b7a21df9c0d024be4dea79f68869f72fdc01ae3b212a
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-  "best_metric": 96.5865157944366,
-  "best_model_checkpoint": "./iteboshi_temp/checkpoint-17000",
-  "epoch": 19.823788546255507,
   "eval_steps": 1000,
-  "global_step": 18000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -5227,6 +5227,586 @@
       "eval_steps_per_second": 3.303,
       "eval_wer": 96.61480433757662,
       "step": 18000
     }
   ],
   "logging_steps": 25,
@@ -5241,12 +5821,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 3.0387073320631665e+19,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 96.57708628005658,
+  "best_model_checkpoint": "./iteboshi_temp/checkpoint-19000",
+  "epoch": 22.026431718061673,
   "eval_steps": 1000,
+  "global_step": 20000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_steps_per_second": 3.303,
       "eval_wer": 96.61480433757662,
       "step": 18000
+    },
+    {
+      "epoch": 19.851321585903083,
+      "grad_norm": 0.0205672699958086,
+      "learning_rate": 2.025641025641026e-06,
+      "loss": 0.0034,
+      "step": 18025
+    },
+    {
+      "epoch": 19.878854625550662,
+      "grad_norm": 0.017052460461854935,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.0024,
+      "step": 18050
+    },
+    {
+      "epoch": 19.90638766519824,
+      "grad_norm": 0.023273777216672897,
+      "learning_rate": 1.9743589743589744e-06,
+      "loss": 0.0024,
+      "step": 18075
+    },
+    {
+      "epoch": 19.933920704845814,
+      "grad_norm": 0.01854720339179039,
+      "learning_rate": 1.948717948717949e-06,
+      "loss": 0.0029,
+      "step": 18100
+    },
+    {
+      "epoch": 19.961453744493394,
+      "grad_norm": 0.023288726806640625,
+      "learning_rate": 1.9230769230769234e-06,
+      "loss": 0.0025,
+      "step": 18125
+    },
+    {
+      "epoch": 19.98898678414097,
+      "grad_norm": 0.019170600920915604,
+      "learning_rate": 1.8974358974358975e-06,
+      "loss": 0.0024,
+      "step": 18150
+    },
+    {
+      "epoch": 20.016519823788546,
+      "grad_norm": 0.013864605687558651,
+      "learning_rate": 1.871794871794872e-06,
+      "loss": 0.0021,
+      "step": 18175
+    },
+    {
+      "epoch": 20.044052863436125,
+      "grad_norm": 0.015261122956871986,
+      "learning_rate": 1.8461538461538465e-06,
+      "loss": 0.002,
+      "step": 18200
+    },
+    {
+      "epoch": 20.0715859030837,
+      "grad_norm": 0.015079254284501076,
+      "learning_rate": 1.8205128205128205e-06,
+      "loss": 0.0024,
+      "step": 18225
+    },
+    {
+      "epoch": 20.099118942731277,
+      "grad_norm": 0.013841504231095314,
+      "learning_rate": 1.794871794871795e-06,
+      "loss": 0.003,
+      "step": 18250
+    },
+    {
+      "epoch": 20.126651982378856,
+      "grad_norm": 0.017009438946843147,
+      "learning_rate": 1.7692307692307695e-06,
+      "loss": 0.002,
+      "step": 18275
+    },
+    {
+      "epoch": 20.154185022026432,
+      "grad_norm": 0.01796025224030018,
+      "learning_rate": 1.7435897435897436e-06,
+      "loss": 0.0019,
+      "step": 18300
+    },
+    {
+      "epoch": 20.181718061674008,
+      "grad_norm": 0.020462974905967712,
+      "learning_rate": 1.717948717948718e-06,
+      "loss": 0.002,
+      "step": 18325
+    },
+    {
+      "epoch": 20.209251101321588,
+      "grad_norm": 0.0168469101190567,
+      "learning_rate": 1.6923076923076926e-06,
+      "loss": 0.002,
+      "step": 18350
+    },
+    {
+      "epoch": 20.236784140969164,
+      "grad_norm": 0.015358548611402512,
+      "learning_rate": 1.6666666666666667e-06,
+      "loss": 0.0019,
+      "step": 18375
+    },
+    {
+      "epoch": 20.26431718061674,
+      "grad_norm": 0.01623690128326416,
+      "learning_rate": 1.6410256410256412e-06,
+      "loss": 0.0019,
+      "step": 18400
+    },
+    {
+      "epoch": 20.291850220264315,
+      "grad_norm": 0.016147859394550323,
+      "learning_rate": 1.6153846153846157e-06,
+      "loss": 0.002,
+      "step": 18425
+    },
+    {
+      "epoch": 20.319383259911895,
+      "grad_norm": 0.023021413013339043,
+      "learning_rate": 1.5897435897435897e-06,
+      "loss": 0.0023,
+      "step": 18450
+    },
+    {
+      "epoch": 20.34691629955947,
+      "grad_norm": 0.0137328477576375,
+      "learning_rate": 1.5641025641025642e-06,
+      "loss": 0.0019,
+      "step": 18475
+    },
+    {
+      "epoch": 20.374449339207047,
+      "grad_norm": 0.01765141263604164,
+      "learning_rate": 1.5384615384615387e-06,
+      "loss": 0.0022,
+      "step": 18500
+    },
+    {
+      "epoch": 20.401982378854626,
+      "grad_norm": 0.015655307099223137,
+      "learning_rate": 1.5128205128205128e-06,
+      "loss": 0.0038,
+      "step": 18525
+    },
+    {
+      "epoch": 20.429515418502202,
+      "grad_norm": 0.021192258223891258,
+      "learning_rate": 1.4871794871794873e-06,
+      "loss": 0.0021,
+      "step": 18550
+    },
+    {
+      "epoch": 20.457048458149778,
+      "grad_norm": 0.014702214859426022,
+      "learning_rate": 1.4615384615384618e-06,
+      "loss": 0.0019,
+      "step": 18575
+    },
+    {
+      "epoch": 20.484581497797357,
+      "grad_norm": 0.018568340688943863,
+      "learning_rate": 1.4358974358974359e-06,
+      "loss": 0.0018,
+      "step": 18600
+    },
+    {
+      "epoch": 20.512114537444933,
+      "grad_norm": 0.020032202824950218,
+      "learning_rate": 1.4102564102564104e-06,
+      "loss": 0.002,
+      "step": 18625
+    },
+    {
+      "epoch": 20.53964757709251,
+      "grad_norm": 0.01590747945010662,
+      "learning_rate": 1.3846153846153848e-06,
+      "loss": 0.002,
+      "step": 18650
+    },
+    {
+      "epoch": 20.56718061674009,
+      "grad_norm": 0.014293953776359558,
+      "learning_rate": 1.358974358974359e-06,
+      "loss": 0.002,
+      "step": 18675
+    },
+    {
+      "epoch": 20.594713656387665,
+      "grad_norm": 0.0199781134724617,
+      "learning_rate": 1.3333333333333334e-06,
+      "loss": 0.0019,
+      "step": 18700
+    },
+    {
+      "epoch": 20.62224669603524,
+      "grad_norm": 0.018757140263915062,
+      "learning_rate": 1.307692307692308e-06,
+      "loss": 0.0022,
+      "step": 18725
+    },
+    {
+      "epoch": 20.64977973568282,
+      "grad_norm": 0.021107446402311325,
+      "learning_rate": 1.282051282051282e-06,
+      "loss": 0.0029,
+      "step": 18750
+    },
+    {
+      "epoch": 20.677312775330396,
+      "grad_norm": 0.018470246344804764,
+      "learning_rate": 1.2564102564102565e-06,
+      "loss": 0.0021,
+      "step": 18775
+    },
+    {
+      "epoch": 20.704845814977972,
+      "grad_norm": 0.01821320876479149,
+      "learning_rate": 1.230769230769231e-06,
+      "loss": 0.0022,
+      "step": 18800
+    },
+    {
+      "epoch": 20.73237885462555,
+      "grad_norm": 0.15323257446289062,
+      "learning_rate": 1.2051282051282053e-06,
+      "loss": 0.0024,
+      "step": 18825
+    },
+    {
+      "epoch": 20.759911894273127,
+      "grad_norm": 0.015295284800231457,
+      "learning_rate": 1.1794871794871795e-06,
+      "loss": 0.002,
+      "step": 18850
+    },
+    {
+      "epoch": 20.787444933920703,
+      "grad_norm": 0.015194980427622795,
+      "learning_rate": 1.153846153846154e-06,
+      "loss": 0.0018,
+      "step": 18875
+    },
+    {
+      "epoch": 20.814977973568283,
+      "grad_norm": 0.05270170047879219,
+      "learning_rate": 1.1282051282051283e-06,
+      "loss": 0.0024,
+      "step": 18900
+    },
+    {
+      "epoch": 20.84251101321586,
+      "grad_norm": 0.01960138976573944,
+      "learning_rate": 1.1025641025641026e-06,
+      "loss": 0.0021,
+      "step": 18925
+    },
+    {
+      "epoch": 20.870044052863435,
+      "grad_norm": 0.02073553018271923,
+      "learning_rate": 1.076923076923077e-06,
+      "loss": 0.0019,
+      "step": 18950
+    },
+    {
+      "epoch": 20.897577092511014,
+      "grad_norm": 0.01615351065993309,
+      "learning_rate": 1.0512820512820514e-06,
+      "loss": 0.002,
+      "step": 18975
+    },
+    {
+      "epoch": 20.92511013215859,
+      "grad_norm": 0.021563587710261345,
+      "learning_rate": 1.0256410256410257e-06,
+      "loss": 0.0021,
+      "step": 19000
+    },
+    {
+      "epoch": 20.92511013215859,
+      "eval_cer": 55.589054600896446,
+      "eval_loss": 1.0507194995880127,
+      "eval_runtime": 844.8487,
+      "eval_samples_per_second": 12.524,
+      "eval_steps_per_second": 3.132,
+      "eval_wer": 96.57708628005658,
+      "step": 19000
+    },
+    {
+      "epoch": 20.952643171806166,
+      "grad_norm": 0.016109561547636986,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 0.002,
+      "step": 19025
+    },
+    {
+      "epoch": 20.980176211453745,
+      "grad_norm": 0.016952887177467346,
+      "learning_rate": 9.743589743589745e-07,
+      "loss": 0.002,
+      "step": 19050
+    },
+    {
+      "epoch": 21.00770925110132,
+      "grad_norm": 0.01466713659465313,
+      "learning_rate": 9.487179487179487e-07,
+      "loss": 0.002,
+      "step": 19075
+    },
+    {
+      "epoch": 21.035242290748897,
+      "grad_norm": 0.01427449006587267,
+      "learning_rate": 9.230769230769232e-07,
+      "loss": 0.002,
+      "step": 19100
+    },
+    {
+      "epoch": 21.062775330396477,
+      "grad_norm": 0.016093429177999496,
+      "learning_rate": 8.974358974358975e-07,
+      "loss": 0.0018,
+      "step": 19125
+    },
+    {
+      "epoch": 21.090308370044053,
+      "grad_norm": 0.019426781684160233,
+      "learning_rate": 8.717948717948718e-07,
+      "loss": 0.0018,
+      "step": 19150
+    },
+    {
+      "epoch": 21.11784140969163,
+      "grad_norm": 0.0124832633882761,
+      "learning_rate": 8.461538461538463e-07,
+      "loss": 0.0017,
+      "step": 19175
+    },
+    {
+      "epoch": 21.145374449339208,
+      "grad_norm": 0.01551234070211649,
+      "learning_rate": 8.205128205128206e-07,
+      "loss": 0.0018,
+      "step": 19200
+    },
+    {
+      "epoch": 21.172907488986784,
+      "grad_norm": 0.01290995441377163,
+      "learning_rate": 7.948717948717949e-07,
+      "loss": 0.0019,
+      "step": 19225
+    },
+    {
+      "epoch": 21.20044052863436,
+      "grad_norm": 0.012107312679290771,
+      "learning_rate": 7.692307692307694e-07,
+      "loss": 0.0018,
+      "step": 19250
+    },
+    {
+      "epoch": 21.22797356828194,
+      "grad_norm": 0.013243271969258785,
+      "learning_rate": 7.435897435897436e-07,
+      "loss": 0.0018,
+      "step": 19275
+    },
+    {
+      "epoch": 21.255506607929515,
+      "grad_norm": 0.01567436195909977,
+      "learning_rate": 7.179487179487179e-07,
+      "loss": 0.0017,
+      "step": 19300
+    },
+    {
+      "epoch": 21.28303964757709,
+      "grad_norm": 0.017800329253077507,
+      "learning_rate": 6.923076923076924e-07,
+      "loss": 0.0017,
+      "step": 19325
+    },
+    {
+      "epoch": 21.31057268722467,
+      "grad_norm": 0.012769469991326332,
+      "learning_rate": 6.666666666666667e-07,
+      "loss": 0.0018,
+      "step": 19350
+    },
+    {
+      "epoch": 21.338105726872246,
+      "grad_norm": 0.013936811126768589,
+      "learning_rate": 6.41025641025641e-07,
+      "loss": 0.0018,
+      "step": 19375
+    },
+    {
+      "epoch": 21.365638766519822,
+      "grad_norm": 0.017832236364483833,
+      "learning_rate": 6.153846153846155e-07,
+      "loss": 0.0018,
+      "step": 19400
+    },
+    {
+      "epoch": 21.393171806167402,
+      "grad_norm": 0.016330501064658165,
+      "learning_rate": 5.897435897435898e-07,
+      "loss": 0.0019,
+      "step": 19425
+    },
+    {
+      "epoch": 21.420704845814978,
+      "grad_norm": 0.012162838131189346,
+      "learning_rate": 5.641025641025642e-07,
+      "loss": 0.0018,
+      "step": 19450
+    },
+    {
+      "epoch": 21.448237885462554,
+      "grad_norm": 0.01499269250780344,
+      "learning_rate": 5.384615384615386e-07,
+      "loss": 0.0019,
+      "step": 19475
+    },
+    {
+      "epoch": 21.475770925110133,
+      "grad_norm": 0.013169058598577976,
+      "learning_rate": 5.128205128205128e-07,
+      "loss": 0.0019,
+      "step": 19500
+    },
+    {
+      "epoch": 21.50330396475771,
+      "grad_norm": 0.011718913912773132,
+      "learning_rate": 4.871794871794872e-07,
+      "loss": 0.0018,
+      "step": 19525
+    },
+    {
+      "epoch": 21.530837004405285,
+      "grad_norm": 0.01436688657850027,
+      "learning_rate": 4.615384615384616e-07,
+      "loss": 0.0019,
+      "step": 19550
+    },
+    {
+      "epoch": 21.558370044052865,
+      "grad_norm": 0.012899577617645264,
+      "learning_rate": 4.358974358974359e-07,
+      "loss": 0.0016,
+      "step": 19575
+    },
+    {
+      "epoch": 21.58590308370044,
+      "grad_norm": 0.018741106614470482,
+      "learning_rate": 4.102564102564103e-07,
+      "loss": 0.0018,
+      "step": 19600
+    },
+    {
+      "epoch": 21.613436123348016,
+      "grad_norm": 0.011879649944603443,
+      "learning_rate": 3.846153846153847e-07,
+      "loss": 0.0018,
+      "step": 19625
+    },
+    {
+      "epoch": 21.640969162995596,
+      "grad_norm": 0.01298064086586237,
+      "learning_rate": 3.5897435897435896e-07,
+      "loss": 0.0018,
+      "step": 19650
+    },
+    {
+      "epoch": 21.66850220264317,
+      "grad_norm": 0.0132521390914917,
+      "learning_rate": 3.3333333333333335e-07,
+      "loss": 0.0017,
+      "step": 19675
+    },
+    {
+      "epoch": 21.696035242290748,
+      "grad_norm": 0.012232212349772453,
+      "learning_rate": 3.0769230769230774e-07,
+      "loss": 0.0022,
+      "step": 19700
+    },
+    {
+      "epoch": 21.723568281938327,
+      "grad_norm": 0.0125159602612257,
+      "learning_rate": 2.820512820512821e-07,
+      "loss": 0.0021,
+      "step": 19725
+    },
+    {
+      "epoch": 21.751101321585903,
+      "grad_norm": 0.012911227531731129,
+      "learning_rate": 2.564102564102564e-07,
+      "loss": 0.0018,
+      "step": 19750
+    },
+    {
+      "epoch": 21.77863436123348,
+      "grad_norm": 0.016304660588502884,
+      "learning_rate": 2.307692307692308e-07,
+      "loss": 0.0018,
+      "step": 19775
+    },
+    {
+      "epoch": 21.80616740088106,
+      "grad_norm": 0.0178163331001997,
+      "learning_rate": 2.0512820512820514e-07,
+      "loss": 0.0018,
+      "step": 19800
+    },
+    {
+      "epoch": 21.833700440528634,
+      "grad_norm": 0.013485315255820751,
+      "learning_rate": 1.7948717948717948e-07,
+      "loss": 0.0017,
+      "step": 19825
+    },
+    {
+      "epoch": 21.86123348017621,
+      "grad_norm": 0.021611526608467102,
+      "learning_rate": 1.5384615384615387e-07,
+      "loss": 0.0018,
+      "step": 19850
+    },
+    {
+      "epoch": 21.88876651982379,
+      "grad_norm": 0.014628293924033642,
+      "learning_rate": 1.282051282051282e-07,
+      "loss": 0.0017,
+      "step": 19875
+    },
+    {
+      "epoch": 21.916299559471366,
+      "grad_norm": 0.013321286998689175,
+      "learning_rate": 1.0256410256410257e-07,
+      "loss": 0.0017,
+      "step": 19900
+    },
+    {
+      "epoch": 21.94383259911894,
+      "grad_norm": 0.016186168417334557,
+      "learning_rate": 7.692307692307694e-08,
+      "loss": 0.0018,
+      "step": 19925
+    },
+    {
+      "epoch": 21.97136563876652,
+      "grad_norm": 0.015817852690815926,
+      "learning_rate": 5.1282051282051286e-08,
+      "loss": 0.0017,
+      "step": 19950
+    },
+    {
+      "epoch": 21.998898678414097,
+      "grad_norm": 0.01383238285779953,
+      "learning_rate": 2.5641025641025643e-08,
+      "loss": 0.0018,
+      "step": 19975
+    },
+    {
+      "epoch": 22.026431718061673,
+      "grad_norm": 0.0143059641122818,
+      "learning_rate": 0.0,
+      "loss": 0.0017,
+      "step": 20000
+    },
+    {
+      "epoch": 22.026431718061673,
+      "eval_cer": 54.87888757694909,
+      "eval_loss": 1.0545215606689453,
+      "eval_runtime": 819.2896,
+      "eval_samples_per_second": 12.915,
+      "eval_steps_per_second": 3.23,
+      "eval_wer": 96.57708628005658,
+      "step": 20000
     }
   ],
   "logging_steps": 25,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 3.376341480070185e+19,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null