Training in progress, step 400, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +711 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dd13ee0cbea58cba9cc7fc01eb302046b4bff7c2c4bc96437408e51e0a258984
 size 373077376

 version https://git-lfs.github.com/spec/v1
+oid sha256:4452d8ee5d1c4b3f248050b462aef67647d83e8aa2c819475c2561ad6988260f
 size 373077376

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2ac14285b251f49d934b2757ab7b76d640647a69a6604df05d4abcea37c572f2
 size 373225675

 version https://git-lfs.github.com/spec/v1
+oid sha256:abc52b4912bc5a69beadb79f29cd708ffaef6d5ab82fd19a670038c92f29c313
 size 373225675

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:793829d79b248c3a7b8954f2cd95073c2ba034f6ee2bb0edff8ce8fef88cb5ad
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:6a3c442dd05d519f184832cb8cb76be210b67395e80e365a8e2c8fc2a9d09440
 size 14645

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:80968dab5c533e2a1cf2f64e5806de56ff6c85624fdd0d13e21f084017ee671b
 size 1401

 version https://git-lfs.github.com/spec/v1
+oid sha256:f438d73941ac2939699522d3048115527267b7c8c06f9f728e1517b0c3c16832
 size 1401

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.6237006237006237,
   "eval_steps": 100,
-  "global_step": 300,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2132,6 +2132,714 @@
       "eval_samples_per_second": 23.972,
       "eval_steps_per_second": 2.996,
       "step": 300
     }
   ],
   "logging_steps": 1,
@@ -2151,7 +2859,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 7.64170916069376e+16,
   "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.8316008316008316,
   "eval_steps": 100,
+  "global_step": 400,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 23.972,
       "eval_steps_per_second": 2.996,
       "step": 300
+    },
+    {
+      "epoch": 0.6257796257796258,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00034091217642278086,
+      "loss": 4.7977,
+      "step": 301
+    },
+    {
+      "epoch": 0.6278586278586279,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0003376502653976583,
+      "loss": 5.6144,
+      "step": 302
+    },
+    {
+      "epoch": 0.6299376299376299,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0003343960602114349,
+      "loss": 5.3662,
+      "step": 303
+    },
+    {
+      "epoch": 0.632016632016632,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0003311497153231305,
+      "loss": 5.5636,
+      "step": 304
+    },
+    {
+      "epoch": 0.6340956340956341,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00032791138481868084,
+      "loss": 5.5391,
+      "step": 305
+    },
+    {
+      "epoch": 0.6361746361746362,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00032468122240362287,
+      "loss": 5.4312,
+      "step": 306
+    },
+    {
+      "epoch": 0.6382536382536382,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0003214593813958001,
+      "loss": 5.501,
+      "step": 307
+    },
+    {
+      "epoch": 0.6403326403326404,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.000318246014718085,
+      "loss": 5.4276,
+      "step": 308
+    },
+    {
+      "epoch": 0.6424116424116424,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00031504127489112105,
+      "loss": 5.3774,
+      "step": 309
+    },
+    {
+      "epoch": 0.6444906444906445,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0003118453140260823,
+      "loss": 5.1568,
+      "step": 310
+    },
+    {
+      "epoch": 0.6465696465696466,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0003086582838174551,
+      "loss": 5.7294,
+      "step": 311
+    },
+    {
+      "epoch": 0.6486486486486487,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.000305480335535837,
+      "loss": 5.5416,
+      "step": 312
+    },
+    {
+      "epoch": 0.6507276507276507,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00030231162002075673,
+      "loss": 5.3863,
+      "step": 313
+    },
+    {
+      "epoch": 0.6528066528066528,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0002991522876735154,
+      "loss": 5.1067,
+      "step": 314
+    },
+    {
+      "epoch": 0.6548856548856549,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002960024884500467,
+      "loss": 5.5995,
+      "step": 315
+    },
+    {
+      "epoch": 0.656964656964657,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0002928623718538006,
+      "loss": 5.5833,
+      "step": 316
+    },
+    {
+      "epoch": 0.659043659043659,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002897320869286462,
+      "loss": 4.8974,
+      "step": 317
+    },
+    {
+      "epoch": 0.6611226611226612,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0002866117822517982,
+      "loss": 5.291,
+      "step": 318
+    },
+    {
+      "epoch": 0.6632016632016632,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.000283501605926764,
+      "loss": 5.5008,
+      "step": 319
+    },
+    {
+      "epoch": 0.6652806652806653,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0002804017055763149,
+      "loss": 5.5551,
+      "step": 320
+    },
+    {
+      "epoch": 0.6673596673596673,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00027731222833547844,
+      "loss": 5.6069,
+      "step": 321
+    },
+    {
+      "epoch": 0.6694386694386695,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00027423332084455543,
+      "loss": 5.4693,
+      "step": 322
+    },
+    {
+      "epoch": 0.6715176715176715,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002711651292421593,
+      "loss": 5.2478,
+      "step": 323
+    },
+    {
+      "epoch": 0.6735966735966736,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002681077991582797,
+      "loss": 5.3848,
+      "step": 324
+    },
+    {
+      "epoch": 0.6756756756756757,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00026506147570737093,
+      "loss": 5.4962,
+      "step": 325
+    },
+    {
+      "epoch": 0.6777546777546778,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0002620263034814632,
+      "loss": 5.1192,
+      "step": 326
+    },
+    {
+      "epoch": 0.6798336798336798,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0002590024265433002,
+      "loss": 5.2134,
+      "step": 327
+    },
+    {
+      "epoch": 0.681912681912682,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.00025598998841950106,
+      "loss": 5.4984,
+      "step": 328
+    },
+    {
+      "epoch": 0.683991683991684,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00025298913209374806,
+      "loss": 5.4985,
+      "step": 329
+    },
+    {
+      "epoch": 0.6860706860706861,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002500000000000001,
+      "loss": 5.2018,
+      "step": 330
+    },
+    {
+      "epoch": 0.6881496881496881,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0002470227340157316,
+      "loss": 4.8436,
+      "step": 331
+    },
+    {
+      "epoch": 0.6902286902286903,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00024405747545519962,
+      "loss": 5.2047,
+      "step": 332
+    },
+    {
+      "epoch": 0.6923076923076923,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00024110436506273432,
+      "loss": 5.4741,
+      "step": 333
+    },
+    {
+      "epoch": 0.6943866943866944,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.00023816354300606107,
+      "loss": 4.9758,
+      "step": 334
+    },
+    {
+      "epoch": 0.6964656964656964,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0002352351488696457,
+      "loss": 5.4983,
+      "step": 335
+    },
+    {
+      "epoch": 0.6985446985446986,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0002323193216480698,
+      "loss": 5.2683,
+      "step": 336
+    },
+    {
+      "epoch": 0.7006237006237006,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00022941619973943362,
+      "loss": 5.3248,
+      "step": 337
+    },
+    {
+      "epoch": 0.7027027027027027,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.00022652592093878665,
+      "loss": 5.6097,
+      "step": 338
+    },
+    {
+      "epoch": 0.7047817047817048,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00022364862243158767,
+      "loss": 5.3526,
+      "step": 339
+    },
+    {
+      "epoch": 0.7068607068607069,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0002207844407871929,
+      "loss": 5.4411,
+      "step": 340
+    },
+    {
+      "epoch": 0.7089397089397089,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.0002179335119523745,
+      "loss": 5.9201,
+      "step": 341
+    },
+    {
+      "epoch": 0.7110187110187111,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002150959712448669,
+      "loss": 5.0856,
+      "step": 342
+    },
+    {
+      "epoch": 0.7130977130977131,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.000212271953346945,
+      "loss": 5.4956,
+      "step": 343
+    },
+    {
+      "epoch": 0.7151767151767152,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0002094615922990309,
+      "loss": 5.3681,
+      "step": 344
+    },
+    {
+      "epoch": 0.7172557172557172,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00020666502149333215,
+      "loss": 5.342,
+      "step": 345
+    },
+    {
+      "epoch": 0.7193347193347194,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00020388237366751006,
+      "loss": 5.3868,
+      "step": 346
+    },
+    {
+      "epoch": 0.7214137214137214,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00020111378089837957,
+      "loss": 5.0104,
+      "step": 347
+    },
+    {
+      "epoch": 0.7234927234927235,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00019835937459564064,
+      "loss": 5.2236,
+      "step": 348
+    },
+    {
+      "epoch": 0.7255717255717256,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00019561928549563967,
+      "loss": 5.4828,
+      "step": 349
+    },
+    {
+      "epoch": 0.7276507276507277,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00019289364365516608,
+      "loss": 5.7081,
+      "step": 350
+    },
+    {
+      "epoch": 0.7297297297297297,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0001901825784452777,
+      "loss": 5.3906,
+      "step": 351
+    },
+    {
+      "epoch": 0.7318087318087318,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00018748621854516078,
+      "loss": 5.1981,
+      "step": 352
+    },
+    {
+      "epoch": 0.7338877338877339,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0001848046919360225,
+      "loss": 5.4607,
+      "step": 353
+    },
+    {
+      "epoch": 0.735966735966736,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0001821381258950161,
+      "loss": 5.3527,
+      "step": 354
+    },
+    {
+      "epoch": 0.738045738045738,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00017948664698919987,
+      "loss": 5.4137,
+      "step": 355
+    },
+    {
+      "epoch": 0.7401247401247402,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001768503810695295,
+      "loss": 5.3231,
+      "step": 356
+    },
+    {
+      "epoch": 0.7422037422037422,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00017422945326488553,
+      "loss": 5.3483,
+      "step": 357
+    },
+    {
+      "epoch": 0.7442827442827443,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00017162398797613282,
+      "loss": 5.2544,
+      "step": 358
+    },
+    {
+      "epoch": 0.7463617463617463,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00016903410887021675,
+      "loss": 5.3142,
+      "step": 359
+    },
+    {
+      "epoch": 0.7484407484407485,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00016645993887429345,
+      "loss": 5.3094,
+      "step": 360
+    },
+    {
+      "epoch": 0.7505197505197505,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00016390160016989486,
+      "loss": 5.4099,
+      "step": 361
+    },
+    {
+      "epoch": 0.7525987525987526,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00016135921418712956,
+      "loss": 4.6715,
+      "step": 362
+    },
+    {
+      "epoch": 0.7546777546777547,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00015883290159891906,
+      "loss": 5.2596,
+      "step": 363
+    },
+    {
+      "epoch": 0.7567567567567568,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0001563227823152708,
+      "loss": 5.411,
+      "step": 364
+    },
+    {
+      "epoch": 0.7588357588357588,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00015382897547758513,
+      "loss": 5.3466,
+      "step": 365
+    },
+    {
+      "epoch": 0.760914760914761,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001513515994530023,
+      "loss": 5.3548,
+      "step": 366
+    },
+    {
+      "epoch": 0.762993762993763,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00014889077182878268,
+      "loss": 5.1248,
+      "step": 367
+    },
+    {
+      "epoch": 0.7650727650727651,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00014644660940672628,
+      "loss": 4.9604,
+      "step": 368
+    },
+    {
+      "epoch": 0.7671517671517671,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00014401922819762863,
+      "loss": 5.2443,
+      "step": 369
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00014160874341577446,
+      "loss": 4.8705,
+      "step": 370
+    },
+    {
+      "epoch": 0.7713097713097713,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00013921526947346903,
+      "loss": 5.2561,
+      "step": 371
+    },
+    {
+      "epoch": 0.7733887733887734,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0001368389199756075,
+      "loss": 5.5852,
+      "step": 372
+    },
+    {
+      "epoch": 0.7754677754677755,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0001344798077142836,
+      "loss": 5.2821,
+      "step": 373
+    },
+    {
+      "epoch": 0.7775467775467776,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0001321380446634342,
+      "loss": 5.0683,
+      "step": 374
+    },
+    {
+      "epoch": 0.7796257796257796,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00012981374197352664,
+      "loss": 5.1648,
+      "step": 375
+    },
+    {
+      "epoch": 0.7817047817047817,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0001275070099662815,
+      "loss": 5.2669,
+      "step": 376
+    },
+    {
+      "epoch": 0.7837837837837838,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00012521795812943704,
+      "loss": 5.4582,
+      "step": 377
+    },
+    {
+      "epoch": 0.7858627858627859,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00012294669511155192,
+      "loss": 5.3977,
+      "step": 378
+    },
+    {
+      "epoch": 0.7879417879417879,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00012069332871684874,
+      "loss": 4.7924,
+      "step": 379
+    },
+    {
+      "epoch": 0.7900207900207901,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00011845796590009682,
+      "loss": 5.4676,
+      "step": 380
+    },
+    {
+      "epoch": 0.7920997920997921,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00011624071276153569,
+      "loss": 5.6005,
+      "step": 381
+    },
+    {
+      "epoch": 0.7941787941787942,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00011404167454183955,
+      "loss": 5.3724,
+      "step": 382
+    },
+    {
+      "epoch": 0.7962577962577962,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.00011186095561712129,
+      "loss": 5.0408,
+      "step": 383
+    },
+    {
+      "epoch": 0.7983367983367984,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.000109698659493979,
+      "loss": 5.41,
+      "step": 384
+    },
+    {
+      "epoch": 0.8004158004158004,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0001075548888045827,
+      "loss": 4.975,
+      "step": 385
+    },
+    {
+      "epoch": 0.8024948024948025,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00010542974530180327,
+      "loss": 5.4077,
+      "step": 386
+    },
+    {
+      "epoch": 0.8045738045738046,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00010332332985438247,
+      "loss": 5.1575,
+      "step": 387
+    },
+    {
+      "epoch": 0.8066528066528067,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00010123574244214551,
+      "loss": 5.6162,
+      "step": 388
+    },
+    {
+      "epoch": 0.8087318087318087,
+      "grad_norm": 0.87890625,
+      "learning_rate": 9.916708215125586e-05,
+      "loss": 5.2549,
+      "step": 389
+    },
+    {
+      "epoch": 0.8108108108108109,
+      "grad_norm": 0.439453125,
+      "learning_rate": 9.711744716951093e-05,
+      "loss": 4.9944,
+      "step": 390
+    },
+    {
+      "epoch": 0.8128898128898129,
+      "grad_norm": 0.470703125,
+      "learning_rate": 9.508693478168346e-05,
+      "loss": 5.1125,
+      "step": 391
+    },
+    {
+      "epoch": 0.814968814968815,
+      "grad_norm": 0.283203125,
+      "learning_rate": 9.307564136490254e-05,
+      "loss": 4.9202,
+      "step": 392
+    },
+    {
+      "epoch": 0.817047817047817,
+      "grad_norm": 0.255859375,
+      "learning_rate": 9.108366238407967e-05,
+      "loss": 5.3454,
+      "step": 393
+    },
+    {
+      "epoch": 0.8191268191268192,
+      "grad_norm": 0.2734375,
+      "learning_rate": 8.911109238737747e-05,
+      "loss": 5.283,
+      "step": 394
+    },
+    {
+      "epoch": 0.8212058212058212,
+      "grad_norm": 0.40234375,
+      "learning_rate": 8.715802500172215e-05,
+      "loss": 4.732,
+      "step": 395
+    },
+    {
+      "epoch": 0.8232848232848233,
+      "grad_norm": 0.28125,
+      "learning_rate": 8.522455292835934e-05,
+      "loss": 4.9245,
+      "step": 396
+    },
+    {
+      "epoch": 0.8253638253638254,
+      "grad_norm": 0.259765625,
+      "learning_rate": 8.331076793845421e-05,
+      "loss": 4.9568,
+      "step": 397
+    },
+    {
+      "epoch": 0.8274428274428275,
+      "grad_norm": 0.263671875,
+      "learning_rate": 8.141676086873573e-05,
+      "loss": 5.3107,
+      "step": 398
+    },
+    {
+      "epoch": 0.8295218295218295,
+      "grad_norm": 0.33203125,
+      "learning_rate": 7.954262161718479e-05,
+      "loss": 4.9182,
+      "step": 399
+    },
+    {
+      "epoch": 0.8316008316008316,
+      "grad_norm": 0.26171875,
+      "learning_rate": 7.768843913876755e-05,
+      "loss": 5.3271,
+      "step": 400
+    },
+    {
+      "epoch": 0.8316008316008316,
+      "eval_loss": 4.718051910400391,
+      "eval_runtime": 0.7111,
+      "eval_samples_per_second": 22.501,
+      "eval_steps_per_second": 2.813,
+      "step": 400
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 1.018894554759168e+17,
   "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null