Azrail commited on
Commit
ab66e32
·
verified ·
1 Parent(s): f4de2c1

Training in progress, step 126000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:811ff470374af97e47c736f298958834b69f1700c42f81f0e13b4a5264484ae8
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:710baf14c92f1a6ab3eef32ca39e73342de5da970d1c32a072279db6a546bd6e
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d11d76e5f8cf7c010d7ddfa9e036517a28b9f13eec8d65dc499e46a38c1f4b3
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dde6003afedc6dd2fd3bca69826bc4c2467f2fe522f76deae105d064b39f61f
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc320281dd48fee58a87ebe65c5af2ea4c357e61810ad0f123ab838f0f93b01f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90e596b43a0993defe8386429a74c73648ebeab624d8851d1dff893410d726b8
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31f9a4be764158103ef48222bfd8b15ec527d59f5ba7b3fa5af00980fe9404f9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5773cfed09936b668e41d5a19336896fe4fe897bf551564d5056fa5a83c98331
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.1925086754832632,
6
  "eval_steps": 500,
7
- "global_step": 125000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -22258,11 +22258,189 @@
22258
  "eval_steps_per_second": 15.182,
22259
  "num_input_tokens_seen": 65525493280,
22260
  "step": 125000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22261
  }
22262
  ],
22263
  "logging_steps": 50,
22264
  "max_steps": 140000,
22265
- "num_input_tokens_seen": 65525493280,
22266
  "num_train_epochs": 2,
22267
  "save_steps": 1000,
22268
  "stateful_callbacks": {
@@ -22277,7 +22455,7 @@
22277
  "attributes": {}
22278
  }
22279
  },
22280
- "total_flos": 1.1596817125562573e+20,
22281
  "train_batch_size": 32,
22282
  "trial_name": null,
22283
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2020487258070287,
6
  "eval_steps": 500,
7
+ "global_step": 126000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
22258
  "eval_steps_per_second": 15.182,
22259
  "num_input_tokens_seen": 65525493280,
22260
  "step": 125000
22261
+ },
22262
+ {
22263
+ "epoch": 1.1929856779994514,
22264
+ "grad_norm": 0.13272584974765778,
22265
+ "learning_rate": 0.0005531940155086557,
22266
+ "loss": 2.0602,
22267
+ "num_input_tokens_seen": 65551700064,
22268
+ "step": 125050
22269
+ },
22270
+ {
22271
+ "epoch": 1.1934626805156396,
22272
+ "grad_norm": 0.14066773653030396,
22273
+ "learning_rate": 0.0005504041188505022,
22274
+ "loss": 2.0695,
22275
+ "num_input_tokens_seen": 65577910784,
22276
+ "step": 125100
22277
+ },
22278
+ {
22279
+ "epoch": 1.193939683031828,
22280
+ "grad_norm": 0.13133113086223602,
22281
+ "learning_rate": 0.0005476126358804593,
22282
+ "loss": 2.0686,
22283
+ "num_input_tokens_seen": 65604124224,
22284
+ "step": 125150
22285
+ },
22286
+ {
22287
+ "epoch": 1.1944166855480163,
22288
+ "grad_norm": 0.13990654051303864,
22289
+ "learning_rate": 0.0005448196544517168,
22290
+ "loss": 2.0532,
22291
+ "num_input_tokens_seen": 65630324960,
22292
+ "step": 125200
22293
+ },
22294
+ {
22295
+ "epoch": 1.1948936880642045,
22296
+ "grad_norm": 0.14154765009880066,
22297
+ "learning_rate": 0.0005420252624646238,
22298
+ "loss": 2.0518,
22299
+ "num_input_tokens_seen": 65656532992,
22300
+ "step": 125250
22301
+ },
22302
+ {
22303
+ "epoch": 1.195370690580393,
22304
+ "grad_norm": 0.13149969279766083,
22305
+ "learning_rate": 0.0005392295478639225,
22306
+ "loss": 2.0619,
22307
+ "num_input_tokens_seen": 65682736768,
22308
+ "step": 125300
22309
+ },
22310
+ {
22311
+ "epoch": 1.1958476930965811,
22312
+ "grad_norm": 0.1339765191078186,
22313
+ "learning_rate": 0.0005364325986359802,
22314
+ "loss": 2.0706,
22315
+ "num_input_tokens_seen": 65708951168,
22316
+ "step": 125350
22317
+ },
22318
+ {
22319
+ "epoch": 1.1963246956127693,
22320
+ "grad_norm": 0.13910150527954102,
22321
+ "learning_rate": 0.0005336345028060199,
22322
+ "loss": 2.0596,
22323
+ "num_input_tokens_seen": 65735165568,
22324
+ "step": 125400
22325
+ },
22326
+ {
22327
+ "epoch": 1.1968016981289575,
22328
+ "grad_norm": 0.1447630077600479,
22329
+ "learning_rate": 0.0005308353484353508,
22330
+ "loss": 2.0518,
22331
+ "num_input_tokens_seen": 65761369888,
22332
+ "step": 125450
22333
+ },
22334
+ {
22335
+ "epoch": 1.197278700645146,
22336
+ "grad_norm": 0.13201679289340973,
22337
+ "learning_rate": 0.0005280352236185959,
22338
+ "loss": 2.0645,
22339
+ "num_input_tokens_seen": 65787582144,
22340
+ "step": 125500
22341
+ },
22342
+ {
22343
+ "epoch": 1.197278700645146,
22344
+ "eval_loss": 1.9799100160598755,
22345
+ "eval_runtime": 83.01,
22346
+ "eval_samples_per_second": 60.234,
22347
+ "eval_steps_per_second": 15.058,
22348
+ "num_input_tokens_seen": 65787582144,
22349
+ "step": 125500
22350
+ },
22351
+ {
22352
+ "epoch": 1.1977557031613342,
22353
+ "grad_norm": 0.1335040032863617,
22354
+ "learning_rate": 0.0005252342164809204,
22355
+ "loss": 2.0597,
22356
+ "num_input_tokens_seen": 65813796352,
22357
+ "step": 125550
22358
+ },
22359
+ {
22360
+ "epoch": 1.1982327056775224,
22361
+ "grad_norm": 0.13693130016326904,
22362
+ "learning_rate": 0.0005224324151752575,
22363
+ "loss": 2.0594,
22364
+ "num_input_tokens_seen": 65840010208,
22365
+ "step": 125600
22366
+ },
22367
+ {
22368
+ "epoch": 1.1987097081937108,
22369
+ "grad_norm": 0.13866880536079407,
22370
+ "learning_rate": 0.0005196299078795343,
22371
+ "loss": 2.0511,
22372
+ "num_input_tokens_seen": 65866216672,
22373
+ "step": 125650
22374
+ },
22375
+ {
22376
+ "epoch": 1.199186710709899,
22377
+ "grad_norm": 0.12740108370780945,
22378
+ "learning_rate": 0.000516826782793897,
22379
+ "loss": 2.0607,
22380
+ "num_input_tokens_seen": 65892430944,
22381
+ "step": 125700
22382
+ },
22383
+ {
22384
+ "epoch": 1.1996637132260872,
22385
+ "grad_norm": 0.13575108349323273,
22386
+ "learning_rate": 0.0005140231281379345,
22387
+ "loss": 2.0555,
22388
+ "num_input_tokens_seen": 65918642496,
22389
+ "step": 125750
22390
+ },
22391
+ {
22392
+ "epoch": 1.2001407157422754,
22393
+ "grad_norm": 0.13791455328464508,
22394
+ "learning_rate": 0.0005112190321479025,
22395
+ "loss": 2.0632,
22396
+ "num_input_tokens_seen": 65944852960,
22397
+ "step": 125800
22398
+ },
22399
+ {
22400
+ "epoch": 1.2006177182584639,
22401
+ "grad_norm": 0.1315431296825409,
22402
+ "learning_rate": 0.0005084145830739461,
22403
+ "loss": 2.0646,
22404
+ "num_input_tokens_seen": 65971066432,
22405
+ "step": 125850
22406
+ },
22407
+ {
22408
+ "epoch": 1.201094720774652,
22409
+ "grad_norm": 0.12288303673267365,
22410
+ "learning_rate": 0.000505609869177323,
22411
+ "loss": 2.0748,
22412
+ "num_input_tokens_seen": 65997277888,
22413
+ "step": 125900
22414
+ },
22415
+ {
22416
+ "epoch": 1.2015717232908403,
22417
+ "grad_norm": 0.12677106261253357,
22418
+ "learning_rate": 0.0005028049787276249,
22419
+ "loss": 2.0595,
22420
+ "num_input_tokens_seen": 66023480960,
22421
+ "step": 125950
22422
+ },
22423
+ {
22424
+ "epoch": 1.2020487258070287,
22425
+ "grad_norm": 0.140994593501091,
22426
+ "learning_rate": 0.0005,
22427
+ "loss": 2.0556,
22428
+ "num_input_tokens_seen": 66049692768,
22429
+ "step": 126000
22430
+ },
22431
+ {
22432
+ "epoch": 1.2020487258070287,
22433
+ "eval_loss": 1.978381633758545,
22434
+ "eval_runtime": 81.8164,
22435
+ "eval_samples_per_second": 61.112,
22436
+ "eval_steps_per_second": 15.278,
22437
+ "num_input_tokens_seen": 66049692768,
22438
+ "step": 126000
22439
  }
22440
  ],
22441
  "logging_steps": 50,
22442
  "max_steps": 140000,
22443
+ "num_input_tokens_seen": 66049692768,
22444
  "num_train_epochs": 2,
22445
  "save_steps": 1000,
22446
  "stateful_callbacks": {
 
22455
  "attributes": {}
22456
  }
22457
  },
22458
+ "total_flos": 1.1689590873539912e+20,
22459
  "train_batch_size": 32,
22460
  "trial_name": null,
22461
  "trial_params": null