mohammadmahdinouri commited on
Commit
83bff14
·
verified ·
1 Parent(s): ac11181

Training in progress, step 16000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1a26dcf49a9adec930252211cc2ebdb39eacfeed22076666661d9caed214679
3
  size 304481530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42301bc164cb007a8e9ffaaebd3b674826efaacc96f02799ea8c54ebdf5beff1
3
  size 304481530
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c979f073b59bd49cc8c35d18515eb7ddd7a81fbd717bb34c90967a1f381c67c5
3
  size 402029570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e166c3997353d811bb7375dab7e17cf88064b52029e8056c729ba4ae8d2e8f22
3
  size 402029570
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ba6101cd4bf8f7ce3d46d2382a668adc549af4a1a22b84d941c0306451bad54
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8653c4f16bb3c4531444bd438e2a397c259c928e9f5a96f450fc3aa43ef0f5c
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c787066085cf3612c776a9db920df8b50e21936babcea06d25d76b00b6f6481
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91185d0e7a47d1f7979000c680b3a146a800c2ff31f983b75b24ceb331884072
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d923993877b77c02936b0de7d9cac94e6fa202dfaca7291842d5e19deb2cdb37
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be0be34d9684d804e2f3030fceca4c7b93603e6596a44aaf270c97cb1740b1da
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b72344aa5e79ea1dbe308108cb74bb1087ed0d7f8ad9e397ccfa76e1dcdde76c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e539799e7e99b66c33c364546118319f901c9765aa17eaf7cf8b17906c00c95a
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51acafd4471f7843be9ffc9528db012939ee248bf40f7127aa783c5f97813694
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccc2a52ae0327def30cc40f7f273a4a1537961b9b580753fe57ec7ecdab69b35
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.029249601474179914,
6
  "eval_steps": 500,
7
- "global_step": 15000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5258,6 +5258,356 @@
5258
  "learning_rate": 0.0004952863526922635,
5259
  "loss": 20.6348,
5260
  "step": 15000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5261
  }
5262
  ],
5263
  "logging_steps": 20,
@@ -5277,7 +5627,7 @@
5277
  "attributes": {}
5278
  }
5279
  },
5280
- "total_flos": 1.102761022435256e+19,
5281
  "train_batch_size": 48,
5282
  "trial_name": null,
5283
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.031199574905791908,
6
  "eval_steps": 500,
7
+ "global_step": 16000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5258
  "learning_rate": 0.0004952863526922635,
5259
  "loss": 20.6348,
5260
  "step": 15000
5261
+ },
5262
+ {
5263
+ "epoch": 0.029288600942812153,
5264
+ "grad_norm": 11.0,
5265
+ "learning_rate": 0.000495279850661354,
5266
+ "loss": 20.6052,
5267
+ "step": 15020
5268
+ },
5269
+ {
5270
+ "epoch": 0.029327600411444395,
5271
+ "grad_norm": 11.125,
5272
+ "learning_rate": 0.0004952733486304447,
5273
+ "loss": 20.6334,
5274
+ "step": 15040
5275
+ },
5276
+ {
5277
+ "epoch": 0.029366599880076633,
5278
+ "grad_norm": 11.3125,
5279
+ "learning_rate": 0.0004952668465995353,
5280
+ "loss": 20.6751,
5281
+ "step": 15060
5282
+ },
5283
+ {
5284
+ "epoch": 0.029405599348708875,
5285
+ "grad_norm": 14.75,
5286
+ "learning_rate": 0.000495260344568626,
5287
+ "loss": 20.5701,
5288
+ "step": 15080
5289
+ },
5290
+ {
5291
+ "epoch": 0.029444598817341114,
5292
+ "grad_norm": 12.0,
5293
+ "learning_rate": 0.0004952538425377166,
5294
+ "loss": 20.5664,
5295
+ "step": 15100
5296
+ },
5297
+ {
5298
+ "epoch": 0.029483598285973352,
5299
+ "grad_norm": 12.625,
5300
+ "learning_rate": 0.0004952473405068073,
5301
+ "loss": 20.5255,
5302
+ "step": 15120
5303
+ },
5304
+ {
5305
+ "epoch": 0.029522597754605594,
5306
+ "grad_norm": 11.0,
5307
+ "learning_rate": 0.000495240838475898,
5308
+ "loss": 20.5584,
5309
+ "step": 15140
5310
+ },
5311
+ {
5312
+ "epoch": 0.029561597223237833,
5313
+ "grad_norm": 13.0625,
5314
+ "learning_rate": 0.0004952343364449886,
5315
+ "loss": 20.6137,
5316
+ "step": 15160
5317
+ },
5318
+ {
5319
+ "epoch": 0.029600596691870075,
5320
+ "grad_norm": 11.125,
5321
+ "learning_rate": 0.0004952278344140793,
5322
+ "loss": 20.5632,
5323
+ "step": 15180
5324
+ },
5325
+ {
5326
+ "epoch": 0.029639596160502314,
5327
+ "grad_norm": 10.8125,
5328
+ "learning_rate": 0.0004952213323831699,
5329
+ "loss": 20.5356,
5330
+ "step": 15200
5331
+ },
5332
+ {
5333
+ "epoch": 0.029678595629134552,
5334
+ "grad_norm": 12.25,
5335
+ "learning_rate": 0.0004952148303522606,
5336
+ "loss": 20.5935,
5337
+ "step": 15220
5338
+ },
5339
+ {
5340
+ "epoch": 0.029717595097766794,
5341
+ "grad_norm": 9.875,
5342
+ "learning_rate": 0.0004952083283213512,
5343
+ "loss": 20.6133,
5344
+ "step": 15240
5345
+ },
5346
+ {
5347
+ "epoch": 0.029756594566399033,
5348
+ "grad_norm": 12.75,
5349
+ "learning_rate": 0.0004952018262904418,
5350
+ "loss": 20.6443,
5351
+ "step": 15260
5352
+ },
5353
+ {
5354
+ "epoch": 0.02979559403503127,
5355
+ "grad_norm": 13.4375,
5356
+ "learning_rate": 0.0004951953242595325,
5357
+ "loss": 20.6207,
5358
+ "step": 15280
5359
+ },
5360
+ {
5361
+ "epoch": 0.029834593503663513,
5362
+ "grad_norm": 10.0625,
5363
+ "learning_rate": 0.0004951888222286231,
5364
+ "loss": 20.5027,
5365
+ "step": 15300
5366
+ },
5367
+ {
5368
+ "epoch": 0.029873592972295752,
5369
+ "grad_norm": 11.8125,
5370
+ "learning_rate": 0.0004951823201977138,
5371
+ "loss": 20.534,
5372
+ "step": 15320
5373
+ },
5374
+ {
5375
+ "epoch": 0.029912592440927994,
5376
+ "grad_norm": 11.0625,
5377
+ "learning_rate": 0.0004951758181668044,
5378
+ "loss": 20.5885,
5379
+ "step": 15340
5380
+ },
5381
+ {
5382
+ "epoch": 0.029951591909560232,
5383
+ "grad_norm": 11.1875,
5384
+ "learning_rate": 0.0004951693161358951,
5385
+ "loss": 20.6046,
5386
+ "step": 15360
5387
+ },
5388
+ {
5389
+ "epoch": 0.02999059137819247,
5390
+ "grad_norm": 11.6875,
5391
+ "learning_rate": 0.0004951628141049857,
5392
+ "loss": 20.4537,
5393
+ "step": 15380
5394
+ },
5395
+ {
5396
+ "epoch": 0.030029590846824713,
5397
+ "grad_norm": 11.1875,
5398
+ "learning_rate": 0.0004951563120740764,
5399
+ "loss": 20.4366,
5400
+ "step": 15400
5401
+ },
5402
+ {
5403
+ "epoch": 0.03006859031545695,
5404
+ "grad_norm": 12.3125,
5405
+ "learning_rate": 0.000495149810043167,
5406
+ "loss": 20.5313,
5407
+ "step": 15420
5408
+ },
5409
+ {
5410
+ "epoch": 0.030107589784089193,
5411
+ "grad_norm": 12.5,
5412
+ "learning_rate": 0.0004951433080122577,
5413
+ "loss": 20.5595,
5414
+ "step": 15440
5415
+ },
5416
+ {
5417
+ "epoch": 0.030146589252721432,
5418
+ "grad_norm": 10.75,
5419
+ "learning_rate": 0.0004951368059813483,
5420
+ "loss": 20.4318,
5421
+ "step": 15460
5422
+ },
5423
+ {
5424
+ "epoch": 0.03018558872135367,
5425
+ "grad_norm": 12.8125,
5426
+ "learning_rate": 0.0004951303039504389,
5427
+ "loss": 20.5356,
5428
+ "step": 15480
5429
+ },
5430
+ {
5431
+ "epoch": 0.030224588189985913,
5432
+ "grad_norm": 11.75,
5433
+ "learning_rate": 0.0004951238019195296,
5434
+ "loss": 20.594,
5435
+ "step": 15500
5436
+ },
5437
+ {
5438
+ "epoch": 0.03026358765861815,
5439
+ "grad_norm": 11.125,
5440
+ "learning_rate": 0.0004951172998886202,
5441
+ "loss": 20.5289,
5442
+ "step": 15520
5443
+ },
5444
+ {
5445
+ "epoch": 0.03030258712725039,
5446
+ "grad_norm": 12.4375,
5447
+ "learning_rate": 0.0004951107978577109,
5448
+ "loss": 20.4482,
5449
+ "step": 15540
5450
+ },
5451
+ {
5452
+ "epoch": 0.03034158659588263,
5453
+ "grad_norm": 11.1875,
5454
+ "learning_rate": 0.0004951042958268015,
5455
+ "loss": 20.4001,
5456
+ "step": 15560
5457
+ },
5458
+ {
5459
+ "epoch": 0.03038058606451487,
5460
+ "grad_norm": 13.0625,
5461
+ "learning_rate": 0.0004950977937958922,
5462
+ "loss": 20.3405,
5463
+ "step": 15580
5464
+ },
5465
+ {
5466
+ "epoch": 0.030419585533147112,
5467
+ "grad_norm": 13.5625,
5468
+ "learning_rate": 0.0004950912917649827,
5469
+ "loss": 20.435,
5470
+ "step": 15600
5471
+ },
5472
+ {
5473
+ "epoch": 0.03045858500177935,
5474
+ "grad_norm": 11.25,
5475
+ "learning_rate": 0.0004950847897340734,
5476
+ "loss": 20.4817,
5477
+ "step": 15620
5478
+ },
5479
+ {
5480
+ "epoch": 0.03049758447041159,
5481
+ "grad_norm": 10.75,
5482
+ "learning_rate": 0.0004950782877031641,
5483
+ "loss": 20.4889,
5484
+ "step": 15640
5485
+ },
5486
+ {
5487
+ "epoch": 0.03053658393904383,
5488
+ "grad_norm": 12.25,
5489
+ "learning_rate": 0.0004950717856722547,
5490
+ "loss": 20.4209,
5491
+ "step": 15660
5492
+ },
5493
+ {
5494
+ "epoch": 0.03057558340767607,
5495
+ "grad_norm": 11.125,
5496
+ "learning_rate": 0.0004950652836413454,
5497
+ "loss": 20.401,
5498
+ "step": 15680
5499
+ },
5500
+ {
5501
+ "epoch": 0.030614582876308312,
5502
+ "grad_norm": 12.0,
5503
+ "learning_rate": 0.000495058781610436,
5504
+ "loss": 20.4579,
5505
+ "step": 15700
5506
+ },
5507
+ {
5508
+ "epoch": 0.03065358234494055,
5509
+ "grad_norm": 10.9375,
5510
+ "learning_rate": 0.0004950522795795267,
5511
+ "loss": 20.4935,
5512
+ "step": 15720
5513
+ },
5514
+ {
5515
+ "epoch": 0.03069258181357279,
5516
+ "grad_norm": 11.75,
5517
+ "learning_rate": 0.0004950457775486173,
5518
+ "loss": 20.4301,
5519
+ "step": 15740
5520
+ },
5521
+ {
5522
+ "epoch": 0.03073158128220503,
5523
+ "grad_norm": 11.6875,
5524
+ "learning_rate": 0.0004950392755177079,
5525
+ "loss": 20.3754,
5526
+ "step": 15760
5527
+ },
5528
+ {
5529
+ "epoch": 0.03077058075083727,
5530
+ "grad_norm": 10.625,
5531
+ "learning_rate": 0.0004950327734867985,
5532
+ "loss": 20.4608,
5533
+ "step": 15780
5534
+ },
5535
+ {
5536
+ "epoch": 0.030809580219469508,
5537
+ "grad_norm": 11.875,
5538
+ "learning_rate": 0.0004950262714558892,
5539
+ "loss": 20.5408,
5540
+ "step": 15800
5541
+ },
5542
+ {
5543
+ "epoch": 0.03084857968810175,
5544
+ "grad_norm": 11.125,
5545
+ "learning_rate": 0.0004950197694249799,
5546
+ "loss": 20.3624,
5547
+ "step": 15820
5548
+ },
5549
+ {
5550
+ "epoch": 0.03088757915673399,
5551
+ "grad_norm": 10.75,
5552
+ "learning_rate": 0.0004950132673940705,
5553
+ "loss": 20.3549,
5554
+ "step": 15840
5555
+ },
5556
+ {
5557
+ "epoch": 0.03092657862536623,
5558
+ "grad_norm": 13.6875,
5559
+ "learning_rate": 0.0004950067653631612,
5560
+ "loss": 20.3933,
5561
+ "step": 15860
5562
+ },
5563
+ {
5564
+ "epoch": 0.03096557809399847,
5565
+ "grad_norm": 12.6875,
5566
+ "learning_rate": 0.0004950002633322518,
5567
+ "loss": 20.3452,
5568
+ "step": 15880
5569
+ },
5570
+ {
5571
+ "epoch": 0.031004577562630708,
5572
+ "grad_norm": 11.0625,
5573
+ "learning_rate": 0.0004949937613013425,
5574
+ "loss": 20.4437,
5575
+ "step": 15900
5576
+ },
5577
+ {
5578
+ "epoch": 0.03104357703126295,
5579
+ "grad_norm": 9.6875,
5580
+ "learning_rate": 0.0004949872592704331,
5581
+ "loss": 20.3318,
5582
+ "step": 15920
5583
+ },
5584
+ {
5585
+ "epoch": 0.03108257649989519,
5586
+ "grad_norm": 10.375,
5587
+ "learning_rate": 0.0004949807572395238,
5588
+ "loss": 20.3704,
5589
+ "step": 15940
5590
+ },
5591
+ {
5592
+ "epoch": 0.03112157596852743,
5593
+ "grad_norm": 11.625,
5594
+ "learning_rate": 0.0004949742552086145,
5595
+ "loss": 20.3497,
5596
+ "step": 15960
5597
+ },
5598
+ {
5599
+ "epoch": 0.03116057543715967,
5600
+ "grad_norm": 11.9375,
5601
+ "learning_rate": 0.0004949677531777051,
5602
+ "loss": 20.4226,
5603
+ "step": 15980
5604
+ },
5605
+ {
5606
+ "epoch": 0.031199574905791908,
5607
+ "grad_norm": 14.125,
5608
+ "learning_rate": 0.0004949612511467957,
5609
+ "loss": 20.3333,
5610
+ "step": 16000
5611
  }
5612
  ],
5613
  "logging_steps": 20,
 
5627
  "attributes": {}
5628
  }
5629
  },
5630
+ "total_flos": 1.176271382718605e+19,
5631
  "train_batch_size": 48,
5632
  "trial_name": null,
5633
  "trial_params": null