Azrail commited on
Commit
88025e6
·
verified ·
1 Parent(s): a405b2d

Training in progress, step 127000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:710baf14c92f1a6ab3eef32ca39e73342de5da970d1c32a072279db6a546bd6e
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9d1695bc9de636b5aaeaf2dd7d5f58cbc5a682eb69ac9b38095e92d54ec5937
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3dde6003afedc6dd2fd3bca69826bc4c2467f2fe522f76deae105d064b39f61f
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dc4491cbd42db47871ad0a656d153441e2ea2d0c5e68c9fdfe29f91fdedede3
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90e596b43a0993defe8386429a74c73648ebeab624d8851d1dff893410d726b8
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8542f0951d699465323349728bdecbda5c5f0e8274e699cbba04806de2fddeeb
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5773cfed09936b668e41d5a19336896fe4fe897bf551564d5056fa5a83c98331
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27a3a16e476801029c30325a569467f804e448c3ecc89accd2bd78b3749ec27f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.2020487258070287,
6
  "eval_steps": 500,
7
- "global_step": 126000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -22436,11 +22436,189 @@
22436
  "eval_steps_per_second": 15.278,
22437
  "num_input_tokens_seen": 66049692768,
22438
  "step": 126000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22439
  }
22440
  ],
22441
  "logging_steps": 50,
22442
  "max_steps": 140000,
22443
- "num_input_tokens_seen": 66049692768,
22444
  "num_train_epochs": 2,
22445
  "save_steps": 1000,
22446
  "stateful_callbacks": {
@@ -22455,7 +22633,7 @@
22455
  "attributes": {}
22456
  }
22457
  },
22458
- "total_flos": 1.1689590873539912e+20,
22459
  "train_batch_size": 32,
22460
  "trial_name": null,
22461
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2115887761307942,
6
  "eval_steps": 500,
7
+ "global_step": 127000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
22436
  "eval_steps_per_second": 15.278,
22437
  "num_input_tokens_seen": 66049692768,
22438
  "step": 126000
22439
+ },
22440
+ {
22441
+ "epoch": 1.202525728323217,
22442
+ "grad_norm": 0.1393454372882843,
22443
+ "learning_rate": 0.0004971950212723752,
22444
+ "loss": 2.0569,
22445
+ "num_input_tokens_seen": 66075907072,
22446
+ "step": 126050
22447
+ },
22448
+ {
22449
+ "epoch": 1.2030027308394051,
22450
+ "grad_norm": 0.1390795111656189,
22451
+ "learning_rate": 0.0004943901308226771,
22452
+ "loss": 2.0579,
22453
+ "num_input_tokens_seen": 66102120320,
22454
+ "step": 126100
22455
+ },
22456
+ {
22457
+ "epoch": 1.2034797333555933,
22458
+ "grad_norm": 0.136804461479187,
22459
+ "learning_rate": 0.0004915854169260539,
22460
+ "loss": 2.0594,
22461
+ "num_input_tokens_seen": 66128330880,
22462
+ "step": 126150
22463
+ },
22464
+ {
22465
+ "epoch": 1.2039567358717818,
22466
+ "grad_norm": 0.14418946206569672,
22467
+ "learning_rate": 0.0004887809678520976,
22468
+ "loss": 2.0521,
22469
+ "num_input_tokens_seen": 66154537216,
22470
+ "step": 126200
22471
+ },
22472
+ {
22473
+ "epoch": 1.20443373838797,
22474
+ "grad_norm": 0.1406649798154831,
22475
+ "learning_rate": 0.00048597687186206556,
22476
+ "loss": 2.0604,
22477
+ "num_input_tokens_seen": 66180744192,
22478
+ "step": 126250
22479
+ },
22480
+ {
22481
+ "epoch": 1.2049107409041582,
22482
+ "grad_norm": 0.13004782795906067,
22483
+ "learning_rate": 0.0004831732172061032,
22484
+ "loss": 2.0633,
22485
+ "num_input_tokens_seen": 66206951232,
22486
+ "step": 126300
22487
+ },
22488
+ {
22489
+ "epoch": 1.2053877434203466,
22490
+ "grad_norm": 0.1319655478000641,
22491
+ "learning_rate": 0.00048037009212046586,
22492
+ "loss": 2.0609,
22493
+ "num_input_tokens_seen": 66233151744,
22494
+ "step": 126350
22495
+ },
22496
+ {
22497
+ "epoch": 1.2058647459365348,
22498
+ "grad_norm": 0.13051386177539825,
22499
+ "learning_rate": 0.0004775675848247427,
22500
+ "loss": 2.0591,
22501
+ "num_input_tokens_seen": 66259358592,
22502
+ "step": 126400
22503
+ },
22504
+ {
22505
+ "epoch": 1.206341748452723,
22506
+ "grad_norm": 0.12983474135398865,
22507
+ "learning_rate": 0.0004747657835190795,
22508
+ "loss": 2.0571,
22509
+ "num_input_tokens_seen": 66285559520,
22510
+ "step": 126450
22511
+ },
22512
+ {
22513
+ "epoch": 1.2068187509689114,
22514
+ "grad_norm": 0.12744031846523285,
22515
+ "learning_rate": 0.00047196477638140405,
22516
+ "loss": 2.0581,
22517
+ "num_input_tokens_seen": 66311770112,
22518
+ "step": 126500
22519
+ },
22520
+ {
22521
+ "epoch": 1.2068187509689114,
22522
+ "eval_loss": 1.9767038822174072,
22523
+ "eval_runtime": 82.0094,
22524
+ "eval_samples_per_second": 60.969,
22525
+ "eval_steps_per_second": 15.242,
22526
+ "num_input_tokens_seen": 66311770112,
22527
+ "step": 126500
22528
+ },
22529
+ {
22530
+ "epoch": 1.2072957534850997,
22531
+ "grad_norm": 0.13606679439544678,
22532
+ "learning_rate": 0.00046916465156464924,
22533
+ "loss": 2.062,
22534
+ "num_input_tokens_seen": 66337979200,
22535
+ "step": 126550
22536
+ },
22537
+ {
22538
+ "epoch": 1.2077727560012879,
22539
+ "grad_norm": 0.12876896560192108,
22540
+ "learning_rate": 0.0004663654971939802,
22541
+ "loss": 2.0627,
22542
+ "num_input_tokens_seen": 66364192640,
22543
+ "step": 126600
22544
+ },
22545
+ {
22546
+ "epoch": 1.2082497585174763,
22547
+ "grad_norm": 0.18826884031295776,
22548
+ "learning_rate": 0.00046356740136402,
22549
+ "loss": 2.0573,
22550
+ "num_input_tokens_seen": 66390404768,
22551
+ "step": 126650
22552
+ },
22553
+ {
22554
+ "epoch": 1.2087267610336645,
22555
+ "grad_norm": 0.1488431692123413,
22556
+ "learning_rate": 0.0004607704521360776,
22557
+ "loss": 2.0592,
22558
+ "num_input_tokens_seen": 66416613920,
22559
+ "step": 126700
22560
+ },
22561
+ {
22562
+ "epoch": 1.2092037635498527,
22563
+ "grad_norm": 0.12901978194713593,
22564
+ "learning_rate": 0.0004579747375353763,
22565
+ "loss": 2.0601,
22566
+ "num_input_tokens_seen": 66442820800,
22567
+ "step": 126750
22568
+ },
22569
+ {
22570
+ "epoch": 1.209680766066041,
22571
+ "grad_norm": 0.13032038509845734,
22572
+ "learning_rate": 0.0004551803455482833,
22573
+ "loss": 2.0675,
22574
+ "num_input_tokens_seen": 66469028480,
22575
+ "step": 126800
22576
+ },
22577
+ {
22578
+ "epoch": 1.2101577685822293,
22579
+ "grad_norm": 0.13756315410137177,
22580
+ "learning_rate": 0.00045238736411954073,
22581
+ "loss": 2.0543,
22582
+ "num_input_tokens_seen": 66495230816,
22583
+ "step": 126850
22584
+ },
22585
+ {
22586
+ "epoch": 1.2106347710984176,
22587
+ "grad_norm": 0.13066066801548004,
22588
+ "learning_rate": 0.0004495958811494978,
22589
+ "loss": 2.0545,
22590
+ "num_input_tokens_seen": 66521443360,
22591
+ "step": 126900
22592
+ },
22593
+ {
22594
+ "epoch": 1.2111117736146058,
22595
+ "grad_norm": 0.13837099075317383,
22596
+ "learning_rate": 0.00044680598449134434,
22597
+ "loss": 2.0557,
22598
+ "num_input_tokens_seen": 66547651488,
22599
+ "step": 126950
22600
+ },
22601
+ {
22602
+ "epoch": 1.2115887761307942,
22603
+ "grad_norm": 0.13125094771385193,
22604
+ "learning_rate": 0.0004440177619483461,
22605
+ "loss": 2.0633,
22606
+ "num_input_tokens_seen": 66573856704,
22607
+ "step": 127000
22608
+ },
22609
+ {
22610
+ "epoch": 1.2115887761307942,
22611
+ "eval_loss": 1.9741461277008057,
22612
+ "eval_runtime": 82.3333,
22613
+ "eval_samples_per_second": 60.729,
22614
+ "eval_steps_per_second": 15.182,
22615
+ "num_input_tokens_seen": 66573856704,
22616
+ "step": 127000
22617
  }
22618
  ],
22619
  "logging_steps": 50,
22620
  "max_steps": 140000,
22621
+ "num_input_tokens_seen": 66573856704,
22622
  "num_train_epochs": 2,
22623
  "save_steps": 1000,
22624
  "stateful_callbacks": {
 
22633
  "attributes": {}
22634
  }
22635
  },
22636
+ "total_flos": 1.1782358329461719e+20,
22637
  "train_batch_size": 32,
22638
  "trial_name": null,
22639
  "trial_params": null