Azrail commited on
Commit
c444e26
·
verified ·
1 Parent(s): 9ab4b98

Training in progress, step 49000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0699f2befd5f5fe39f37d9992ad71298c6e825af92f8b9997d530b9228219782
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1fa5e14e336776a0794b601efbdc3db24765810ef38e2b37883f1ad39a38e15
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6975052cf62584f01dc0b92d80322e1defc71e0703e038bfa5340c5530e8e1a
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51be2ef186eec1ebc97ea07dde4acfe4cb7904323be165f365952918c5ff93ee
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f916f32ed5bd769a4257bf59e71aa59f0b4e6ba66e2f6069ff1d46ad7cda2db
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2815e20f41b0ab873453c57ac2dfc7d374a540c5d47a47423caeebea8ab88de
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de67d78be185ea67aa4ca20dcc37ca7f9d17d76246f8cfa3148b96b4fc56902c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85feca9ffa4367dad07b4142308894db505807fd169bd3aedff12898c8f097e0
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.457922415540742,
6
  "eval_steps": 500,
7
- "global_step": 48000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8552,11 +8552,189 @@
8552
  "eval_steps_per_second": 15.249,
8553
  "num_input_tokens_seen": 25161718656,
8554
  "step": 48000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8555
  }
8556
  ],
8557
  "logging_steps": 50,
8558
  "max_steps": 70000,
8559
- "num_input_tokens_seen": 25161718656,
8560
  "num_train_epochs": 1,
8561
  "save_steps": 1000,
8562
  "stateful_callbacks": {
@@ -8571,7 +8749,7 @@
8571
  "attributes": {}
8572
  }
8573
  },
8574
- "total_flos": 4.4531652523637146e+19,
8575
  "train_batch_size": 32,
8576
  "trial_name": null,
8577
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.46746246586450746,
6
  "eval_steps": 500,
7
+ "global_step": 49000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8552
  "eval_steps_per_second": 15.249,
8553
  "num_input_tokens_seen": 25161718656,
8554
  "step": 48000
8555
+ },
8556
+ {
8557
+ "epoch": 0.45839941805693024,
8558
+ "grad_norm": 0.13940733671188354,
8559
+ "learning_rate": 0.001,
8560
+ "loss": 2.174,
8561
+ "num_input_tokens_seen": 25187922848,
8562
+ "step": 48050
8563
+ },
8564
+ {
8565
+ "epoch": 0.4588764205731185,
8566
+ "grad_norm": 0.16502974927425385,
8567
+ "learning_rate": 0.001,
8568
+ "loss": 2.1807,
8569
+ "num_input_tokens_seen": 25214132864,
8570
+ "step": 48100
8571
+ },
8572
+ {
8573
+ "epoch": 0.4593534230893068,
8574
+ "grad_norm": 0.15250737965106964,
8575
+ "learning_rate": 0.001,
8576
+ "loss": 2.1831,
8577
+ "num_input_tokens_seen": 25240339520,
8578
+ "step": 48150
8579
+ },
8580
+ {
8581
+ "epoch": 0.4598304256054951,
8582
+ "grad_norm": 0.14336740970611572,
8583
+ "learning_rate": 0.001,
8584
+ "loss": 2.175,
8585
+ "num_input_tokens_seen": 25266553920,
8586
+ "step": 48200
8587
+ },
8588
+ {
8589
+ "epoch": 0.46030742812168335,
8590
+ "grad_norm": 0.1376286745071411,
8591
+ "learning_rate": 0.001,
8592
+ "loss": 2.1733,
8593
+ "num_input_tokens_seen": 25292766560,
8594
+ "step": 48250
8595
+ },
8596
+ {
8597
+ "epoch": 0.4607844306378716,
8598
+ "grad_norm": 0.1339864432811737,
8599
+ "learning_rate": 0.001,
8600
+ "loss": 2.1667,
8601
+ "num_input_tokens_seen": 25318970496,
8602
+ "step": 48300
8603
+ },
8604
+ {
8605
+ "epoch": 0.4612614331540599,
8606
+ "grad_norm": 0.14675366878509521,
8607
+ "learning_rate": 0.001,
8608
+ "loss": 2.1784,
8609
+ "num_input_tokens_seen": 25345180512,
8610
+ "step": 48350
8611
+ },
8612
+ {
8613
+ "epoch": 0.46173843567024814,
8614
+ "grad_norm": 0.14352139830589294,
8615
+ "learning_rate": 0.001,
8616
+ "loss": 2.1915,
8617
+ "num_input_tokens_seen": 25371386368,
8618
+ "step": 48400
8619
+ },
8620
+ {
8621
+ "epoch": 0.46221543818643646,
8622
+ "grad_norm": 0.14589083194732666,
8623
+ "learning_rate": 0.001,
8624
+ "loss": 2.1692,
8625
+ "num_input_tokens_seen": 25397588192,
8626
+ "step": 48450
8627
+ },
8628
+ {
8629
+ "epoch": 0.4626924407026247,
8630
+ "grad_norm": 0.1392335146665573,
8631
+ "learning_rate": 0.001,
8632
+ "loss": 2.1811,
8633
+ "num_input_tokens_seen": 25423801984,
8634
+ "step": 48500
8635
+ },
8636
+ {
8637
+ "epoch": 0.4626924407026247,
8638
+ "eval_loss": 2.0870039463043213,
8639
+ "eval_runtime": 82.4574,
8640
+ "eval_samples_per_second": 60.637,
8641
+ "eval_steps_per_second": 15.159,
8642
+ "num_input_tokens_seen": 25423801984,
8643
+ "step": 48500
8644
+ },
8645
+ {
8646
+ "epoch": 0.463169443218813,
8647
+ "grad_norm": 0.14096789062023163,
8648
+ "learning_rate": 0.001,
8649
+ "loss": 2.1822,
8650
+ "num_input_tokens_seen": 25450016384,
8651
+ "step": 48550
8652
+ },
8653
+ {
8654
+ "epoch": 0.46364644573500124,
8655
+ "grad_norm": 0.13657501339912415,
8656
+ "learning_rate": 0.001,
8657
+ "loss": 2.1633,
8658
+ "num_input_tokens_seen": 25476223712,
8659
+ "step": 48600
8660
+ },
8661
+ {
8662
+ "epoch": 0.4641234482511895,
8663
+ "grad_norm": 0.1375761330127716,
8664
+ "learning_rate": 0.001,
8665
+ "loss": 2.1601,
8666
+ "num_input_tokens_seen": 25502435136,
8667
+ "step": 48650
8668
+ },
8669
+ {
8670
+ "epoch": 0.46460045076737777,
8671
+ "grad_norm": 0.13810068368911743,
8672
+ "learning_rate": 0.001,
8673
+ "loss": 2.1651,
8674
+ "num_input_tokens_seen": 25528648192,
8675
+ "step": 48700
8676
+ },
8677
+ {
8678
+ "epoch": 0.4650774532835661,
8679
+ "grad_norm": 0.1375926285982132,
8680
+ "learning_rate": 0.001,
8681
+ "loss": 2.1766,
8682
+ "num_input_tokens_seen": 25554860256,
8683
+ "step": 48750
8684
+ },
8685
+ {
8686
+ "epoch": 0.46555445579975435,
8687
+ "grad_norm": 0.14654815196990967,
8688
+ "learning_rate": 0.001,
8689
+ "loss": 2.1634,
8690
+ "num_input_tokens_seen": 25581068864,
8691
+ "step": 48800
8692
+ },
8693
+ {
8694
+ "epoch": 0.4660314583159426,
8695
+ "grad_norm": 0.1339625120162964,
8696
+ "learning_rate": 0.001,
8697
+ "loss": 2.1681,
8698
+ "num_input_tokens_seen": 25607278112,
8699
+ "step": 48850
8700
+ },
8701
+ {
8702
+ "epoch": 0.4665084608321309,
8703
+ "grad_norm": 0.13390694558620453,
8704
+ "learning_rate": 0.001,
8705
+ "loss": 2.1789,
8706
+ "num_input_tokens_seen": 25633491968,
8707
+ "step": 48900
8708
+ },
8709
+ {
8710
+ "epoch": 0.46698546334831914,
8711
+ "grad_norm": 0.14397822320461273,
8712
+ "learning_rate": 0.001,
8713
+ "loss": 2.1525,
8714
+ "num_input_tokens_seen": 25659705568,
8715
+ "step": 48950
8716
+ },
8717
+ {
8718
+ "epoch": 0.46746246586450746,
8719
+ "grad_norm": 0.12739968299865723,
8720
+ "learning_rate": 0.001,
8721
+ "loss": 2.1621,
8722
+ "num_input_tokens_seen": 25685912544,
8723
+ "step": 49000
8724
+ },
8725
+ {
8726
+ "epoch": 0.46746246586450746,
8727
+ "eval_loss": 2.0851972103118896,
8728
+ "eval_runtime": 82.4678,
8729
+ "eval_samples_per_second": 60.63,
8730
+ "eval_steps_per_second": 15.157,
8731
+ "num_input_tokens_seen": 25685912544,
8732
+ "step": 49000
8733
  }
8734
  ],
8735
  "logging_steps": 50,
8736
  "max_steps": 70000,
8737
+ "num_input_tokens_seen": 25685912544,
8738
  "num_train_epochs": 1,
8739
  "save_steps": 1000,
8740
  "stateful_callbacks": {
 
8749
  "attributes": {}
8750
  }
8751
  },
8752
+ "total_flos": 4.5459380092431974e+19,
8753
  "train_batch_size": 32,
8754
  "trial_name": null,
8755
  "trial_params": null