Azrail commited on
Commit
f2e8f17
·
verified ·
1 Parent(s): 0ca2a86

Training in progress, step 49000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83040a9f33c98136e5fdace56390e4f45897e63e4e108ccbd4b366bd299ccd64
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d2189cc3a98b2403601c139a530c8e21835cb6237e1f4942ace6213b73fce5f
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:437a042365907c8955a2fc7d892d047bacaeaffa159edfba16e54b9aa6d50132
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec0537d11321458817927ebac3e783711d3aa86865e3823d0bc93d1e41dfc5d1
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3179ff7b9a01d9e9850f6d8ae042cb4934a5fc48309149cb50bc43cd37884f1d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02dd1579b4e4c484590ab9c87fcdb4df0578497bcab5d7d028a086e5a9506abe
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de67d78be185ea67aa4ca20dcc37ca7f9d17d76246f8cfa3148b96b4fc56902c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85feca9ffa4367dad07b4142308894db505807fd169bd3aedff12898c8f097e0
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.228961207770371,
6
  "eval_steps": 500,
7
- "global_step": 48000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8552,11 +8552,189 @@
8552
  "eval_steps_per_second": 23.129,
8553
  "num_input_tokens_seen": 12582907456,
8554
  "step": 48000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8555
  }
8556
  ],
8557
  "logging_steps": 50,
8558
  "max_steps": 70000,
8559
- "num_input_tokens_seen": 12582907456,
8560
  "num_train_epochs": 1,
8561
  "save_steps": 1000,
8562
  "stateful_callbacks": {
@@ -8571,7 +8749,7 @@
8571
  "attributes": {}
8572
  }
8573
  },
8574
- "total_flos": 3.3660505536567706e+18,
8575
  "train_batch_size": 64,
8576
  "trial_name": null,
8577
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.23373123293225373,
6
  "eval_steps": 500,
7
+ "global_step": 49000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8552
  "eval_steps_per_second": 23.129,
8553
  "num_input_tokens_seen": 12582907456,
8554
  "step": 48000
8555
+ },
8556
+ {
8557
+ "epoch": 0.22919970902846512,
8558
+ "grad_norm": 0.20247948169708252,
8559
+ "learning_rate": 0.001,
8560
+ "loss": 2.6122,
8561
+ "num_input_tokens_seen": 12596014656,
8562
+ "step": 48050
8563
+ },
8564
+ {
8565
+ "epoch": 0.22943821028655925,
8566
+ "grad_norm": 0.20237554609775543,
8567
+ "learning_rate": 0.001,
8568
+ "loss": 2.6235,
8569
+ "num_input_tokens_seen": 12609121856,
8570
+ "step": 48100
8571
+ },
8572
+ {
8573
+ "epoch": 0.2296767115446534,
8574
+ "grad_norm": 0.19862660765647888,
8575
+ "learning_rate": 0.001,
8576
+ "loss": 2.6264,
8577
+ "num_input_tokens_seen": 12622229056,
8578
+ "step": 48150
8579
+ },
8580
+ {
8581
+ "epoch": 0.22991521280274754,
8582
+ "grad_norm": 0.20839153230190277,
8583
+ "learning_rate": 0.001,
8584
+ "loss": 2.5915,
8585
+ "num_input_tokens_seen": 12635336256,
8586
+ "step": 48200
8587
+ },
8588
+ {
8589
+ "epoch": 0.23015371406084167,
8590
+ "grad_norm": 0.19385166466236115,
8591
+ "learning_rate": 0.001,
8592
+ "loss": 2.5979,
8593
+ "num_input_tokens_seen": 12648443456,
8594
+ "step": 48250
8595
+ },
8596
+ {
8597
+ "epoch": 0.2303922153189358,
8598
+ "grad_norm": 0.197597935795784,
8599
+ "learning_rate": 0.001,
8600
+ "loss": 2.6093,
8601
+ "num_input_tokens_seen": 12661550656,
8602
+ "step": 48300
8603
+ },
8604
+ {
8605
+ "epoch": 0.23063071657702994,
8606
+ "grad_norm": 0.20289985835552216,
8607
+ "learning_rate": 0.001,
8608
+ "loss": 2.6039,
8609
+ "num_input_tokens_seen": 12674657856,
8610
+ "step": 48350
8611
+ },
8612
+ {
8613
+ "epoch": 0.23086921783512407,
8614
+ "grad_norm": 0.1986515372991562,
8615
+ "learning_rate": 0.001,
8616
+ "loss": 2.6048,
8617
+ "num_input_tokens_seen": 12687765056,
8618
+ "step": 48400
8619
+ },
8620
+ {
8621
+ "epoch": 0.23110771909321823,
8622
+ "grad_norm": 0.19720982015132904,
8623
+ "learning_rate": 0.001,
8624
+ "loss": 2.6171,
8625
+ "num_input_tokens_seen": 12700872256,
8626
+ "step": 48450
8627
+ },
8628
+ {
8629
+ "epoch": 0.23134622035131236,
8630
+ "grad_norm": 0.24635523557662964,
8631
+ "learning_rate": 0.001,
8632
+ "loss": 2.6242,
8633
+ "num_input_tokens_seen": 12713979456,
8634
+ "step": 48500
8635
+ },
8636
+ {
8637
+ "epoch": 0.23134622035131236,
8638
+ "eval_loss": 2.495468854904175,
8639
+ "eval_runtime": 53.4259,
8640
+ "eval_samples_per_second": 93.588,
8641
+ "eval_steps_per_second": 23.397,
8642
+ "num_input_tokens_seen": 12713979456,
8643
+ "step": 48500
8644
+ },
8645
+ {
8646
+ "epoch": 0.2315847216094065,
8647
+ "grad_norm": 0.5883195996284485,
8648
+ "learning_rate": 0.001,
8649
+ "loss": 2.6399,
8650
+ "num_input_tokens_seen": 12727086656,
8651
+ "step": 48550
8652
+ },
8653
+ {
8654
+ "epoch": 0.23182322286750062,
8655
+ "grad_norm": 0.20890024304389954,
8656
+ "learning_rate": 0.001,
8657
+ "loss": 2.6325,
8658
+ "num_input_tokens_seen": 12740193856,
8659
+ "step": 48600
8660
+ },
8661
+ {
8662
+ "epoch": 0.23206172412559475,
8663
+ "grad_norm": 0.21251678466796875,
8664
+ "learning_rate": 0.001,
8665
+ "loss": 2.6233,
8666
+ "num_input_tokens_seen": 12753301056,
8667
+ "step": 48650
8668
+ },
8669
+ {
8670
+ "epoch": 0.23230022538368889,
8671
+ "grad_norm": 0.20996986329555511,
8672
+ "learning_rate": 0.001,
8673
+ "loss": 2.6174,
8674
+ "num_input_tokens_seen": 12766408256,
8675
+ "step": 48700
8676
+ },
8677
+ {
8678
+ "epoch": 0.23253872664178304,
8679
+ "grad_norm": 0.23039382696151733,
8680
+ "learning_rate": 0.001,
8681
+ "loss": 2.6305,
8682
+ "num_input_tokens_seen": 12779515456,
8683
+ "step": 48750
8684
+ },
8685
+ {
8686
+ "epoch": 0.23277722789987718,
8687
+ "grad_norm": 0.23922136425971985,
8688
+ "learning_rate": 0.001,
8689
+ "loss": 2.6108,
8690
+ "num_input_tokens_seen": 12792622656,
8691
+ "step": 48800
8692
+ },
8693
+ {
8694
+ "epoch": 0.2330157291579713,
8695
+ "grad_norm": 0.22746366262435913,
8696
+ "learning_rate": 0.001,
8697
+ "loss": 2.6219,
8698
+ "num_input_tokens_seen": 12805729856,
8699
+ "step": 48850
8700
+ },
8701
+ {
8702
+ "epoch": 0.23325423041606544,
8703
+ "grad_norm": 0.22131897509098053,
8704
+ "learning_rate": 0.001,
8705
+ "loss": 2.6205,
8706
+ "num_input_tokens_seen": 12818837056,
8707
+ "step": 48900
8708
+ },
8709
+ {
8710
+ "epoch": 0.23349273167415957,
8711
+ "grad_norm": 0.25431814789772034,
8712
+ "learning_rate": 0.001,
8713
+ "loss": 2.6252,
8714
+ "num_input_tokens_seen": 12831944256,
8715
+ "step": 48950
8716
+ },
8717
+ {
8718
+ "epoch": 0.23373123293225373,
8719
+ "grad_norm": 0.2622738778591156,
8720
+ "learning_rate": 0.001,
8721
+ "loss": 2.6288,
8722
+ "num_input_tokens_seen": 12845051456,
8723
+ "step": 49000
8724
+ },
8725
+ {
8726
+ "epoch": 0.23373123293225373,
8727
+ "eval_loss": 2.498055934906006,
8728
+ "eval_runtime": 53.8861,
8729
+ "eval_samples_per_second": 92.788,
8730
+ "eval_steps_per_second": 23.197,
8731
+ "num_input_tokens_seen": 12845051456,
8732
+ "step": 49000
8733
  }
8734
  ],
8735
  "logging_steps": 50,
8736
  "max_steps": 70000,
8737
+ "num_input_tokens_seen": 12845051456,
8738
  "num_train_epochs": 1,
8739
  "save_steps": 1000,
8740
  "stateful_callbacks": {
 
8749
  "attributes": {}
8750
  }
8751
  },
8752
+ "total_flos": 3.4361766321822106e+18,
8753
  "train_batch_size": 64,
8754
  "trial_name": null,
8755
  "trial_params": null