Azrail commited on
Commit
344b507
·
verified ·
1 Parent(s): a2a725a

Training in progress, step 38000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ce938a644f0cf4d10d231b631256c1bcbd8d98d79787b20ca3ed148b88756be
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a94fdb7295accdf7201fc02029e5ae45ac44dcd9ef16798d70e0f488636e1f9c
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:64656c8de22e45c2941d2ea854ec0d370243cfeea2920fb181966f363dd14777
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d665c90e6bb04abc526448689a0e7c0f687d8ae7453c9e6c300bb8f38a3b48a2
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0451e520bbe84b70e4cd2907956e95cd6d56464539f21e68e26c043e5cf63b1e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:067fc834db8daa6bcb7d646c19fb2debac62a3ca3a0f0e8b29d38f87eb5e83ea
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90815e584013ee668de6d5b656c515902fbacbb32f54a71d2d1d29e05110019f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b9f11080dc8caed5e3c50cfbb46586ae36896ccfee6afab64bede4080bf44b1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.8127435999873696,
6
  "eval_steps": 500,
7
- "global_step": 37000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6594,11 +6594,189 @@
6594
  "eval_steps_per_second": 18.779,
6595
  "num_input_tokens_seen": 38797308160,
6596
  "step": 37000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6597
  }
6598
  ],
6599
  "logging_steps": 50,
6600
  "max_steps": 200000,
6601
- "num_input_tokens_seen": 38797308160,
6602
  "num_train_epochs": 5,
6603
  "save_steps": 1000,
6604
  "stateful_callbacks": {
@@ -6613,7 +6791,7 @@
6613
  "attributes": {}
6614
  }
6615
  },
6616
- "total_flos": 2.2095351303794196e+19,
6617
  "train_batch_size": 64,
6618
  "trial_name": null,
6619
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.8347096432302714,
6
  "eval_steps": 500,
7
+ "global_step": 38000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6594
  "eval_steps_per_second": 18.779,
6595
  "num_input_tokens_seen": 38797308160,
6596
  "step": 37000
6597
+ },
6598
+ {
6599
+ "epoch": 0.8138419021495146,
6600
+ "grad_norm": 0.1504666954278946,
6601
+ "learning_rate": 0.001,
6602
+ "loss": 2.6625,
6603
+ "num_input_tokens_seen": 38849736960,
6604
+ "step": 37050
6605
+ },
6606
+ {
6607
+ "epoch": 0.8149402043116597,
6608
+ "grad_norm": 0.15831789374351501,
6609
+ "learning_rate": 0.001,
6610
+ "loss": 2.6566,
6611
+ "num_input_tokens_seen": 38902165760,
6612
+ "step": 37100
6613
+ },
6614
+ {
6615
+ "epoch": 0.8160385064738048,
6616
+ "grad_norm": 0.1391575187444687,
6617
+ "learning_rate": 0.001,
6618
+ "loss": 2.6609,
6619
+ "num_input_tokens_seen": 38954594560,
6620
+ "step": 37150
6621
+ },
6622
+ {
6623
+ "epoch": 0.81713680863595,
6624
+ "grad_norm": 0.22168035805225372,
6625
+ "learning_rate": 0.001,
6626
+ "loss": 2.6768,
6627
+ "num_input_tokens_seen": 39007023360,
6628
+ "step": 37200
6629
+ },
6630
+ {
6631
+ "epoch": 0.818235110798095,
6632
+ "grad_norm": 0.1874976009130478,
6633
+ "learning_rate": 0.001,
6634
+ "loss": 2.679,
6635
+ "num_input_tokens_seen": 39059452160,
6636
+ "step": 37250
6637
+ },
6638
+ {
6639
+ "epoch": 0.8193334129602401,
6640
+ "grad_norm": 0.1796240657567978,
6641
+ "learning_rate": 0.001,
6642
+ "loss": 2.6644,
6643
+ "num_input_tokens_seen": 39111880960,
6644
+ "step": 37300
6645
+ },
6646
+ {
6647
+ "epoch": 0.8204317151223852,
6648
+ "grad_norm": 0.3271934986114502,
6649
+ "learning_rate": 0.001,
6650
+ "loss": 2.6695,
6651
+ "num_input_tokens_seen": 39164309760,
6652
+ "step": 37350
6653
+ },
6654
+ {
6655
+ "epoch": 0.8215300172845302,
6656
+ "grad_norm": 0.13447704911231995,
6657
+ "learning_rate": 0.001,
6658
+ "loss": 2.6656,
6659
+ "num_input_tokens_seen": 39216738560,
6660
+ "step": 37400
6661
+ },
6662
+ {
6663
+ "epoch": 0.8226283194466754,
6664
+ "grad_norm": 0.1367628127336502,
6665
+ "learning_rate": 0.001,
6666
+ "loss": 2.6505,
6667
+ "num_input_tokens_seen": 39269167360,
6668
+ "step": 37450
6669
+ },
6670
+ {
6671
+ "epoch": 0.8237266216088205,
6672
+ "grad_norm": 0.1498686671257019,
6673
+ "learning_rate": 0.001,
6674
+ "loss": 2.6594,
6675
+ "num_input_tokens_seen": 39321596160,
6676
+ "step": 37500
6677
+ },
6678
+ {
6679
+ "epoch": 0.8237266216088205,
6680
+ "eval_loss": 2.5516529083251953,
6681
+ "eval_runtime": 66.8213,
6682
+ "eval_samples_per_second": 74.826,
6683
+ "eval_steps_per_second": 18.707,
6684
+ "num_input_tokens_seen": 39321596160,
6685
+ "step": 37500
6686
+ },
6687
+ {
6688
+ "epoch": 0.8248249237709656,
6689
+ "grad_norm": 0.14790424704551697,
6690
+ "learning_rate": 0.001,
6691
+ "loss": 2.6519,
6692
+ "num_input_tokens_seen": 39374024960,
6693
+ "step": 37550
6694
+ },
6695
+ {
6696
+ "epoch": 0.8259232259331106,
6697
+ "grad_norm": 0.15297918021678925,
6698
+ "learning_rate": 0.001,
6699
+ "loss": 2.6533,
6700
+ "num_input_tokens_seen": 39426453760,
6701
+ "step": 37600
6702
+ },
6703
+ {
6704
+ "epoch": 0.8270215280952558,
6705
+ "grad_norm": 0.15760953724384308,
6706
+ "learning_rate": 0.001,
6707
+ "loss": 2.6584,
6708
+ "num_input_tokens_seen": 39478882560,
6709
+ "step": 37650
6710
+ },
6711
+ {
6712
+ "epoch": 0.8281198302574009,
6713
+ "grad_norm": 0.1545770913362503,
6714
+ "learning_rate": 0.001,
6715
+ "loss": 2.6453,
6716
+ "num_input_tokens_seen": 39531311360,
6717
+ "step": 37700
6718
+ },
6719
+ {
6720
+ "epoch": 0.8292181324195459,
6721
+ "grad_norm": 0.17809870839118958,
6722
+ "learning_rate": 0.001,
6723
+ "loss": 2.6547,
6724
+ "num_input_tokens_seen": 39583740160,
6725
+ "step": 37750
6726
+ },
6727
+ {
6728
+ "epoch": 0.830316434581691,
6729
+ "grad_norm": 0.2712576687335968,
6730
+ "learning_rate": 0.001,
6731
+ "loss": 2.6489,
6732
+ "num_input_tokens_seen": 39636168960,
6733
+ "step": 37800
6734
+ },
6735
+ {
6736
+ "epoch": 0.8314147367438361,
6737
+ "grad_norm": 0.1525331437587738,
6738
+ "learning_rate": 0.001,
6739
+ "loss": 2.6558,
6740
+ "num_input_tokens_seen": 39688597760,
6741
+ "step": 37850
6742
+ },
6743
+ {
6744
+ "epoch": 0.8325130389059812,
6745
+ "grad_norm": 0.1624525785446167,
6746
+ "learning_rate": 0.001,
6747
+ "loss": 2.6465,
6748
+ "num_input_tokens_seen": 39741026560,
6749
+ "step": 37900
6750
+ },
6751
+ {
6752
+ "epoch": 0.8336113410681263,
6753
+ "grad_norm": 0.14974552392959595,
6754
+ "learning_rate": 0.001,
6755
+ "loss": 2.6595,
6756
+ "num_input_tokens_seen": 39793455360,
6757
+ "step": 37950
6758
+ },
6759
+ {
6760
+ "epoch": 0.8347096432302714,
6761
+ "grad_norm": 0.15206202864646912,
6762
+ "learning_rate": 0.001,
6763
+ "loss": 2.6525,
6764
+ "num_input_tokens_seen": 39845884160,
6765
+ "step": 38000
6766
+ },
6767
+ {
6768
+ "epoch": 0.8347096432302714,
6769
+ "eval_loss": 2.549203395843506,
6770
+ "eval_runtime": 66.3732,
6771
+ "eval_samples_per_second": 75.332,
6772
+ "eval_steps_per_second": 18.833,
6773
+ "num_input_tokens_seen": 39845884160,
6774
+ "step": 38000
6775
  }
6776
  ],
6777
  "logging_steps": 50,
6778
  "max_steps": 200000,
6779
+ "num_input_tokens_seen": 39845884160,
6780
  "num_train_epochs": 5,
6781
  "save_steps": 1000,
6782
  "stateful_callbacks": {
 
6791
  "attributes": {}
6792
  }
6793
  },
6794
+ "total_flos": 2.2692523019759124e+19,
6795
  "train_batch_size": 64,
6796
  "trial_name": null,
6797
  "trial_params": null