Azrail commited on
Commit
2431107
·
verified ·
1 Parent(s): 12ee7bf

Training in progress, step 123000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81b6085fb8cdb1171b74b00e5808748cf92ce0ddf8ba548a106b9e635e652ce5
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cef5b67a6a8ef1b7b03d42987cf14119de3a2a743fc8652bcc28538e2c6f502f
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2584540bb683d62bf86744736defc5b1b50bc3492f528f85e121c6574fb37a99
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03925e5e99d9cbfffe2f6300cf8385c7fca65c8ed5a96f6e0b64b1da83665e80
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3efcdbc541e421955fc1801cd719c72805694f44c64389ef735698f77e94dcbf
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:057702d02e4981608a0b19960ab61ff20cc438831297a4986309cdb565b1c450
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd02e3ed8ffd9c6d891f91758bb97fdbe6142d1b35a6390b66d152313f44683b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e98c7489b04ae19323aa5fe9264a9e2511b478d8f623351ee3b05babc6a227f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.1638885245119668,
6
  "eval_steps": 500,
7
- "global_step": 122000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -21724,11 +21724,189 @@
21724
  "eval_steps_per_second": 15.195,
21725
  "num_input_tokens_seen": 63952872768,
21726
  "step": 122000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21727
  }
21728
  ],
21729
  "logging_steps": 50,
21730
  "max_steps": 140000,
21731
- "num_input_tokens_seen": 63952872768,
21732
  "num_train_epochs": 2,
21733
  "save_steps": 1000,
21734
  "stateful_callbacks": {
@@ -21743,7 +21921,7 @@
21743
  "attributes": {}
21744
  }
21745
  },
21746
- "total_flos": 1.1318491979536712e+20,
21747
  "train_batch_size": 32,
21748
  "trial_name": null,
21749
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.1734285748357323,
6
  "eval_steps": 500,
7
+ "global_step": 123000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
21724
  "eval_steps_per_second": 15.195,
21725
  "num_input_tokens_seen": 63952872768,
21726
  "step": 122000
21727
+ },
21728
+ {
21729
+ "epoch": 1.164365527028155,
21730
+ "grad_norm": 0.13863904774188995,
21731
+ "learning_rate": 0.0007144112572668733,
21732
+ "loss": 2.0703,
21733
+ "num_input_tokens_seen": 63979084224,
21734
+ "step": 122050
21735
+ },
21736
+ {
21737
+ "epoch": 1.1648425295443434,
21738
+ "grad_norm": 0.1426379680633545,
21739
+ "learning_rate": 0.0007118738970516943,
21740
+ "loss": 2.0766,
21741
+ "num_input_tokens_seen": 64005286944,
21742
+ "step": 122100
21743
+ },
21744
+ {
21745
+ "epoch": 1.1653195320605316,
21746
+ "grad_norm": 0.13977181911468506,
21747
+ "learning_rate": 0.0007093298687687141,
21748
+ "loss": 2.0692,
21749
+ "num_input_tokens_seen": 64031487744,
21750
+ "step": 122150
21751
+ },
21752
+ {
21753
+ "epoch": 1.1657965345767198,
21754
+ "grad_norm": 0.1425238400697708,
21755
+ "learning_rate": 0.0007067792524832604,
21756
+ "loss": 2.0662,
21757
+ "num_input_tokens_seen": 64057695552,
21758
+ "step": 122200
21759
+ },
21760
+ {
21761
+ "epoch": 1.1662735370929083,
21762
+ "grad_norm": 0.15061677992343903,
21763
+ "learning_rate": 0.0007042221284679982,
21764
+ "loss": 2.0781,
21765
+ "num_input_tokens_seen": 64083893664,
21766
+ "step": 122250
21767
+ },
21768
+ {
21769
+ "epoch": 1.1667505396090965,
21770
+ "grad_norm": 0.12374892085790634,
21771
+ "learning_rate": 0.0007016585772004026,
21772
+ "loss": 2.0745,
21773
+ "num_input_tokens_seen": 64110107392,
21774
+ "step": 122300
21775
+ },
21776
+ {
21777
+ "epoch": 1.1672275421252847,
21778
+ "grad_norm": 0.1427278071641922,
21779
+ "learning_rate": 0.0006990886793602267,
21780
+ "loss": 2.0861,
21781
+ "num_input_tokens_seen": 64136321792,
21782
+ "step": 122350
21783
+ },
21784
+ {
21785
+ "epoch": 1.1677045446414729,
21786
+ "grad_norm": 0.15141050517559052,
21787
+ "learning_rate": 0.0006965125158269618,
21788
+ "loss": 2.0767,
21789
+ "num_input_tokens_seen": 64162534656,
21790
+ "step": 122400
21791
+ },
21792
+ {
21793
+ "epoch": 1.1681815471576613,
21794
+ "grad_norm": 0.13262976706027985,
21795
+ "learning_rate": 0.0006939301676772927,
21796
+ "loss": 2.0662,
21797
+ "num_input_tokens_seen": 64188740064,
21798
+ "step": 122450
21799
+ },
21800
+ {
21801
+ "epoch": 1.1686585496738495,
21802
+ "grad_norm": 0.13390204310417175,
21803
+ "learning_rate": 0.000691341716182545,
21804
+ "loss": 2.0684,
21805
+ "num_input_tokens_seen": 64214942816,
21806
+ "step": 122500
21807
+ },
21808
+ {
21809
+ "epoch": 1.1686585496738495,
21810
+ "eval_loss": 1.9892343282699585,
21811
+ "eval_runtime": 81.7351,
21812
+ "eval_samples_per_second": 61.173,
21813
+ "eval_steps_per_second": 15.293,
21814
+ "num_input_tokens_seen": 64214942816,
21815
+ "step": 122500
21816
+ },
21817
+ {
21818
+ "epoch": 1.1691355521900377,
21819
+ "grad_norm": 0.14351387321949005,
21820
+ "learning_rate": 0.0006887472428061285,
21821
+ "loss": 2.0611,
21822
+ "num_input_tokens_seen": 64241151872,
21823
+ "step": 122550
21824
+ },
21825
+ {
21826
+ "epoch": 1.1696125547062262,
21827
+ "grad_norm": 0.1321556568145752,
21828
+ "learning_rate": 0.0006861468292009726,
21829
+ "loss": 2.0726,
21830
+ "num_input_tokens_seen": 64267354176,
21831
+ "step": 122600
21832
+ },
21833
+ {
21834
+ "epoch": 1.1700895572224144,
21835
+ "grad_norm": 0.12825502455234528,
21836
+ "learning_rate": 0.0006835405572069572,
21837
+ "loss": 2.0703,
21838
+ "num_input_tokens_seen": 64293568544,
21839
+ "step": 122650
21840
+ },
21841
+ {
21842
+ "epoch": 1.1705665597386026,
21843
+ "grad_norm": 0.1376345157623291,
21844
+ "learning_rate": 0.0006809285088483361,
21845
+ "loss": 2.0789,
21846
+ "num_input_tokens_seen": 64319782944,
21847
+ "step": 122700
21848
+ },
21849
+ {
21850
+ "epoch": 1.1710435622547908,
21851
+ "grad_norm": 0.14178837835788727,
21852
+ "learning_rate": 0.0006783107663311565,
21853
+ "loss": 2.0755,
21854
+ "num_input_tokens_seen": 64345996064,
21855
+ "step": 122750
21856
+ },
21857
+ {
21858
+ "epoch": 1.1715205647709792,
21859
+ "grad_norm": 0.1475340873003006,
21860
+ "learning_rate": 0.0006756874120406714,
21861
+ "loss": 2.0668,
21862
+ "num_input_tokens_seen": 64372202944,
21863
+ "step": 122800
21864
+ },
21865
+ {
21866
+ "epoch": 1.1719975672871674,
21867
+ "grad_norm": 0.13012921810150146,
21868
+ "learning_rate": 0.0006730585285387465,
21869
+ "loss": 2.0618,
21870
+ "num_input_tokens_seen": 64398414944,
21871
+ "step": 122850
21872
+ },
21873
+ {
21874
+ "epoch": 1.1724745698033556,
21875
+ "grad_norm": 0.13203522562980652,
21876
+ "learning_rate": 0.0006704241985612625,
21877
+ "loss": 2.0712,
21878
+ "num_input_tokens_seen": 64424627264,
21879
+ "step": 122900
21880
+ },
21881
+ {
21882
+ "epoch": 1.172951572319544,
21883
+ "grad_norm": 0.13648848235607147,
21884
+ "learning_rate": 0.0006677845050155106,
21885
+ "loss": 2.0694,
21886
+ "num_input_tokens_seen": 64450839392,
21887
+ "step": 122950
21888
+ },
21889
+ {
21890
+ "epoch": 1.1734285748357323,
21891
+ "grad_norm": 0.1383182257413864,
21892
+ "learning_rate": 0.0006651395309775837,
21893
+ "loss": 2.0564,
21894
+ "num_input_tokens_seen": 64477051392,
21895
+ "step": 123000
21896
+ },
21897
+ {
21898
+ "epoch": 1.1734285748357323,
21899
+ "eval_loss": 1.9881237745285034,
21900
+ "eval_runtime": 82.9953,
21901
+ "eval_samples_per_second": 60.244,
21902
+ "eval_steps_per_second": 15.061,
21903
+ "num_input_tokens_seen": 64477051392,
21904
+ "step": 123000
21905
  }
21906
  ],
21907
  "logging_steps": 50,
21908
  "max_steps": 140000,
21909
+ "num_input_tokens_seen": 64477051392,
21910
  "num_train_epochs": 2,
21911
  "save_steps": 1000,
21912
  "stateful_callbacks": {
 
21921
  "attributes": {}
21922
  }
21923
  },
21924
+ "total_flos": 1.141126203496661e+20,
21925
  "train_batch_size": 32,
21926
  "trial_name": null,
21927
  "trial_params": null