Azrail commited on
Commit
5a1efb8
·
verified ·
1 Parent(s): 1549f81

Training in progress, step 67000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6627f46453f0eddcb5503378a89a14a6529d63c8f3e731e04b523860ef73959
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b6d95b7e811d1f68b64bc7cb8a6aa2be60af9ae27cf26bbdeedecc87fc96939
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55adb983e10ce2c91d34635b0e2c61b12341302e3599339214fbe162d24db56d
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4764d7e2e901d9dd421188980b44c73e20159a2b530b5e58e042540dbd4ca383
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5506f8ab70fc0520e3fcff77fee663d3576573119296fd847d8ec1a26a45a3cf
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2f0aa502d64898ee3e50486c039d0e2439e7552237090a80d559862b18540a7
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d828325c04baaeca4bef8dd14dbbff2a89fb26da8a22793521965c92d2ced694
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8f163bf0d684bb1f1d6d058d310158a309f623a594242fc874446ccea1105f8
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.44394892569404854,
6
  "eval_steps": 500,
7
- "global_step": 66000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11756,11 +11756,189 @@
11756
  "eval_steps_per_second": 23.459,
11757
  "num_input_tokens_seen": 17301504000,
11758
  "step": 66000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11759
  }
11760
  ],
11761
  "logging_steps": 50,
11762
  "max_steps": 70000,
11763
- "num_input_tokens_seen": 17301504000,
11764
  "num_train_epochs": 1,
11765
  "save_steps": 1000,
11766
  "stateful_callbacks": {
@@ -11775,7 +11953,7 @@
11775
  "attributes": {}
11776
  }
11777
  },
11778
- "total_flos": 4.62832118267904e+18,
11779
  "train_batch_size": 64,
11780
  "trial_name": null,
11781
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4506754245682008,
6
  "eval_steps": 500,
7
+ "global_step": 67000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11756
  "eval_steps_per_second": 23.459,
11757
  "num_input_tokens_seen": 17301504000,
11758
  "step": 66000
11759
+ },
11760
+ {
11761
+ "epoch": 0.4442852506377562,
11762
+ "grad_norm": 0.14435406029224396,
11763
+ "learning_rate": 0.0001429927009743659,
11764
+ "loss": 2.9718,
11765
+ "num_input_tokens_seen": 17314611200,
11766
+ "step": 66050
11767
+ },
11768
+ {
11769
+ "epoch": 0.4446215755814638,
11770
+ "grad_norm": 0.1603071242570877,
11771
+ "learning_rate": 0.0001395732016485406,
11772
+ "loss": 2.9731,
11773
+ "num_input_tokens_seen": 17327718400,
11774
+ "step": 66100
11775
+ },
11776
+ {
11777
+ "epoch": 0.4449579005251714,
11778
+ "grad_norm": 0.14310726523399353,
11779
+ "learning_rate": 0.00013618844100771256,
11780
+ "loss": 2.9665,
11781
+ "num_input_tokens_seen": 17340825600,
11782
+ "step": 66150
11783
+ },
11784
+ {
11785
+ "epoch": 0.44529422546887903,
11786
+ "grad_norm": 0.276594340801239,
11787
+ "learning_rate": 0.00013283874528215734,
11788
+ "loss": 2.9711,
11789
+ "num_input_tokens_seen": 17353932800,
11790
+ "step": 66200
11791
+ },
11792
+ {
11793
+ "epoch": 0.44563055041258665,
11794
+ "grad_norm": 0.1535540074110031,
11795
+ "learning_rate": 0.00012952443732252057,
11796
+ "loss": 2.9693,
11797
+ "num_input_tokens_seen": 17367040000,
11798
+ "step": 66250
11799
+ },
11800
+ {
11801
+ "epoch": 0.44596687535629426,
11802
+ "grad_norm": 0.15807458758354187,
11803
+ "learning_rate": 0.00012624583656870153,
11804
+ "loss": 2.9754,
11805
+ "num_input_tokens_seen": 17380147200,
11806
+ "step": 66300
11807
+ },
11808
+ {
11809
+ "epoch": 0.44630320030000187,
11810
+ "grad_norm": 0.14477893710136414,
11811
+ "learning_rate": 0.00012300325901906528,
11812
+ "loss": 2.9735,
11813
+ "num_input_tokens_seen": 17393254400,
11814
+ "step": 66350
11815
+ },
11816
+ {
11817
+ "epoch": 0.4466395252437095,
11818
+ "grad_norm": 0.14505073428153992,
11819
+ "learning_rate": 0.00011979701719998454,
11820
+ "loss": 2.9783,
11821
+ "num_input_tokens_seen": 17406361600,
11822
+ "step": 66400
11823
+ },
11824
+ {
11825
+ "epoch": 0.4469758501874171,
11826
+ "grad_norm": 0.15850161015987396,
11827
+ "learning_rate": 0.00011662742013571926,
11828
+ "loss": 2.967,
11829
+ "num_input_tokens_seen": 17419468800,
11830
+ "step": 66450
11831
+ },
11832
+ {
11833
+ "epoch": 0.4473121751311247,
11834
+ "grad_norm": 0.14653578400611877,
11835
+ "learning_rate": 0.00011349477331863151,
11836
+ "loss": 2.9651,
11837
+ "num_input_tokens_seen": 17432576000,
11838
+ "step": 66500
11839
+ },
11840
+ {
11841
+ "epoch": 0.4473121751311247,
11842
+ "eval_loss": 2.8710148334503174,
11843
+ "eval_runtime": 53.2889,
11844
+ "eval_samples_per_second": 93.828,
11845
+ "eval_steps_per_second": 23.457,
11846
+ "num_input_tokens_seen": 17432576000,
11847
+ "step": 66500
11848
+ },
11849
+ {
11850
+ "epoch": 0.4476485000748323,
11851
+ "grad_norm": 0.15636616945266724,
11852
+ "learning_rate": 0.00011039937867974164,
11853
+ "loss": 2.9758,
11854
+ "num_input_tokens_seen": 17445683200,
11855
+ "step": 66550
11856
+ },
11857
+ {
11858
+ "epoch": 0.4479848250185399,
11859
+ "grad_norm": 0.14427579939365387,
11860
+ "learning_rate": 0.00010734153455962764,
11861
+ "loss": 2.9594,
11862
+ "num_input_tokens_seen": 17458790400,
11863
+ "step": 66600
11864
+ },
11865
+ {
11866
+ "epoch": 0.44832114996224753,
11867
+ "grad_norm": 0.15148353576660156,
11868
+ "learning_rate": 0.00010432153567966984,
11869
+ "loss": 2.9684,
11870
+ "num_input_tokens_seen": 17471897600,
11871
+ "step": 66650
11872
+ },
11873
+ {
11874
+ "epoch": 0.44865747490595514,
11875
+ "grad_norm": 0.1541094332933426,
11876
+ "learning_rate": 0.0001013396731136465,
11877
+ "loss": 2.9685,
11878
+ "num_input_tokens_seen": 17485004800,
11879
+ "step": 66700
11880
+ },
11881
+ {
11882
+ "epoch": 0.44899379984966276,
11883
+ "grad_norm": 0.14267295598983765,
11884
+ "learning_rate": 9.839623425967759e-05,
11885
+ "loss": 2.9728,
11886
+ "num_input_tokens_seen": 17498112000,
11887
+ "step": 66750
11888
+ },
11889
+ {
11890
+ "epoch": 0.44933012479337037,
11891
+ "grad_norm": 0.1437918245792389,
11892
+ "learning_rate": 9.549150281252633e-05,
11893
+ "loss": 2.9752,
11894
+ "num_input_tokens_seen": 17511219200,
11895
+ "step": 66800
11896
+ },
11897
+ {
11898
+ "epoch": 0.449666449737078,
11899
+ "grad_norm": 0.1517232209444046,
11900
+ "learning_rate": 9.262575873625529e-05,
11901
+ "loss": 2.9729,
11902
+ "num_input_tokens_seen": 17524326400,
11903
+ "step": 66850
11904
+ },
11905
+ {
11906
+ "epoch": 0.4500027746807856,
11907
+ "grad_norm": 0.15286608040332794,
11908
+ "learning_rate": 8.979927823724321e-05,
11909
+ "loss": 2.9687,
11910
+ "num_input_tokens_seen": 17537433600,
11911
+ "step": 66900
11912
+ },
11913
+ {
11914
+ "epoch": 0.4503390996244932,
11915
+ "grad_norm": 0.14875057339668274,
11916
+ "learning_rate": 8.70123337375635e-05,
11917
+ "loss": 2.9758,
11918
+ "num_input_tokens_seen": 17550540800,
11919
+ "step": 66950
11920
+ },
11921
+ {
11922
+ "epoch": 0.4506754245682008,
11923
+ "grad_norm": 0.1493612825870514,
11924
+ "learning_rate": 8.426519384872733e-05,
11925
+ "loss": 2.9704,
11926
+ "num_input_tokens_seen": 17563648000,
11927
+ "step": 67000
11928
+ },
11929
+ {
11930
+ "epoch": 0.4506754245682008,
11931
+ "eval_loss": 2.869231939315796,
11932
+ "eval_runtime": 53.2491,
11933
+ "eval_samples_per_second": 93.898,
11934
+ "eval_steps_per_second": 23.475,
11935
+ "num_input_tokens_seen": 17563648000,
11936
+ "step": 67000
11937
  }
11938
  ],
11939
  "logging_steps": 50,
11940
  "max_steps": 70000,
11941
+ "num_input_tokens_seen": 17563648000,
11942
  "num_train_epochs": 1,
11943
  "save_steps": 1000,
11944
  "stateful_callbacks": {
 
11953
  "attributes": {}
11954
  }
11955
  },
11956
+ "total_flos": 4.69844726120448e+18,
11957
  "train_batch_size": 64,
11958
  "trial_name": null,
11959
  "trial_params": null