Azrail commited on
Commit
9d120c7
·
verified ·
1 Parent(s): e673a15

Training in progress, step 56000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5142916764b6385c48d096b2a7f336531a047dd5a1c0cd7b8aa09a2fdd35007
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75a54732bc39e58afccb21a46f57190dd49c2ae00c7fd73b4d8434827934d2aa
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c871f297ec758cbe8e1e4a52c756dfd036112baba8fbed3f20c9699d23ba9b0
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da1643d7c66b6de7210d626427e81524686db0e0650499f03aeaee61e640ca95
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a5eacfa99e53a8a1de73851121ef39f03223e9cc67398ac06a0e84e6dbf4ae3
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f1d7953b9adf97d81c8d5df7c90f2cd3786e196584c751d3c25ee459604bb2b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aaffe7b6e7bde964bb6e6784b39ca6209cca3589a90aff9795b02fa93025464e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56dc1edb3d2e4264095d54347eab2555bc17fb9d10875074bfbbaaa6e5eeeb69
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2623513839035501,
6
  "eval_steps": 500,
7
- "global_step": 55000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9798,11 +9798,189 @@
9798
  "eval_steps_per_second": 23.444,
9799
  "num_input_tokens_seen": 14417915456,
9800
  "step": 55000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9801
  }
9802
  ],
9803
  "logging_steps": 50,
9804
  "max_steps": 70000,
9805
- "num_input_tokens_seen": 14417915456,
9806
  "num_train_epochs": 1,
9807
  "save_steps": 1000,
9808
  "stateful_callbacks": {
@@ -9817,7 +9995,7 @@
9817
  "attributes": {}
9818
  }
9819
  },
9820
- "total_flos": 3.8569331033348506e+18,
9821
  "train_batch_size": 64,
9822
  "trial_name": null,
9823
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2671214090654328,
6
  "eval_steps": 500,
7
+ "global_step": 56000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9798
  "eval_steps_per_second": 23.444,
9799
  "num_input_tokens_seen": 14417915456,
9800
  "step": 55000
9801
+ },
9802
+ {
9803
+ "epoch": 0.2625898851616442,
9804
+ "grad_norm": 0.22046101093292236,
9805
+ "learning_rate": 0.001,
9806
+ "loss": 2.6077,
9807
+ "num_input_tokens_seen": 14431022656,
9808
+ "step": 55050
9809
+ },
9810
+ {
9811
+ "epoch": 0.2628283864197384,
9812
+ "grad_norm": 0.4682837724685669,
9813
+ "learning_rate": 0.001,
9814
+ "loss": 2.6065,
9815
+ "num_input_tokens_seen": 14444129856,
9816
+ "step": 55100
9817
+ },
9818
+ {
9819
+ "epoch": 0.2630668876778325,
9820
+ "grad_norm": 0.21442484855651855,
9821
+ "learning_rate": 0.001,
9822
+ "loss": 2.6079,
9823
+ "num_input_tokens_seen": 14457237056,
9824
+ "step": 55150
9825
+ },
9826
+ {
9827
+ "epoch": 0.26330538893592664,
9828
+ "grad_norm": 0.2513403296470642,
9829
+ "learning_rate": 0.001,
9830
+ "loss": 2.6037,
9831
+ "num_input_tokens_seen": 14470344256,
9832
+ "step": 55200
9833
+ },
9834
+ {
9835
+ "epoch": 0.26354389019402075,
9836
+ "grad_norm": 0.21526487171649933,
9837
+ "learning_rate": 0.001,
9838
+ "loss": 2.6049,
9839
+ "num_input_tokens_seen": 14483451456,
9840
+ "step": 55250
9841
+ },
9842
+ {
9843
+ "epoch": 0.2637823914521149,
9844
+ "grad_norm": 0.22567112743854523,
9845
+ "learning_rate": 0.001,
9846
+ "loss": 2.5953,
9847
+ "num_input_tokens_seen": 14496558656,
9848
+ "step": 55300
9849
+ },
9850
+ {
9851
+ "epoch": 0.26402089271020907,
9852
+ "grad_norm": 0.20226064324378967,
9853
+ "learning_rate": 0.001,
9854
+ "loss": 2.609,
9855
+ "num_input_tokens_seen": 14509665856,
9856
+ "step": 55350
9857
+ },
9858
+ {
9859
+ "epoch": 0.26425939396830317,
9860
+ "grad_norm": 0.31736019253730774,
9861
+ "learning_rate": 0.001,
9862
+ "loss": 2.6174,
9863
+ "num_input_tokens_seen": 14522773056,
9864
+ "step": 55400
9865
+ },
9866
+ {
9867
+ "epoch": 0.26449789522639733,
9868
+ "grad_norm": 0.2573414146900177,
9869
+ "learning_rate": 0.001,
9870
+ "loss": 2.612,
9871
+ "num_input_tokens_seen": 14535880256,
9872
+ "step": 55450
9873
+ },
9874
+ {
9875
+ "epoch": 0.26473639648449143,
9876
+ "grad_norm": 0.278160959482193,
9877
+ "learning_rate": 0.001,
9878
+ "loss": 2.6713,
9879
+ "num_input_tokens_seen": 14548987456,
9880
+ "step": 55500
9881
+ },
9882
+ {
9883
+ "epoch": 0.26473639648449143,
9884
+ "eval_loss": 2.5104730129241943,
9885
+ "eval_runtime": 54.2403,
9886
+ "eval_samples_per_second": 92.182,
9887
+ "eval_steps_per_second": 23.046,
9888
+ "num_input_tokens_seen": 14548987456,
9889
+ "step": 55500
9890
+ },
9891
+ {
9892
+ "epoch": 0.2649748977425856,
9893
+ "grad_norm": 0.25843819975852966,
9894
+ "learning_rate": 0.001,
9895
+ "loss": 2.6223,
9896
+ "num_input_tokens_seen": 14562094656,
9897
+ "step": 55550
9898
+ },
9899
+ {
9900
+ "epoch": 0.26521339900067975,
9901
+ "grad_norm": 0.42813193798065186,
9902
+ "learning_rate": 0.001,
9903
+ "loss": 2.6114,
9904
+ "num_input_tokens_seen": 14575201856,
9905
+ "step": 55600
9906
+ },
9907
+ {
9908
+ "epoch": 0.26545190025877385,
9909
+ "grad_norm": 0.23324181139469147,
9910
+ "learning_rate": 0.001,
9911
+ "loss": 2.6149,
9912
+ "num_input_tokens_seen": 14588309056,
9913
+ "step": 55650
9914
+ },
9915
+ {
9916
+ "epoch": 0.265690401516868,
9917
+ "grad_norm": 0.2795487940311432,
9918
+ "learning_rate": 0.001,
9919
+ "loss": 2.6067,
9920
+ "num_input_tokens_seen": 14601416256,
9921
+ "step": 55700
9922
+ },
9923
+ {
9924
+ "epoch": 0.2659289027749621,
9925
+ "grad_norm": 0.6856834888458252,
9926
+ "learning_rate": 0.001,
9927
+ "loss": 2.6135,
9928
+ "num_input_tokens_seen": 14614523456,
9929
+ "step": 55750
9930
+ },
9931
+ {
9932
+ "epoch": 0.2661674040330563,
9933
+ "grad_norm": 0.348906934261322,
9934
+ "learning_rate": 0.001,
9935
+ "loss": 2.6384,
9936
+ "num_input_tokens_seen": 14627630656,
9937
+ "step": 55800
9938
+ },
9939
+ {
9940
+ "epoch": 0.26640590529115044,
9941
+ "grad_norm": 0.2510247528553009,
9942
+ "learning_rate": 0.001,
9943
+ "loss": 2.6224,
9944
+ "num_input_tokens_seen": 14640737856,
9945
+ "step": 55850
9946
+ },
9947
+ {
9948
+ "epoch": 0.26664440654924454,
9949
+ "grad_norm": 0.34429189562797546,
9950
+ "learning_rate": 0.001,
9951
+ "loss": 2.6139,
9952
+ "num_input_tokens_seen": 14653845056,
9953
+ "step": 55900
9954
+ },
9955
+ {
9956
+ "epoch": 0.2668829078073387,
9957
+ "grad_norm": 0.25697243213653564,
9958
+ "learning_rate": 0.001,
9959
+ "loss": 2.6143,
9960
+ "num_input_tokens_seen": 14666952256,
9961
+ "step": 55950
9962
+ },
9963
+ {
9964
+ "epoch": 0.2671214090654328,
9965
+ "grad_norm": 0.2812611758708954,
9966
+ "learning_rate": 0.001,
9967
+ "loss": 2.6172,
9968
+ "num_input_tokens_seen": 14680059456,
9969
+ "step": 56000
9970
+ },
9971
+ {
9972
+ "epoch": 0.2671214090654328,
9973
+ "eval_loss": 2.492490291595459,
9974
+ "eval_runtime": 53.3814,
9975
+ "eval_samples_per_second": 93.666,
9976
+ "eval_steps_per_second": 23.416,
9977
+ "num_input_tokens_seen": 14680059456,
9978
+ "step": 56000
9979
  }
9980
  ],
9981
  "logging_steps": 50,
9982
  "max_steps": 70000,
9983
+ "num_input_tokens_seen": 14680059456,
9984
  "num_train_epochs": 1,
9985
  "save_steps": 1000,
9986
  "stateful_callbacks": {
 
9995
  "attributes": {}
9996
  }
9997
  },
9998
+ "total_flos": 3.9270591818602906e+18,
9999
  "train_batch_size": 64,
10000
  "trial_name": null,
10001
  "trial_params": null