Azrail commited on
Commit
0b379ed
·
verified ·
1 Parent(s): fc76640

Training in progress, step 28000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5b523d0237ff4825791520de6c6899e7d737f3dbfe8441d833895f1e2466285
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:598ec2e422397aad641d528881e643db9612147d6333b5c66a69998965ce9656
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:149a5f50fc47d3d0a29e92a6c18a1d78db3365d41cfd7f18ae74185f9b0fbc4e
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:593d98fe7868eb09c1f3193111558b84e18bf0affb6cefd648708d4d1cba6ae6
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d1d738e0f013e71559a982b5bed46734a8c7b8ac496ca76379bed24380a52a1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2954868f3cacffad4686728c7094ccb6fc0d9e0b5adf1b06d98602d6248bf938
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5585e9833c9684d1dabff9cec651205ae9bf4f81ab2bb2b589702ce44919fbb3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06d5fef101c9d39d51795e2426ebd97ece14c40eab5611cbd021ffd2d11b16ce
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.5930831675583508,
6
  "eval_steps": 500,
7
- "global_step": 27000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4814,11 +4814,189 @@
4814
  "eval_steps_per_second": 18.877,
4815
  "num_input_tokens_seen": 28311548160,
4816
  "step": 27000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4817
  }
4818
  ],
4819
  "logging_steps": 50,
4820
  "max_steps": 200000,
4821
- "num_input_tokens_seen": 28311548160,
4822
  "num_train_epochs": 5,
4823
  "save_steps": 1000,
4824
  "stateful_callbacks": {
@@ -4833,7 +5011,7 @@
4833
  "attributes": {}
4834
  }
4835
  },
4836
- "total_flos": 1.6123634144144916e+19,
4837
  "train_batch_size": 64,
4838
  "trial_name": null,
4839
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.6150492108012526,
6
  "eval_steps": 500,
7
+ "global_step": 28000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4814
  "eval_steps_per_second": 18.877,
4815
  "num_input_tokens_seen": 28311548160,
4816
  "step": 27000
4817
+ },
4818
+ {
4819
+ "epoch": 0.5941814697204958,
4820
+ "grad_norm": 0.15350718796253204,
4821
+ "learning_rate": 0.001,
4822
+ "loss": 2.6787,
4823
+ "num_input_tokens_seen": 28363976960,
4824
+ "step": 27050
4825
+ },
4826
+ {
4827
+ "epoch": 0.5952797718826409,
4828
+ "grad_norm": 0.1393333077430725,
4829
+ "learning_rate": 0.001,
4830
+ "loss": 2.6759,
4831
+ "num_input_tokens_seen": 28416405760,
4832
+ "step": 27100
4833
+ },
4834
+ {
4835
+ "epoch": 0.596378074044786,
4836
+ "grad_norm": 0.1485709846019745,
4837
+ "learning_rate": 0.001,
4838
+ "loss": 2.6772,
4839
+ "num_input_tokens_seen": 28468834560,
4840
+ "step": 27150
4841
+ },
4842
+ {
4843
+ "epoch": 0.5974763762069311,
4844
+ "grad_norm": 0.13909003138542175,
4845
+ "learning_rate": 0.001,
4846
+ "loss": 2.6729,
4847
+ "num_input_tokens_seen": 28521263360,
4848
+ "step": 27200
4849
+ },
4850
+ {
4851
+ "epoch": 0.5985746783690762,
4852
+ "grad_norm": 0.15117496252059937,
4853
+ "learning_rate": 0.001,
4854
+ "loss": 2.6704,
4855
+ "num_input_tokens_seen": 28573692160,
4856
+ "step": 27250
4857
+ },
4858
+ {
4859
+ "epoch": 0.5996729805312213,
4860
+ "grad_norm": 0.14054876565933228,
4861
+ "learning_rate": 0.001,
4862
+ "loss": 2.6748,
4863
+ "num_input_tokens_seen": 28626120960,
4864
+ "step": 27300
4865
+ },
4866
+ {
4867
+ "epoch": 0.6007712826933664,
4868
+ "grad_norm": 0.15437620878219604,
4869
+ "learning_rate": 0.001,
4870
+ "loss": 2.6778,
4871
+ "num_input_tokens_seen": 28678549760,
4872
+ "step": 27350
4873
+ },
4874
+ {
4875
+ "epoch": 0.6018695848555115,
4876
+ "grad_norm": 0.15858007967472076,
4877
+ "learning_rate": 0.001,
4878
+ "loss": 2.6763,
4879
+ "num_input_tokens_seen": 28730978560,
4880
+ "step": 27400
4881
+ },
4882
+ {
4883
+ "epoch": 0.6029678870176566,
4884
+ "grad_norm": 0.14459487795829773,
4885
+ "learning_rate": 0.001,
4886
+ "loss": 2.6726,
4887
+ "num_input_tokens_seen": 28783407360,
4888
+ "step": 27450
4889
+ },
4890
+ {
4891
+ "epoch": 0.6040661891798017,
4892
+ "grad_norm": 0.17691345512866974,
4893
+ "learning_rate": 0.001,
4894
+ "loss": 2.678,
4895
+ "num_input_tokens_seen": 28835836160,
4896
+ "step": 27500
4897
+ },
4898
+ {
4899
+ "epoch": 0.6040661891798017,
4900
+ "eval_loss": 2.576051950454712,
4901
+ "eval_runtime": 66.9387,
4902
+ "eval_samples_per_second": 74.695,
4903
+ "eval_steps_per_second": 18.674,
4904
+ "num_input_tokens_seen": 28835836160,
4905
+ "step": 27500
4906
+ },
4907
+ {
4908
+ "epoch": 0.6051644913419467,
4909
+ "grad_norm": 0.16200922429561615,
4910
+ "learning_rate": 0.001,
4911
+ "loss": 2.6763,
4912
+ "num_input_tokens_seen": 28888264960,
4913
+ "step": 27550
4914
+ },
4915
+ {
4916
+ "epoch": 0.6062627935040918,
4917
+ "grad_norm": 0.14567038416862488,
4918
+ "learning_rate": 0.001,
4919
+ "loss": 2.6795,
4920
+ "num_input_tokens_seen": 28940693760,
4921
+ "step": 27600
4922
+ },
4923
+ {
4924
+ "epoch": 0.607361095666237,
4925
+ "grad_norm": 0.16075611114501953,
4926
+ "learning_rate": 0.001,
4927
+ "loss": 2.6746,
4928
+ "num_input_tokens_seen": 28993122560,
4929
+ "step": 27650
4930
+ },
4931
+ {
4932
+ "epoch": 0.6084593978283821,
4933
+ "grad_norm": 0.1386987417936325,
4934
+ "learning_rate": 0.001,
4935
+ "loss": 2.6771,
4936
+ "num_input_tokens_seen": 29045551360,
4937
+ "step": 27700
4938
+ },
4939
+ {
4940
+ "epoch": 0.6095576999905271,
4941
+ "grad_norm": 0.14672614634037018,
4942
+ "learning_rate": 0.001,
4943
+ "loss": 2.6792,
4944
+ "num_input_tokens_seen": 29097980160,
4945
+ "step": 27750
4946
+ },
4947
+ {
4948
+ "epoch": 0.6106560021526722,
4949
+ "grad_norm": 0.22614523768424988,
4950
+ "learning_rate": 0.001,
4951
+ "loss": 2.6728,
4952
+ "num_input_tokens_seen": 29150408960,
4953
+ "step": 27800
4954
+ },
4955
+ {
4956
+ "epoch": 0.6117543043148174,
4957
+ "grad_norm": 0.15554341673851013,
4958
+ "learning_rate": 0.001,
4959
+ "loss": 2.676,
4960
+ "num_input_tokens_seen": 29202837760,
4961
+ "step": 27850
4962
+ },
4963
+ {
4964
+ "epoch": 0.6128526064769624,
4965
+ "grad_norm": 0.17181837558746338,
4966
+ "learning_rate": 0.001,
4967
+ "loss": 2.6811,
4968
+ "num_input_tokens_seen": 29255266560,
4969
+ "step": 27900
4970
+ },
4971
+ {
4972
+ "epoch": 0.6139509086391075,
4973
+ "grad_norm": 0.15763437747955322,
4974
+ "learning_rate": 0.001,
4975
+ "loss": 2.6797,
4976
+ "num_input_tokens_seen": 29307695360,
4977
+ "step": 27950
4978
+ },
4979
+ {
4980
+ "epoch": 0.6150492108012526,
4981
+ "grad_norm": 0.14721135795116425,
4982
+ "learning_rate": 0.001,
4983
+ "loss": 2.6762,
4984
+ "num_input_tokens_seen": 29360124160,
4985
+ "step": 28000
4986
+ },
4987
+ {
4988
+ "epoch": 0.6150492108012526,
4989
+ "eval_loss": 2.5763511657714844,
4990
+ "eval_runtime": 66.3236,
4991
+ "eval_samples_per_second": 75.388,
4992
+ "eval_steps_per_second": 18.847,
4993
+ "num_input_tokens_seen": 29360124160,
4994
+ "step": 28000
4995
  }
4996
  ],
4997
  "logging_steps": 50,
4998
  "max_steps": 200000,
4999
+ "num_input_tokens_seen": 29360124160,
5000
  "num_train_epochs": 5,
5001
  "save_steps": 1000,
5002
  "stateful_callbacks": {
 
5011
  "attributes": {}
5012
  }
5013
  },
5014
+ "total_flos": 1.6720805860109844e+19,
5015
  "train_batch_size": 64,
5016
  "trial_name": null,
5017
  "trial_params": null