Azrail commited on
Commit
467845f
·
verified ·
1 Parent(s): 1400fd6

Training in progress, step 29000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:598ec2e422397aad641d528881e643db9612147d6333b5c66a69998965ce9656
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:496d2b46e7c0c2d415917c3f430a70a0aac599fe885f35c60cc3199532b41d7a
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:593d98fe7868eb09c1f3193111558b84e18bf0affb6cefd648708d4d1cba6ae6
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5a0ef03792604564acfd0823f03cfd37314bbc8a8eb68b05d8de1d1cfee687a
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2954868f3cacffad4686728c7094ccb6fc0d9e0b5adf1b06d98602d6248bf938
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:782fee7d7309ad00bf19a629f420a995596231f63b5af04a7f7244e077883f2d
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06d5fef101c9d39d51795e2426ebd97ece14c40eab5611cbd021ffd2d11b16ce
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6038c3966e5acd5e329cd1d75f036dea625d34bb913a8f0d05452e8d1784e0ba
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6150492108012526,
6
  "eval_steps": 500,
7
- "global_step": 28000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4992,11 +4992,189 @@
4992
  "eval_steps_per_second": 18.847,
4993
  "num_input_tokens_seen": 29360124160,
4994
  "step": 28000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4995
  }
4996
  ],
4997
  "logging_steps": 50,
4998
  "max_steps": 200000,
4999
- "num_input_tokens_seen": 29360124160,
5000
  "num_train_epochs": 5,
5001
  "save_steps": 1000,
5002
  "stateful_callbacks": {
@@ -5011,7 +5189,7 @@
5011
  "attributes": {}
5012
  }
5013
  },
5014
- "total_flos": 1.6720805860109844e+19,
5015
  "train_batch_size": 64,
5016
  "trial_name": null,
5017
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.6370152540441545,
6
  "eval_steps": 500,
7
+ "global_step": 29000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4992
  "eval_steps_per_second": 18.847,
4993
  "num_input_tokens_seen": 29360124160,
4994
  "step": 28000
4995
+ },
4996
+ {
4997
+ "epoch": 0.6161475129633978,
4998
+ "grad_norm": 0.13857993483543396,
4999
+ "learning_rate": 0.001,
5000
+ "loss": 2.677,
5001
+ "num_input_tokens_seen": 29412552960,
5002
+ "step": 28050
5003
+ },
5004
+ {
5005
+ "epoch": 0.6172458151255428,
5006
+ "grad_norm": 0.14276473224163055,
5007
+ "learning_rate": 0.001,
5008
+ "loss": 2.6669,
5009
+ "num_input_tokens_seen": 29464981760,
5010
+ "step": 28100
5011
+ },
5012
+ {
5013
+ "epoch": 0.6183441172876879,
5014
+ "grad_norm": 0.1536131203174591,
5015
+ "learning_rate": 0.001,
5016
+ "loss": 2.6757,
5017
+ "num_input_tokens_seen": 29517410560,
5018
+ "step": 28150
5019
+ },
5020
+ {
5021
+ "epoch": 0.619442419449833,
5022
+ "grad_norm": 0.15733414888381958,
5023
+ "learning_rate": 0.001,
5024
+ "loss": 2.6735,
5025
+ "num_input_tokens_seen": 29569839360,
5026
+ "step": 28200
5027
+ },
5028
+ {
5029
+ "epoch": 0.620540721611978,
5030
+ "grad_norm": 0.14553523063659668,
5031
+ "learning_rate": 0.001,
5032
+ "loss": 2.6683,
5033
+ "num_input_tokens_seen": 29622268160,
5034
+ "step": 28250
5035
+ },
5036
+ {
5037
+ "epoch": 0.6216390237741232,
5038
+ "grad_norm": 0.15685459971427917,
5039
+ "learning_rate": 0.001,
5040
+ "loss": 2.6692,
5041
+ "num_input_tokens_seen": 29674696960,
5042
+ "step": 28300
5043
+ },
5044
+ {
5045
+ "epoch": 0.6227373259362683,
5046
+ "grad_norm": 0.16553767025470734,
5047
+ "learning_rate": 0.001,
5048
+ "loss": 2.6778,
5049
+ "num_input_tokens_seen": 29727125760,
5050
+ "step": 28350
5051
+ },
5052
+ {
5053
+ "epoch": 0.6238356280984134,
5054
+ "grad_norm": 0.1619853973388672,
5055
+ "learning_rate": 0.001,
5056
+ "loss": 2.6807,
5057
+ "num_input_tokens_seen": 29779554560,
5058
+ "step": 28400
5059
+ },
5060
+ {
5061
+ "epoch": 0.6249339302605584,
5062
+ "grad_norm": 0.12794817984104156,
5063
+ "learning_rate": 0.001,
5064
+ "loss": 2.6776,
5065
+ "num_input_tokens_seen": 29831983360,
5066
+ "step": 28450
5067
+ },
5068
+ {
5069
+ "epoch": 0.6260322324227036,
5070
+ "grad_norm": 0.17001128196716309,
5071
+ "learning_rate": 0.001,
5072
+ "loss": 2.6797,
5073
+ "num_input_tokens_seen": 29884412160,
5074
+ "step": 28500
5075
+ },
5076
+ {
5077
+ "epoch": 0.6260322324227036,
5078
+ "eval_loss": 2.5728061199188232,
5079
+ "eval_runtime": 66.7752,
5080
+ "eval_samples_per_second": 74.878,
5081
+ "eval_steps_per_second": 18.72,
5082
+ "num_input_tokens_seen": 29884412160,
5083
+ "step": 28500
5084
+ },
5085
+ {
5086
+ "epoch": 0.6271305345848487,
5087
+ "grad_norm": 0.12936875224113464,
5088
+ "learning_rate": 0.001,
5089
+ "loss": 2.6677,
5090
+ "num_input_tokens_seen": 29936840960,
5091
+ "step": 28550
5092
+ },
5093
+ {
5094
+ "epoch": 0.6282288367469937,
5095
+ "grad_norm": 0.14839358627796173,
5096
+ "learning_rate": 0.001,
5097
+ "loss": 2.6681,
5098
+ "num_input_tokens_seen": 29989269760,
5099
+ "step": 28600
5100
+ },
5101
+ {
5102
+ "epoch": 0.6293271389091388,
5103
+ "grad_norm": 0.1526126265525818,
5104
+ "learning_rate": 0.001,
5105
+ "loss": 2.6711,
5106
+ "num_input_tokens_seen": 30041698560,
5107
+ "step": 28650
5108
+ },
5109
+ {
5110
+ "epoch": 0.630425441071284,
5111
+ "grad_norm": 11.806962013244629,
5112
+ "learning_rate": 0.001,
5113
+ "loss": 2.7543,
5114
+ "num_input_tokens_seen": 30094127360,
5115
+ "step": 28700
5116
+ },
5117
+ {
5118
+ "epoch": 0.631523743233429,
5119
+ "grad_norm": 0.13446328043937683,
5120
+ "learning_rate": 0.001,
5121
+ "loss": 2.9466,
5122
+ "num_input_tokens_seen": 30146556160,
5123
+ "step": 28750
5124
+ },
5125
+ {
5126
+ "epoch": 0.6326220453955741,
5127
+ "grad_norm": 0.1319582760334015,
5128
+ "learning_rate": 0.001,
5129
+ "loss": 2.7002,
5130
+ "num_input_tokens_seen": 30198984960,
5131
+ "step": 28800
5132
+ },
5133
+ {
5134
+ "epoch": 0.6337203475577192,
5135
+ "grad_norm": 0.13955356180667877,
5136
+ "learning_rate": 0.001,
5137
+ "loss": 2.6814,
5138
+ "num_input_tokens_seen": 30251413760,
5139
+ "step": 28850
5140
+ },
5141
+ {
5142
+ "epoch": 0.6348186497198643,
5143
+ "grad_norm": 0.1295064240694046,
5144
+ "learning_rate": 0.001,
5145
+ "loss": 2.676,
5146
+ "num_input_tokens_seen": 30303842560,
5147
+ "step": 28900
5148
+ },
5149
+ {
5150
+ "epoch": 0.6359169518820094,
5151
+ "grad_norm": 0.1440495401620865,
5152
+ "learning_rate": 0.001,
5153
+ "loss": 2.6778,
5154
+ "num_input_tokens_seen": 30356271360,
5155
+ "step": 28950
5156
+ },
5157
+ {
5158
+ "epoch": 0.6370152540441545,
5159
+ "grad_norm": 0.13806115090847015,
5160
+ "learning_rate": 0.001,
5161
+ "loss": 2.6712,
5162
+ "num_input_tokens_seen": 30408700160,
5163
+ "step": 29000
5164
+ },
5165
+ {
5166
+ "epoch": 0.6370152540441545,
5167
+ "eval_loss": 2.576237440109253,
5168
+ "eval_runtime": 66.9761,
5169
+ "eval_samples_per_second": 74.653,
5170
+ "eval_steps_per_second": 18.663,
5171
+ "num_input_tokens_seen": 30408700160,
5172
+ "step": 29000
5173
  }
5174
  ],
5175
  "logging_steps": 50,
5176
  "max_steps": 200000,
5177
+ "num_input_tokens_seen": 30408700160,
5178
  "num_train_epochs": 5,
5179
  "save_steps": 1000,
5180
  "stateful_callbacks": {
 
5189
  "attributes": {}
5190
  }
5191
  },
5192
+ "total_flos": 1.7317977576074772e+19,
5193
  "train_batch_size": 64,
5194
  "trial_name": null,
5195
  "trial_params": null