Azrail commited on
Commit
e027bbc
·
verified ·
1 Parent(s): 40128a7

Training in progress, step 24000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b09b714fd1dab9349e276a2d1d64f33fa5004406d993d6a429a5d969766a4711
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54c2b1dc0ce252890792fa50a7ced2b1884b184496f8709b1df62b942e4f6173
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c41cb06b8dbe81630708c3aba06b43b83a86488a65bdb7921cf7d53205761b4
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6caaef1143ab01dc77c2601e1c5bde16b77c55e497c5f13366c2442c28ab6fac
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e51d5acf7af721ddb096287cdb18fed327f0e9b8f18a038e8c92ad3d7c982961
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4eb9e5f9b752984653e9c2f4587df901a2cc5f64a95a0121fadf8e7c7c268621
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d95e34f92e07ac3fca3b42c6f704d96b3191058ad871fb9d07f6b2779013efbe
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06f0f3181677433703f6860ec173100c1f71e33282413595313e7174a82f6998
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.5052189945867432,
6
  "eval_steps": 500,
7
- "global_step": 23000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4102,11 +4102,189 @@
4102
  "eval_steps_per_second": 18.892,
4103
  "num_input_tokens_seen": 24117244160,
4104
  "step": 23000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4105
  }
4106
  ],
4107
  "logging_steps": 50,
4108
  "max_steps": 200000,
4109
- "num_input_tokens_seen": 24117244160,
4110
  "num_train_epochs": 5,
4111
  "save_steps": 1000,
4112
  "stateful_callbacks": {
@@ -4121,7 +4299,7 @@
4121
  "attributes": {}
4122
  }
4123
  },
4124
- "total_flos": 1.3734947280285204e+19,
4125
  "train_batch_size": 64,
4126
  "trial_name": null,
4127
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.5271850378296451,
6
  "eval_steps": 500,
7
+ "global_step": 24000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4102
  "eval_steps_per_second": 18.892,
4103
  "num_input_tokens_seen": 24117244160,
4104
  "step": 23000
4105
+ },
4106
+ {
4107
+ "epoch": 0.5063172967488883,
4108
+ "grad_norm": 0.15117652714252472,
4109
+ "learning_rate": 0.001,
4110
+ "loss": 2.696,
4111
+ "num_input_tokens_seen": 24169672960,
4112
+ "step": 23050
4113
+ },
4114
+ {
4115
+ "epoch": 0.5074155989110334,
4116
+ "grad_norm": 0.15605470538139343,
4117
+ "learning_rate": 0.001,
4118
+ "loss": 2.6918,
4119
+ "num_input_tokens_seen": 24222101760,
4120
+ "step": 23100
4121
+ },
4122
+ {
4123
+ "epoch": 0.5085139010731785,
4124
+ "grad_norm": 0.17503651976585388,
4125
+ "learning_rate": 0.001,
4126
+ "loss": 2.688,
4127
+ "num_input_tokens_seen": 24274530560,
4128
+ "step": 23150
4129
+ },
4130
+ {
4131
+ "epoch": 0.5096122032353236,
4132
+ "grad_norm": 0.1622135490179062,
4133
+ "learning_rate": 0.001,
4134
+ "loss": 2.6949,
4135
+ "num_input_tokens_seen": 24326959360,
4136
+ "step": 23200
4137
+ },
4138
+ {
4139
+ "epoch": 0.5107105053974687,
4140
+ "grad_norm": 0.1331271231174469,
4141
+ "learning_rate": 0.001,
4142
+ "loss": 2.6876,
4143
+ "num_input_tokens_seen": 24379388160,
4144
+ "step": 23250
4145
+ },
4146
+ {
4147
+ "epoch": 0.5118088075596138,
4148
+ "grad_norm": 0.14365510642528534,
4149
+ "learning_rate": 0.001,
4150
+ "loss": 2.7027,
4151
+ "num_input_tokens_seen": 24431816960,
4152
+ "step": 23300
4153
+ },
4154
+ {
4155
+ "epoch": 0.5129071097217589,
4156
+ "grad_norm": 0.13621902465820312,
4157
+ "learning_rate": 0.001,
4158
+ "loss": 2.6946,
4159
+ "num_input_tokens_seen": 24484245760,
4160
+ "step": 23350
4161
+ },
4162
+ {
4163
+ "epoch": 0.5140054118839039,
4164
+ "grad_norm": 0.12506547570228577,
4165
+ "learning_rate": 0.001,
4166
+ "loss": 2.6864,
4167
+ "num_input_tokens_seen": 24536674560,
4168
+ "step": 23400
4169
+ },
4170
+ {
4171
+ "epoch": 0.515103714046049,
4172
+ "grad_norm": 0.12824128568172455,
4173
+ "learning_rate": 0.001,
4174
+ "loss": 2.6871,
4175
+ "num_input_tokens_seen": 24589103360,
4176
+ "step": 23450
4177
+ },
4178
+ {
4179
+ "epoch": 0.5162020162081942,
4180
+ "grad_norm": 0.14310036599636078,
4181
+ "learning_rate": 0.001,
4182
+ "loss": 2.6936,
4183
+ "num_input_tokens_seen": 24641532160,
4184
+ "step": 23500
4185
+ },
4186
+ {
4187
+ "epoch": 0.5162020162081942,
4188
+ "eval_loss": 2.592362880706787,
4189
+ "eval_runtime": 66.663,
4190
+ "eval_samples_per_second": 75.004,
4191
+ "eval_steps_per_second": 18.751,
4192
+ "num_input_tokens_seen": 24641532160,
4193
+ "step": 23500
4194
+ },
4195
+ {
4196
+ "epoch": 0.5173003183703393,
4197
+ "grad_norm": 0.1362077295780182,
4198
+ "learning_rate": 0.001,
4199
+ "loss": 2.6924,
4200
+ "num_input_tokens_seen": 24693960960,
4201
+ "step": 23550
4202
+ },
4203
+ {
4204
+ "epoch": 0.5183986205324843,
4205
+ "grad_norm": 0.13662473857402802,
4206
+ "learning_rate": 0.001,
4207
+ "loss": 2.6972,
4208
+ "num_input_tokens_seen": 24746389760,
4209
+ "step": 23600
4210
+ },
4211
+ {
4212
+ "epoch": 0.5194969226946294,
4213
+ "grad_norm": 0.12603560090065002,
4214
+ "learning_rate": 0.001,
4215
+ "loss": 2.6908,
4216
+ "num_input_tokens_seen": 24798818560,
4217
+ "step": 23650
4218
+ },
4219
+ {
4220
+ "epoch": 0.5205952248567746,
4221
+ "grad_norm": 0.16597150266170502,
4222
+ "learning_rate": 0.001,
4223
+ "loss": 2.6882,
4224
+ "num_input_tokens_seen": 24851247360,
4225
+ "step": 23700
4226
+ },
4227
+ {
4228
+ "epoch": 0.5216935270189196,
4229
+ "grad_norm": 0.13665246963500977,
4230
+ "learning_rate": 0.001,
4231
+ "loss": 2.6958,
4232
+ "num_input_tokens_seen": 24903676160,
4233
+ "step": 23750
4234
+ },
4235
+ {
4236
+ "epoch": 0.5227918291810647,
4237
+ "grad_norm": 0.14349523186683655,
4238
+ "learning_rate": 0.001,
4239
+ "loss": 2.6874,
4240
+ "num_input_tokens_seen": 24956104960,
4241
+ "step": 23800
4242
+ },
4243
+ {
4244
+ "epoch": 0.5238901313432098,
4245
+ "grad_norm": 0.15857954323291779,
4246
+ "learning_rate": 0.001,
4247
+ "loss": 2.6882,
4248
+ "num_input_tokens_seen": 25008533760,
4249
+ "step": 23850
4250
+ },
4251
+ {
4252
+ "epoch": 0.524988433505355,
4253
+ "grad_norm": 0.15056300163269043,
4254
+ "learning_rate": 0.001,
4255
+ "loss": 2.694,
4256
+ "num_input_tokens_seen": 25060962560,
4257
+ "step": 23900
4258
+ },
4259
+ {
4260
+ "epoch": 0.5260867356675,
4261
+ "grad_norm": 0.12861080467700958,
4262
+ "learning_rate": 0.001,
4263
+ "loss": 2.6899,
4264
+ "num_input_tokens_seen": 25113391360,
4265
+ "step": 23950
4266
+ },
4267
+ {
4268
+ "epoch": 0.5271850378296451,
4269
+ "grad_norm": 0.14443258941173553,
4270
+ "learning_rate": 0.001,
4271
+ "loss": 2.6929,
4272
+ "num_input_tokens_seen": 25165820160,
4273
+ "step": 24000
4274
+ },
4275
+ {
4276
+ "epoch": 0.5271850378296451,
4277
+ "eval_loss": 2.5910630226135254,
4278
+ "eval_runtime": 66.9014,
4279
+ "eval_samples_per_second": 74.737,
4280
+ "eval_steps_per_second": 18.684,
4281
+ "num_input_tokens_seen": 25165820160,
4282
+ "step": 24000
4283
  }
4284
  ],
4285
  "logging_steps": 50,
4286
  "max_steps": 200000,
4287
+ "num_input_tokens_seen": 25165820160,
4288
  "num_train_epochs": 5,
4289
  "save_steps": 1000,
4290
  "stateful_callbacks": {
 
4299
  "attributes": {}
4300
  }
4301
  },
4302
+ "total_flos": 1.4332118996250132e+19,
4303
  "train_batch_size": 64,
4304
  "trial_name": null,
4305
  "trial_params": null