Training in progress, step 30000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 563074920
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc13ffa23a1f5210f44d10669aa87f3ec7bfb7a2664786f76ce56132b042639e
|
| 3 |
size 563074920
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1125916346
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c2acc68a1693942d243837338503be83794d69c0b95c32e490c2e11f4c4406e
|
| 3 |
size 1125916346
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5f9d2ea250bcd3507c62c8571a114db63d14fdd2d31f9df1da7534fe6e55434
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:492390519daa872425f50793597ce5e74ef972fc3d656ffa5ca614e3b949a837
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -5170,11 +5170,189 @@
|
|
| 5170 |
"eval_steps_per_second": 8.742,
|
| 5171 |
"num_input_tokens_seen": 7602176000,
|
| 5172 |
"step": 29000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5173 |
}
|
| 5174 |
],
|
| 5175 |
"logging_steps": 50,
|
| 5176 |
"max_steps": 30000,
|
| 5177 |
-
"num_input_tokens_seen":
|
| 5178 |
"num_train_epochs": 1,
|
| 5179 |
"save_steps": 1000,
|
| 5180 |
"stateful_callbacks": {
|
|
@@ -5184,12 +5362,12 @@
|
|
| 5184 |
"should_evaluate": false,
|
| 5185 |
"should_log": false,
|
| 5186 |
"should_save": true,
|
| 5187 |
-
"should_training_stop":
|
| 5188 |
},
|
| 5189 |
"attributes": {}
|
| 5190 |
}
|
| 5191 |
},
|
| 5192 |
-
"total_flos":
|
| 5193 |
"train_batch_size": 64,
|
| 5194 |
"trial_name": null,
|
| 5195 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.20179496622456752,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 30000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 5170 |
"eval_steps_per_second": 8.742,
|
| 5171 |
"num_input_tokens_seen": 7602176000,
|
| 5172 |
"step": 29000
|
| 5173 |
+
},
|
| 5174 |
+
{
|
| 5175 |
+
"epoch": 0.1954047922941229,
|
| 5176 |
+
"grad_norm": 0.6069262027740479,
|
| 5177 |
+
"learning_rate": 1.5444383180638342e-06,
|
| 5178 |
+
"loss": 11.9314,
|
| 5179 |
+
"num_input_tokens_seen": 7615283200,
|
| 5180 |
+
"step": 29050
|
| 5181 |
+
},
|
| 5182 |
+
{
|
| 5183 |
+
"epoch": 0.1957411172378305,
|
| 5184 |
+
"grad_norm": 0.628108561038971,
|
| 5185 |
+
"learning_rate": 1.3862661152405309e-06,
|
| 5186 |
+
"loss": 11.9151,
|
| 5187 |
+
"num_input_tokens_seen": 7628390400,
|
| 5188 |
+
"step": 29100
|
| 5189 |
+
},
|
| 5190 |
+
{
|
| 5191 |
+
"epoch": 0.1960774421815381,
|
| 5192 |
+
"grad_norm": 0.6232333779335022,
|
| 5193 |
+
"learning_rate": 1.236618116485233e-06,
|
| 5194 |
+
"loss": 11.8887,
|
| 5195 |
+
"num_input_tokens_seen": 7641497600,
|
| 5196 |
+
"step": 29150
|
| 5197 |
+
},
|
| 5198 |
+
{
|
| 5199 |
+
"epoch": 0.19641376712524572,
|
| 5200 |
+
"grad_norm": 0.6372972726821899,
|
| 5201 |
+
"learning_rate": 1.0954985936379223e-06,
|
| 5202 |
+
"loss": 11.8873,
|
| 5203 |
+
"num_input_tokens_seen": 7654604800,
|
| 5204 |
+
"step": 29200
|
| 5205 |
+
},
|
| 5206 |
+
{
|
| 5207 |
+
"epoch": 0.19675009206895333,
|
| 5208 |
+
"grad_norm": 0.5991822481155396,
|
| 5209 |
+
"learning_rate": 9.6291157508529e-07,
|
| 5210 |
+
"loss": 11.9405,
|
| 5211 |
+
"num_input_tokens_seen": 7667712000,
|
| 5212 |
+
"step": 29250
|
| 5213 |
+
},
|
| 5214 |
+
{
|
| 5215 |
+
"epoch": 0.19708641701266094,
|
| 5216 |
+
"grad_norm": 0.6108511686325073,
|
| 5217 |
+
"learning_rate": 8.388608456459612e-07,
|
| 5218 |
+
"loss": 11.9085,
|
| 5219 |
+
"num_input_tokens_seen": 7680819200,
|
| 5220 |
+
"step": 29300
|
| 5221 |
+
},
|
| 5222 |
+
{
|
| 5223 |
+
"epoch": 0.19742274195636855,
|
| 5224 |
+
"grad_norm": 0.6104913949966431,
|
| 5225 |
+
"learning_rate": 7.23349946462215e-07,
|
| 5226 |
+
"loss": 11.8859,
|
| 5227 |
+
"num_input_tokens_seen": 7693926400,
|
| 5228 |
+
"step": 29350
|
| 5229 |
+
},
|
| 5230 |
+
{
|
| 5231 |
+
"epoch": 0.1977590669000762,
|
| 5232 |
+
"grad_norm": 0.6084222197532654,
|
| 5233 |
+
"learning_rate": 6.163821748990994e-07,
|
| 5234 |
+
"loss": 11.9059,
|
| 5235 |
+
"num_input_tokens_seen": 7707033600,
|
| 5236 |
+
"step": 29400
|
| 5237 |
+
},
|
| 5238 |
+
{
|
| 5239 |
+
"epoch": 0.1980953918437838,
|
| 5240 |
+
"grad_norm": 0.633105993270874,
|
| 5241 |
+
"learning_rate": 5.179605844501388e-07,
|
| 5242 |
+
"loss": 11.9174,
|
| 5243 |
+
"num_input_tokens_seen": 7720140800,
|
| 5244 |
+
"step": 29450
|
| 5245 |
+
},
|
| 5246 |
+
{
|
| 5247 |
+
"epoch": 0.1984317167874914,
|
| 5248 |
+
"grad_norm": 0.6088514924049377,
|
| 5249 |
+
"learning_rate": 4.280879846503049e-07,
|
| 5250 |
+
"loss": 11.9125,
|
| 5251 |
+
"num_input_tokens_seen": 7733248000,
|
| 5252 |
+
"step": 29500
|
| 5253 |
+
},
|
| 5254 |
+
{
|
| 5255 |
+
"epoch": 0.1984317167874914,
|
| 5256 |
+
"eval_loss": 2.8849411010742188,
|
| 5257 |
+
"eval_runtime": 143.8146,
|
| 5258 |
+
"eval_samples_per_second": 34.767,
|
| 5259 |
+
"eval_steps_per_second": 8.692,
|
| 5260 |
+
"num_input_tokens_seen": 7733248000,
|
| 5261 |
+
"step": 29500
|
| 5262 |
+
},
|
| 5263 |
+
{
|
| 5264 |
+
"epoch": 0.19876804173119902,
|
| 5265 |
+
"grad_norm": 0.6054402589797974,
|
| 5266 |
+
"learning_rate": 3.467669409957463e-07,
|
| 5267 |
+
"loss": 11.9468,
|
| 5268 |
+
"num_input_tokens_seen": 7746355200,
|
| 5269 |
+
"step": 29550
|
| 5270 |
+
},
|
| 5271 |
+
{
|
| 5272 |
+
"epoch": 0.19910436667490664,
|
| 5273 |
+
"grad_norm": 0.6133595705032349,
|
| 5274 |
+
"learning_rate": 2.7399977487051473e-07,
|
| 5275 |
+
"loss": 11.9368,
|
| 5276 |
+
"num_input_tokens_seen": 7759462400,
|
| 5277 |
+
"step": 29600
|
| 5278 |
+
},
|
| 5279 |
+
{
|
| 5280 |
+
"epoch": 0.19944069161861425,
|
| 5281 |
+
"grad_norm": 0.6098650693893433,
|
| 5282 |
+
"learning_rate": 2.097885634804175e-07,
|
| 5283 |
+
"loss": 11.8971,
|
| 5284 |
+
"num_input_tokens_seen": 7772569600,
|
| 5285 |
+
"step": 29650
|
| 5286 |
+
},
|
| 5287 |
+
{
|
| 5288 |
+
"epoch": 0.19977701656232186,
|
| 5289 |
+
"grad_norm": 0.6231054663658142,
|
| 5290 |
+
"learning_rate": 1.541351397936319e-07,
|
| 5291 |
+
"loss": 11.9546,
|
| 5292 |
+
"num_input_tokens_seen": 7785676800,
|
| 5293 |
+
"step": 29700
|
| 5294 |
+
},
|
| 5295 |
+
{
|
| 5296 |
+
"epoch": 0.20011334150602947,
|
| 5297 |
+
"grad_norm": 0.6323234438896179,
|
| 5298 |
+
"learning_rate": 1.0704109248838022e-07,
|
| 5299 |
+
"loss": 11.8848,
|
| 5300 |
+
"num_input_tokens_seen": 7798784000,
|
| 5301 |
+
"step": 29750
|
| 5302 |
+
},
|
| 5303 |
+
{
|
| 5304 |
+
"epoch": 0.20044966644973708,
|
| 5305 |
+
"grad_norm": 0.6294256448745728,
|
| 5306 |
+
"learning_rate": 6.850776590763274e-08,
|
| 5307 |
+
"loss": 11.9027,
|
| 5308 |
+
"num_input_tokens_seen": 7811891200,
|
| 5309 |
+
"step": 29800
|
| 5310 |
+
},
|
| 5311 |
+
{
|
| 5312 |
+
"epoch": 0.2007859913934447,
|
| 5313 |
+
"grad_norm": 0.6184135675430298,
|
| 5314 |
+
"learning_rate": 3.853626002063848e-08,
|
| 5315 |
+
"loss": 11.9454,
|
| 5316 |
+
"num_input_tokens_seen": 7824998400,
|
| 5317 |
+
"step": 29850
|
| 5318 |
+
},
|
| 5319 |
+
{
|
| 5320 |
+
"epoch": 0.2011223163371523,
|
| 5321 |
+
"grad_norm": 0.6376939415931702,
|
| 5322 |
+
"learning_rate": 1.7127430391683516e-08,
|
| 5323 |
+
"loss": 11.8928,
|
| 5324 |
+
"num_input_tokens_seen": 7838105600,
|
| 5325 |
+
"step": 29900
|
| 5326 |
+
},
|
| 5327 |
+
{
|
| 5328 |
+
"epoch": 0.2014586412808599,
|
| 5329 |
+
"grad_norm": 0.6745944619178772,
|
| 5330 |
+
"learning_rate": 4.281888155543978e-09,
|
| 5331 |
+
"loss": 11.9315,
|
| 5332 |
+
"num_input_tokens_seen": 7851212800,
|
| 5333 |
+
"step": 29950
|
| 5334 |
+
},
|
| 5335 |
+
{
|
| 5336 |
+
"epoch": 0.20179496622456752,
|
| 5337 |
+
"grad_norm": 0.6381050944328308,
|
| 5338 |
+
"learning_rate": 0.0,
|
| 5339 |
+
"loss": 11.9242,
|
| 5340 |
+
"num_input_tokens_seen": 7864320000,
|
| 5341 |
+
"step": 30000
|
| 5342 |
+
},
|
| 5343 |
+
{
|
| 5344 |
+
"epoch": 0.20179496622456752,
|
| 5345 |
+
"eval_loss": 2.8848958015441895,
|
| 5346 |
+
"eval_runtime": 142.697,
|
| 5347 |
+
"eval_samples_per_second": 35.039,
|
| 5348 |
+
"eval_steps_per_second": 8.76,
|
| 5349 |
+
"num_input_tokens_seen": 7864320000,
|
| 5350 |
+
"step": 30000
|
| 5351 |
}
|
| 5352 |
],
|
| 5353 |
"logging_steps": 50,
|
| 5354 |
"max_steps": 30000,
|
| 5355 |
+
"num_input_tokens_seen": 7864320000,
|
| 5356 |
"num_train_epochs": 1,
|
| 5357 |
"save_steps": 1000,
|
| 5358 |
"stateful_callbacks": {
|
|
|
|
| 5362 |
"should_evaluate": false,
|
| 5363 |
"should_log": false,
|
| 5364 |
"should_save": true,
|
| 5365 |
+
"should_training_stop": true
|
| 5366 |
},
|
| 5367 |
"attributes": {}
|
| 5368 |
}
|
| 5369 |
},
|
| 5370 |
+
"total_flos": 5.0112805994496e+18,
|
| 5371 |
"train_batch_size": 64,
|
| 5372 |
"trial_name": null,
|
| 5373 |
"trial_params": null
|