Training in progress, step 30000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9965fcf14e783e7e1d55074ea2afa9a825c414e7bb1e05e788c2b6e78b01e868
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e44a1859e7ded3de3d773c5abac76c0fc5f7c6f4fc38577dfe331b1a4c391ab7
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc95bdc35ebe00877717681894afcd7d44f457b0583fea8b14d22f39dd179eb8
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6d0c8a94ae7b3402d9f6c538decfc8292fd64108bb86fd10da3f27734428bf0b
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -5170,11 +5170,189 @@
|
|
| 5170 |
"eval_steps_per_second": 18.663,
|
| 5171 |
"num_input_tokens_seen": 30408700160,
|
| 5172 |
"step": 29000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5173 |
}
|
| 5174 |
],
|
| 5175 |
"logging_steps": 50,
|
| 5176 |
"max_steps": 200000,
|
| 5177 |
-
"num_input_tokens_seen":
|
| 5178 |
"num_train_epochs": 5,
|
| 5179 |
"save_steps": 1000,
|
| 5180 |
"stateful_callbacks": {
|
|
@@ -5189,7 +5367,7 @@
|
|
| 5189 |
"attributes": {}
|
| 5190 |
}
|
| 5191 |
},
|
| 5192 |
-
"total_flos": 1.
|
| 5193 |
"train_batch_size": 64,
|
| 5194 |
"trial_name": null,
|
| 5195 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.6589812972870563,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 30000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 5170 |
"eval_steps_per_second": 18.663,
|
| 5171 |
"num_input_tokens_seen": 30408700160,
|
| 5172 |
"step": 29000
|
| 5173 |
+
},
|
| 5174 |
+
{
|
| 5175 |
+
"epoch": 0.6381135562062996,
|
| 5176 |
+
"grad_norm": 0.13853897154331207,
|
| 5177 |
+
"learning_rate": 0.001,
|
| 5178 |
+
"loss": 2.6719,
|
| 5179 |
+
"num_input_tokens_seen": 30461128960,
|
| 5180 |
+
"step": 29050
|
| 5181 |
+
},
|
| 5182 |
+
{
|
| 5183 |
+
"epoch": 0.6392118583684446,
|
| 5184 |
+
"grad_norm": 0.14228977262973785,
|
| 5185 |
+
"learning_rate": 0.001,
|
| 5186 |
+
"loss": 2.6788,
|
| 5187 |
+
"num_input_tokens_seen": 30513557760,
|
| 5188 |
+
"step": 29100
|
| 5189 |
+
},
|
| 5190 |
+
{
|
| 5191 |
+
"epoch": 0.6403101605305898,
|
| 5192 |
+
"grad_norm": 0.13464143872261047,
|
| 5193 |
+
"learning_rate": 0.001,
|
| 5194 |
+
"loss": 2.6743,
|
| 5195 |
+
"num_input_tokens_seen": 30565986560,
|
| 5196 |
+
"step": 29150
|
| 5197 |
+
},
|
| 5198 |
+
{
|
| 5199 |
+
"epoch": 0.6414084626927349,
|
| 5200 |
+
"grad_norm": 0.15960821509361267,
|
| 5201 |
+
"learning_rate": 0.001,
|
| 5202 |
+
"loss": 2.6729,
|
| 5203 |
+
"num_input_tokens_seen": 30618415360,
|
| 5204 |
+
"step": 29200
|
| 5205 |
+
},
|
| 5206 |
+
{
|
| 5207 |
+
"epoch": 0.64250676485488,
|
| 5208 |
+
"grad_norm": 0.13830585777759552,
|
| 5209 |
+
"learning_rate": 0.001,
|
| 5210 |
+
"loss": 2.6723,
|
| 5211 |
+
"num_input_tokens_seen": 30670844160,
|
| 5212 |
+
"step": 29250
|
| 5213 |
+
},
|
| 5214 |
+
{
|
| 5215 |
+
"epoch": 0.643605067017025,
|
| 5216 |
+
"grad_norm": 0.14440728724002838,
|
| 5217 |
+
"learning_rate": 0.001,
|
| 5218 |
+
"loss": 2.664,
|
| 5219 |
+
"num_input_tokens_seen": 30723272960,
|
| 5220 |
+
"step": 29300
|
| 5221 |
+
},
|
| 5222 |
+
{
|
| 5223 |
+
"epoch": 0.6447033691791701,
|
| 5224 |
+
"grad_norm": 0.14259463548660278,
|
| 5225 |
+
"learning_rate": 0.001,
|
| 5226 |
+
"loss": 2.6675,
|
| 5227 |
+
"num_input_tokens_seen": 30775701760,
|
| 5228 |
+
"step": 29350
|
| 5229 |
+
},
|
| 5230 |
+
{
|
| 5231 |
+
"epoch": 0.6458016713413153,
|
| 5232 |
+
"grad_norm": 0.1462564468383789,
|
| 5233 |
+
"learning_rate": 0.001,
|
| 5234 |
+
"loss": 2.6671,
|
| 5235 |
+
"num_input_tokens_seen": 30828130560,
|
| 5236 |
+
"step": 29400
|
| 5237 |
+
},
|
| 5238 |
+
{
|
| 5239 |
+
"epoch": 0.6468999735034603,
|
| 5240 |
+
"grad_norm": 0.1443469077348709,
|
| 5241 |
+
"learning_rate": 0.001,
|
| 5242 |
+
"loss": 2.6667,
|
| 5243 |
+
"num_input_tokens_seen": 30880559360,
|
| 5244 |
+
"step": 29450
|
| 5245 |
+
},
|
| 5246 |
+
{
|
| 5247 |
+
"epoch": 0.6479982756656054,
|
| 5248 |
+
"grad_norm": 0.143255814909935,
|
| 5249 |
+
"learning_rate": 0.001,
|
| 5250 |
+
"loss": 2.6652,
|
| 5251 |
+
"num_input_tokens_seen": 30932988160,
|
| 5252 |
+
"step": 29500
|
| 5253 |
+
},
|
| 5254 |
+
{
|
| 5255 |
+
"epoch": 0.6479982756656054,
|
| 5256 |
+
"eval_loss": 2.569544792175293,
|
| 5257 |
+
"eval_runtime": 66.8674,
|
| 5258 |
+
"eval_samples_per_second": 74.775,
|
| 5259 |
+
"eval_steps_per_second": 18.694,
|
| 5260 |
+
"num_input_tokens_seen": 30932988160,
|
| 5261 |
+
"step": 29500
|
| 5262 |
+
},
|
| 5263 |
+
{
|
| 5264 |
+
"epoch": 0.6490965778277505,
|
| 5265 |
+
"grad_norm": 0.15149758756160736,
|
| 5266 |
+
"learning_rate": 0.001,
|
| 5267 |
+
"loss": 2.6681,
|
| 5268 |
+
"num_input_tokens_seen": 30985416960,
|
| 5269 |
+
"step": 29550
|
| 5270 |
+
},
|
| 5271 |
+
{
|
| 5272 |
+
"epoch": 0.6501948799898957,
|
| 5273 |
+
"grad_norm": 0.15703468024730682,
|
| 5274 |
+
"learning_rate": 0.001,
|
| 5275 |
+
"loss": 2.6681,
|
| 5276 |
+
"num_input_tokens_seen": 31037845760,
|
| 5277 |
+
"step": 29600
|
| 5278 |
+
},
|
| 5279 |
+
{
|
| 5280 |
+
"epoch": 0.6512931821520407,
|
| 5281 |
+
"grad_norm": 0.14332515001296997,
|
| 5282 |
+
"learning_rate": 0.001,
|
| 5283 |
+
"loss": 2.6622,
|
| 5284 |
+
"num_input_tokens_seen": 31090274560,
|
| 5285 |
+
"step": 29650
|
| 5286 |
+
},
|
| 5287 |
+
{
|
| 5288 |
+
"epoch": 0.6523914843141858,
|
| 5289 |
+
"grad_norm": 0.13763870298862457,
|
| 5290 |
+
"learning_rate": 0.001,
|
| 5291 |
+
"loss": 2.6724,
|
| 5292 |
+
"num_input_tokens_seen": 31142703360,
|
| 5293 |
+
"step": 29700
|
| 5294 |
+
},
|
| 5295 |
+
{
|
| 5296 |
+
"epoch": 0.6534897864763309,
|
| 5297 |
+
"grad_norm": 0.11858976632356644,
|
| 5298 |
+
"learning_rate": 0.001,
|
| 5299 |
+
"loss": 2.6743,
|
| 5300 |
+
"num_input_tokens_seen": 31195132160,
|
| 5301 |
+
"step": 29750
|
| 5302 |
+
},
|
| 5303 |
+
{
|
| 5304 |
+
"epoch": 0.654588088638476,
|
| 5305 |
+
"grad_norm": 0.15627937018871307,
|
| 5306 |
+
"learning_rate": 0.001,
|
| 5307 |
+
"loss": 2.6653,
|
| 5308 |
+
"num_input_tokens_seen": 31247560960,
|
| 5309 |
+
"step": 29800
|
| 5310 |
+
},
|
| 5311 |
+
{
|
| 5312 |
+
"epoch": 0.6556863908006211,
|
| 5313 |
+
"grad_norm": 0.15052759647369385,
|
| 5314 |
+
"learning_rate": 0.001,
|
| 5315 |
+
"loss": 2.6684,
|
| 5316 |
+
"num_input_tokens_seen": 31299989760,
|
| 5317 |
+
"step": 29850
|
| 5318 |
+
},
|
| 5319 |
+
{
|
| 5320 |
+
"epoch": 0.6567846929627662,
|
| 5321 |
+
"grad_norm": 0.1648450791835785,
|
| 5322 |
+
"learning_rate": 0.001,
|
| 5323 |
+
"loss": 2.6783,
|
| 5324 |
+
"num_input_tokens_seen": 31352418560,
|
| 5325 |
+
"step": 29900
|
| 5326 |
+
},
|
| 5327 |
+
{
|
| 5328 |
+
"epoch": 0.6578829951249113,
|
| 5329 |
+
"grad_norm": 0.13318586349487305,
|
| 5330 |
+
"learning_rate": 0.001,
|
| 5331 |
+
"loss": 2.6712,
|
| 5332 |
+
"num_input_tokens_seen": 31404847360,
|
| 5333 |
+
"step": 29950
|
| 5334 |
+
},
|
| 5335 |
+
{
|
| 5336 |
+
"epoch": 0.6589812972870563,
|
| 5337 |
+
"grad_norm": 0.1517287641763687,
|
| 5338 |
+
"learning_rate": 0.001,
|
| 5339 |
+
"loss": 2.6688,
|
| 5340 |
+
"num_input_tokens_seen": 31457276160,
|
| 5341 |
+
"step": 30000
|
| 5342 |
+
},
|
| 5343 |
+
{
|
| 5344 |
+
"epoch": 0.6589812972870563,
|
| 5345 |
+
"eval_loss": 2.5676708221435547,
|
| 5346 |
+
"eval_runtime": 66.0876,
|
| 5347 |
+
"eval_samples_per_second": 75.657,
|
| 5348 |
+
"eval_steps_per_second": 18.914,
|
| 5349 |
+
"num_input_tokens_seen": 31457276160,
|
| 5350 |
+
"step": 30000
|
| 5351 |
}
|
| 5352 |
],
|
| 5353 |
"logging_steps": 50,
|
| 5354 |
"max_steps": 200000,
|
| 5355 |
+
"num_input_tokens_seen": 31457276160,
|
| 5356 |
"num_train_epochs": 5,
|
| 5357 |
"save_steps": 1000,
|
| 5358 |
"stateful_callbacks": {
|
|
|
|
| 5367 |
"attributes": {}
|
| 5368 |
}
|
| 5369 |
},
|
| 5370 |
+
"total_flos": 1.79151492920397e+19,
|
| 5371 |
"train_batch_size": 64,
|
| 5372 |
"trial_name": null,
|
| 5373 |
"trial_params": null
|