Initial commit
Browse files- config.json +1 -1
- optimizer.pt +1 -1
- pytorch_model.bin +1 -1
- scheduler.pt +1 -1
- trainer_state.json +603 -3
- training_args.bin +1 -1
config.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "/content/drive/MyDrive/ggpt2/checkpoint-
|
| 3 |
"_num_labels": 1,
|
| 4 |
"activation_function": "gelu_new",
|
| 5 |
"architectures": [
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "/content/drive/MyDrive/ggpt2/checkpoint-360000",
|
| 3 |
"_num_labels": 1,
|
| 4 |
"activation_function": "gelu_new",
|
| 5 |
"architectures": [
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 655348487
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:25aad45ca03f3ac4902671d5f30fcb071be626a4dddaf248bee4b6f553ec9a29
|
| 3 |
size 655348487
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 333975623
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7c4d6d6d56f177aadbd34ba09347ff85284717d7a559aac93a6e51fad2a1d41
|
| 3 |
size 333975623
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 623
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:617b3a41c1dd383619fbebee52cdee21b1f69f1f07f755cce1d5f1686ff9115a
|
| 3 |
size 623
|
trainer_state.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
-
"epoch":
|
| 5 |
-
"global_step":
|
| 6 |
"is_hyper_param_search": false,
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
|
@@ -4326,11 +4326,611 @@
|
|
| 4326 |
"learning_rate": 4.5085440874914565e-05,
|
| 4327 |
"loss": 2.0832,
|
| 4328 |
"step": 360000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4329 |
}
|
| 4330 |
],
|
| 4331 |
"max_steps": 3658000,
|
| 4332 |
"num_train_epochs": 1000,
|
| 4333 |
-
"total_flos":
|
| 4334 |
"trial_name": null,
|
| 4335 |
"trial_params": null
|
| 4336 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 112.08310552214324,
|
| 5 |
+
"global_step": 410000,
|
| 6 |
"is_hyper_param_search": false,
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
|
|
|
| 4326 |
"learning_rate": 4.5085440874914565e-05,
|
| 4327 |
"loss": 2.0832,
|
| 4328 |
"step": 360000
|
| 4329 |
+
},
|
| 4330 |
+
{
|
| 4331 |
+
"epoch": 98.55,
|
| 4332 |
+
"learning_rate": 4.50786056049214e-05,
|
| 4333 |
+
"loss": 0.0029,
|
| 4334 |
+
"step": 360500
|
| 4335 |
+
},
|
| 4336 |
+
{
|
| 4337 |
+
"epoch": 98.69,
|
| 4338 |
+
"learning_rate": 4.5071770334928234e-05,
|
| 4339 |
+
"loss": 2.0982,
|
| 4340 |
+
"step": 361000
|
| 4341 |
+
},
|
| 4342 |
+
{
|
| 4343 |
+
"epoch": 98.82,
|
| 4344 |
+
"learning_rate": 4.506493506493506e-05,
|
| 4345 |
+
"loss": 2.0996,
|
| 4346 |
+
"step": 361500
|
| 4347 |
+
},
|
| 4348 |
+
{
|
| 4349 |
+
"epoch": 98.96,
|
| 4350 |
+
"learning_rate": 4.5058099794941904e-05,
|
| 4351 |
+
"loss": 2.1084,
|
| 4352 |
+
"step": 362000
|
| 4353 |
+
},
|
| 4354 |
+
{
|
| 4355 |
+
"epoch": 99.1,
|
| 4356 |
+
"learning_rate": 4.505126452494874e-05,
|
| 4357 |
+
"loss": 2.0753,
|
| 4358 |
+
"step": 362500
|
| 4359 |
+
},
|
| 4360 |
+
{
|
| 4361 |
+
"epoch": 99.23,
|
| 4362 |
+
"learning_rate": 4.504442925495557e-05,
|
| 4363 |
+
"loss": 2.0645,
|
| 4364 |
+
"step": 363000
|
| 4365 |
+
},
|
| 4366 |
+
{
|
| 4367 |
+
"epoch": 99.37,
|
| 4368 |
+
"learning_rate": 4.503759398496241e-05,
|
| 4369 |
+
"loss": 2.0722,
|
| 4370 |
+
"step": 363500
|
| 4371 |
+
},
|
| 4372 |
+
{
|
| 4373 |
+
"epoch": 99.51,
|
| 4374 |
+
"learning_rate": 4.503075871496924e-05,
|
| 4375 |
+
"loss": 2.0849,
|
| 4376 |
+
"step": 364000
|
| 4377 |
+
},
|
| 4378 |
+
{
|
| 4379 |
+
"epoch": 99.64,
|
| 4380 |
+
"learning_rate": 4.502392344497608e-05,
|
| 4381 |
+
"loss": 2.0903,
|
| 4382 |
+
"step": 364500
|
| 4383 |
+
},
|
| 4384 |
+
{
|
| 4385 |
+
"epoch": 99.78,
|
| 4386 |
+
"learning_rate": 4.501708817498291e-05,
|
| 4387 |
+
"loss": 2.0973,
|
| 4388 |
+
"step": 365000
|
| 4389 |
+
},
|
| 4390 |
+
{
|
| 4391 |
+
"epoch": 99.92,
|
| 4392 |
+
"learning_rate": 4.501025290498975e-05,
|
| 4393 |
+
"loss": 2.1043,
|
| 4394 |
+
"step": 365500
|
| 4395 |
+
},
|
| 4396 |
+
{
|
| 4397 |
+
"epoch": 100.05,
|
| 4398 |
+
"learning_rate": 4.500341763499659e-05,
|
| 4399 |
+
"loss": 2.0809,
|
| 4400 |
+
"step": 366000
|
| 4401 |
+
},
|
| 4402 |
+
{
|
| 4403 |
+
"epoch": 100.19,
|
| 4404 |
+
"learning_rate": 4.499658236500342e-05,
|
| 4405 |
+
"loss": 2.0633,
|
| 4406 |
+
"step": 366500
|
| 4407 |
+
},
|
| 4408 |
+
{
|
| 4409 |
+
"epoch": 100.33,
|
| 4410 |
+
"learning_rate": 4.498974709501025e-05,
|
| 4411 |
+
"loss": 2.0694,
|
| 4412 |
+
"step": 367000
|
| 4413 |
+
},
|
| 4414 |
+
{
|
| 4415 |
+
"epoch": 100.46,
|
| 4416 |
+
"learning_rate": 4.4982911825017086e-05,
|
| 4417 |
+
"loss": 2.0763,
|
| 4418 |
+
"step": 367500
|
| 4419 |
+
},
|
| 4420 |
+
{
|
| 4421 |
+
"epoch": 100.6,
|
| 4422 |
+
"learning_rate": 4.497607655502393e-05,
|
| 4423 |
+
"loss": 2.0849,
|
| 4424 |
+
"step": 368000
|
| 4425 |
+
},
|
| 4426 |
+
{
|
| 4427 |
+
"epoch": 100.74,
|
| 4428 |
+
"learning_rate": 4.496924128503076e-05,
|
| 4429 |
+
"loss": 2.0855,
|
| 4430 |
+
"step": 368500
|
| 4431 |
+
},
|
| 4432 |
+
{
|
| 4433 |
+
"epoch": 100.87,
|
| 4434 |
+
"learning_rate": 4.49624060150376e-05,
|
| 4435 |
+
"loss": 2.0935,
|
| 4436 |
+
"step": 369000
|
| 4437 |
+
},
|
| 4438 |
+
{
|
| 4439 |
+
"epoch": 101.01,
|
| 4440 |
+
"learning_rate": 4.495557074504443e-05,
|
| 4441 |
+
"loss": 2.1002,
|
| 4442 |
+
"step": 369500
|
| 4443 |
+
},
|
| 4444 |
+
{
|
| 4445 |
+
"epoch": 101.15,
|
| 4446 |
+
"learning_rate": 4.494873547505127e-05,
|
| 4447 |
+
"loss": 2.0488,
|
| 4448 |
+
"step": 370000
|
| 4449 |
+
},
|
| 4450 |
+
{
|
| 4451 |
+
"epoch": 101.28,
|
| 4452 |
+
"learning_rate": 4.49419002050581e-05,
|
| 4453 |
+
"loss": 2.0613,
|
| 4454 |
+
"step": 370500
|
| 4455 |
+
},
|
| 4456 |
+
{
|
| 4457 |
+
"epoch": 101.42,
|
| 4458 |
+
"learning_rate": 4.493506493506494e-05,
|
| 4459 |
+
"loss": 2.0637,
|
| 4460 |
+
"step": 371000
|
| 4461 |
+
},
|
| 4462 |
+
{
|
| 4463 |
+
"epoch": 101.56,
|
| 4464 |
+
"learning_rate": 4.492822966507177e-05,
|
| 4465 |
+
"loss": 2.0769,
|
| 4466 |
+
"step": 371500
|
| 4467 |
+
},
|
| 4468 |
+
{
|
| 4469 |
+
"epoch": 101.69,
|
| 4470 |
+
"learning_rate": 4.4921394395078606e-05,
|
| 4471 |
+
"loss": 2.086,
|
| 4472 |
+
"step": 372000
|
| 4473 |
+
},
|
| 4474 |
+
{
|
| 4475 |
+
"epoch": 101.83,
|
| 4476 |
+
"learning_rate": 4.491455912508544e-05,
|
| 4477 |
+
"loss": 2.0914,
|
| 4478 |
+
"step": 372500
|
| 4479 |
+
},
|
| 4480 |
+
{
|
| 4481 |
+
"epoch": 101.97,
|
| 4482 |
+
"learning_rate": 4.4907723855092276e-05,
|
| 4483 |
+
"loss": 2.0974,
|
| 4484 |
+
"step": 373000
|
| 4485 |
+
},
|
| 4486 |
+
{
|
| 4487 |
+
"epoch": 102.1,
|
| 4488 |
+
"learning_rate": 4.490088858509912e-05,
|
| 4489 |
+
"loss": 2.0534,
|
| 4490 |
+
"step": 373500
|
| 4491 |
+
},
|
| 4492 |
+
{
|
| 4493 |
+
"epoch": 102.24,
|
| 4494 |
+
"learning_rate": 4.489405331510595e-05,
|
| 4495 |
+
"loss": 2.0527,
|
| 4496 |
+
"step": 374000
|
| 4497 |
+
},
|
| 4498 |
+
{
|
| 4499 |
+
"epoch": 102.38,
|
| 4500 |
+
"learning_rate": 4.488721804511278e-05,
|
| 4501 |
+
"loss": 2.0637,
|
| 4502 |
+
"step": 374500
|
| 4503 |
+
},
|
| 4504 |
+
{
|
| 4505 |
+
"epoch": 102.52,
|
| 4506 |
+
"learning_rate": 4.4880382775119615e-05,
|
| 4507 |
+
"loss": 2.0692,
|
| 4508 |
+
"step": 375000
|
| 4509 |
+
},
|
| 4510 |
+
{
|
| 4511 |
+
"epoch": 102.65,
|
| 4512 |
+
"learning_rate": 4.487354750512646e-05,
|
| 4513 |
+
"loss": 2.078,
|
| 4514 |
+
"step": 375500
|
| 4515 |
+
},
|
| 4516 |
+
{
|
| 4517 |
+
"epoch": 102.79,
|
| 4518 |
+
"learning_rate": 4.486671223513329e-05,
|
| 4519 |
+
"loss": 2.0788,
|
| 4520 |
+
"step": 376000
|
| 4521 |
+
},
|
| 4522 |
+
{
|
| 4523 |
+
"epoch": 102.93,
|
| 4524 |
+
"learning_rate": 4.4859876965140126e-05,
|
| 4525 |
+
"loss": 2.089,
|
| 4526 |
+
"step": 376500
|
| 4527 |
+
},
|
| 4528 |
+
{
|
| 4529 |
+
"epoch": 103.06,
|
| 4530 |
+
"learning_rate": 4.485304169514696e-05,
|
| 4531 |
+
"loss": 2.0704,
|
| 4532 |
+
"step": 377000
|
| 4533 |
+
},
|
| 4534 |
+
{
|
| 4535 |
+
"epoch": 103.2,
|
| 4536 |
+
"learning_rate": 4.4846206425153796e-05,
|
| 4537 |
+
"loss": 2.0524,
|
| 4538 |
+
"step": 377500
|
| 4539 |
+
},
|
| 4540 |
+
{
|
| 4541 |
+
"epoch": 103.34,
|
| 4542 |
+
"learning_rate": 4.483937115516063e-05,
|
| 4543 |
+
"loss": 2.0481,
|
| 4544 |
+
"step": 378000
|
| 4545 |
+
},
|
| 4546 |
+
{
|
| 4547 |
+
"epoch": 103.47,
|
| 4548 |
+
"learning_rate": 4.4832535885167465e-05,
|
| 4549 |
+
"loss": 2.0607,
|
| 4550 |
+
"step": 378500
|
| 4551 |
+
},
|
| 4552 |
+
{
|
| 4553 |
+
"epoch": 103.61,
|
| 4554 |
+
"learning_rate": 4.482570061517431e-05,
|
| 4555 |
+
"loss": 2.0742,
|
| 4556 |
+
"step": 379000
|
| 4557 |
+
},
|
| 4558 |
+
{
|
| 4559 |
+
"epoch": 103.75,
|
| 4560 |
+
"learning_rate": 4.4818865345181135e-05,
|
| 4561 |
+
"loss": 2.0698,
|
| 4562 |
+
"step": 379500
|
| 4563 |
+
},
|
| 4564 |
+
{
|
| 4565 |
+
"epoch": 103.88,
|
| 4566 |
+
"learning_rate": 4.481203007518797e-05,
|
| 4567 |
+
"loss": 2.0826,
|
| 4568 |
+
"step": 380000
|
| 4569 |
+
},
|
| 4570 |
+
{
|
| 4571 |
+
"epoch": 104.02,
|
| 4572 |
+
"learning_rate": 4.4805194805194805e-05,
|
| 4573 |
+
"loss": 2.0789,
|
| 4574 |
+
"step": 380500
|
| 4575 |
+
},
|
| 4576 |
+
{
|
| 4577 |
+
"epoch": 104.16,
|
| 4578 |
+
"learning_rate": 4.4798359535201646e-05,
|
| 4579 |
+
"loss": 2.0342,
|
| 4580 |
+
"step": 381000
|
| 4581 |
+
},
|
| 4582 |
+
{
|
| 4583 |
+
"epoch": 104.29,
|
| 4584 |
+
"learning_rate": 4.479152426520848e-05,
|
| 4585 |
+
"loss": 2.0422,
|
| 4586 |
+
"step": 381500
|
| 4587 |
+
},
|
| 4588 |
+
{
|
| 4589 |
+
"epoch": 104.43,
|
| 4590 |
+
"learning_rate": 4.4784688995215316e-05,
|
| 4591 |
+
"loss": 2.0582,
|
| 4592 |
+
"step": 382000
|
| 4593 |
+
},
|
| 4594 |
+
{
|
| 4595 |
+
"epoch": 104.57,
|
| 4596 |
+
"learning_rate": 4.4777853725222144e-05,
|
| 4597 |
+
"loss": 2.0651,
|
| 4598 |
+
"step": 382500
|
| 4599 |
+
},
|
| 4600 |
+
{
|
| 4601 |
+
"epoch": 104.7,
|
| 4602 |
+
"learning_rate": 4.4771018455228985e-05,
|
| 4603 |
+
"loss": 2.0687,
|
| 4604 |
+
"step": 383000
|
| 4605 |
+
},
|
| 4606 |
+
{
|
| 4607 |
+
"epoch": 104.84,
|
| 4608 |
+
"learning_rate": 4.476418318523582e-05,
|
| 4609 |
+
"loss": 2.0763,
|
| 4610 |
+
"step": 383500
|
| 4611 |
+
},
|
| 4612 |
+
{
|
| 4613 |
+
"epoch": 104.98,
|
| 4614 |
+
"learning_rate": 4.4757347915242655e-05,
|
| 4615 |
+
"loss": 2.0866,
|
| 4616 |
+
"step": 384000
|
| 4617 |
+
},
|
| 4618 |
+
{
|
| 4619 |
+
"epoch": 105.11,
|
| 4620 |
+
"learning_rate": 4.475051264524949e-05,
|
| 4621 |
+
"loss": 2.0387,
|
| 4622 |
+
"step": 384500
|
| 4623 |
+
},
|
| 4624 |
+
{
|
| 4625 |
+
"epoch": 105.25,
|
| 4626 |
+
"learning_rate": 4.4743677375256325e-05,
|
| 4627 |
+
"loss": 2.0398,
|
| 4628 |
+
"step": 385000
|
| 4629 |
+
},
|
| 4630 |
+
{
|
| 4631 |
+
"epoch": 105.39,
|
| 4632 |
+
"learning_rate": 4.473684210526316e-05,
|
| 4633 |
+
"loss": 2.0542,
|
| 4634 |
+
"step": 385500
|
| 4635 |
+
},
|
| 4636 |
+
{
|
| 4637 |
+
"epoch": 105.52,
|
| 4638 |
+
"learning_rate": 4.4730006835269994e-05,
|
| 4639 |
+
"loss": 2.0551,
|
| 4640 |
+
"step": 386000
|
| 4641 |
+
},
|
| 4642 |
+
{
|
| 4643 |
+
"epoch": 105.66,
|
| 4644 |
+
"learning_rate": 4.472317156527683e-05,
|
| 4645 |
+
"loss": 2.0639,
|
| 4646 |
+
"step": 386500
|
| 4647 |
+
},
|
| 4648 |
+
{
|
| 4649 |
+
"epoch": 105.8,
|
| 4650 |
+
"learning_rate": 4.471633629528367e-05,
|
| 4651 |
+
"loss": 2.0706,
|
| 4652 |
+
"step": 387000
|
| 4653 |
+
},
|
| 4654 |
+
{
|
| 4655 |
+
"epoch": 105.93,
|
| 4656 |
+
"learning_rate": 4.47095010252905e-05,
|
| 4657 |
+
"loss": 2.0742,
|
| 4658 |
+
"step": 387500
|
| 4659 |
+
},
|
| 4660 |
+
{
|
| 4661 |
+
"epoch": 106.07,
|
| 4662 |
+
"learning_rate": 4.470266575529733e-05,
|
| 4663 |
+
"loss": 2.0517,
|
| 4664 |
+
"step": 388000
|
| 4665 |
+
},
|
| 4666 |
+
{
|
| 4667 |
+
"epoch": 106.21,
|
| 4668 |
+
"learning_rate": 4.4695830485304175e-05,
|
| 4669 |
+
"loss": 2.0298,
|
| 4670 |
+
"step": 388500
|
| 4671 |
+
},
|
| 4672 |
+
{
|
| 4673 |
+
"epoch": 106.34,
|
| 4674 |
+
"learning_rate": 4.468899521531101e-05,
|
| 4675 |
+
"loss": 2.0385,
|
| 4676 |
+
"step": 389000
|
| 4677 |
+
},
|
| 4678 |
+
{
|
| 4679 |
+
"epoch": 106.48,
|
| 4680 |
+
"learning_rate": 4.4682159945317844e-05,
|
| 4681 |
+
"loss": 2.051,
|
| 4682 |
+
"step": 389500
|
| 4683 |
+
},
|
| 4684 |
+
{
|
| 4685 |
+
"epoch": 106.62,
|
| 4686 |
+
"learning_rate": 4.467532467532467e-05,
|
| 4687 |
+
"loss": 2.0592,
|
| 4688 |
+
"step": 390000
|
| 4689 |
+
},
|
| 4690 |
+
{
|
| 4691 |
+
"epoch": 106.75,
|
| 4692 |
+
"learning_rate": 4.4668489405331514e-05,
|
| 4693 |
+
"loss": 2.0676,
|
| 4694 |
+
"step": 390500
|
| 4695 |
+
},
|
| 4696 |
+
{
|
| 4697 |
+
"epoch": 106.89,
|
| 4698 |
+
"learning_rate": 4.466165413533835e-05,
|
| 4699 |
+
"loss": 2.0695,
|
| 4700 |
+
"step": 391000
|
| 4701 |
+
},
|
| 4702 |
+
{
|
| 4703 |
+
"epoch": 107.03,
|
| 4704 |
+
"learning_rate": 4.4654818865345184e-05,
|
| 4705 |
+
"loss": 2.0598,
|
| 4706 |
+
"step": 391500
|
| 4707 |
+
},
|
| 4708 |
+
{
|
| 4709 |
+
"epoch": 107.16,
|
| 4710 |
+
"learning_rate": 4.464798359535202e-05,
|
| 4711 |
+
"loss": 2.024,
|
| 4712 |
+
"step": 392000
|
| 4713 |
+
},
|
| 4714 |
+
{
|
| 4715 |
+
"epoch": 107.3,
|
| 4716 |
+
"learning_rate": 4.464114832535885e-05,
|
| 4717 |
+
"loss": 2.0372,
|
| 4718 |
+
"step": 392500
|
| 4719 |
+
},
|
| 4720 |
+
{
|
| 4721 |
+
"epoch": 107.44,
|
| 4722 |
+
"learning_rate": 4.463431305536569e-05,
|
| 4723 |
+
"loss": 2.0433,
|
| 4724 |
+
"step": 393000
|
| 4725 |
+
},
|
| 4726 |
+
{
|
| 4727 |
+
"epoch": 107.57,
|
| 4728 |
+
"learning_rate": 4.462747778537252e-05,
|
| 4729 |
+
"loss": 2.0472,
|
| 4730 |
+
"step": 393500
|
| 4731 |
+
},
|
| 4732 |
+
{
|
| 4733 |
+
"epoch": 107.71,
|
| 4734 |
+
"learning_rate": 4.462064251537936e-05,
|
| 4735 |
+
"loss": 2.0579,
|
| 4736 |
+
"step": 394000
|
| 4737 |
+
},
|
| 4738 |
+
{
|
| 4739 |
+
"epoch": 107.85,
|
| 4740 |
+
"learning_rate": 4.46138072453862e-05,
|
| 4741 |
+
"loss": 2.0605,
|
| 4742 |
+
"step": 394500
|
| 4743 |
+
},
|
| 4744 |
+
{
|
| 4745 |
+
"epoch": 107.98,
|
| 4746 |
+
"learning_rate": 4.460697197539303e-05,
|
| 4747 |
+
"loss": 2.0745,
|
| 4748 |
+
"step": 395000
|
| 4749 |
+
},
|
| 4750 |
+
{
|
| 4751 |
+
"epoch": 108.12,
|
| 4752 |
+
"learning_rate": 4.460013670539986e-05,
|
| 4753 |
+
"loss": 2.026,
|
| 4754 |
+
"step": 395500
|
| 4755 |
+
},
|
| 4756 |
+
{
|
| 4757 |
+
"epoch": 108.26,
|
| 4758 |
+
"learning_rate": 4.45933014354067e-05,
|
| 4759 |
+
"loss": 2.0251,
|
| 4760 |
+
"step": 396000
|
| 4761 |
+
},
|
| 4762 |
+
{
|
| 4763 |
+
"epoch": 108.39,
|
| 4764 |
+
"learning_rate": 4.458646616541354e-05,
|
| 4765 |
+
"loss": 2.0438,
|
| 4766 |
+
"step": 396500
|
| 4767 |
+
},
|
| 4768 |
+
{
|
| 4769 |
+
"epoch": 108.53,
|
| 4770 |
+
"learning_rate": 4.457963089542037e-05,
|
| 4771 |
+
"loss": 2.0407,
|
| 4772 |
+
"step": 397000
|
| 4773 |
+
},
|
| 4774 |
+
{
|
| 4775 |
+
"epoch": 108.67,
|
| 4776 |
+
"learning_rate": 4.457279562542721e-05,
|
| 4777 |
+
"loss": 2.0477,
|
| 4778 |
+
"step": 397500
|
| 4779 |
+
},
|
| 4780 |
+
{
|
| 4781 |
+
"epoch": 108.8,
|
| 4782 |
+
"learning_rate": 4.456596035543404e-05,
|
| 4783 |
+
"loss": 2.0536,
|
| 4784 |
+
"step": 398000
|
| 4785 |
+
},
|
| 4786 |
+
{
|
| 4787 |
+
"epoch": 108.94,
|
| 4788 |
+
"learning_rate": 4.455912508544088e-05,
|
| 4789 |
+
"loss": 2.065,
|
| 4790 |
+
"step": 398500
|
| 4791 |
+
},
|
| 4792 |
+
{
|
| 4793 |
+
"epoch": 109.08,
|
| 4794 |
+
"learning_rate": 4.455228981544771e-05,
|
| 4795 |
+
"loss": 2.0385,
|
| 4796 |
+
"step": 399000
|
| 4797 |
+
},
|
| 4798 |
+
{
|
| 4799 |
+
"epoch": 109.21,
|
| 4800 |
+
"learning_rate": 4.454545454545455e-05,
|
| 4801 |
+
"loss": 2.0245,
|
| 4802 |
+
"step": 399500
|
| 4803 |
+
},
|
| 4804 |
+
{
|
| 4805 |
+
"epoch": 109.35,
|
| 4806 |
+
"learning_rate": 4.453861927546138e-05,
|
| 4807 |
+
"loss": 2.0304,
|
| 4808 |
+
"step": 400000
|
| 4809 |
+
},
|
| 4810 |
+
{
|
| 4811 |
+
"epoch": 109.49,
|
| 4812 |
+
"learning_rate": 4.453178400546822e-05,
|
| 4813 |
+
"loss": 2.0395,
|
| 4814 |
+
"step": 400500
|
| 4815 |
+
},
|
| 4816 |
+
{
|
| 4817 |
+
"epoch": 109.62,
|
| 4818 |
+
"learning_rate": 4.452494873547505e-05,
|
| 4819 |
+
"loss": 2.0405,
|
| 4820 |
+
"step": 401000
|
| 4821 |
+
},
|
| 4822 |
+
{
|
| 4823 |
+
"epoch": 109.76,
|
| 4824 |
+
"learning_rate": 4.4518113465481886e-05,
|
| 4825 |
+
"loss": 2.0475,
|
| 4826 |
+
"step": 401500
|
| 4827 |
+
},
|
| 4828 |
+
{
|
| 4829 |
+
"epoch": 109.9,
|
| 4830 |
+
"learning_rate": 4.451127819548873e-05,
|
| 4831 |
+
"loss": 2.0538,
|
| 4832 |
+
"step": 402000
|
| 4833 |
+
},
|
| 4834 |
+
{
|
| 4835 |
+
"epoch": 110.03,
|
| 4836 |
+
"learning_rate": 4.450444292549556e-05,
|
| 4837 |
+
"loss": 2.0518,
|
| 4838 |
+
"step": 402500
|
| 4839 |
+
},
|
| 4840 |
+
{
|
| 4841 |
+
"epoch": 110.17,
|
| 4842 |
+
"learning_rate": 4.449760765550239e-05,
|
| 4843 |
+
"loss": 2.0132,
|
| 4844 |
+
"step": 403000
|
| 4845 |
+
},
|
| 4846 |
+
{
|
| 4847 |
+
"epoch": 110.31,
|
| 4848 |
+
"learning_rate": 4.4490772385509225e-05,
|
| 4849 |
+
"loss": 2.0259,
|
| 4850 |
+
"step": 403500
|
| 4851 |
+
},
|
| 4852 |
+
{
|
| 4853 |
+
"epoch": 110.44,
|
| 4854 |
+
"learning_rate": 4.448393711551607e-05,
|
| 4855 |
+
"loss": 2.0302,
|
| 4856 |
+
"step": 404000
|
| 4857 |
+
},
|
| 4858 |
+
{
|
| 4859 |
+
"epoch": 110.58,
|
| 4860 |
+
"learning_rate": 4.44771018455229e-05,
|
| 4861 |
+
"loss": 2.0392,
|
| 4862 |
+
"step": 404500
|
| 4863 |
+
},
|
| 4864 |
+
{
|
| 4865 |
+
"epoch": 110.72,
|
| 4866 |
+
"learning_rate": 4.4470266575529737e-05,
|
| 4867 |
+
"loss": 2.0429,
|
| 4868 |
+
"step": 405000
|
| 4869 |
+
},
|
| 4870 |
+
{
|
| 4871 |
+
"epoch": 110.85,
|
| 4872 |
+
"learning_rate": 4.446343130553657e-05,
|
| 4873 |
+
"loss": 2.0444,
|
| 4874 |
+
"step": 405500
|
| 4875 |
+
},
|
| 4876 |
+
{
|
| 4877 |
+
"epoch": 110.99,
|
| 4878 |
+
"learning_rate": 4.4456596035543406e-05,
|
| 4879 |
+
"loss": 2.0542,
|
| 4880 |
+
"step": 406000
|
| 4881 |
+
},
|
| 4882 |
+
{
|
| 4883 |
+
"epoch": 111.13,
|
| 4884 |
+
"learning_rate": 4.444976076555024e-05,
|
| 4885 |
+
"loss": 2.0086,
|
| 4886 |
+
"step": 406500
|
| 4887 |
+
},
|
| 4888 |
+
{
|
| 4889 |
+
"epoch": 111.26,
|
| 4890 |
+
"learning_rate": 4.4442925495557076e-05,
|
| 4891 |
+
"loss": 2.0216,
|
| 4892 |
+
"step": 407000
|
| 4893 |
+
},
|
| 4894 |
+
{
|
| 4895 |
+
"epoch": 111.4,
|
| 4896 |
+
"learning_rate": 4.443609022556392e-05,
|
| 4897 |
+
"loss": 2.0197,
|
| 4898 |
+
"step": 407500
|
| 4899 |
+
},
|
| 4900 |
+
{
|
| 4901 |
+
"epoch": 111.54,
|
| 4902 |
+
"learning_rate": 4.4429254955570745e-05,
|
| 4903 |
+
"loss": 2.032,
|
| 4904 |
+
"step": 408000
|
| 4905 |
+
},
|
| 4906 |
+
{
|
| 4907 |
+
"epoch": 111.67,
|
| 4908 |
+
"learning_rate": 4.442241968557758e-05,
|
| 4909 |
+
"loss": 2.0413,
|
| 4910 |
+
"step": 408500
|
| 4911 |
+
},
|
| 4912 |
+
{
|
| 4913 |
+
"epoch": 111.81,
|
| 4914 |
+
"learning_rate": 4.4415584415584415e-05,
|
| 4915 |
+
"loss": 2.0456,
|
| 4916 |
+
"step": 409000
|
| 4917 |
+
},
|
| 4918 |
+
{
|
| 4919 |
+
"epoch": 111.95,
|
| 4920 |
+
"learning_rate": 4.4408749145591257e-05,
|
| 4921 |
+
"loss": 2.047,
|
| 4922 |
+
"step": 409500
|
| 4923 |
+
},
|
| 4924 |
+
{
|
| 4925 |
+
"epoch": 112.08,
|
| 4926 |
+
"learning_rate": 4.440191387559809e-05,
|
| 4927 |
+
"loss": 2.0248,
|
| 4928 |
+
"step": 410000
|
| 4929 |
}
|
| 4930 |
],
|
| 4931 |
"max_steps": 3658000,
|
| 4932 |
"num_train_epochs": 1000,
|
| 4933 |
+
"total_flos": 825321947035336704,
|
| 4934 |
"trial_name": null,
|
| 4935 |
"trial_params": null
|
| 4936 |
}
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2031
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93d2fc2ca4a3f894c9391bd5e40a96e46714817c7477b582da0320beb734993f
|
| 3 |
size 2031
|