Commit ·
ce06b5e
1
Parent(s): 272cc03
huggingartists
Browse files- README.md +3 -3
- config.json +2 -2
- evaluation.txt +1 -1
- flax_model.msgpack +1 -1
- optimizer.pt +1 -1
- pytorch_model.bin +2 -2
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- tokenizer.json +0 -0
- trainer_state.json +430 -6
- training_args.bin +2 -2
README.md
CHANGED
|
@@ -45,15 +45,15 @@ from datasets import load_dataset
|
|
| 45 |
dataset = load_dataset("huggingartists/bob-dylan")
|
| 46 |
```
|
| 47 |
|
| 48 |
-
[Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/
|
| 49 |
|
| 50 |
## Training procedure
|
| 51 |
|
| 52 |
The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Bob Dylan's lyrics.
|
| 53 |
|
| 54 |
-
Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/
|
| 55 |
|
| 56 |
-
At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/
|
| 57 |
|
| 58 |
## How to use
|
| 59 |
|
|
|
|
| 45 |
dataset = load_dataset("huggingartists/bob-dylan")
|
| 46 |
```
|
| 47 |
|
| 48 |
+
[Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/31a7e0lm/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
|
| 49 |
|
| 50 |
## Training procedure
|
| 51 |
|
| 52 |
The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Bob Dylan's lyrics.
|
| 53 |
|
| 54 |
+
Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/1h7wqver) for full transparency and reproducibility.
|
| 55 |
|
| 56 |
+
At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/1h7wqver/artifacts) is logged and versioned.
|
| 57 |
|
| 58 |
## How to use
|
| 59 |
|
config.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "
|
| 3 |
"activation_function": "gelu_new",
|
| 4 |
"architectures": [
|
| 5 |
"GPT2LMHeadModel"
|
|
@@ -36,7 +36,7 @@
|
|
| 36 |
}
|
| 37 |
},
|
| 38 |
"torch_dtype": "float32",
|
| 39 |
-
"transformers_version": "4.
|
| 40 |
"use_cache": true,
|
| 41 |
"vocab_size": 50257
|
| 42 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "bob-dylan",
|
| 3 |
"activation_function": "gelu_new",
|
| 4 |
"architectures": [
|
| 5 |
"GPT2LMHeadModel"
|
|
|
|
| 36 |
}
|
| 37 |
},
|
| 38 |
"torch_dtype": "float32",
|
| 39 |
+
"transformers_version": "4.19.2",
|
| 40 |
"use_cache": true,
|
| 41 |
"vocab_size": 50257
|
| 42 |
}
|
evaluation.txt
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"eval_loss": 1.
|
|
|
|
| 1 |
+
{"eval_loss": 1.1156859397888184, "eval_runtime": 5.2897, "eval_samples_per_second": 82.046, "eval_steps_per_second": 10.398, "epoch": 11.0}
|
flax_model.msgpack
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 497764120
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52eb735612f0abe86a36c99bbc88e4b736d213924b487ddc439a7fda4f3738ba
|
| 3 |
size 497764120
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 995604017
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:20df6e4328ecf349bf08cc74a4faa3ceabf0373ff1ce5c11ee4657c56c5ebe05
|
| 3 |
size 995604017
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6cbda632eba71ca8a5a6d2b2a32f60a0e0d89f0b2b5f27757234f2f9dea5b2bc
|
| 3 |
+
size 510396521
|
rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14567
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7446d5500cdd6761e0d9b127f879a785bc53369d1cd3923b64bfed4fdcf6b5a3
|
| 3 |
size 14567
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 623
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:07ca2fdd8c3e336181f82585738bd2cd39530e31bea6189b6d35d926f6c48442
|
| 3 |
size 623
|
tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
trainer_state.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
{
|
| 2 |
-
"best_metric": 1.
|
| 3 |
-
"best_model_checkpoint": "output/bob-dylan/checkpoint-
|
| 4 |
-
"epoch":
|
| 5 |
-
"global_step":
|
| 6 |
"is_hyper_param_search": false,
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
|
@@ -3902,11 +3902,435 @@
|
|
| 3902 |
"eval_samples_per_second": 22.062,
|
| 3903 |
"eval_steps_per_second": 2.801,
|
| 3904 |
"step": 3180
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3905 |
}
|
| 3906 |
],
|
| 3907 |
-
"max_steps":
|
| 3908 |
"num_train_epochs": 11,
|
| 3909 |
-
"total_flos":
|
| 3910 |
"trial_name": null,
|
| 3911 |
"trial_params": null
|
| 3912 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_metric": 1.1156859397888184,
|
| 3 |
+
"best_model_checkpoint": "output/bob-dylan/checkpoint-3520",
|
| 4 |
+
"epoch": 11.0,
|
| 5 |
+
"global_step": 3520,
|
| 6 |
"is_hyper_param_search": false,
|
| 7 |
"is_local_process_zero": true,
|
| 8 |
"is_world_process_zero": true,
|
|
|
|
| 3902 |
"eval_samples_per_second": 22.062,
|
| 3903 |
"eval_steps_per_second": 2.801,
|
| 3904 |
"step": 3180
|
| 3905 |
+
},
|
| 3906 |
+
{
|
| 3907 |
+
"epoch": 9.95,
|
| 3908 |
+
"learning_rate": 0.00013645750858358395,
|
| 3909 |
+
"loss": 1.2433,
|
| 3910 |
+
"step": 3185
|
| 3911 |
+
},
|
| 3912 |
+
{
|
| 3913 |
+
"epoch": 9.97,
|
| 3914 |
+
"learning_rate": 0.0001368696722497127,
|
| 3915 |
+
"loss": 1.547,
|
| 3916 |
+
"step": 3190
|
| 3917 |
+
},
|
| 3918 |
+
{
|
| 3919 |
+
"epoch": 9.98,
|
| 3920 |
+
"learning_rate": 0.00013711736829567482,
|
| 3921 |
+
"loss": 1.4594,
|
| 3922 |
+
"step": 3195
|
| 3923 |
+
},
|
| 3924 |
+
{
|
| 3925 |
+
"epoch": 10.0,
|
| 3926 |
+
"learning_rate": 0.0001372,
|
| 3927 |
+
"loss": 1.3407,
|
| 3928 |
+
"step": 3200
|
| 3929 |
+
},
|
| 3930 |
+
{
|
| 3931 |
+
"epoch": 10.0,
|
| 3932 |
+
"eval_loss": 1.139600157737732,
|
| 3933 |
+
"eval_runtime": 5.2723,
|
| 3934 |
+
"eval_samples_per_second": 82.317,
|
| 3935 |
+
"eval_steps_per_second": 10.432,
|
| 3936 |
+
"step": 3200
|
| 3937 |
+
},
|
| 3938 |
+
{
|
| 3939 |
+
"epoch": 10.02,
|
| 3940 |
+
"learning_rate": 0.00013711736829567482,
|
| 3941 |
+
"loss": 1.4415,
|
| 3942 |
+
"step": 3205
|
| 3943 |
+
},
|
| 3944 |
+
{
|
| 3945 |
+
"epoch": 10.03,
|
| 3946 |
+
"learning_rate": 0.00013686967224971273,
|
| 3947 |
+
"loss": 1.2348,
|
| 3948 |
+
"step": 3210
|
| 3949 |
+
},
|
| 3950 |
+
{
|
| 3951 |
+
"epoch": 10.05,
|
| 3952 |
+
"learning_rate": 0.00013645750858358398,
|
| 3953 |
+
"loss": 1.4623,
|
| 3954 |
+
"step": 3215
|
| 3955 |
+
},
|
| 3956 |
+
{
|
| 3957 |
+
"epoch": 10.06,
|
| 3958 |
+
"learning_rate": 0.00013588187023566163,
|
| 3959 |
+
"loss": 1.437,
|
| 3960 |
+
"step": 3220
|
| 3961 |
+
},
|
| 3962 |
+
{
|
| 3963 |
+
"epoch": 10.08,
|
| 3964 |
+
"learning_rate": 0.00013514414396914573,
|
| 3965 |
+
"loss": 1.6916,
|
| 3966 |
+
"step": 3225
|
| 3967 |
+
},
|
| 3968 |
+
{
|
| 3969 |
+
"epoch": 10.09,
|
| 3970 |
+
"learning_rate": 0.00013424610703122958,
|
| 3971 |
+
"loss": 1.7023,
|
| 3972 |
+
"step": 3230
|
| 3973 |
+
},
|
| 3974 |
+
{
|
| 3975 |
+
"epoch": 10.11,
|
| 3976 |
+
"learning_rate": 0.00013318992287155525,
|
| 3977 |
+
"loss": 1.3172,
|
| 3978 |
+
"step": 3235
|
| 3979 |
+
},
|
| 3980 |
+
{
|
| 3981 |
+
"epoch": 10.12,
|
| 3982 |
+
"learning_rate": 0.00013197813593027435,
|
| 3983 |
+
"loss": 1.2053,
|
| 3984 |
+
"step": 3240
|
| 3985 |
+
},
|
| 3986 |
+
{
|
| 3987 |
+
"epoch": 10.14,
|
| 3988 |
+
"learning_rate": 0.00013061366550826825,
|
| 3989 |
+
"loss": 1.1869,
|
| 3990 |
+
"step": 3245
|
| 3991 |
+
},
|
| 3992 |
+
{
|
| 3993 |
+
"epoch": 10.16,
|
| 3994 |
+
"learning_rate": 0.00012909979873429724,
|
| 3995 |
+
"loss": 1.2981,
|
| 3996 |
+
"step": 3250
|
| 3997 |
+
},
|
| 3998 |
+
{
|
| 3999 |
+
"epoch": 10.17,
|
| 4000 |
+
"learning_rate": 0.0001274401826460187,
|
| 4001 |
+
"loss": 1.6608,
|
| 4002 |
+
"step": 3255
|
| 4003 |
+
},
|
| 4004 |
+
{
|
| 4005 |
+
"epoch": 10.19,
|
| 4006 |
+
"learning_rate": 0.00012563881540395474,
|
| 4007 |
+
"loss": 1.3115,
|
| 4008 |
+
"step": 3260
|
| 4009 |
+
},
|
| 4010 |
+
{
|
| 4011 |
+
"epoch": 10.2,
|
| 4012 |
+
"learning_rate": 0.00012370003665957216,
|
| 4013 |
+
"loss": 1.2824,
|
| 4014 |
+
"step": 3265
|
| 4015 |
+
},
|
| 4016 |
+
{
|
| 4017 |
+
"epoch": 10.22,
|
| 4018 |
+
"learning_rate": 0.00012162851710068375,
|
| 4019 |
+
"loss": 1.4082,
|
| 4020 |
+
"step": 3270
|
| 4021 |
+
},
|
| 4022 |
+
{
|
| 4023 |
+
"epoch": 10.23,
|
| 4024 |
+
"learning_rate": 0.00011942924719935029,
|
| 4025 |
+
"loss": 1.3048,
|
| 4026 |
+
"step": 3275
|
| 4027 |
+
},
|
| 4028 |
+
{
|
| 4029 |
+
"epoch": 10.25,
|
| 4030 |
+
"learning_rate": 0.00011710752518939736,
|
| 4031 |
+
"loss": 1.3276,
|
| 4032 |
+
"step": 3280
|
| 4033 |
+
},
|
| 4034 |
+
{
|
| 4035 |
+
"epoch": 10.27,
|
| 4036 |
+
"learning_rate": 0.0001146689443025054,
|
| 4037 |
+
"loss": 1.4064,
|
| 4038 |
+
"step": 3285
|
| 4039 |
+
},
|
| 4040 |
+
{
|
| 4041 |
+
"epoch": 10.28,
|
| 4042 |
+
"learning_rate": 0.00011211937929362613,
|
| 4043 |
+
"loss": 1.2408,
|
| 4044 |
+
"step": 3290
|
| 4045 |
+
},
|
| 4046 |
+
{
|
| 4047 |
+
"epoch": 10.3,
|
| 4048 |
+
"learning_rate": 0.00010946497228818107,
|
| 4049 |
+
"loss": 1.3932,
|
| 4050 |
+
"step": 3295
|
| 4051 |
+
},
|
| 4052 |
+
{
|
| 4053 |
+
"epoch": 10.31,
|
| 4054 |
+
"learning_rate": 0.00010671211798514499,
|
| 4055 |
+
"loss": 1.4576,
|
| 4056 |
+
"step": 3300
|
| 4057 |
+
},
|
| 4058 |
+
{
|
| 4059 |
+
"epoch": 10.33,
|
| 4060 |
+
"learning_rate": 0.00010386744825165496,
|
| 4061 |
+
"loss": 1.455,
|
| 4062 |
+
"step": 3305
|
| 4063 |
+
},
|
| 4064 |
+
{
|
| 4065 |
+
"epoch": 10.34,
|
| 4066 |
+
"learning_rate": 0.00010093781614626351,
|
| 4067 |
+
"loss": 1.3289,
|
| 4068 |
+
"step": 3310
|
| 4069 |
+
},
|
| 4070 |
+
{
|
| 4071 |
+
"epoch": 10.36,
|
| 4072 |
+
"learning_rate": 9.793027940931756e-05,
|
| 4073 |
+
"loss": 1.2645,
|
| 4074 |
+
"step": 3315
|
| 4075 |
+
},
|
| 4076 |
+
{
|
| 4077 |
+
"epoch": 10.38,
|
| 4078 |
+
"learning_rate": 9.485208346024504e-05,
|
| 4079 |
+
"loss": 1.39,
|
| 4080 |
+
"step": 3320
|
| 4081 |
+
},
|
| 4082 |
+
{
|
| 4083 |
+
"epoch": 10.39,
|
| 4084 |
+
"learning_rate": 9.17106439427063e-05,
|
| 4085 |
+
"loss": 1.3945,
|
| 4086 |
+
"step": 3325
|
| 4087 |
+
},
|
| 4088 |
+
{
|
| 4089 |
+
"epoch": 10.41,
|
| 4090 |
+
"learning_rate": 8.851352885965625e-05,
|
| 4091 |
+
"loss": 1.5375,
|
| 4092 |
+
"step": 3330
|
| 4093 |
+
},
|
| 4094 |
+
{
|
| 4095 |
+
"epoch": 10.42,
|
| 4096 |
+
"learning_rate": 8.526844034136417e-05,
|
| 4097 |
+
"loss": 1.4077,
|
| 4098 |
+
"step": 3335
|
| 4099 |
+
},
|
| 4100 |
+
{
|
| 4101 |
+
"epoch": 10.44,
|
| 4102 |
+
"learning_rate": 8.198319609030632e-05,
|
| 4103 |
+
"loss": 1.4331,
|
| 4104 |
+
"step": 3340
|
| 4105 |
+
},
|
| 4106 |
+
{
|
| 4107 |
+
"epoch": 10.45,
|
| 4108 |
+
"learning_rate": 7.866571054763788e-05,
|
| 4109 |
+
"loss": 1.8602,
|
| 4110 |
+
"step": 3345
|
| 4111 |
+
},
|
| 4112 |
+
{
|
| 4113 |
+
"epoch": 10.47,
|
| 4114 |
+
"learning_rate": 7.532397582660805e-05,
|
| 4115 |
+
"loss": 1.4865,
|
| 4116 |
+
"step": 3350
|
| 4117 |
+
},
|
| 4118 |
+
{
|
| 4119 |
+
"epoch": 10.48,
|
| 4120 |
+
"learning_rate": 7.19660424588612e-05,
|
| 4121 |
+
"loss": 1.2815,
|
| 4122 |
+
"step": 3355
|
| 4123 |
+
},
|
| 4124 |
+
{
|
| 4125 |
+
"epoch": 10.5,
|
| 4126 |
+
"learning_rate": 6.859999999999997e-05,
|
| 4127 |
+
"loss": 1.4705,
|
| 4128 |
+
"step": 3360
|
| 4129 |
+
},
|
| 4130 |
+
{
|
| 4131 |
+
"epoch": 10.52,
|
| 4132 |
+
"learning_rate": 6.523395754113922e-05,
|
| 4133 |
+
"loss": 1.1969,
|
| 4134 |
+
"step": 3365
|
| 4135 |
+
},
|
| 4136 |
+
{
|
| 4137 |
+
"epoch": 10.53,
|
| 4138 |
+
"learning_rate": 6.187602417339237e-05,
|
| 4139 |
+
"loss": 1.4564,
|
| 4140 |
+
"step": 3370
|
| 4141 |
+
},
|
| 4142 |
+
{
|
| 4143 |
+
"epoch": 10.55,
|
| 4144 |
+
"learning_rate": 5.853428945236207e-05,
|
| 4145 |
+
"loss": 1.4113,
|
| 4146 |
+
"step": 3375
|
| 4147 |
+
},
|
| 4148 |
+
{
|
| 4149 |
+
"epoch": 10.56,
|
| 4150 |
+
"learning_rate": 5.521680390969362e-05,
|
| 4151 |
+
"loss": 1.4642,
|
| 4152 |
+
"step": 3380
|
| 4153 |
+
},
|
| 4154 |
+
{
|
| 4155 |
+
"epoch": 10.58,
|
| 4156 |
+
"learning_rate": 5.193155965863624e-05,
|
| 4157 |
+
"loss": 1.4196,
|
| 4158 |
+
"step": 3385
|
| 4159 |
+
},
|
| 4160 |
+
{
|
| 4161 |
+
"epoch": 10.59,
|
| 4162 |
+
"learning_rate": 4.8686471140344147e-05,
|
| 4163 |
+
"loss": 1.3666,
|
| 4164 |
+
"step": 3390
|
| 4165 |
+
},
|
| 4166 |
+
{
|
| 4167 |
+
"epoch": 10.61,
|
| 4168 |
+
"learning_rate": 4.548935605729363e-05,
|
| 4169 |
+
"loss": 1.3908,
|
| 4170 |
+
"step": 3395
|
| 4171 |
+
},
|
| 4172 |
+
{
|
| 4173 |
+
"epoch": 10.62,
|
| 4174 |
+
"learning_rate": 4.23479165397549e-05,
|
| 4175 |
+
"loss": 1.4785,
|
| 4176 |
+
"step": 3400
|
| 4177 |
+
},
|
| 4178 |
+
{
|
| 4179 |
+
"epoch": 10.64,
|
| 4180 |
+
"learning_rate": 3.926972059068282e-05,
|
| 4181 |
+
"loss": 1.4775,
|
| 4182 |
+
"step": 3405
|
| 4183 |
+
},
|
| 4184 |
+
{
|
| 4185 |
+
"epoch": 10.66,
|
| 4186 |
+
"learning_rate": 3.626218385373685e-05,
|
| 4187 |
+
"loss": 1.4841,
|
| 4188 |
+
"step": 3410
|
| 4189 |
+
},
|
| 4190 |
+
{
|
| 4191 |
+
"epoch": 10.67,
|
| 4192 |
+
"learning_rate": 3.333255174834496e-05,
|
| 4193 |
+
"loss": 1.4263,
|
| 4194 |
+
"step": 3415
|
| 4195 |
+
},
|
| 4196 |
+
{
|
| 4197 |
+
"epoch": 10.69,
|
| 4198 |
+
"learning_rate": 3.0487882014855373e-05,
|
| 4199 |
+
"loss": 1.4815,
|
| 4200 |
+
"step": 3420
|
| 4201 |
+
},
|
| 4202 |
+
{
|
| 4203 |
+
"epoch": 10.7,
|
| 4204 |
+
"learning_rate": 2.7735027711819264e-05,
|
| 4205 |
+
"loss": 1.3612,
|
| 4206 |
+
"step": 3425
|
| 4207 |
+
},
|
| 4208 |
+
{
|
| 4209 |
+
"epoch": 10.72,
|
| 4210 |
+
"learning_rate": 2.508062070637383e-05,
|
| 4211 |
+
"loss": 1.3586,
|
| 4212 |
+
"step": 3430
|
| 4213 |
+
},
|
| 4214 |
+
{
|
| 4215 |
+
"epoch": 10.73,
|
| 4216 |
+
"learning_rate": 2.253105569749455e-05,
|
| 4217 |
+
"loss": 1.4036,
|
| 4218 |
+
"step": 3435
|
| 4219 |
+
},
|
| 4220 |
+
{
|
| 4221 |
+
"epoch": 10.75,
|
| 4222 |
+
"learning_rate": 2.0092474810602945e-05,
|
| 4223 |
+
"loss": 1.2455,
|
| 4224 |
+
"step": 3440
|
| 4225 |
+
},
|
| 4226 |
+
{
|
| 4227 |
+
"epoch": 10.77,
|
| 4228 |
+
"learning_rate": 1.7770752800649997e-05,
|
| 4229 |
+
"loss": 1.3747,
|
| 4230 |
+
"step": 3445
|
| 4231 |
+
},
|
| 4232 |
+
{
|
| 4233 |
+
"epoch": 10.78,
|
| 4234 |
+
"learning_rate": 1.5571482899316204e-05,
|
| 4235 |
+
"loss": 1.2848,
|
| 4236 |
+
"step": 3450
|
| 4237 |
+
},
|
| 4238 |
+
{
|
| 4239 |
+
"epoch": 10.8,
|
| 4240 |
+
"learning_rate": 1.3499963340427795e-05,
|
| 4241 |
+
"loss": 1.5623,
|
| 4242 |
+
"step": 3455
|
| 4243 |
+
},
|
| 4244 |
+
{
|
| 4245 |
+
"epoch": 10.81,
|
| 4246 |
+
"learning_rate": 1.1561184596045504e-05,
|
| 4247 |
+
"loss": 1.4704,
|
| 4248 |
+
"step": 3460
|
| 4249 |
+
},
|
| 4250 |
+
{
|
| 4251 |
+
"epoch": 10.83,
|
| 4252 |
+
"learning_rate": 9.759817353981509e-06,
|
| 4253 |
+
"loss": 1.3271,
|
| 4254 |
+
"step": 3465
|
| 4255 |
+
},
|
| 4256 |
+
{
|
| 4257 |
+
"epoch": 10.84,
|
| 4258 |
+
"learning_rate": 8.100201265702836e-06,
|
| 4259 |
+
"loss": 1.2696,
|
| 4260 |
+
"step": 3470
|
| 4261 |
+
},
|
| 4262 |
+
{
|
| 4263 |
+
"epoch": 10.86,
|
| 4264 |
+
"learning_rate": 6.586334491731833e-06,
|
| 4265 |
+
"loss": 1.5138,
|
| 4266 |
+
"step": 3475
|
| 4267 |
+
},
|
| 4268 |
+
{
|
| 4269 |
+
"epoch": 10.88,
|
| 4270 |
+
"learning_rate": 5.221864069725821e-06,
|
| 4271 |
+
"loss": 1.344,
|
| 4272 |
+
"step": 3480
|
| 4273 |
+
},
|
| 4274 |
+
{
|
| 4275 |
+
"epoch": 10.89,
|
| 4276 |
+
"learning_rate": 4.010077128444735e-06,
|
| 4277 |
+
"loss": 1.3544,
|
| 4278 |
+
"step": 3485
|
| 4279 |
+
},
|
| 4280 |
+
{
|
| 4281 |
+
"epoch": 10.91,
|
| 4282 |
+
"learning_rate": 2.9538929687704825e-06,
|
| 4283 |
+
"loss": 1.6602,
|
| 4284 |
+
"step": 3490
|
| 4285 |
+
},
|
| 4286 |
+
{
|
| 4287 |
+
"epoch": 10.92,
|
| 4288 |
+
"learning_rate": 2.0558560308543213e-06,
|
| 4289 |
+
"loss": 1.3761,
|
| 4290 |
+
"step": 3495
|
| 4291 |
+
},
|
| 4292 |
+
{
|
| 4293 |
+
"epoch": 10.94,
|
| 4294 |
+
"learning_rate": 1.3181297643384459e-06,
|
| 4295 |
+
"loss": 1.3709,
|
| 4296 |
+
"step": 3500
|
| 4297 |
+
},
|
| 4298 |
+
{
|
| 4299 |
+
"epoch": 10.95,
|
| 4300 |
+
"learning_rate": 7.424914164160148e-07,
|
| 4301 |
+
"loss": 1.3595,
|
| 4302 |
+
"step": 3505
|
| 4303 |
+
},
|
| 4304 |
+
{
|
| 4305 |
+
"epoch": 10.97,
|
| 4306 |
+
"learning_rate": 3.303277502872983e-07,
|
| 4307 |
+
"loss": 1.4077,
|
| 4308 |
+
"step": 3510
|
| 4309 |
+
},
|
| 4310 |
+
{
|
| 4311 |
+
"epoch": 10.98,
|
| 4312 |
+
"learning_rate": 8.263170432518063e-08,
|
| 4313 |
+
"loss": 1.4356,
|
| 4314 |
+
"step": 3515
|
| 4315 |
+
},
|
| 4316 |
+
{
|
| 4317 |
+
"epoch": 11.0,
|
| 4318 |
+
"learning_rate": 0.0,
|
| 4319 |
+
"loss": 1.7243,
|
| 4320 |
+
"step": 3520
|
| 4321 |
+
},
|
| 4322 |
+
{
|
| 4323 |
+
"epoch": 11.0,
|
| 4324 |
+
"eval_loss": 1.1156859397888184,
|
| 4325 |
+
"eval_runtime": 5.2715,
|
| 4326 |
+
"eval_samples_per_second": 82.33,
|
| 4327 |
+
"eval_steps_per_second": 10.433,
|
| 4328 |
+
"step": 3520
|
| 4329 |
}
|
| 4330 |
],
|
| 4331 |
+
"max_steps": 3520,
|
| 4332 |
"num_train_epochs": 11,
|
| 4333 |
+
"total_flos": 3668148191232000.0,
|
| 4334 |
"trial_name": null,
|
| 4335 |
"trial_params": null
|
| 4336 |
}
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8998c8154106cd43a7d424edf953518beb4d146ebea8364f94c30b8bca6902f7
|
| 3 |
+
size 3247
|