Upload folder using huggingface_hub
Browse files- attention_kindselective_n_heads8_seed1341/args.json +1 -0
- attention_kindselective_n_heads8_seed1341/dataloader_02500.pt +3 -0
- attention_kindselective_n_heads8_seed1341/dataloader_05000.pt +3 -0
- attention_kindselective_n_heads8_seed1341/dataloader_07500.pt +3 -0
- attention_kindselective_n_heads8_seed1341/dataloader_09999.pt +3 -0
- attention_kindselective_n_heads8_seed1341/log2.txt +1209 -0
- attention_kindselective_n_heads8_seed1341/model_02500.pt +3 -0
- attention_kindselective_n_heads8_seed1341/model_05000.pt +3 -0
- attention_kindselective_n_heads8_seed1341/model_07500.pt +3 -0
- attention_kindselective_n_heads8_seed1341/model_09999.pt +3 -0
- attention_kindselective_n_heads8_seed1341/optimizer_02500.pt +3 -0
- attention_kindselective_n_heads8_seed1341/optimizer_05000.pt +3 -0
- attention_kindselective_n_heads8_seed1341/optimizer_07500.pt +3 -0
- attention_kindselective_n_heads8_seed1341/optimizer_09999.pt +3 -0
attention_kindselective_n_heads8_seed1341/args.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads8_seed1341", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 8, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1341, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 3e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "3e-5_10240_8_1341", "n_embd": 512}
|
attention_kindselective_n_heads8_seed1341/dataloader_02500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b2ea67f78ff5a7970d0db044ff7ee527b3dc065f295fd30f588df4b44b568d0
|
| 3 |
+
size 964
|
attention_kindselective_n_heads8_seed1341/dataloader_05000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f03ed2ebf741f15e13c79e6cc1e9a19b308450d81cc3b4d8d0338c63d77ca59
|
| 3 |
+
size 964
|
attention_kindselective_n_heads8_seed1341/dataloader_07500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:82590037fb2eecbec961f7967a8dd1b8d85515d31a252f66b92b8139858a8b7c
|
| 3 |
+
size 964
|
attention_kindselective_n_heads8_seed1341/dataloader_09999.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c544303717d12355a69b8ffb1eb109434e4fdccfd5a61279b6e8ba2e870d6700
|
| 3 |
+
size 964
|
attention_kindselective_n_heads8_seed1341/log2.txt
ADDED
|
@@ -0,0 +1,1209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
max_steps: 10000
|
| 2 |
+
0 val loss 11.0680
|
| 3 |
+
0 val perplexity 64089.5781
|
| 4 |
+
0 val loss 11.0680
|
| 5 |
+
0 val perplexity 64089.5781
|
| 6 |
+
0 val loss 11.0680
|
| 7 |
+
0 val perplexity 64089.5781
|
| 8 |
+
0 val loss 11.0680
|
| 9 |
+
0 val perplexity 64089.5781
|
| 10 |
+
0 train 11.048663 (lr=1.0000e-07) (hash(x)=47078120)
|
| 11 |
+
0 train 11.048663 (lr=2.5000e-07) (hash(x)=47078120)
|
| 12 |
+
0 train 11.048663 (lr=3.5000e-07) (hash(x)=47078120)
|
| 13 |
+
0 train 11.048676 (lr=1.5000e-07) (hash(x)=47078120)
|
| 14 |
+
100 val loss 9.8889
|
| 15 |
+
100 val perplexity 19711.1328
|
| 16 |
+
100 train 9.874242 (lr=1.0100e-05) (hash(x)=43429388)
|
| 17 |
+
100 val loss 9.4787
|
| 18 |
+
100 val perplexity 13077.7705
|
| 19 |
+
100 val loss 9.7282
|
| 20 |
+
100 val perplexity 16783.9766
|
| 21 |
+
100 train 9.448478 (lr=3.5350e-05) (hash(x)=43429388)
|
| 22 |
+
100 val loss 9.5889
|
| 23 |
+
100 val perplexity 14601.5840
|
| 24 |
+
100 train 9.713765 (lr=1.5150e-05) (hash(x)=43429388)
|
| 25 |
+
100 train 9.563825 (lr=2.5250e-05) (hash(x)=43429388)
|
| 26 |
+
200 val loss 9.4174
|
| 27 |
+
200 val perplexity 12300.9268
|
| 28 |
+
200 train 9.477206 (lr=2.0000e-05) (hash(x)=52929681)
|
| 29 |
+
200 val loss 8.3919
|
| 30 |
+
200 val perplexity 4411.0986
|
| 31 |
+
200 val loss 8.0984
|
| 32 |
+
200 val perplexity 3289.3396
|
| 33 |
+
200 train 8.515907 (lr=5.0000e-05) (hash(x)=52929681)
|
| 34 |
+
200 val loss 9.1730
|
| 35 |
+
200 val perplexity 9633.6963
|
| 36 |
+
200 train 8.248425 (lr=7.0000e-05) (hash(x)=52929681)
|
| 37 |
+
200 train 9.244697 (lr=3.0000e-05) (hash(x)=52929681)
|
| 38 |
+
300 val loss 8.8455
|
| 39 |
+
300 val perplexity 6943.1738
|
| 40 |
+
300 train 8.874369 (lr=1.9995e-05) (hash(x)=49930367)
|
| 41 |
+
300 val loss 7.7464
|
| 42 |
+
300 val perplexity 2313.2307
|
| 43 |
+
300 train 7.757118 (lr=4.9988e-05) (hash(x)=49930367)
|
| 44 |
+
300 val loss 7.6647
|
| 45 |
+
300 val perplexity 2131.7766
|
| 46 |
+
300 val loss 8.2538
|
| 47 |
+
300 val perplexity 3842.1523
|
| 48 |
+
300 train 7.684315 (lr=6.9984e-05) (hash(x)=49930367)
|
| 49 |
+
300 train 8.276509 (lr=2.9993e-05) (hash(x)=49930367)
|
| 50 |
+
400 val loss 8.2382
|
| 51 |
+
400 val perplexity 3782.5420
|
| 52 |
+
400 train 8.121539 (lr=1.9982e-05) (hash(x)=48542946)
|
| 53 |
+
400 val loss 7.5870
|
| 54 |
+
400 val perplexity 1972.4014
|
| 55 |
+
400 train 7.386682 (lr=4.9954e-05) (hash(x)=48542946)
|
| 56 |
+
400 val loss 7.5243
|
| 57 |
+
400 val perplexity 1852.4517
|
| 58 |
+
400 val loss 7.8717
|
| 59 |
+
400 val perplexity 2621.8926
|
| 60 |
+
400 train 7.326735 (lr=6.9935e-05) (hash(x)=48542946)
|
| 61 |
+
400 train 7.709966 (lr=2.9972e-05) (hash(x)=48542946)
|
| 62 |
+
500 val loss 7.9134
|
| 63 |
+
500 val perplexity 2733.5874
|
| 64 |
+
500 train 8.054297 (lr=1.9958e-05) (hash(x)=55286048)
|
| 65 |
+
500 val loss 7.6640
|
| 66 |
+
500 val perplexity 2130.2502
|
| 67 |
+
500 val loss 7.5074
|
| 68 |
+
500 val perplexity 1821.5359
|
| 69 |
+
500 val loss 7.4518
|
| 70 |
+
500 val perplexity 1722.9878
|
| 71 |
+
500 train 7.837950 (lr=2.9938e-05) (hash(x)=55286048)
|
| 72 |
+
500 train 7.714735 (lr=4.9896e-05) (hash(x)=55286048)
|
| 73 |
+
500 train 7.667075 (lr=6.9854e-05) (hash(x)=55286048)
|
| 74 |
+
600 val loss 7.7274
|
| 75 |
+
600 val perplexity 2269.6853
|
| 76 |
+
600 train 7.917661 (lr=1.9926e-05) (hash(x)=51577760)
|
| 77 |
+
600 val loss 7.5601
|
| 78 |
+
600 val perplexity 1920.0514
|
| 79 |
+
600 train 7.793221 (lr=2.9889e-05) (hash(x)=51577760)
|
| 80 |
+
600 val loss 7.4491
|
| 81 |
+
600 val perplexity 1718.2578
|
| 82 |
+
600 val loss 7.3973
|
| 83 |
+
600 val perplexity 1631.6017
|
| 84 |
+
600 train 7.691466 (lr=4.9815e-05) (hash(x)=51577760)
|
| 85 |
+
600 train 7.632826 (lr=6.9741e-05) (hash(x)=51577760)
|
| 86 |
+
700 val loss 7.6300
|
| 87 |
+
700 val perplexity 2059.1208
|
| 88 |
+
700 train 7.758367 (lr=1.9885e-05) (hash(x)=57433471)
|
| 89 |
+
700 val loss 7.4870
|
| 90 |
+
700 val perplexity 1784.6235
|
| 91 |
+
700 train 7.647078 (lr=2.9827e-05) (hash(x)=57433471)
|
| 92 |
+
700 val loss 7.3347
|
| 93 |
+
700 val perplexity 1532.5104
|
| 94 |
+
700 train 7.507759 (lr=6.9596e-05) (hash(x)=57433471)
|
| 95 |
+
700 val loss 7.3973
|
| 96 |
+
700 val perplexity 1631.5386
|
| 97 |
+
700 train 7.570853 (lr=4.9712e-05) (hash(x)=57433471)
|
| 98 |
+
800 val loss 7.5663
|
| 99 |
+
800 val perplexity 1931.9979
|
| 100 |
+
800 train 7.500276 (lr=1.9834e-05) (hash(x)=49799291)
|
| 101 |
+
800 val loss 7.4398
|
| 102 |
+
800 val perplexity 1702.4897
|
| 103 |
+
800 train 7.366629 (lr=2.9751e-05) (hash(x)=49799291)
|
| 104 |
+
800 val loss 7.3000
|
| 105 |
+
800 val perplexity 1480.3511
|
| 106 |
+
800 train 7.210431 (lr=6.9419e-05) (hash(x)=49799291)
|
| 107 |
+
800 val loss 7.3632
|
| 108 |
+
800 val perplexity 1576.9290
|
| 109 |
+
800 train 7.281087 (lr=4.9585e-05) (hash(x)=49799291)
|
| 110 |
+
900 val loss 7.5209
|
| 111 |
+
900 val perplexity 1846.1362
|
| 112 |
+
900 train 7.526892 (lr=1.9774e-05) (hash(x)=49502839)
|
| 113 |
+
900 val loss 7.4191
|
| 114 |
+
900 val perplexity 1667.5374
|
| 115 |
+
900 train 7.422843 (lr=2.9662e-05) (hash(x)=49502839)
|
| 116 |
+
900 val loss 7.2387
|
| 117 |
+
900 val perplexity 1392.2841
|
| 118 |
+
900 train 7.215050 (lr=6.9210e-05) (hash(x)=49502839)
|
| 119 |
+
900 val loss 7.3366
|
| 120 |
+
900 val perplexity 1535.4969
|
| 121 |
+
900 train 7.333570 (lr=4.9436e-05) (hash(x)=49502839)
|
| 122 |
+
1000 val loss 7.4822
|
| 123 |
+
1000 val perplexity 1776.1934
|
| 124 |
+
1000 train 7.707015 (lr=1.9706e-05) (hash(x)=51142904)
|
| 125 |
+
1000 val loss 7.3874
|
| 126 |
+
1000 val perplexity 1615.5121
|
| 127 |
+
1000 train 7.649211 (lr=2.9558e-05) (hash(x)=51142904)
|
| 128 |
+
1000 val loss 7.1808
|
| 129 |
+
1000 val perplexity 1313.9176
|
| 130 |
+
1000 train 7.483747 (lr=6.8970e-05) (hash(x)=51142904)
|
| 131 |
+
1000 val loss 7.3008
|
| 132 |
+
1000 val perplexity 1481.4540
|
| 133 |
+
1000 train 7.578658 (lr=4.9264e-05) (hash(x)=51142904)
|
| 134 |
+
1100 val loss 7.4532
|
| 135 |
+
1100 val perplexity 1725.2938
|
| 136 |
+
1100 train 7.515146 (lr=1.9628e-05) (hash(x)=52751086)
|
| 137 |
+
1100 val loss 7.3649
|
| 138 |
+
1100 val perplexity 1579.5704
|
| 139 |
+
1100 train 7.431870 (lr=2.9442e-05) (hash(x)=52751086)
|
| 140 |
+
1100 val loss 7.1476
|
| 141 |
+
1100 val perplexity 1271.1102
|
| 142 |
+
1100 train 7.207986 (lr=6.8698e-05) (hash(x)=52751086)
|
| 143 |
+
1100 val loss 7.2667
|
| 144 |
+
1100 val perplexity 1431.8098
|
| 145 |
+
1100 train 7.324548 (lr=4.9070e-05) (hash(x)=52751086)
|
| 146 |
+
1200 val loss 7.4319
|
| 147 |
+
1200 val perplexity 1688.9976
|
| 148 |
+
1200 train 7.388729 (lr=1.9542e-05) (hash(x)=51538621)
|
| 149 |
+
1200 val loss 7.3563
|
| 150 |
+
1200 val perplexity 1566.0082
|
| 151 |
+
1200 train 7.301937 (lr=2.9312e-05) (hash(x)=51538621)
|
| 152 |
+
1200 val loss 7.1481
|
| 153 |
+
1200 val perplexity 1271.6545
|
| 154 |
+
1200 train 7.103180 (lr=6.8395e-05) (hash(x)=51538621)
|
| 155 |
+
1200 val loss 7.2521
|
| 156 |
+
1200 val perplexity 1411.0454
|
| 157 |
+
1200 train 7.200856 (lr=4.8854e-05) (hash(x)=51538621)
|
| 158 |
+
1300 val loss 7.4095
|
| 159 |
+
1300 val perplexity 1651.6470
|
| 160 |
+
1300 train 7.466151 (lr=1.9446e-05) (hash(x)=52034040)
|
| 161 |
+
1300 val loss 7.3408
|
| 162 |
+
1300 val perplexity 1541.8721
|
| 163 |
+
1300 train 7.412338 (lr=2.9169e-05) (hash(x)=52034040)
|
| 164 |
+
1300 val loss 7.0932
|
| 165 |
+
1300 val perplexity 1203.7023
|
| 166 |
+
1300 train 7.166739 (lr=6.8062e-05) (hash(x)=52034040)
|
| 167 |
+
1300 val loss 7.2236
|
| 168 |
+
1300 val perplexity 1371.4453
|
| 169 |
+
1300 train 7.305515 (lr=4.8616e-05) (hash(x)=52034040)
|
| 170 |
+
1400 val loss 7.3843
|
| 171 |
+
1400 val perplexity 1610.4229
|
| 172 |
+
1400 train 7.408330 (lr=1.9342e-05) (hash(x)=50640105)
|
| 173 |
+
1400 val loss 7.3167
|
| 174 |
+
1400 val perplexity 1505.1840
|
| 175 |
+
1400 train 7.352315 (lr=2.9013e-05) (hash(x)=50640105)
|
| 176 |
+
1400 val loss 7.0789
|
| 177 |
+
1400 val perplexity 1186.6193
|
| 178 |
+
1400 train 7.134275 (lr=6.7698e-05) (hash(x)=50640105)
|
| 179 |
+
1400 val loss 7.2080
|
| 180 |
+
1400 val perplexity 1350.1521
|
| 181 |
+
1400 train 7.266603 (lr=4.8356e-05) (hash(x)=50640105)
|
| 182 |
+
1500 val loss 7.3619
|
| 183 |
+
1500 val perplexity 1574.9045
|
| 184 |
+
1500 train 7.287918 (lr=1.9230e-05) (hash(x)=49016270)
|
| 185 |
+
1500 val loss 7.2903
|
| 186 |
+
1500 val perplexity 1465.9382
|
| 187 |
+
1500 train 7.219070 (lr=2.8845e-05) (hash(x)=49016270)
|
| 188 |
+
1500 val loss 7.0480
|
| 189 |
+
1500 val perplexity 1150.5200
|
| 190 |
+
1500 train 6.976660 (lr=6.7304e-05) (hash(x)=49016270)
|
| 191 |
+
1500 val loss 7.1885
|
| 192 |
+
1500 val perplexity 1324.1281
|
| 193 |
+
1500 train 7.115997 (lr=4.8074e-05) (hash(x)=49016270)
|
| 194 |
+
1600 val loss 7.3388
|
| 195 |
+
1600 val perplexity 1538.8591
|
| 196 |
+
1600 train 7.111701 (lr=1.9109e-05) (hash(x)=46100488)
|
| 197 |
+
1600 val loss 7.2736
|
| 198 |
+
1600 val perplexity 1441.7178
|
| 199 |
+
1600 train 7.037468 (lr=2.8663e-05) (hash(x)=46100488)
|
| 200 |
+
1600 val loss 7.0191
|
| 201 |
+
1600 val perplexity 1117.7505
|
| 202 |
+
1600 train 6.790306 (lr=6.6881e-05) (hash(x)=46100488)
|
| 203 |
+
1600 val loss 7.1701
|
| 204 |
+
1600 val perplexity 1299.9185
|
| 205 |
+
1600 train 6.944131 (lr=4.7772e-05) (hash(x)=46100488)
|
| 206 |
+
1700 val loss 7.3212
|
| 207 |
+
1700 val perplexity 1512.0359
|
| 208 |
+
1700 train 7.369852 (lr=1.8979e-05) (hash(x)=49185350)
|
| 209 |
+
1700 val loss 7.2640
|
| 210 |
+
1700 val perplexity 1427.8956
|
| 211 |
+
1700 train 7.317546 (lr=2.8469e-05) (hash(x)=49185350)
|
| 212 |
+
1700 val loss 7.0044
|
| 213 |
+
1700 val perplexity 1101.4866
|
| 214 |
+
1700 train 7.065816 (lr=6.6428e-05) (hash(x)=49185350)
|
| 215 |
+
1700 val loss 7.1566
|
| 216 |
+
1700 val perplexity 1282.5233
|
| 217 |
+
1700 train 7.217639 (lr=4.7448e-05) (hash(x)=49185350)
|
| 218 |
+
1800 val loss 7.3080
|
| 219 |
+
1800 val perplexity 1492.1678
|
| 220 |
+
1800 train 7.221244 (lr=1.8842e-05) (hash(x)=48024574)
|
| 221 |
+
1800 val loss 7.2471
|
| 222 |
+
1800 val perplexity 1403.9862
|
| 223 |
+
1800 train 7.151842 (lr=2.8263e-05) (hash(x)=48024574)
|
| 224 |
+
1800 val loss 6.9785
|
| 225 |
+
1800 val perplexity 1073.2799
|
| 226 |
+
1800 train 6.877046 (lr=6.5947e-05) (hash(x)=48024574)
|
| 227 |
+
1800 val loss 7.1461
|
| 228 |
+
1800 val perplexity 1269.1606
|
| 229 |
+
1900 val loss 7.2891
|
| 230 |
+
1900 val perplexity 1464.3209
|
| 231 |
+
1800 train 7.040640 (lr=4.7105e-05) (hash(x)=48024574)
|
| 232 |
+
1900 train 7.115648 (lr=1.8696e-05) (hash(x)=45823189)
|
| 233 |
+
1900 val loss 7.2267
|
| 234 |
+
1900 val perplexity 1375.6698
|
| 235 |
+
1900 train 7.034698 (lr=2.8044e-05) (hash(x)=45823189)
|
| 236 |
+
1900 val loss 6.9555
|
| 237 |
+
1900 val perplexity 1048.9285
|
| 238 |
+
1900 train 6.739227 (lr=6.5437e-05) (hash(x)=45823189)
|
| 239 |
+
2000 val loss 7.2767
|
| 240 |
+
2000 val perplexity 1446.2552
|
| 241 |
+
2000 train 7.108987 (lr=1.8543e-05) (hash(x)=45703932)
|
| 242 |
+
1900 val loss 7.1339
|
| 243 |
+
1900 val perplexity 1253.7621
|
| 244 |
+
1900 train 6.941078 (lr=4.6741e-05) (hash(x)=45823189)
|
| 245 |
+
2000 val loss 7.2035
|
| 246 |
+
2000 val perplexity 1344.1517
|
| 247 |
+
2000 train 7.027834 (lr=2.7814e-05) (hash(x)=45703932)
|
| 248 |
+
2000 val loss 6.9407
|
| 249 |
+
2000 val perplexity 1033.5043
|
| 250 |
+
2000 train 6.743909 (lr=6.4900e-05) (hash(x)=45703932)
|
| 251 |
+
2100 val loss 7.2564
|
| 252 |
+
2100 val perplexity 1417.1031
|
| 253 |
+
2100 train 7.887122 (lr=1.8382e-05) (hash(x)=58570170)
|
| 254 |
+
2000 val loss 7.1274
|
| 255 |
+
2000 val perplexity 1245.6257
|
| 256 |
+
2000 train 6.943665 (lr=4.6357e-05) (hash(x)=45703932)
|
| 257 |
+
2100 val loss 7.1834
|
| 258 |
+
2100 val perplexity 1317.3862
|
| 259 |
+
2100 train 7.836891 (lr=2.7572e-05) (hash(x)=58570170)
|
| 260 |
+
2100 val loss 6.9112
|
| 261 |
+
2100 val perplexity 1003.4402
|
| 262 |
+
2100 train 7.606760 (lr=6.4335e-05) (hash(x)=58570170)
|
| 263 |
+
2200 val loss 7.2513
|
| 264 |
+
2200 val perplexity 1409.9761
|
| 265 |
+
2200 train 7.275453 (lr=1.8213e-05) (hash(x)=55262880)
|
| 266 |
+
2100 val loss 7.1219
|
| 267 |
+
2100 val perplexity 1238.8298
|
| 268 |
+
2100 train 7.786014 (lr=4.5954e-05) (hash(x)=58570170)
|
| 269 |
+
2200 val loss 7.1710
|
| 270 |
+
2200 val perplexity 1301.1643
|
| 271 |
+
2200 train 7.189610 (lr=2.7319e-05) (hash(x)=55262880)
|
| 272 |
+
2200 val loss 6.9619
|
| 273 |
+
2200 val perplexity 1055.6140
|
| 274 |
+
2200 train 6.981012 (lr=6.3745e-05) (hash(x)=55262880)
|
| 275 |
+
2300 val loss 7.2239
|
| 276 |
+
2300 val perplexity 1371.8573
|
| 277 |
+
2300 train 6.981198 (lr=1.8036e-05) (hash(x)=46415497)
|
| 278 |
+
2200 val loss 7.1250
|
| 279 |
+
2200 val perplexity 1242.6696
|
| 280 |
+
2200 train 7.142995 (lr=4.5532e-05) (hash(x)=55262880)
|
| 281 |
+
2300 val loss 7.1466
|
| 282 |
+
2300 val perplexity 1269.8204
|
| 283 |
+
2300 train 6.890901 (lr=2.7055e-05) (hash(x)=46415497)
|
| 284 |
+
2400 val loss 7.2045
|
| 285 |
+
2400 val perplexity 1345.4766
|
| 286 |
+
2400 train 7.125453 (lr=1.7853e-05) (hash(x)=49272278)
|
| 287 |
+
2300 val loss 6.8938
|
| 288 |
+
2300 val perplexity 986.1673
|
| 289 |
+
2300 train 6.622870 (lr=6.3128e-05) (hash(x)=46415497)
|
| 290 |
+
2300 val loss 7.1014
|
| 291 |
+
2300 val perplexity 1213.7175
|
| 292 |
+
2300 train 6.864444 (lr=4.5091e-05) (hash(x)=46415497)
|
| 293 |
+
2400 val loss 7.1125
|
| 294 |
+
2400 val perplexity 1227.2383
|
| 295 |
+
2400 train 7.022389 (lr=2.6780e-05) (hash(x)=49272278)
|
| 296 |
+
2500 val loss 7.1825
|
| 297 |
+
2500 val perplexity 1316.1329
|
| 298 |
+
2500 train 7.040866 (lr=1.7663e-05) (hash(x)=48390803)
|
| 299 |
+
2400 val loss 6.8477
|
| 300 |
+
2400 val perplexity 941.7234
|
| 301 |
+
2400 train 6.740791 (lr=6.2486e-05) (hash(x)=49272278)
|
| 302 |
+
2400 val loss 7.0815
|
| 303 |
+
2400 val perplexity 1189.7360
|
| 304 |
+
2400 train 6.993971 (lr=4.4633e-05) (hash(x)=49272278)
|
| 305 |
+
2500 val loss 7.0937
|
| 306 |
+
2500 val perplexity 1204.3481
|
| 307 |
+
2500 train 6.950256 (lr=2.6494e-05) (hash(x)=48390803)
|
| 308 |
+
2500 val loss 6.8303
|
| 309 |
+
2500 val perplexity 925.4961
|
| 310 |
+
2600 val loss 7.1665
|
| 311 |
+
2600 val perplexity 1295.2786
|
| 312 |
+
2600 train 7.031013 (lr=1.7465e-05) (hash(x)=47450116)
|
| 313 |
+
2500 train 6.699334 (lr=6.1819e-05) (hash(x)=48390803)
|
| 314 |
+
2500 val loss 7.0623
|
| 315 |
+
2500 val perplexity 1167.1173
|
| 316 |
+
2500 train 6.923334 (lr=4.4156e-05) (hash(x)=48390803)
|
| 317 |
+
2600 val loss 7.0709
|
| 318 |
+
2600 val perplexity 1177.1604
|
| 319 |
+
2600 train 6.918849 (lr=2.6198e-05) (hash(x)=47450116)
|
| 320 |
+
2700 val loss 7.1567
|
| 321 |
+
2700 val perplexity 1282.6627
|
| 322 |
+
2700 train 7.217981 (lr=1.7261e-05) (hash(x)=52681152)
|
| 323 |
+
2600 val loss 6.8076
|
| 324 |
+
2600 val perplexity 904.6650
|
| 325 |
+
2600 train 6.673070 (lr=6.1128e-05) (hash(x)=47450116)
|
| 326 |
+
2600 val loss 7.0478
|
| 327 |
+
2600 val perplexity 1150.3018
|
| 328 |
+
2600 train 6.902493 (lr=4.3663e-05) (hash(x)=47450116)
|
| 329 |
+
2700 val loss 7.0496
|
| 330 |
+
2700 val perplexity 1152.3412
|
| 331 |
+
2700 train 7.117827 (lr=2.5892e-05) (hash(x)=52681152)
|
| 332 |
+
2800 val loss 7.1363
|
| 333 |
+
2800 val perplexity 1256.7675
|
| 334 |
+
2800 train 7.209763 (lr=1.7051e-05) (hash(x)=50664094)
|
| 335 |
+
2700 val loss 6.7852
|
| 336 |
+
2700 val perplexity 884.6849
|
| 337 |
+
2700 train 6.864418 (lr=6.0414e-05) (hash(x)=52681152)
|
| 338 |
+
2700 val loss 7.0395
|
| 339 |
+
2700 val perplexity 1140.7711
|
| 340 |
+
2700 train 7.108398 (lr=4.3153e-05) (hash(x)=52681152)
|
| 341 |
+
2800 val loss 7.0391
|
| 342 |
+
2800 val perplexity 1140.3605
|
| 343 |
+
2800 train 7.111782 (lr=2.5576e-05) (hash(x)=50664094)
|
| 344 |
+
2900 val loss 7.1164
|
| 345 |
+
2900 val perplexity 1231.9611
|
| 346 |
+
2900 train 6.889511 (lr=1.6834e-05) (hash(x)=47067144)
|
| 347 |
+
2800 val loss 6.7762
|
| 348 |
+
2800 val perplexity 876.7094
|
| 349 |
+
2800 train 6.850021 (lr=5.9677e-05) (hash(x)=50664094)
|
| 350 |
+
2800 val loss 7.0292
|
| 351 |
+
2800 val perplexity 1129.1125
|
| 352 |
+
2800 train 7.094847 (lr=4.2627e-05) (hash(x)=50664094)
|
| 353 |
+
2900 val loss 7.0108
|
| 354 |
+
2900 val perplexity 1108.5176
|
| 355 |
+
2900 train 6.776527 (lr=2.5251e-05) (hash(x)=47067144)
|
| 356 |
+
3000 val loss 7.0995
|
| 357 |
+
3000 val perplexity 1211.4059
|
| 358 |
+
3000 train 6.896778 (lr=1.6611e-05) (hash(x)=45015009)
|
| 359 |
+
2900 val loss 6.7621
|
| 360 |
+
2900 val perplexity 864.4884
|
| 361 |
+
2900 train 6.564690 (lr=5.8919e-05) (hash(x)=47067144)
|
| 362 |
+
2900 val loss 7.0175
|
| 363 |
+
2900 val perplexity 1115.9957
|
| 364 |
+
2900 train 6.791795 (lr=4.2085e-05) (hash(x)=47067144)
|
| 365 |
+
3000 val loss 6.9916
|
| 366 |
+
3000 val perplexity 1087.4424
|
| 367 |
+
3000 train 6.785407 (lr=2.4917e-05) (hash(x)=45015009)
|
| 368 |
+
3100 val loss 7.0851
|
| 369 |
+
3100 val perplexity 1194.0815
|
| 370 |
+
3100 train 6.896208 (lr=1.6383e-05) (hash(x)=45245896)
|
| 371 |
+
3000 val loss 6.7417
|
| 372 |
+
3000 val perplexity 847.0032
|
| 373 |
+
3000 train 6.542316 (lr=5.8140e-05) (hash(x)=45015009)
|
| 374 |
+
3000 val loss 7.0100
|
| 375 |
+
3000 val perplexity 1107.6675
|
| 376 |
+
3000 train 6.804945 (lr=4.1529e-05) (hash(x)=45015009)
|
| 377 |
+
3100 val loss 6.9760
|
| 378 |
+
3100 val perplexity 1070.6133
|
| 379 |
+
3100 train 6.779830 (lr=2.4574e-05) (hash(x)=45245896)
|
| 380 |
+
3200 val loss 7.0673
|
| 381 |
+
3200 val perplexity 1172.9346
|
| 382 |
+
3200 train 7.026747 (lr=1.6149e-05) (hash(x)=49995942)
|
| 383 |
+
3100 val loss 6.7318
|
| 384 |
+
3100 val perplexity 838.6891
|
| 385 |
+
3100 train 6.545393 (lr=5.7340e-05) (hash(x)=45245896)
|
| 386 |
+
3100 val loss 6.9952
|
| 387 |
+
3100 val perplexity 1091.3998
|
| 388 |
+
3100 train 6.805078 (lr=4.0957e-05) (hash(x)=45245896)
|
| 389 |
+
3200 val loss 6.9604
|
| 390 |
+
3200 val perplexity 1054.0768
|
| 391 |
+
3200 train 6.913863 (lr=2.4224e-05) (hash(x)=49995942)
|
| 392 |
+
3300 val loss 7.0569
|
| 393 |
+
3300 val perplexity 1160.8708
|
| 394 |
+
3300 train 6.930760 (lr=1.5910e-05) (hash(x)=52311504)
|
| 395 |
+
3200 val loss 6.7166
|
| 396 |
+
3200 val perplexity 825.9999
|
| 397 |
+
3200 train 6.681448 (lr=5.6522e-05) (hash(x)=49995942)
|
| 398 |
+
3200 val loss 6.9841
|
| 399 |
+
3200 val perplexity 1079.2999
|
| 400 |
+
3200 train 6.941720 (lr=4.0373e-05) (hash(x)=49995942)
|
| 401 |
+
3300 val loss 6.9465
|
| 402 |
+
3300 val perplexity 1039.4871
|
| 403 |
+
3300 train 6.819232 (lr=2.3865e-05) (hash(x)=52311504)
|
| 404 |
+
3400 val loss 7.0490
|
| 405 |
+
3400 val perplexity 1151.7167
|
| 406 |
+
3400 train 7.042104 (lr=1.5666e-05) (hash(x)=44332917)
|
| 407 |
+
3300 val loss 6.7106
|
| 408 |
+
3300 val perplexity 821.0791
|
| 409 |
+
3300 train 6.580021 (lr=5.5684e-05) (hash(x)=52311504)
|
| 410 |
+
3300 val loss 6.9826
|
| 411 |
+
3300 val perplexity 1077.7489
|
| 412 |
+
3300 train 6.858939 (lr=3.9775e-05) (hash(x)=52311504)
|
| 413 |
+
3400 val loss 6.9332
|
| 414 |
+
3400 val perplexity 1025.7731
|
| 415 |
+
3400 train 6.936850 (lr=2.3498e-05) (hash(x)=44332917)
|
| 416 |
+
3500 val loss 7.0323
|
| 417 |
+
3500 val perplexity 1132.6359
|
| 418 |
+
3500 train 7.139286 (lr=1.5416e-05) (hash(x)=56517159)
|
| 419 |
+
3400 val loss 6.7010
|
| 420 |
+
3400 val perplexity 813.2463
|
| 421 |
+
3400 train 6.689954 (lr=5.4829e-05) (hash(x)=44332917)
|
| 422 |
+
3400 val loss 6.9820
|
| 423 |
+
3400 val perplexity 1077.1057
|
| 424 |
+
3400 train 6.967134 (lr=3.9164e-05) (hash(x)=44332917)
|
| 425 |
+
3500 val loss 6.9179
|
| 426 |
+
3500 val perplexity 1010.1714
|
| 427 |
+
3500 train 7.036409 (lr=2.3125e-05) (hash(x)=56517159)
|
| 428 |
+
3600 val loss 7.0287
|
| 429 |
+
3600 val perplexity 1128.5656
|
| 430 |
+
3600 train 6.907561 (lr=1.5163e-05) (hash(x)=50720920)
|
| 431 |
+
3500 val loss 6.6896
|
| 432 |
+
3500 val perplexity 803.9983
|
| 433 |
+
3500 train 6.777810 (lr=5.3958e-05) (hash(x)=56517159)
|
| 434 |
+
3500 val loss 6.9596
|
| 435 |
+
3500 val perplexity 1053.2172
|
| 436 |
+
3500 train 7.056179 (lr=3.8541e-05) (hash(x)=56517159)
|
| 437 |
+
3600 val loss 6.9100
|
| 438 |
+
3600 val perplexity 1002.2887
|
| 439 |
+
3600 train 6.787700 (lr=2.2744e-05) (hash(x)=50720920)
|
| 440 |
+
3700 val loss 7.0122
|
| 441 |
+
3700 val perplexity 1110.0436
|
| 442 |
+
3700 train 7.504549 (lr=1.4905e-05) (hash(x)=62727701)
|
| 443 |
+
3600 val loss 6.6984
|
| 444 |
+
3600 val perplexity 811.0815
|
| 445 |
+
3600 train 6.556833 (lr=5.3070e-05) (hash(x)=50720920)
|
| 446 |
+
3600 val loss 6.9576
|
| 447 |
+
3600 val perplexity 1051.0614
|
| 448 |
+
3600 train 6.823028 (lr=3.7907e-05) (hash(x)=50720920)
|
| 449 |
+
3700 val loss 6.8997
|
| 450 |
+
3700 val perplexity 991.9734
|
| 451 |
+
3700 train 7.375038 (lr=2.2357e-05) (hash(x)=62727701)
|
| 452 |
+
3800 val loss 7.0017
|
| 453 |
+
3800 val perplexity 1098.4895
|
| 454 |
+
3800 train 6.844926 (lr=1.4643e-05) (hash(x)=54772539)
|
| 455 |
+
3700 val loss 6.6738
|
| 456 |
+
3700 val perplexity 791.3810
|
| 457 |
+
3700 train 7.154129 (lr=5.2167e-05) (hash(x)=62727701)
|
| 458 |
+
3700 val loss 6.9509
|
| 459 |
+
3700 val perplexity 1044.1140
|
| 460 |
+
3700 train 7.432554 (lr=3.7262e-05) (hash(x)=62727701)
|
| 461 |
+
3800 val loss 6.8858
|
| 462 |
+
3800 val perplexity 978.2759
|
| 463 |
+
3800 train 6.742421 (lr=2.1965e-05) (hash(x)=54772539)
|
| 464 |
+
3900 val loss 6.9911
|
| 465 |
+
3900 val perplexity 1086.9178
|
| 466 |
+
3900 train 6.996634 (lr=1.4377e-05) (hash(x)=52274485)
|
| 467 |
+
3800 val loss 6.6617
|
| 468 |
+
3800 val perplexity 781.8557
|
| 469 |
+
3800 train 6.517062 (lr=5.1251e-05) (hash(x)=54772539)
|
| 470 |
+
3800 val loss 6.9500
|
| 471 |
+
3800 val perplexity 1043.1793
|
| 472 |
+
3800 train 6.794149 (lr=3.6608e-05) (hash(x)=54772539)
|
| 473 |
+
3900 val loss 6.8733
|
| 474 |
+
3900 val perplexity 966.1602
|
| 475 |
+
3900 train 6.879457 (lr=2.1566e-05) (hash(x)=52274485)
|
| 476 |
+
4000 val loss 6.9853
|
| 477 |
+
4000 val perplexity 1080.6085
|
| 478 |
+
4000 train 6.692829 (lr=1.4108e-05) (hash(x)=50118307)
|
| 479 |
+
3900 val loss 6.6532
|
| 480 |
+
3900 val perplexity 775.2562
|
| 481 |
+
3900 train 6.649825 (lr=5.0321e-05) (hash(x)=52274485)
|
| 482 |
+
4000 val loss 6.8697
|
| 483 |
+
4000 val perplexity 962.6872
|
| 484 |
+
3900 val loss 6.9389
|
| 485 |
+
3900 val perplexity 1031.6731
|
| 486 |
+
4000 train 6.566363 (lr=2.1162e-05) (hash(x)=50118307)
|
| 487 |
+
3900 train 6.947895 (lr=3.5944e-05) (hash(x)=52274485)
|
| 488 |
+
4100 val loss 6.9691
|
| 489 |
+
4100 val perplexity 1063.2795
|
| 490 |
+
4100 train 6.554935 (lr=1.3836e-05) (hash(x)=42771647)
|
| 491 |
+
4000 val loss 6.6577
|
| 492 |
+
4000 val perplexity 778.7485
|
| 493 |
+
4000 train 6.355518 (lr=4.9379e-05) (hash(x)=50118307)
|
| 494 |
+
4100 val loss 6.8494
|
| 495 |
+
4100 val perplexity 943.3229
|
| 496 |
+
4100 train 6.423832 (lr=2.0754e-05) (hash(x)=42771647)
|
| 497 |
+
4000 val loss 6.9340
|
| 498 |
+
4000 val perplexity 1026.6221
|
| 499 |
+
4000 train 6.642421 (lr=3.5271e-05) (hash(x)=50118307)
|
| 500 |
+
4200 val loss 6.9589
|
| 501 |
+
4200 val perplexity 1052.4305
|
| 502 |
+
4200 train 7.068559 (lr=1.3561e-05) (hash(x)=51748836)
|
| 503 |
+
4100 val loss 6.6376
|
| 504 |
+
4100 val perplexity 763.2496
|
| 505 |
+
4100 train 6.206556 (lr=4.8426e-05) (hash(x)=42771647)
|
| 506 |
+
4200 val loss 6.8396
|
| 507 |
+
4200 val perplexity 934.1204
|
| 508 |
+
4200 train 6.950802 (lr=2.0341e-05) (hash(x)=51748836)
|
| 509 |
+
4100 val loss 6.9343
|
| 510 |
+
4100 val perplexity 1026.8899
|
| 511 |
+
4100 train 6.515369 (lr=3.4590e-05) (hash(x)=42771647)
|
| 512 |
+
4300 val loss 6.9447
|
| 513 |
+
4300 val perplexity 1037.6448
|
| 514 |
+
4300 train 6.956426 (lr=1.3283e-05) (hash(x)=49021280)
|
| 515 |
+
4200 val loss 6.6271
|
| 516 |
+
4200 val perplexity 755.2573
|
| 517 |
+
4200 train 6.723958 (lr=4.7463e-05) (hash(x)=51748836)
|
| 518 |
+
4300 val loss 6.8180
|
| 519 |
+
4300 val perplexity 914.1560
|
| 520 |
+
4300 train 6.831802 (lr=1.9924e-05) (hash(x)=49021280)
|
| 521 |
+
4400 val loss 6.9332
|
| 522 |
+
4400 val perplexity 1025.7461
|
| 523 |
+
4400 train 7.020908 (lr=1.3003e-05) (hash(x)=55200309)
|
| 524 |
+
4200 val loss 6.9201
|
| 525 |
+
4200 val perplexity 1012.4133
|
| 526 |
+
4200 train 7.022127 (lr=3.3902e-05) (hash(x)=51748836)
|
| 527 |
+
4300 val loss 6.6103
|
| 528 |
+
4300 val perplexity 742.6902
|
| 529 |
+
4300 train 6.629007 (lr=4.6490e-05) (hash(x)=49021280)
|
| 530 |
+
4400 val loss 6.7991
|
| 531 |
+
4400 val perplexity 897.0301
|
| 532 |
+
4400 train 6.883133 (lr=1.9504e-05) (hash(x)=55200309)
|
| 533 |
+
4500 val loss 6.9237
|
| 534 |
+
4500 val perplexity 1016.1121
|
| 535 |
+
4500 train 7.022612 (lr=1.2720e-05) (hash(x)=52085049)
|
| 536 |
+
4300 val loss 6.9037
|
| 537 |
+
4300 val perplexity 995.9319
|
| 538 |
+
4300 train 6.916924 (lr=3.3207e-05) (hash(x)=49021280)
|
| 539 |
+
4400 val loss 6.6076
|
| 540 |
+
4400 val perplexity 740.7267
|
| 541 |
+
4400 train 6.694966 (lr=4.5509e-05) (hash(x)=55200309)
|
| 542 |
+
4500 val loss 6.7862
|
| 543 |
+
4500 val perplexity 885.5211
|
| 544 |
+
4500 train 6.876517 (lr=1.9081e-05) (hash(x)=52085049)
|
| 545 |
+
4600 val loss 6.9170
|
| 546 |
+
4600 val perplexity 1009.3023
|
| 547 |
+
4600 train 6.909331 (lr=1.2436e-05) (hash(x)=48935595)
|
| 548 |
+
4400 val loss 6.8934
|
| 549 |
+
4400 val perplexity 985.7658
|
| 550 |
+
4400 train 6.987580 (lr=3.2507e-05) (hash(x)=55200309)
|
| 551 |
+
4500 val loss 6.5932
|
| 552 |
+
4500 val perplexity 730.0834
|
| 553 |
+
4500 train 6.665695 (lr=4.4521e-05) (hash(x)=52085049)
|
| 554 |
+
4600 val loss 6.7758
|
| 555 |
+
4600 val perplexity 876.3883
|
| 556 |
+
4600 train 6.771263 (lr=1.8655e-05) (hash(x)=48935595)
|
| 557 |
+
4700 val loss 6.9094
|
| 558 |
+
4700 val perplexity 1001.5992
|
| 559 |
+
4700 train 7.105185 (lr=1.2151e-05) (hash(x)=49182380)
|
| 560 |
+
4500 val loss 6.8863
|
| 561 |
+
4500 val perplexity 978.7505
|
| 562 |
+
4500 train 6.974458 (lr=3.1801e-05) (hash(x)=52085049)
|
| 563 |
+
4600 val loss 6.5769
|
| 564 |
+
4600 val perplexity 718.3427
|
| 565 |
+
4600 train 6.586899 (lr=4.3527e-05) (hash(x)=48935595)
|
| 566 |
+
4800 val loss 6.8964
|
| 567 |
+
4800 val perplexity 988.7254
|
| 568 |
+
4700 val loss 6.7589
|
| 569 |
+
4700 val perplexity 861.7277
|
| 570 |
+
4800 train 6.694540 (lr=1.1864e-05) (hash(x)=43941929)
|
| 571 |
+
4700 train 6.979102 (lr=1.8226e-05) (hash(x)=49182380)
|
| 572 |
+
4600 val loss 6.8921
|
| 573 |
+
4600 val perplexity 984.4585
|
| 574 |
+
4600 train 6.891096 (lr=3.1091e-05) (hash(x)=48935595)
|
| 575 |
+
4700 val loss 6.5731
|
| 576 |
+
4700 val perplexity 715.5541
|
| 577 |
+
4700 train 6.820342 (lr=4.2528e-05) (hash(x)=49182380)
|
| 578 |
+
4800 val loss 6.7492
|
| 579 |
+
4800 val perplexity 853.3676
|
| 580 |
+
4900 val loss 6.8910
|
| 581 |
+
4900 val perplexity 983.3855
|
| 582 |
+
4900 train 7.058427 (lr=1.1577e-05) (hash(x)=51852773)
|
| 583 |
+
4800 train 6.550427 (lr=1.7796e-05) (hash(x)=43941929)
|
| 584 |
+
4700 val loss 6.8718
|
| 585 |
+
4700 val perplexity 964.7123
|
| 586 |
+
4700 train 7.057541 (lr=3.0377e-05) (hash(x)=49182380)
|
| 587 |
+
4800 val loss 6.5603
|
| 588 |
+
4800 val perplexity 706.4680
|
| 589 |
+
4800 train 6.383238 (lr=4.1525e-05) (hash(x)=43941929)
|
| 590 |
+
5000 val loss 6.8868
|
| 591 |
+
5000 val perplexity 979.2640
|
| 592 |
+
4900 val loss 6.7417
|
| 593 |
+
4900 val perplexity 846.9903
|
| 594 |
+
4900 train 6.903447 (lr=1.7365e-05) (hash(x)=51852773)
|
| 595 |
+
5000 train 6.477450 (lr=1.1288e-05) (hash(x)=40509616)
|
| 596 |
+
4800 val loss 6.8656
|
| 597 |
+
4800 val perplexity 958.6776
|
| 598 |
+
4800 train 6.658672 (lr=2.9661e-05) (hash(x)=43941929)
|
| 599 |
+
4900 val loss 6.5601
|
| 600 |
+
4900 val perplexity 706.3693
|
| 601 |
+
4900 train 6.714787 (lr=4.0518e-05) (hash(x)=51852773)
|
| 602 |
+
5000 val loss 6.7326
|
| 603 |
+
5000 val perplexity 839.3625
|
| 604 |
+
5100 val loss 6.8764
|
| 605 |
+
5100 val perplexity 969.1464
|
| 606 |
+
5100 train 7.171636 (lr=1.1000e-05) (hash(x)=57585369)
|
| 607 |
+
5000 train 6.303533 (lr=1.6933e-05) (hash(x)=40509616)
|
| 608 |
+
4900 val loss 6.8608
|
| 609 |
+
4900 val perplexity 954.1130
|
| 610 |
+
4900 train 7.019123 (lr=2.8942e-05) (hash(x)=51852773)
|
| 611 |
+
5000 val loss 6.5599
|
| 612 |
+
5000 val perplexity 706.1871
|
| 613 |
+
5000 train 6.147030 (lr=3.9510e-05) (hash(x)=40509616)
|
| 614 |
+
5200 val loss 6.8709
|
| 615 |
+
5200 val perplexity 963.7726
|
| 616 |
+
5200 train 6.894734 (lr=1.0712e-05) (hash(x)=51042313)
|
| 617 |
+
5100 val loss 6.7199
|
| 618 |
+
5100 val perplexity 828.7194
|
| 619 |
+
5100 train 6.972271 (lr=1.6500e-05) (hash(x)=57585369)
|
| 620 |
+
5000 val loss 6.8607
|
| 621 |
+
5000 val perplexity 954.0266
|
| 622 |
+
5000 train 6.446211 (lr=2.8221e-05) (hash(x)=40509616)
|
| 623 |
+
5100 val loss 6.5459
|
| 624 |
+
5100 val perplexity 696.3658
|
| 625 |
+
5100 train 6.742117 (lr=3.8500e-05) (hash(x)=57585369)
|
| 626 |
+
5300 val loss 6.8614
|
| 627 |
+
5300 val perplexity 954.7220
|
| 628 |
+
5300 train 7.021107 (lr=1.0423e-05) (hash(x)=52001684)
|
| 629 |
+
5200 val loss 6.7183
|
| 630 |
+
5200 val perplexity 827.4274
|
| 631 |
+
5200 train 6.726495 (lr=1.6067e-05) (hash(x)=51042313)
|
| 632 |
+
5100 val loss 6.8445
|
| 633 |
+
5100 val perplexity 938.6766
|
| 634 |
+
5100 train 7.109215 (lr=2.7500e-05) (hash(x)=57585369)
|
| 635 |
+
5200 val loss 6.5457
|
| 636 |
+
5200 val perplexity 696.2642
|
| 637 |
+
5400 val loss 6.8580
|
| 638 |
+
5400 val perplexity 951.4784
|
| 639 |
+
5200 train 6.539655 (lr=3.7490e-05) (hash(x)=51042313)
|
| 640 |
+
5400 train 6.782125 (lr=1.0136e-05) (hash(x)=48831647)
|
| 641 |
+
5300 val loss 6.7052
|
| 642 |
+
5300 val perplexity 816.6333
|
| 643 |
+
5300 train 6.861244 (lr=1.5635e-05) (hash(x)=52001684)
|
| 644 |
+
5200 val loss 6.8397
|
| 645 |
+
5200 val perplexity 934.1863
|
| 646 |
+
5200 train 6.856593 (lr=2.6779e-05) (hash(x)=51042313)
|
| 647 |
+
5500 val loss 6.8556
|
| 648 |
+
5500 val perplexity 949.1456
|
| 649 |
+
5500 train 7.206544 (lr=9.8491e-06) (hash(x)=50192069)
|
| 650 |
+
5300 val loss 6.5281
|
| 651 |
+
5300 val perplexity 684.1060
|
| 652 |
+
5300 train 6.689152 (lr=3.6482e-05) (hash(x)=52001684)
|
| 653 |
+
5400 val loss 6.7006
|
| 654 |
+
5400 val perplexity 812.8881
|
| 655 |
+
5400 train 6.625731 (lr=1.5204e-05) (hash(x)=48831647)
|
| 656 |
+
5300 val loss 6.8333
|
| 657 |
+
5300 val perplexity 928.2460
|
| 658 |
+
5300 train 6.993705 (lr=2.6058e-05) (hash(x)=52001684)
|
| 659 |
+
5600 val loss 6.8466
|
| 660 |
+
5600 val perplexity 940.6655
|
| 661 |
+
5600 train 6.654557 (lr=9.5636e-06) (hash(x)=47208852)
|
| 662 |
+
5400 val loss 6.5233
|
| 663 |
+
5400 val perplexity 680.8095
|
| 664 |
+
5400 train 6.443679 (lr=3.5475e-05) (hash(x)=48831647)
|
| 665 |
+
5500 val loss 6.6987
|
| 666 |
+
5500 val perplexity 811.3495
|
| 667 |
+
5500 train 7.087494 (lr=1.4774e-05) (hash(x)=50192069)
|
| 668 |
+
5400 val loss 6.8253
|
| 669 |
+
5400 val perplexity 920.8331
|
| 670 |
+
5400 train 6.749354 (lr=2.5339e-05) (hash(x)=48831647)
|
| 671 |
+
5700 val loss 6.8428
|
| 672 |
+
5700 val perplexity 937.1461
|
| 673 |
+
5700 train 6.436399 (lr=9.2796e-06) (hash(x)=44061694)
|
| 674 |
+
5500 val loss 6.5335
|
| 675 |
+
5500 val perplexity 687.8297
|
| 676 |
+
5500 train 6.943909 (lr=3.4472e-05) (hash(x)=50192069)
|
| 677 |
+
5600 val loss 6.6884
|
| 678 |
+
5600 val perplexity 803.0125
|
| 679 |
+
5600 train 6.500677 (lr=1.4345e-05) (hash(x)=47208852)
|
| 680 |
+
5500 val loss 6.8219
|
| 681 |
+
5500 val perplexity 917.7541
|
| 682 |
+
5500 train 7.200629 (lr=2.4623e-05) (hash(x)=50192069)
|
| 683 |
+
5800 val loss 6.8371
|
| 684 |
+
5800 val perplexity 931.7457
|
| 685 |
+
5800 train 7.215187 (lr=8.9973e-06) (hash(x)=56513279)
|
| 686 |
+
5600 val loss 6.5178
|
| 687 |
+
5600 val perplexity 677.0574
|
| 688 |
+
5700 val loss 6.6834
|
| 689 |
+
5700 val perplexity 799.0583
|
| 690 |
+
5600 train 6.333476 (lr=3.3473e-05) (hash(x)=47208852)
|
| 691 |
+
5700 train 6.274059 (lr=1.3919e-05) (hash(x)=44061694)
|
| 692 |
+
5600 val loss 6.8152
|
| 693 |
+
5600 val perplexity 911.5791
|
| 694 |
+
5600 train 6.629575 (lr=2.3909e-05) (hash(x)=47208852)
|
| 695 |
+
5900 val loss 6.8312
|
| 696 |
+
5900 val perplexity 926.2846
|
| 697 |
+
5900 train 6.992841 (lr=8.7171e-06) (hash(x)=50412818)
|
| 698 |
+
5800 val loss 6.6761
|
| 699 |
+
5800 val perplexity 793.2364
|
| 700 |
+
5700 val loss 6.5278
|
| 701 |
+
5700 val perplexity 683.8582
|
| 702 |
+
5800 train 7.079706 (lr=1.3496e-05) (hash(x)=56513279)
|
| 703 |
+
5700 train 6.099132 (lr=3.2479e-05) (hash(x)=44061694)
|
| 704 |
+
5700 val loss 6.8144
|
| 705 |
+
5700 val perplexity 910.8314
|
| 706 |
+
5700 train 6.405194 (lr=2.3199e-05) (hash(x)=44061694)
|
| 707 |
+
6000 val loss 6.8262
|
| 708 |
+
6000 val perplexity 921.7257
|
| 709 |
+
6000 train 6.613876 (lr=8.4393e-06) (hash(x)=47159634)
|
| 710 |
+
5900 val loss 6.6691
|
| 711 |
+
5900 val perplexity 787.6990
|
| 712 |
+
5900 train 6.819257 (lr=1.3076e-05) (hash(x)=50412818)
|
| 713 |
+
5800 val loss 6.5141
|
| 714 |
+
5800 val perplexity 674.6172
|
| 715 |
+
5800 train 6.898224 (lr=3.1491e-05) (hash(x)=56513279)
|
| 716 |
+
5800 val loss 6.8042
|
| 717 |
+
5800 val perplexity 901.6212
|
| 718 |
+
5800 train 7.188510 (lr=2.2493e-05) (hash(x)=56513279)
|
| 719 |
+
6100 val loss 6.8191
|
| 720 |
+
6100 val perplexity 915.1422
|
| 721 |
+
6100 train 6.918497 (lr=8.1640e-06) (hash(x)=54312795)
|
| 722 |
+
6000 val loss 6.6627
|
| 723 |
+
6000 val perplexity 782.6782
|
| 724 |
+
6000 train 6.458968 (lr=1.2659e-05) (hash(x)=47159634)
|
| 725 |
+
5900 val loss 6.5100
|
| 726 |
+
5900 val perplexity 671.8426
|
| 727 |
+
5900 train 6.649952 (lr=3.0510e-05) (hash(x)=50412818)
|
| 728 |
+
5900 val loss 6.8038
|
| 729 |
+
5900 val perplexity 901.3005
|
| 730 |
+
5900 train 6.949226 (lr=2.1793e-05) (hash(x)=50412818)
|
| 731 |
+
6200 val loss 6.8130
|
| 732 |
+
6200 val perplexity 909.6113
|
| 733 |
+
6200 train 6.990555 (lr=7.8917e-06) (hash(x)=54187587)
|
| 734 |
+
6100 val loss 6.6554
|
| 735 |
+
6100 val perplexity 776.9844
|
| 736 |
+
6100 train 6.756692 (lr=1.2246e-05) (hash(x)=54312795)
|
| 737 |
+
6000 val loss 6.5064
|
| 738 |
+
6000 val perplexity 669.4238
|
| 739 |
+
6000 train 6.328054 (lr=2.9537e-05) (hash(x)=47159634)
|
| 740 |
+
6000 val loss 6.7957
|
| 741 |
+
6000 val perplexity 894.0056
|
| 742 |
+
6000 train 6.589294 (lr=2.1098e-05) (hash(x)=47159634)
|
| 743 |
+
6300 val loss 6.8062
|
| 744 |
+
6300 val perplexity 903.3907
|
| 745 |
+
6300 train 6.916306 (lr=7.6226e-06) (hash(x)=53620387)
|
| 746 |
+
6200 val loss 6.6462
|
| 747 |
+
6200 val perplexity 769.8491
|
| 748 |
+
6200 train 6.825405 (lr=1.1838e-05) (hash(x)=54187587)
|
| 749 |
+
6100 val loss 6.4902
|
| 750 |
+
6100 val perplexity 658.6664
|
| 751 |
+
6100 train 6.581672 (lr=2.8574e-05) (hash(x)=54312795)
|
| 752 |
+
6100 val loss 6.7919
|
| 753 |
+
6100 val perplexity 890.6199
|
| 754 |
+
6100 train 6.885747 (lr=2.0410e-05) (hash(x)=54312795)
|
| 755 |
+
6400 val loss 6.7978
|
| 756 |
+
6400 val perplexity 895.8444
|
| 757 |
+
6400 train 6.737978 (lr=7.3569e-06) (hash(x)=48761774)
|
| 758 |
+
6300 val loss 6.6364
|
| 759 |
+
6300 val perplexity 762.3516
|
| 760 |
+
6300 train 6.762627 (lr=1.1434e-05) (hash(x)=53620387)
|
| 761 |
+
6200 val loss 6.4772
|
| 762 |
+
6200 val perplexity 650.1536
|
| 763 |
+
6200 train 6.615796 (lr=2.7621e-05) (hash(x)=54187587)
|
| 764 |
+
6200 val loss 6.7828
|
| 765 |
+
6200 val perplexity 882.5361
|
| 766 |
+
6200 train 6.953031 (lr=1.9729e-05) (hash(x)=54187587)
|
| 767 |
+
6500 val loss 6.7933
|
| 768 |
+
6500 val perplexity 891.8179
|
| 769 |
+
6500 train 7.034630 (lr=7.0950e-06) (hash(x)=56690281)
|
| 770 |
+
6400 val loss 6.6275
|
| 771 |
+
6400 val perplexity 755.6107
|
| 772 |
+
6400 train 6.551711 (lr=1.1035e-05) (hash(x)=48761774)
|
| 773 |
+
6300 val loss 6.4718
|
| 774 |
+
6300 val perplexity 646.6768
|
| 775 |
+
6300 train 6.592690 (lr=2.6679e-05) (hash(x)=53620387)
|
| 776 |
+
6300 val loss 6.7704
|
| 777 |
+
6300 val perplexity 871.6821
|
| 778 |
+
6300 train 6.878792 (lr=1.9056e-05) (hash(x)=53620387)
|
| 779 |
+
6600 val loss 6.7888
|
| 780 |
+
6600 val perplexity 887.8833
|
| 781 |
+
6600 train 6.580529 (lr=6.8372e-06) (hash(x)=42985269)
|
| 782 |
+
6500 val loss 6.6192
|
| 783 |
+
6500 val perplexity 749.3109
|
| 784 |
+
6500 train 6.858943 (lr=1.0643e-05) (hash(x)=56690281)
|
| 785 |
+
6400 val loss 6.4743
|
| 786 |
+
6400 val perplexity 648.2544
|
| 787 |
+
6400 train 6.393586 (lr=2.5749e-05) (hash(x)=48761774)
|
| 788 |
+
6400 val loss 6.7612
|
| 789 |
+
6400 val perplexity 863.6722
|
| 790 |
+
6400 train 6.697858 (lr=1.8392e-05) (hash(x)=48761774)
|
| 791 |
+
6700 val loss 6.7863
|
| 792 |
+
6700 val perplexity 885.5966
|
| 793 |
+
6700 train 6.804174 (lr=6.5835e-06) (hash(x)=53315447)
|
| 794 |
+
6600 val loss 6.6162
|
| 795 |
+
6600 val perplexity 747.0964
|
| 796 |
+
6600 train 6.430020 (lr=1.0256e-05) (hash(x)=42985269)
|
| 797 |
+
6500 val loss 6.4627
|
| 798 |
+
6500 val perplexity 640.7616
|
| 799 |
+
6500 train 6.703842 (lr=2.4833e-05) (hash(x)=56690281)
|
| 800 |
+
6500 val loss 6.7577
|
| 801 |
+
6500 val perplexity 860.6866
|
| 802 |
+
6500 train 7.011350 (lr=1.7738e-05) (hash(x)=56690281)
|
| 803 |
+
6800 val loss 6.7796
|
| 804 |
+
6800 val perplexity 879.7517
|
| 805 |
+
6800 train 7.271560 (lr=6.3345e-06) (hash(x)=61577166)
|
| 806 |
+
6700 val loss 6.6127
|
| 807 |
+
6700 val perplexity 744.4901
|
| 808 |
+
6700 train 6.654109 (lr=9.8753e-06) (hash(x)=53315447)
|
| 809 |
+
6600 val loss 6.4587
|
| 810 |
+
6600 val perplexity 638.2580
|
| 811 |
+
6600 train 6.275012 (lr=2.3930e-05) (hash(x)=42985269)
|
| 812 |
+
6900 val loss 6.7779
|
| 813 |
+
6900 val perplexity 878.2637
|
| 814 |
+
6900 train 6.883226 (lr=6.0902e-06) (hash(x)=54641005)
|
| 815 |
+
6600 val loss 6.7546
|
| 816 |
+
6600 val perplexity 857.9875
|
| 817 |
+
6600 train 6.544014 (lr=1.7093e-05) (hash(x)=42985269)
|
| 818 |
+
6800 val loss 6.6055
|
| 819 |
+
6800 val perplexity 739.1291
|
| 820 |
+
6800 train 7.100903 (lr=9.5017e-06) (hash(x)=61577166)
|
| 821 |
+
6700 val loss 6.4604
|
| 822 |
+
6700 val perplexity 639.3189
|
| 823 |
+
6700 train 6.494969 (lr=2.3042e-05) (hash(x)=53315447)
|
| 824 |
+
7000 val loss 6.7713
|
| 825 |
+
7000 val perplexity 872.4722
|
| 826 |
+
7000 train 7.330594 (lr=5.8510e-06) (hash(x)=60579512)
|
| 827 |
+
6700 val loss 6.7507
|
| 828 |
+
6700 val perplexity 854.6246
|
| 829 |
+
6700 train 6.784648 (lr=1.6459e-05) (hash(x)=53315447)
|
| 830 |
+
6900 val loss 6.5959
|
| 831 |
+
6900 val perplexity 732.0701
|
| 832 |
+
6900 train 6.713835 (lr=9.1353e-06) (hash(x)=54641005)
|
| 833 |
+
6800 val loss 6.4498
|
| 834 |
+
6800 val perplexity 632.5809
|
| 835 |
+
6800 train 6.904761 (lr=2.2171e-05) (hash(x)=61577166)
|
| 836 |
+
7100 val loss 6.7694
|
| 837 |
+
7100 val perplexity 870.7997
|
| 838 |
+
7100 train 6.665070 (lr=5.6170e-06) (hash(x)=53151549)
|
| 839 |
+
6800 val loss 6.7475
|
| 840 |
+
6800 val perplexity 851.9592
|
| 841 |
+
6800 train 7.248035 (lr=1.5836e-05) (hash(x)=61577166)
|
| 842 |
+
7000 val loss 6.5925
|
| 843 |
+
7000 val perplexity 729.6163
|
| 844 |
+
7000 train 7.168230 (lr=8.7764e-06) (hash(x)=60579512)
|
| 845 |
+
6900 val loss 6.4431
|
| 846 |
+
6900 val perplexity 628.3522
|
| 847 |
+
6900 train 6.581858 (lr=2.1316e-05) (hash(x)=54641005)
|
| 848 |
+
7200 val loss 6.7639
|
| 849 |
+
7200 val perplexity 865.9951
|
| 850 |
+
7200 train 7.761684 (lr=5.3886e-06) (hash(x)=71842455)
|
| 851 |
+
6900 val loss 6.7337
|
| 852 |
+
6900 val perplexity 840.2455
|
| 853 |
+
6900 train 6.854381 (lr=1.5225e-05) (hash(x)=54641005)
|
| 854 |
+
7100 val loss 6.5893
|
| 855 |
+
7100 val perplexity 727.2419
|
| 856 |
+
7100 train 6.488562 (lr=8.4255e-06) (hash(x)=53151549)
|
| 857 |
+
7000 val loss 6.4382
|
| 858 |
+
7000 val perplexity 625.2898
|
| 859 |
+
7000 train 6.964822 (lr=2.0478e-05) (hash(x)=60579512)
|
| 860 |
+
7300 val loss 6.7625
|
| 861 |
+
7300 val perplexity 864.8347
|
| 862 |
+
7300 train 6.528047 (lr=5.1659e-06) (hash(x)=44516452)
|
| 863 |
+
7000 val loss 6.7305
|
| 864 |
+
7000 val perplexity 837.5577
|
| 865 |
+
7000 train 7.277111 (lr=1.4627e-05) (hash(x)=60579512)
|
| 866 |
+
7200 val loss 6.5813
|
| 867 |
+
7200 val perplexity 721.4806
|
| 868 |
+
7200 train 7.597327 (lr=8.0829e-06) (hash(x)=71842455)
|
| 869 |
+
7100 val loss 6.4414
|
| 870 |
+
7100 val perplexity 627.2758
|
| 871 |
+
7100 train 6.345554 (lr=1.9660e-05) (hash(x)=53151549)
|
| 872 |
+
7400 val loss 6.7592
|
| 873 |
+
7400 val perplexity 861.9483
|
| 874 |
+
7400 train 6.410480 (lr=4.9493e-06) (hash(x)=42667710)
|
| 875 |
+
7100 val loss 6.7267
|
| 876 |
+
7100 val perplexity 834.4161
|
| 877 |
+
7100 train 6.616548 (lr=1.4043e-05) (hash(x)=53151549)
|
| 878 |
+
7300 val loss 6.5815
|
| 879 |
+
7300 val perplexity 721.5986
|
| 880 |
+
7300 train 6.349890 (lr=7.7489e-06) (hash(x)=44516452)
|
| 881 |
+
7200 val loss 6.4361
|
| 882 |
+
7200 val perplexity 623.9594
|
| 883 |
+
7200 train 7.351032 (lr=1.8860e-05) (hash(x)=71842455)
|
| 884 |
+
7500 val loss 6.7555
|
| 885 |
+
7500 val perplexity 858.8078
|
| 886 |
+
7500 train 6.505149 (lr=4.7389e-06) (hash(x)=47050797)
|
| 887 |
+
7200 val loss 6.7204
|
| 888 |
+
7200 val perplexity 829.1766
|
| 889 |
+
7200 train 7.733423 (lr=1.3471e-05) (hash(x)=71842455)
|
| 890 |
+
7400 val loss 6.5739
|
| 891 |
+
7400 val perplexity 716.1360
|
| 892 |
+
7400 train 6.221677 (lr=7.4239e-06) (hash(x)=42667710)
|
| 893 |
+
7300 val loss 6.4313
|
| 894 |
+
7300 val perplexity 620.9935
|
| 895 |
+
7300 train 6.198883 (lr=1.8081e-05) (hash(x)=44516452)
|
| 896 |
+
7600 val loss 6.7532
|
| 897 |
+
7600 val perplexity 856.8362
|
| 898 |
+
7600 train 6.666946 (lr=4.5349e-06) (hash(x)=49785056)
|
| 899 |
+
7300 val loss 6.7178
|
| 900 |
+
7300 val perplexity 827.0081
|
| 901 |
+
7300 train 6.492526 (lr=1.2915e-05) (hash(x)=44516452)
|
| 902 |
+
7500 val loss 6.5720
|
| 903 |
+
7500 val perplexity 714.8130
|
| 904 |
+
7500 train 6.316886 (lr=7.1083e-06) (hash(x)=47050797)
|
| 905 |
+
7400 val loss 6.4278
|
| 906 |
+
7400 val perplexity 618.8268
|
| 907 |
+
7400 train 6.063058 (lr=1.7323e-05) (hash(x)=42667710)
|
| 908 |
+
7700 val loss 6.7499
|
| 909 |
+
7700 val perplexity 853.9936
|
| 910 |
+
7700 train 6.595833 (lr=4.3375e-06) (hash(x)=53232030)
|
| 911 |
+
7400 val loss 6.7131
|
| 912 |
+
7400 val perplexity 823.0838
|
| 913 |
+
7400 train 6.368190 (lr=1.2373e-05) (hash(x)=42667710)
|
| 914 |
+
7600 val loss 6.5660
|
| 915 |
+
7600 val perplexity 710.5325
|
| 916 |
+
7600 train 6.483896 (lr=6.8023e-06) (hash(x)=49785056)
|
| 917 |
+
7500 val loss 6.4225
|
| 918 |
+
7500 val perplexity 615.5386
|
| 919 |
+
7500 train 6.168694 (lr=1.6586e-05) (hash(x)=47050797)
|
| 920 |
+
7800 val loss 6.7466
|
| 921 |
+
7800 val perplexity 851.1527
|
| 922 |
+
7800 train 6.579307 (lr=4.1470e-06) (hash(x)=48049749)
|
| 923 |
+
7500 val loss 6.7074
|
| 924 |
+
7500 val perplexity 818.4573
|
| 925 |
+
7500 train 6.456196 (lr=1.1847e-05) (hash(x)=47050797)
|
| 926 |
+
7700 val loss 6.5620
|
| 927 |
+
7700 val perplexity 707.7010
|
| 928 |
+
7700 train 6.402081 (lr=6.5062e-06) (hash(x)=53232030)
|
| 929 |
+
7600 val loss 6.4167
|
| 930 |
+
7600 val perplexity 611.9720
|
| 931 |
+
7600 train 6.321131 (lr=1.5872e-05) (hash(x)=49785056)
|
| 932 |
+
7900 val loss 6.7407
|
| 933 |
+
7900 val perplexity 846.1240
|
| 934 |
+
7900 train 6.525646 (lr=3.9635e-06) (hash(x)=44768513)
|
| 935 |
+
7600 val loss 6.7034
|
| 936 |
+
7600 val perplexity 815.2051
|
| 937 |
+
7600 train 6.616943 (lr=1.1337e-05) (hash(x)=49785056)
|
| 938 |
+
7800 val loss 6.5586
|
| 939 |
+
7800 val perplexity 705.2940
|
| 940 |
+
7800 train 6.375649 (lr=6.2205e-06) (hash(x)=48049749)
|
| 941 |
+
7700 val loss 6.4153
|
| 942 |
+
7700 val perplexity 611.1429
|
| 943 |
+
7700 train 6.237316 (lr=1.5181e-05) (hash(x)=53232030)
|
| 944 |
+
8000 val loss 6.7368
|
| 945 |
+
8000 val perplexity 842.8249
|
| 946 |
+
8000 train 6.609796 (lr=3.7873e-06) (hash(x)=46228039)
|
| 947 |
+
7700 val loss 6.7032
|
| 948 |
+
7700 val perplexity 814.9871
|
| 949 |
+
7700 train 6.551203 (lr=1.0844e-05) (hash(x)=53232030)
|
| 950 |
+
7900 val loss 6.5509
|
| 951 |
+
7900 val perplexity 699.8884
|
| 952 |
+
7900 train 6.348945 (lr=5.9453e-06) (hash(x)=44768513)
|
| 953 |
+
7800 val loss 6.4185
|
| 954 |
+
7800 val perplexity 613.0842
|
| 955 |
+
7800 train 6.233474 (lr=1.4514e-05) (hash(x)=48049749)
|
| 956 |
+
8100 val loss 6.7341
|
| 957 |
+
8100 val perplexity 840.5965
|
| 958 |
+
8100 train 7.155693 (lr=3.6184e-06) (hash(x)=60017091)
|
| 959 |
+
7800 val loss 6.6980
|
| 960 |
+
7800 val perplexity 810.7551
|
| 961 |
+
7800 train 6.527575 (lr=1.0367e-05) (hash(x)=48049749)
|
| 962 |
+
8000 val loss 6.5460
|
| 963 |
+
8000 val perplexity 696.4471
|
| 964 |
+
8000 train 6.408615 (lr=5.6809e-06) (hash(x)=46228039)
|
| 965 |
+
7900 val loss 6.4048
|
| 966 |
+
7900 val perplexity 604.7163
|
| 967 |
+
7900 train 6.222796 (lr=1.3872e-05) (hash(x)=44768513)
|
| 968 |
+
8200 val loss 6.7300
|
| 969 |
+
8200 val perplexity 837.1589
|
| 970 |
+
8200 train 6.721200 (lr=3.4572e-06) (hash(x)=49910198)
|
| 971 |
+
7900 val loss 6.6898
|
| 972 |
+
7900 val perplexity 804.1443
|
| 973 |
+
7900 train 6.489498 (lr=9.9088e-06) (hash(x)=44768513)
|
| 974 |
+
8100 val loss 6.5439
|
| 975 |
+
8100 val perplexity 695.0140
|
| 976 |
+
8100 train 6.955059 (lr=5.4277e-06) (hash(x)=60017091)
|
| 977 |
+
8300 val loss 6.7268
|
| 978 |
+
8300 val perplexity 834.4623
|
| 979 |
+
8300 train 7.089801 (lr=3.3037e-06) (hash(x)=57919055)
|
| 980 |
+
8000 val loss 6.4018
|
| 981 |
+
8000 val perplexity 602.9116
|
| 982 |
+
8000 train 6.253847 (lr=1.3255e-05) (hash(x)=46228039)
|
| 983 |
+
8000 val loss 6.6854
|
| 984 |
+
8000 val perplexity 800.5988
|
| 985 |
+
8000 train 6.562236 (lr=9.4682e-06) (hash(x)=46228039)
|
| 986 |
+
8200 val loss 6.5399
|
| 987 |
+
8200 val perplexity 692.2427
|
| 988 |
+
8200 train 6.549590 (lr=5.1858e-06) (hash(x)=49910198)
|
| 989 |
+
8400 val loss 6.7268
|
| 990 |
+
8400 val perplexity 834.4695
|
| 991 |
+
8400 train 6.833617 (lr=3.1581e-06) (hash(x)=49694964)
|
| 992 |
+
8100 val loss 6.3971
|
| 993 |
+
8100 val perplexity 600.1071
|
| 994 |
+
8100 train 6.769569 (lr=1.2665e-05) (hash(x)=60017091)
|
| 995 |
+
8100 val loss 6.6832
|
| 996 |
+
8100 val perplexity 798.8370
|
| 997 |
+
8100 train 7.114765 (lr=9.0461e-06) (hash(x)=60017091)
|
| 998 |
+
8300 val loss 6.5345
|
| 999 |
+
8300 val perplexity 688.5184
|
| 1000 |
+
8300 train 6.907079 (lr=4.9556e-06) (hash(x)=57919055)
|
| 1001 |
+
8500 val loss 6.7234
|
| 1002 |
+
8500 val perplexity 831.6531
|
| 1003 |
+
8500 train 6.736595 (lr=3.0206e-06) (hash(x)=53762585)
|
| 1004 |
+
8200 val loss 6.3928
|
| 1005 |
+
8200 val perplexity 597.5019
|
| 1006 |
+
8200 train 6.411548 (lr=1.2100e-05) (hash(x)=49910198)
|
| 1007 |
+
8200 val loss 6.6775
|
| 1008 |
+
8200 val perplexity 794.3458
|
| 1009 |
+
8200 train 6.668144 (lr=8.6430e-06) (hash(x)=49910198)
|
| 1010 |
+
8400 val loss 6.5352
|
| 1011 |
+
8400 val perplexity 688.9457
|
| 1012 |
+
8400 train 6.652549 (lr=4.7372e-06) (hash(x)=49694964)
|
| 1013 |
+
8600 val loss 6.7207
|
| 1014 |
+
8600 val perplexity 829.4001
|
| 1015 |
+
8600 train 6.771939 (lr=2.8913e-06) (hash(x)=51166973)
|
| 1016 |
+
8300 val loss 6.3882
|
| 1017 |
+
8300 val perplexity 594.7954
|
| 1018 |
+
8300 train 6.720931 (lr=1.1563e-05) (hash(x)=57919055)
|
| 1019 |
+
8300 val loss 6.6720
|
| 1020 |
+
8300 val perplexity 789.9393
|
| 1021 |
+
8300 train 7.051010 (lr=8.2593e-06) (hash(x)=57919055)
|
| 1022 |
+
8500 val loss 6.5307
|
| 1023 |
+
8500 val perplexity 685.8688
|
| 1024 |
+
8500 train 6.550485 (lr=4.5309e-06) (hash(x)=53762585)
|
| 1025 |
+
8700 val loss 6.7181
|
| 1026 |
+
8700 val perplexity 827.2309
|
| 1027 |
+
8700 train 6.789652 (lr=2.7703e-06) (hash(x)=53968049)
|
| 1028 |
+
8400 val loss 6.3887
|
| 1029 |
+
8400 val perplexity 595.0958
|
| 1030 |
+
8400 train 6.482622 (lr=1.1053e-05) (hash(x)=49694964)
|
| 1031 |
+
8400 val loss 6.6700
|
| 1032 |
+
8400 val perplexity 788.4261
|
| 1033 |
+
8400 train 6.787225 (lr=7.8953e-06) (hash(x)=49694964)
|
| 1034 |
+
8600 val loss 6.5288
|
| 1035 |
+
8600 val perplexity 684.6053
|
| 1036 |
+
8600 train 6.580200 (lr=4.3369e-06) (hash(x)=51166973)
|
| 1037 |
+
8800 val loss 6.7160
|
| 1038 |
+
8800 val perplexity 825.5129
|
| 1039 |
+
8800 train 6.815679 (lr=2.6577e-06) (hash(x)=59231056)
|
| 1040 |
+
8500 val loss 6.3836
|
| 1041 |
+
8500 val perplexity 592.0306
|
| 1042 |
+
8500 train 6.408396 (lr=1.0572e-05) (hash(x)=53762585)
|
| 1043 |
+
8500 val loss 6.6665
|
| 1044 |
+
8500 val perplexity 785.6351
|
| 1045 |
+
8500 train 6.681557 (lr=7.5515e-06) (hash(x)=53762585)
|
| 1046 |
+
8700 val loss 6.5256
|
| 1047 |
+
8700 val perplexity 682.3744
|
| 1048 |
+
8900 val loss 6.7144
|
| 1049 |
+
8900 val perplexity 824.1870
|
| 1050 |
+
8700 train 6.605652 (lr=4.1554e-06) (hash(x)=53968049)
|
| 1051 |
+
8900 train 6.623099 (lr=2.5538e-06) (hash(x)=50488048)
|
| 1052 |
+
8600 val loss 6.3830
|
| 1053 |
+
8600 val perplexity 591.6888
|
| 1054 |
+
8600 train 6.420757 (lr=1.0119e-05) (hash(x)=51166973)
|
| 1055 |
+
8600 val loss 6.6634
|
| 1056 |
+
8600 val perplexity 783.2065
|
| 1057 |
+
8600 train 6.713762 (lr=7.2282e-06) (hash(x)=51166973)
|
| 1058 |
+
9000 val loss 6.7123
|
| 1059 |
+
9000 val perplexity 822.4427
|
| 1060 |
+
9000 train 6.414406 (lr=2.4585e-06) (hash(x)=44492956)
|
| 1061 |
+
8800 val loss 6.5224
|
| 1062 |
+
8800 val perplexity 680.2186
|
| 1063 |
+
8800 train 6.619614 (lr=3.9866e-06) (hash(x)=59231056)
|
| 1064 |
+
8700 val loss 6.3795
|
| 1065 |
+
8700 val perplexity 589.6207
|
| 1066 |
+
8700 train 6.453899 (lr=9.6960e-06) (hash(x)=53968049)
|
| 1067 |
+
8700 val loss 6.6606
|
| 1068 |
+
8700 val perplexity 781.0009
|
| 1069 |
+
8700 train 6.732889 (lr=6.9257e-06) (hash(x)=53968049)
|
| 1070 |
+
9100 val loss 6.7135
|
| 1071 |
+
9100 val perplexity 823.4442
|
| 1072 |
+
9100 train 6.769123 (lr=2.3720e-06) (hash(x)=51134989)
|
| 1073 |
+
8900 val loss 6.5213
|
| 1074 |
+
8900 val perplexity 679.4908
|
| 1075 |
+
8900 train 6.445029 (lr=3.8307e-06) (hash(x)=50488048)
|
| 1076 |
+
8800 val loss 6.3741
|
| 1077 |
+
8800 val perplexity 586.4824
|
| 1078 |
+
8800 train 6.473434 (lr=9.3021e-06) (hash(x)=59231056)
|
| 1079 |
+
8800 val loss 6.6562
|
| 1080 |
+
8800 val perplexity 777.5667
|
| 1081 |
+
8800 train 6.754422 (lr=6.6444e-06) (hash(x)=59231056)
|
| 1082 |
+
9200 val loss 6.7107
|
| 1083 |
+
9200 val perplexity 821.1844
|
| 1084 |
+
9200 train 6.514919 (lr=2.2943e-06) (hash(x)=48636056)
|
| 1085 |
+
9000 val loss 6.5202
|
| 1086 |
+
9000 val perplexity 678.7302
|
| 1087 |
+
9000 train 6.222732 (lr=3.6877e-06) (hash(x)=44492956)
|
| 1088 |
+
8900 val loss 6.3754
|
| 1089 |
+
8900 val perplexity 587.2310
|
| 1090 |
+
8900 train 6.260254 (lr=8.9382e-06) (hash(x)=50488048)
|
| 1091 |
+
9300 val loss 6.7099
|
| 1092 |
+
9300 val perplexity 820.5175
|
| 1093 |
+
9300 train 6.680036 (lr=2.2256e-06) (hash(x)=50200551)
|
| 1094 |
+
8900 val loss 6.6543
|
| 1095 |
+
8900 val perplexity 776.0824
|
| 1096 |
+
8900 train 6.573340 (lr=6.3845e-06) (hash(x)=50488048)
|
| 1097 |
+
9100 val loss 6.5226
|
| 1098 |
+
9100 val perplexity 680.3639
|
| 1099 |
+
9100 train 6.545186 (lr=3.5580e-06) (hash(x)=51134989)
|
| 1100 |
+
9000 val loss 6.3725
|
| 1101 |
+
9000 val perplexity 585.4980
|
| 1102 |
+
9000 train 6.081764 (lr=8.6047e-06) (hash(x)=44492956)
|
| 1103 |
+
9400 val loss 6.7092
|
| 1104 |
+
9400 val perplexity 819.8851
|
| 1105 |
+
9400 train 6.530241 (lr=2.1660e-06) (hash(x)=48057228)
|
| 1106 |
+
9000 val loss 6.6524
|
| 1107 |
+
9000 val perplexity 774.6767
|
| 1108 |
+
9000 train 6.361135 (lr=6.1462e-06) (hash(x)=44492956)
|
| 1109 |
+
9200 val loss 6.5177
|
| 1110 |
+
9200 val perplexity 676.9999
|
| 1111 |
+
9200 train 6.318551 (lr=3.4415e-06) (hash(x)=48636056)
|
| 1112 |
+
9100 val loss 6.3738
|
| 1113 |
+
9100 val perplexity 586.2674
|
| 1114 |
+
9100 train 6.352215 (lr=8.3020e-06) (hash(x)=51134989)
|
| 1115 |
+
9500 val loss 6.7056
|
| 1116 |
+
9500 val perplexity 816.9913
|
| 1117 |
+
9500 train 6.484961 (lr=2.1154e-06) (hash(x)=48125171)
|
| 1118 |
+
9100 val loss 6.6548
|
| 1119 |
+
9100 val perplexity 776.4929
|
| 1120 |
+
9100 train 6.679288 (lr=5.9300e-06) (hash(x)=51134989)
|
| 1121 |
+
9300 val loss 6.5164
|
| 1122 |
+
9300 val perplexity 676.1420
|
| 1123 |
+
9300 train 6.469476 (lr=3.3385e-06) (hash(x)=50200551)
|
| 1124 |
+
9200 val loss 6.3685
|
| 1125 |
+
9200 val perplexity 583.1851
|
| 1126 |
+
9200 train 6.174442 (lr=8.0302e-06) (hash(x)=48636056)
|
| 1127 |
+
9600 val loss 6.7056
|
| 1128 |
+
9600 val perplexity 816.9484
|
| 1129 |
+
9600 train 6.674135 (lr=2.0739e-06) (hash(x)=53375853)
|
| 1130 |
+
9200 val loss 6.6493
|
| 1131 |
+
9200 val perplexity 772.2683
|
| 1132 |
+
9200 train 6.453731 (lr=5.7359e-06) (hash(x)=48636056)
|
| 1133 |
+
9400 val loss 6.5164
|
| 1134 |
+
9400 val perplexity 676.1276
|
| 1135 |
+
9400 train 6.336080 (lr=3.2490e-06) (hash(x)=48057228)
|
| 1136 |
+
9300 val loss 6.3682
|
| 1137 |
+
9300 val perplexity 583.0174
|
| 1138 |
+
9300 train 6.318146 (lr=7.7898e-06) (hash(x)=50200551)
|
| 1139 |
+
9700 val loss 6.7019
|
| 1140 |
+
9700 val perplexity 813.9532
|
| 1141 |
+
9700 train 7.475645 (lr=2.0416e-06) (hash(x)=53924631)
|
| 1142 |
+
9300 val loss 6.6490
|
| 1143 |
+
9300 val perplexity 772.0294
|
| 1144 |
+
9300 train 6.610506 (lr=5.5641e-06) (hash(x)=50200551)
|
| 1145 |
+
9500 val loss 6.5129
|
| 1146 |
+
9500 val perplexity 673.7589
|
| 1147 |
+
9500 train 6.297124 (lr=3.1730e-06) (hash(x)=48125171)
|
| 1148 |
+
9400 val loss 6.3696
|
| 1149 |
+
9400 val perplexity 583.8489
|
| 1150 |
+
9400 train 6.185252 (lr=7.5809e-06) (hash(x)=48057228)
|
| 1151 |
+
9800 val loss 6.6990
|
| 1152 |
+
9800 val perplexity 811.5999
|
| 1153 |
+
9800 train 6.732530 (lr=2.0185e-06) (hash(x)=48895047)
|
| 1154 |
+
9400 val loss 6.6472
|
| 1155 |
+
9400 val perplexity 770.5902
|
| 1156 |
+
9400 train 6.478499 (lr=5.4149e-06) (hash(x)=48057228)
|
| 1157 |
+
9600 val loss 6.5126
|
| 1158 |
+
9600 val perplexity 673.5715
|
| 1159 |
+
9600 train 6.459769 (lr=3.1108e-06) (hash(x)=53375853)
|
| 1160 |
+
9500 val loss 6.3674
|
| 1161 |
+
9500 val perplexity 582.5411
|
| 1162 |
+
9500 train 6.146292 (lr=7.4038e-06) (hash(x)=48125171)
|
| 1163 |
+
9900 val loss 6.6963
|
| 1164 |
+
9900 val perplexity 809.4128
|
| 1165 |
+
9900 train 6.532602 (lr=2.0046e-06) (hash(x)=44269923)
|
| 1166 |
+
9500 val loss 6.6440
|
| 1167 |
+
9500 val perplexity 768.1942
|
| 1168 |
+
9500 train 6.421329 (lr=5.2884e-06) (hash(x)=48125171)
|
| 1169 |
+
9700 val loss 6.5079
|
| 1170 |
+
9700 val perplexity 670.4332
|
| 1171 |
+
9700 train 7.313867 (lr=3.0624e-06) (hash(x)=53924631)
|
| 1172 |
+
9600 val loss 6.3645
|
| 1173 |
+
9600 val perplexity 580.8505
|
| 1174 |
+
9600 train 6.295037 (lr=7.2586e-06) (hash(x)=53375853)
|
| 1175 |
+
9999 val loss 6.6978
|
| 1176 |
+
9999 val perplexity 810.6125
|
| 1177 |
+
9600 val loss 6.6432
|
| 1178 |
+
9600 val perplexity 767.5720
|
| 1179 |
+
9600 train 6.602069 (lr=5.1847e-06) (hash(x)=53375853)
|
| 1180 |
+
9800 val loss 6.5053
|
| 1181 |
+
9800 val perplexity 668.6689
|
| 1182 |
+
9800 train 6.551248 (lr=3.0277e-06) (hash(x)=48895047)
|
| 1183 |
+
9700 val loss 6.3601
|
| 1184 |
+
9700 val perplexity 578.3121
|
| 1185 |
+
9700 train 7.180431 (lr=7.1456e-06) (hash(x)=53924631)
|
| 1186 |
+
9700 val loss 6.6379
|
| 1187 |
+
9700 val perplexity 763.5204
|
| 1188 |
+
9700 train 7.429144 (lr=5.1040e-06) (hash(x)=53924631)
|
| 1189 |
+
9900 val loss 6.5029
|
| 1190 |
+
9900 val perplexity 667.0677
|
| 1191 |
+
9900 train 6.344713 (lr=3.0069e-06) (hash(x)=44269923)
|
| 1192 |
+
9800 val loss 6.3598
|
| 1193 |
+
9800 val perplexity 578.1113
|
| 1194 |
+
9800 train 6.405945 (lr=7.0647e-06) (hash(x)=48895047)
|
| 1195 |
+
9800 val loss 6.6355
|
| 1196 |
+
9800 val perplexity 761.6445
|
| 1197 |
+
9800 train 6.674919 (lr=5.0462e-06) (hash(x)=48895047)
|
| 1198 |
+
9999 val loss 6.5035
|
| 1199 |
+
9999 val perplexity 667.4902
|
| 1200 |
+
9900 val loss 6.6316
|
| 1201 |
+
9900 val perplexity 758.6907
|
| 1202 |
+
9900 train 6.478441 (lr=5.0116e-06) (hash(x)=44269923)
|
| 1203 |
+
9900 val loss 6.3549
|
| 1204 |
+
9900 val perplexity 575.2765
|
| 1205 |
+
9900 train 6.186326 (lr=7.0162e-06) (hash(x)=44269923)
|
| 1206 |
+
9999 val loss 6.6315
|
| 1207 |
+
9999 val perplexity 758.6360
|
| 1208 |
+
9999 val loss 6.3557
|
| 1209 |
+
9999 val perplexity 575.7891
|
attention_kindselective_n_heads8_seed1341/model_02500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e860e923a556180e7e2dade3cce2e338a0c6e4ced0183df1bef4b801ac4ed341
|
| 3 |
+
size 257976706
|
attention_kindselective_n_heads8_seed1341/model_05000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f185c3159438e039d4fc515e37f75f445947587e36943760fb5202a8375b6af4
|
| 3 |
+
size 257976706
|
attention_kindselective_n_heads8_seed1341/model_07500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee0c71f6868299f0a2ff54ed1ec8d3aeaf7c4572b36fbacc656c3c31f9cbf2a1
|
| 3 |
+
size 257976706
|
attention_kindselective_n_heads8_seed1341/model_09999.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef70a6883ed7ace4af08d40c7efad1fdddd7b9a3923d2826bd87c81631ed7525
|
| 3 |
+
size 257976706
|
attention_kindselective_n_heads8_seed1341/optimizer_02500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5bd67e3d6826de3f55ad5d5f1c04b26f72da5c15b4892f1263072c7dfd955a14
|
| 3 |
+
size 509672838
|
attention_kindselective_n_heads8_seed1341/optimizer_05000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba33e8595171564264bbf96f4a16fc0a8b17e06aca96550034da8efea6848d93
|
| 3 |
+
size 509672838
|
attention_kindselective_n_heads8_seed1341/optimizer_07500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a08c66319f568af505a5def305e7297e4f583c6ba4f94fce35a6492eaaf6622d
|
| 3 |
+
size 509672838
|
attention_kindselective_n_heads8_seed1341/optimizer_09999.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0de2886281b4cfd84dfa959b6a06d6011db4a724bda0944d7119a668fadebcb4
|
| 3 |
+
size 509672838
|