diff --git a/attention_kindselective_n_heads2_seed1339/args.json b/attention_kindselective_n_heads2_seed1339/args.json index 50a1e7ac0f2c781c2ec69abf626963c397df6db9..5ac40dd1169985c27456a4b9f409a73759217bfe 100644 --- a/attention_kindselective_n_heads2_seed1339/args.json +++ b/attention_kindselective_n_heads2_seed1339/args.json @@ -1 +1 @@ -{"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads2_seed1339", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1339, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 0.00015, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "15e-5_10240_2_1339", "n_embd": 128} \ No newline at end of file +{"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_7/attention_kindselective_n_heads2_seed1339", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 50000, "warmup_steps": 200, "group": "wider_is_better_7", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1339, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "5e-5_10240_2_1339", "n_embd": 128} \ No newline at end of file diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_10000.pt b/attention_kindselective_n_heads2_seed1339/dataloader_10000.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1452fd777edd0ccb65d6b47710c50208c14b312 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_10000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3858f6c832feea78a674d8c5c384061cc7d4f22cddbd0a2be6de33bc91e2c72 +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_12500.pt b/attention_kindselective_n_heads2_seed1339/dataloader_12500.pt new file mode 100644 index 0000000000000000000000000000000000000000..f63806424a0177a7f2d678c2c63138219ed021f3 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_12500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab3779d33c2e0a7873fcd8c39402e44260740665950323ad1445480ec339965a +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_15000.pt b/attention_kindselective_n_heads2_seed1339/dataloader_15000.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ea0f5a1bfab75667c4ebb0ca01b358cdc836a54 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_15000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:450020c7c306c73e5c07c463518ab937102d657515ea5a38da6f2e7291f20324 +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_17500.pt b/attention_kindselective_n_heads2_seed1339/dataloader_17500.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb9392348b1209c827e3e376b05eeda80e779aa8 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_17500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0696a655d7c8a9d7d275c7489f74a5a948ee029ac3941b045d6abaf12544a5b1 +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_20000.pt b/attention_kindselective_n_heads2_seed1339/dataloader_20000.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba32fca059aec40e4f758de96bdac6df23b9d9f5 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_20000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4eb226bdcee777fd1ca493533704dae226c077ef79c842fc9dc59a534d5381c1 +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_22500.pt b/attention_kindselective_n_heads2_seed1339/dataloader_22500.pt new file mode 100644 index 0000000000000000000000000000000000000000..498dc444f528d893090328a5bd1e2f37da46dc12 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_22500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88833bfd821adac4edd1dd0772083ae007c7b8d33041f66e53a679e1fa8993e0 +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_25000.pt b/attention_kindselective_n_heads2_seed1339/dataloader_25000.pt new file mode 100644 index 0000000000000000000000000000000000000000..b657bc134192d0ea956f984c289d0c682979a1f4 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_25000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:554555a425bac43d626c36f1c81c2b0aba51eda3281dab27a9cb56b61f413354 +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_27500.pt b/attention_kindselective_n_heads2_seed1339/dataloader_27500.pt new file mode 100644 index 0000000000000000000000000000000000000000..d92d43d89390714f43db4f0782e49af0145b4a90 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_27500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a52940b3b45414e6cdbec0dbaeba848f52d681c2daf78c269027057332d7fbd +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_30000.pt b/attention_kindselective_n_heads2_seed1339/dataloader_30000.pt new file mode 100644 index 0000000000000000000000000000000000000000..06856f53253b6c8cbefd7d595d9b9b7266b22621 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_30000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:775519ea26122cb70d533c496bcdbbc19f759f3d096e83e98cca1dc10275fe8e +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_32500.pt b/attention_kindselective_n_heads2_seed1339/dataloader_32500.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc0b4b40e57f41aa1046b3bd2697256635160c09 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_32500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab3422c8205fe45210246fed3dd6c317b75df02228cd8b75fba669574ce3b2d9 +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_35000.pt b/attention_kindselective_n_heads2_seed1339/dataloader_35000.pt new file mode 100644 index 0000000000000000000000000000000000000000..62647b540fa7925361626b9f8dfa3959eebb7608 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_35000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:199ed3be67b88981f686112c97a2261729a37e0be3d4b0f4a289985a95d3cdf1 +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_37500.pt b/attention_kindselective_n_heads2_seed1339/dataloader_37500.pt new file mode 100644 index 0000000000000000000000000000000000000000..475d8b538138a8e39b76a4cf04c8eaeac074d295 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_37500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f77178b3be9dd3f8cd03c935236251f73fde6da7948ba9feda0c888fb8912dfe +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_40000.pt b/attention_kindselective_n_heads2_seed1339/dataloader_40000.pt new file mode 100644 index 0000000000000000000000000000000000000000..50ec00ef6be330bbdb4cdf88e9a1097345da0d4d --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_40000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84f58ba3b4a1f9be7da4e697e06782f0e1ce4d3aca49f1997087fc83aa466dd9 +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_42500.pt b/attention_kindselective_n_heads2_seed1339/dataloader_42500.pt new file mode 100644 index 0000000000000000000000000000000000000000..74e133f73a7293d5f4d6407784703c91f705d6e3 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_42500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf6d24c78d89100d146bce9f26be940db3d71092473d9b55db97d6b35531eac2 +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_45000.pt b/attention_kindselective_n_heads2_seed1339/dataloader_45000.pt new file mode 100644 index 0000000000000000000000000000000000000000..718bc149d695b1e9498bdd0693053d7417207818 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_45000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22bb90b43d81f3da5454f91a70e1ed29aeb2f470a727ce38390ff8a5c4924889 +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_47500.pt b/attention_kindselective_n_heads2_seed1339/dataloader_47500.pt new file mode 100644 index 0000000000000000000000000000000000000000..3eab97c7c6d4b86d90405bf1c4f3435727495da4 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_47500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55507725e6988f190e4963078652fafa6b68e8d4f79221387612612babf3e1c1 +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/dataloader_49999.pt b/attention_kindselective_n_heads2_seed1339/dataloader_49999.pt new file mode 100644 index 0000000000000000000000000000000000000000..d87f88b62a343a49411f8a6feee8f527879fcd1f --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/dataloader_49999.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47776cddb8021172f048a950b83f25b692cb340214b800ce3837c15ceb58907c +size 964 diff --git a/attention_kindselective_n_heads2_seed1339/log2.txt b/attention_kindselective_n_heads2_seed1339/log2.txt index 8ad368ead434986e7b8a8cd14b9e9597e5d34d1f..e1d99402134e29b9be8c7f24c46f40fa6c5b8db1 100644 --- a/attention_kindselective_n_heads2_seed1339/log2.txt +++ b/attention_kindselective_n_heads2_seed1339/log2.txt @@ -1,578 +1,4482 @@ -max_steps: 10000 -900 val loss 7.5969 -900 val perplexity 1991.9968 -900 train 7.572263 (lr=9.8872e-05) (hash(x)=49436061) -1000 val loss 7.5734 -1000 val perplexity 1945.8105 -1000 train 7.514424 (lr=9.8528e-05) (hash(x)=48818282) +max_steps: 50000 +max_steps: 50000 0 val loss 11.7444 0 val perplexity 126046.6875 -1100 val loss 7.5473 -1100 val perplexity 1895.5912 -1100 train 7.608188 (lr=9.8140e-05) (hash(x)=55387425) -1200 val loss 7.5217 -1200 val perplexity 1847.7664 -1200 train 7.468529 (lr=9.7708e-05) (hash(x)=48328378) -1300 val loss 7.4985 -1300 val perplexity 1805.4044 -1300 train 7.419605 (lr=9.7231e-05) (hash(x)=48956717) -0 train 11.717561 (lr=7.5000e-07) (hash(x)=55241167) -1400 val loss 7.5026 -1400 val perplexity 1812.7057 -1400 train 7.386037 (lr=9.6711e-05) (hash(x)=49782571) -100 val loss 9.6965 -100 val perplexity 16261.2285 -100 train 9.628764 (lr=7.5750e-05) (hash(x)=46387919) -1500 val loss 7.4773 -1500 val perplexity 1767.5096 -1500 train 7.258651 (lr=9.6149e-05) (hash(x)=51081994) -200 val loss 8.0333 -200 val perplexity 3081.7817 -200 train 8.088078 (lr=1.5000e-04) (hash(x)=51276369) -1600 val loss 7.4579 -1600 val perplexity 1733.5146 -1600 train 7.523229 (lr=9.5544e-05) (hash(x)=48791204) -300 val loss 7.7162 -300 val perplexity 2244.4336 -300 train 7.639675 (lr=1.4997e-04) (hash(x)=48572728) -1700 val loss 7.4387 -1700 val perplexity 1700.5919 -1700 train 7.161916 (lr=9.4897e-05) (hash(x)=46425011) -400 val loss 7.6960 -400 val perplexity 2199.5452 -400 train 7.408287 (lr=1.4986e-04) (hash(x)=44943400) -1800 val loss 7.4088 -1800 val perplexity 1650.4590 -1800 train 7.065498 (lr=9.4209e-05) (hash(x)=44919141) -500 val loss 7.7190 -500 val perplexity 2250.7332 -500 train 7.623987 (lr=1.4969e-04) (hash(x)=48434590) -1900 val loss 7.3505 -1900 val perplexity 1556.9557 -1900 train 7.455896 (lr=9.3481e-05) (hash(x)=48299675) -600 val loss 7.6594 -600 val perplexity 2120.3806 -600 train 7.504135 (lr=1.4945e-04) (hash(x)=47442266) -2000 val loss 7.3041 -2000 val perplexity 1486.4043 -2000 train 8.246761 (lr=9.2714e-05) (hash(x)=61804797) -700 val loss 7.6184 -700 val perplexity 2035.3838 -700 train 7.673785 (lr=1.4913e-04) (hash(x)=56495712) -2100 val loss 7.2584 -2100 val perplexity 1420.0369 -2100 train 7.342447 (lr=9.1908e-05) (hash(x)=53638030) -800 val loss 7.5712 -800 val perplexity 1941.3823 -800 train 7.564387 (lr=1.4876e-04) (hash(x)=50093738) -2200 val loss 7.2501 -2200 val perplexity 1408.2015 -2200 train 7.447799 (lr=9.1064e-05) (hash(x)=56364593) -900 val loss 7.5101 -900 val perplexity 1826.3429 -900 train 7.466010 (lr=1.4831e-04) (hash(x)=49436061) -2300 val loss 7.2280 -2300 val perplexity 1377.4703 -2300 train 7.307809 (lr=9.0182e-05) (hash(x)=52945580) -1000 val loss 7.4440 -1000 val perplexity 1709.5248 -1000 train 7.373216 (lr=1.4779e-04) (hash(x)=48818282) -2400 val loss 7.1955 -2400 val perplexity 1333.4473 -2400 train 7.284088 (lr=8.9265e-05) (hash(x)=54595211) -1100 val loss 7.3930 -1100 val perplexity 1624.5348 -1100 train 7.437781 (lr=1.4721e-04) (hash(x)=55387425) -2500 val loss 7.1905 -2500 val perplexity 1326.7396 -2500 train 7.198493 (lr=8.8313e-05) (hash(x)=52323363) -1200 val loss 7.3864 -1200 val perplexity 1613.8221 -1200 train 7.297882 (lr=1.4656e-04) (hash(x)=48328378) -2600 val loss 7.1661 -2600 val perplexity 1294.8401 -2600 train 6.969779 (lr=8.7326e-05) (hash(x)=42652707) -1300 val loss 7.3559 -1300 val perplexity 1565.3617 -1300 train 7.271660 (lr=1.4585e-04) (hash(x)=48956717) -2700 val loss 7.1652 -2700 val perplexity 1293.5731 -2700 train 6.893122 (lr=8.6306e-05) (hash(x)=45687855) -1400 val loss 7.3621 -1400 val perplexity 1575.2109 -1400 train 7.228504 (lr=1.4507e-04) (hash(x)=49782571) -2800 val loss 7.1404 -2800 val perplexity 1261.9781 -2800 train 7.139996 (lr=8.5254e-05) (hash(x)=49962830) -1500 val loss 7.3566 -1500 val perplexity 1566.5236 -1500 train 7.131782 (lr=1.4422e-04) (hash(x)=51081994) -2900 val loss 7.1124 -2900 val perplexity 1227.1224 -2900 train 7.157219 (lr=8.4170e-05) (hash(x)=52958451) -1600 val loss 7.2937 -1600 val perplexity 1470.9889 -1600 train 7.357556 (lr=1.4332e-04) (hash(x)=48791204) -3000 val loss 7.0789 -3000 val perplexity 1186.6748 -3000 train 7.020242 (lr=8.3057e-05) (hash(x)=48896990) -1700 val loss 7.2649 -1700 val perplexity 1429.2798 -1700 train 6.956748 (lr=1.4235e-04) (hash(x)=46425011) -3100 val loss 7.0680 -3100 val perplexity 1173.8483 -3100 train 7.037431 (lr=8.1915e-05) (hash(x)=47064930) -1800 val loss 7.2177 -1800 val perplexity 1363.3796 -1800 train 6.878641 (lr=1.4131e-04) (hash(x)=44919141) -3200 val loss 7.0466 -3200 val perplexity 1148.9044 -3200 train 6.800650 (lr=8.0745e-05) (hash(x)=45145921) -1900 val loss 7.1876 -1900 val perplexity 1322.8931 -1900 train 7.273633 (lr=1.4022e-04) (hash(x)=48299675) -3300 val loss 7.0223 -3300 val perplexity 1121.3967 -3300 train 6.873117 (lr=7.9549e-05) (hash(x)=48085018) -2000 val loss 7.1324 -2000 val perplexity 1251.8201 -2000 train 7.983136 (lr=1.3907e-04) (hash(x)=61804797) -3400 val loss 6.9982 -3400 val perplexity 1094.6818 -3400 train 7.005905 (lr=7.8328e-05) (hash(x)=49596053) -2100 val loss 7.0736 -2100 val perplexity 1180.3406 -2100 train 7.182455 (lr=1.3786e-04) (hash(x)=53638030) -3500 val loss 6.9891 -3500 val perplexity 1084.7898 -3500 train 6.591727 (lr=7.7082e-05) (hash(x)=41135634) -2200 val loss 7.0376 -2200 val perplexity 1138.6738 -2200 train 7.247160 (lr=1.3660e-04) (hash(x)=56364593) -3600 val loss 6.9701 -3600 val perplexity 1064.2880 -3600 train 6.984326 (lr=7.5814e-05) (hash(x)=54286330) -2300 val loss 7.0180 -2300 val perplexity 1116.5631 -2300 train 7.111286 (lr=1.3527e-04) (hash(x)=52945580) -3700 val loss 6.9778 -3700 val perplexity 1072.5734 -3700 train 6.879121 (lr=7.4525e-05) (hash(x)=57061993) -2400 val loss 6.9654 -2400 val perplexity 1059.3838 -2400 train 7.038701 (lr=1.3390e-04) (hash(x)=54595211) -3800 val loss 6.9521 -3800 val perplexity 1045.3574 -3800 train 6.536635 (lr=7.3215e-05) (hash(x)=46544884) -2500 val loss 6.9307 -2500 val perplexity 1023.1781 -2500 train 6.927703 (lr=1.3247e-04) (hash(x)=52323363) -3900 val loss 6.9418 -3900 val perplexity 1034.6000 -3900 train 6.773740 (lr=7.1887e-05) (hash(x)=48937435) -2600 val loss 6.8754 -2600 val perplexity 968.1622 -2600 train 6.629884 (lr=1.3099e-04) (hash(x)=42652707) -4000 val loss 6.9220 -4000 val perplexity 1014.3887 -4000 train 6.967721 (lr=7.0541e-05) (hash(x)=54466186) -2700 val loss 6.8620 -2700 val perplexity 955.2903 -2700 train 6.563602 (lr=1.2946e-04) (hash(x)=45687855) -4100 val loss 6.9034 -4100 val perplexity 995.6447 -4100 train 6.734161 (lr=6.9180e-05) (hash(x)=51079995) -2800 val loss 6.8310 -2800 val perplexity 926.1013 -2800 train 6.844923 (lr=1.2788e-04) (hash(x)=49962830) -4200 val loss 6.8986 -4200 val perplexity 990.8748 -4200 train 7.022826 (lr=6.7804e-05) (hash(x)=56495581) -2900 val loss 6.8384 -2900 val perplexity 933.0102 -2900 train 6.857624 (lr=1.2626e-04) (hash(x)=52958451) -4300 val loss 6.8568 -4300 val perplexity 950.3548 -4300 train 6.723518 (lr=6.6414e-05) (hash(x)=45095478) -3000 val loss 6.7960 -3000 val perplexity 894.2563 -3000 train 6.743637 (lr=1.2459e-04) (hash(x)=48896990) -4400 val loss 6.8469 -4400 val perplexity 940.9572 -4400 train 6.786045 (lr=6.5013e-05) (hash(x)=49954028) -4500 val loss 6.8305 -4500 val perplexity 925.6757 -4500 train 6.764406 (lr=6.3602e-05) (hash(x)=48975821) -3100 val loss 6.7878 -3100 val perplexity 887.0014 -3100 train 6.793625 (lr=1.2287e-04) (hash(x)=47064930) -4600 val loss 6.8016 -4600 val perplexity 899.3201 -4600 train 6.895534 (lr=6.2182e-05) (hash(x)=49293028) -3200 val loss 6.7811 -3200 val perplexity 881.0615 -3200 train 6.505876 (lr=1.2112e-04) (hash(x)=45145921) -4700 val loss 6.7950 -4700 val perplexity 893.3932 -4700 train 6.722027 (lr=6.0754e-05) (hash(x)=48077753) -3300 val loss 6.7835 -3300 val perplexity 883.1204 -3300 train 6.615720 (lr=1.1932e-04) (hash(x)=48085018) -4800 val loss 6.7895 -4800 val perplexity 888.4474 -4800 train 6.651279 (lr=5.9321e-05) (hash(x)=45037879) -3400 val loss 6.7838 -3400 val perplexity 883.4468 -3400 train 6.771408 (lr=1.1749e-04) (hash(x)=49596053) -4900 val loss 6.7730 -4900 val perplexity 873.9567 -4900 train 6.689004 (lr=5.7883e-05) (hash(x)=50320154) -3500 val loss 6.7643 -3500 val perplexity 866.3450 -3500 train 6.378889 (lr=1.1562e-04) (hash(x)=41135634) -5000 val loss 6.7669 -5000 val perplexity 868.6503 -5000 train 6.983054 (lr=5.6442e-05) (hash(x)=55024523) -3600 val loss 6.7291 -3600 val perplexity 836.3896 -3600 train 6.714438 (lr=1.1372e-04) (hash(x)=54286330) -5100 val loss 6.7350 -5100 val perplexity 841.3091 -5100 train 6.674947 (lr=5.5000e-05) (hash(x)=47570607) -3700 val loss 6.7328 -3700 val perplexity 839.4858 -3700 train 6.590438 (lr=1.1179e-04) (hash(x)=57061993) -5200 val loss 6.7208 -5200 val perplexity 829.4543 -5200 train 6.841348 (lr=5.3558e-05) (hash(x)=54311349) -3800 val loss 6.7147 -3800 val perplexity 824.4700 -3800 train 6.275104 (lr=1.0982e-04) (hash(x)=46544884) -5300 val loss 6.7038 -5300 val perplexity 815.4897 -5300 train 6.609777 (lr=5.2117e-05) (hash(x)=47178524) -3900 val loss 6.7199 -3900 val perplexity 828.7628 -3900 train 6.531300 (lr=1.0783e-04) (hash(x)=48937435) -5400 val loss 6.6995 -5400 val perplexity 812.0095 -5400 train 6.718117 (lr=5.0679e-05) (hash(x)=49425088) -4000 val loss 6.7199 -4000 val perplexity 828.7194 -4000 train 6.746758 (lr=1.0581e-04) (hash(x)=54466186) -5500 val loss 6.6891 -5500 val perplexity 803.6185 -5500 train 6.531772 (lr=4.9246e-05) (hash(x)=46383189) -4100 val loss 6.6825 -4100 val perplexity 798.3271 -4100 train 6.493564 (lr=1.0377e-04) (hash(x)=51079995) -5600 val loss 6.6879 -5600 val perplexity 802.6320 -5600 train 6.864735 (lr=4.7818e-05) (hash(x)=54084990) -4200 val loss 6.6782 -4200 val perplexity 794.8940 -4200 train 6.765399 (lr=1.0171e-04) (hash(x)=56495581) -5700 val loss 6.6801 -5700 val perplexity 796.3665 -5700 train 6.365303 (lr=4.6398e-05) (hash(x)=47384182) -4300 val loss 6.6532 -4300 val perplexity 775.2820 -4300 train 6.498065 (lr=9.9622e-05) (hash(x)=45095478) -5800 val loss 6.6799 -5800 val perplexity 796.2286 -5800 train 6.573336 (lr=4.4987e-05) (hash(x)=51683744) -4400 val loss 6.6397 -4400 val perplexity 764.8487 -4400 train 6.632647 (lr=9.7520e-05) (hash(x)=49954028) -5900 val loss 6.6699 -5900 val perplexity 788.3209 -5900 train 6.679185 (lr=4.3586e-05) (hash(x)=52798836) -4500 val loss 6.6403 -4500 val perplexity 765.3331 -4500 train 6.555310 (lr=9.5403e-05) (hash(x)=48975821) -6000 val loss 6.6785 -6000 val perplexity 795.1404 -6000 train 6.579535 (lr=4.2196e-05) (hash(x)=50945000) -4600 val loss 6.6170 -4600 val perplexity 747.6994 -4600 train 6.699085 (lr=9.3273e-05) (hash(x)=49293028) -6100 val loss 6.6605 -6100 val perplexity 780.9373 -6100 train 6.376235 (lr=4.0820e-05) (hash(x)=48964427) -4700 val loss 6.6228 -4700 val perplexity 752.0621 -6200 val loss 6.6637 -6200 val perplexity 783.4239 -4700 train 6.526177 (lr=9.1132e-05) (hash(x)=48077753) -6200 train 6.446042 (lr=3.9459e-05) (hash(x)=47148610) -6300 val loss 6.6562 -6300 val perplexity 777.5964 -6300 train 6.572690 (lr=3.8113e-05) (hash(x)=50860553) -4800 val loss 6.6178 -4800 val perplexity 748.3293 -4800 train 6.518622 (lr=8.8982e-05) (hash(x)=45037879) -6400 val loss 6.6432 -6400 val perplexity 767.5582 -6400 train 6.425728 (lr=3.6785e-05) (hash(x)=50553992) -4900 val loss 6.5986 -4900 val perplexity 734.0506 -4900 train 6.506639 (lr=8.6825e-05) (hash(x)=50320154) -6500 val loss 6.6449 -6500 val perplexity 768.8839 -6500 train 6.475725 (lr=3.5475e-05) (hash(x)=52521332) -5000 val loss 6.5903 -5000 val perplexity 728.0233 -5000 train 6.842143 (lr=8.4663e-05) (hash(x)=55024523) -6600 val loss 6.6217 -6600 val perplexity 751.2403 -6600 train 6.966245 (lr=3.4186e-05) (hash(x)=52609843) -5100 val loss 6.5743 -5100 val perplexity 716.4435 -5100 train 6.487578 (lr=8.2500e-05) (hash(x)=47570607) -6700 val loss 6.6003 -6700 val perplexity 735.3451 -6700 train 6.636974 (lr=3.2918e-05) (hash(x)=53632957) -5200 val loss 6.5754 -5200 val perplexity 717.2426 -5200 train 6.693417 (lr=8.0337e-05) (hash(x)=54311349) -6800 val loss 6.5942 -6800 val perplexity 730.8583 -6800 train 6.607226 (lr=3.1672e-05) (hash(x)=46927608) -5300 val loss 6.5566 -5300 val perplexity 703.8453 -5300 train 6.460115 (lr=7.8175e-05) (hash(x)=47178524) -6900 val loss 6.5882 -6900 val perplexity 726.4496 -6900 train 6.615112 (lr=3.0451e-05) (hash(x)=54483057) -5400 val loss 6.5604 -5400 val perplexity 706.5711 -5400 train 6.593896 (lr=7.6018e-05) (hash(x)=49425088) -7000 val loss 6.5774 -7000 val perplexity 718.6606 -7000 train 6.763172 (lr=2.9255e-05) (hash(x)=51493884) -5500 val loss 6.5462 -5500 val perplexity 696.6125 -5500 train 6.380437 (lr=7.3868e-05) (hash(x)=46383189) -7100 val loss 6.5651 -7100 val perplexity 709.8884 -7100 train 6.717602 (lr=2.8085e-05) (hash(x)=53303341) -5600 val loss 6.5517 -5600 val perplexity 700.4420 -5600 train 6.718967 (lr=7.1727e-05) (hash(x)=54084990) -7200 val loss 6.5605 -7200 val perplexity 706.6425 -7200 train 6.264062 (lr=2.6943e-05) (hash(x)=45272178) -5700 val loss 6.5331 -5700 val perplexity 687.5358 -5700 train 6.188586 (lr=6.9597e-05) (hash(x)=47384182) -7300 val loss 6.5546 -7300 val perplexity 702.4502 -7300 train 6.485512 (lr=2.5830e-05) (hash(x)=50389872) -5800 val loss 6.5450 -5800 val perplexity 695.7803 -5800 train 6.420528 (lr=6.7480e-05) (hash(x)=51683744) -7400 val loss 6.5482 -7400 val perplexity 698.0001 -7400 train 6.201846 (lr=2.4746e-05) (hash(x)=43796301) -5900 val loss 6.5304 -5900 val perplexity 685.6703 -5900 train 6.529302 (lr=6.5378e-05) (hash(x)=52798836) -7500 val loss 6.5417 -7500 val perplexity 693.4697 -7500 train 6.477395 (lr=2.3694e-05) (hash(x)=47808686) -6000 val loss 6.5321 -6000 val perplexity 686.8110 -6000 train 6.415793 (lr=6.3294e-05) (hash(x)=50945000) -7600 val loss 6.5397 -7600 val perplexity 692.0661 -7600 train 6.286121 (lr=2.2674e-05) (hash(x)=41936898) -7700 val loss 6.5449 -7700 val perplexity 695.6599 -7700 train 6.755160 (lr=2.1687e-05) (hash(x)=57550318) -6100 val loss 6.5229 -6100 val perplexity 680.5154 -6100 train 6.207138 (lr=6.1230e-05) (hash(x)=48964427) -7800 val loss 6.5337 -7800 val perplexity 687.9454 -7800 train 6.407978 (lr=2.0735e-05) (hash(x)=47485210) -6200 val loss 6.5283 -6200 val perplexity 684.2143 -6200 train 6.284555 (lr=5.9188e-05) (hash(x)=47148610) -7900 val loss 6.5314 -7900 val perplexity 686.3399 -7900 train 6.898339 (lr=1.9818e-05) (hash(x)=53228688) -6300 val loss 6.5233 -6300 val perplexity 680.8536 -6300 train 6.436521 (lr=5.7169e-05) (hash(x)=50860553) -8000 val loss 6.5283 -8000 val perplexity 684.2267 -8000 train 6.578187 (lr=1.8936e-05) (hash(x)=52018673) -6400 val loss 6.5221 -6400 val perplexity 679.9780 -6400 train 6.280699 (lr=5.5177e-05) (hash(x)=50553992) -8100 val loss 6.5242 -8100 val perplexity 681.4324 -8100 train 6.353445 (lr=1.8092e-05) (hash(x)=47079349) -6500 val loss 6.5162 -6500 val perplexity 675.9912 -6500 train 6.320277 (lr=5.3213e-05) (hash(x)=52521332) -8200 val loss 6.5191 -8200 val perplexity 677.9484 -8200 train 6.559124 (lr=1.7286e-05) (hash(x)=57921563) -6600 val loss 6.4862 -6600 val perplexity 656.0344 -6600 train 6.825990 (lr=5.1279e-05) (hash(x)=52609843) -8300 val loss 6.5156 -8300 val perplexity 675.6302 -8300 train 6.059009 (lr=1.6519e-05) (hash(x)=45038933) -6700 val loss 6.4759 -6700 val perplexity 649.3335 -6700 train 6.509449 (lr=4.9377e-05) (hash(x)=53632957) -8400 val loss 6.5269 -8400 val perplexity 683.2643 -8400 train 6.374324 (lr=1.5791e-05) (hash(x)=47763246) -6800 val loss 6.4683 -6800 val perplexity 644.3602 -6800 train 6.460664 (lr=4.7509e-05) (hash(x)=46927608) -8500 val loss 6.5188 -8500 val perplexity 677.7935 -8500 train 6.563421 (lr=1.5103e-05) (hash(x)=56176595) -6900 val loss 6.4662 -6900 val perplexity 643.0048 -6900 train 6.484306 (lr=4.5676e-05) (hash(x)=54483057) -8600 val loss 6.5157 -8600 val perplexity 675.6860 -8600 train 6.585588 (lr=1.4456e-05) (hash(x)=55184249) -7000 val loss 6.4562 -7000 val perplexity 636.6522 -7000 train 6.601372 (lr=4.3882e-05) (hash(x)=51493884) -8700 val loss 6.5128 -8700 val perplexity 673.7424 -8700 train 6.452381 (lr=1.3851e-05) (hash(x)=46471646) -7100 val loss 6.4422 -7100 val perplexity 627.7880 -7100 train 6.557719 (lr=4.2128e-05) (hash(x)=53303341) -8800 val loss 6.5137 -8800 val perplexity 674.3239 -8800 train 6.222188 (lr=1.3289e-05) (hash(x)=46233162) -7200 val loss 6.4518 -7200 val perplexity 633.8185 -7200 train 6.174304 (lr=4.0414e-05) (hash(x)=45272178) -8900 val loss 6.5114 -8900 val perplexity 672.7723 -8900 train 6.433084 (lr=1.2769e-05) (hash(x)=47233684) -7300 val loss 6.4358 -7300 val perplexity 623.7919 -7300 train 6.357533 (lr=3.8745e-05) (hash(x)=50389872) -9000 val loss 6.5053 -9000 val perplexity 668.6711 -9000 train 6.403587 (lr=1.2292e-05) (hash(x)=48374529) -7400 val loss 6.4336 -7400 val perplexity 622.3898 -7400 train 6.076984 (lr=3.7120e-05) (hash(x)=43796301) -9100 val loss 6.4932 -9100 val perplexity 660.6213 -9100 train 6.443924 (lr=1.1860e-05) (hash(x)=48065371) -7500 val loss 6.4282 -7500 val perplexity 619.0293 -9200 val loss 6.4862 -9200 val perplexity 656.0491 -9200 train 6.478733 (lr=1.1472e-05) (hash(x)=47408078) -7500 train 6.375185 (lr=3.5541e-05) (hash(x)=47808686) -9300 val loss 6.4858 -9300 val perplexity 655.7463 -9300 train 6.474623 (lr=1.1128e-05) (hash(x)=50749781) -7600 val loss 6.4303 -7600 val perplexity 620.3874 -7600 train 6.170215 (lr=3.4011e-05) (hash(x)=41936898) -9400 val loss 6.4775 -9400 val perplexity 650.3433 -9400 train 6.675423 (lr=1.0830e-05) (hash(x)=48560169) -7700 val loss 6.4319 -7700 val perplexity 621.3718 -7700 train 6.636626 (lr=3.2531e-05) (hash(x)=57550318) -9500 val loss 6.4770 -9500 val perplexity 650.0038 -9500 train 6.594353 (lr=1.0577e-05) (hash(x)=50936392) -7800 val loss 6.4213 -7800 val perplexity 614.7798 -7800 train 6.287605 (lr=3.1102e-05) (hash(x)=47485210) -9600 val loss 6.4729 -9600 val perplexity 647.3536 -9600 train 6.489072 (lr=1.0369e-05) (hash(x)=50651714) -7900 val loss 6.4279 -7900 val perplexity 618.8944 -7900 train 6.767558 (lr=2.9726e-05) (hash(x)=53228688) -9700 val loss 6.4730 -9700 val perplexity 647.3935 -9700 train 6.543466 (lr=1.0208e-05) (hash(x)=47311384) -8000 val loss 6.4221 -8000 val perplexity 615.2637 -8000 train 6.469327 (lr=2.8405e-05) (hash(x)=52018673) -9800 val loss 6.4721 -9800 val perplexity 646.8399 -9800 train 6.569210 (lr=1.0092e-05) (hash(x)=50921139) -8100 val loss 6.4145 -8100 val perplexity 610.6134 -8100 train 6.244730 (lr=2.7138e-05) (hash(x)=47079349) -9900 val loss 6.4766 -9900 val perplexity 649.7541 -9900 train 6.602388 (lr=1.0023e-05) (hash(x)=48142455) -8200 val loss 6.4152 -8200 val perplexity 611.0695 -8200 train 6.430655 (lr=2.5929e-05) (hash(x)=57921563) -9999 val loss 6.4663 -9999 val perplexity 643.0909 -8300 val loss 6.4102 -8300 val perplexity 607.9884 -8300 train 5.933331 (lr=2.4778e-05) (hash(x)=45038933) -8400 val loss 6.4243 -8400 val perplexity 616.6609 -8400 train 6.261151 (lr=2.3686e-05) (hash(x)=47763246) -8500 val loss 6.4191 -8500 val perplexity 613.4542 -8500 train 6.478737 (lr=2.2655e-05) (hash(x)=56176595) -8600 val loss 6.4165 -8600 val perplexity 611.8454 -8600 train 6.476886 (lr=2.1685e-05) (hash(x)=55184249) -8700 val loss 6.4175 -8700 val perplexity 612.4998 -8700 train 6.344475 (lr=2.0777e-05) (hash(x)=46471646) -8800 val loss 6.4215 -8800 val perplexity 614.9026 -8800 train 6.112791 (lr=1.9933e-05) (hash(x)=46233162) -8900 val loss 6.4167 -8900 val perplexity 611.9857 -8900 train 6.344871 (lr=1.9153e-05) (hash(x)=47233684) -9000 val loss 6.4089 -9000 val perplexity 607.2270 -9000 train 6.273678 (lr=1.8439e-05) (hash(x)=48374529) -9100 val loss 6.3961 -9100 val perplexity 599.5264 -9100 train 6.349834 (lr=1.7790e-05) (hash(x)=48065371) -9200 val loss 6.3860 -9200 val perplexity 593.4916 -9200 train 6.378371 (lr=1.7208e-05) (hash(x)=47408078) -9300 val loss 6.3823 -9300 val perplexity 591.3016 -9300 train 6.362077 (lr=1.6692e-05) (hash(x)=50749781) -9400 val loss 6.3772 -9400 val perplexity 588.2960 -9400 train 6.570877 (lr=1.6245e-05) (hash(x)=48560169) -9500 val loss 6.3806 -9500 val perplexity 590.2719 -9500 train 6.477550 (lr=1.5865e-05) (hash(x)=50936392) -9600 val loss 6.3755 -9600 val perplexity 587.2906 -9600 train 6.395058 (lr=1.5554e-05) (hash(x)=50651714) -9700 val loss 6.3734 -9700 val perplexity 586.0524 -9700 train 6.412555 (lr=1.5312e-05) (hash(x)=47311384) -9800 val loss 6.3752 -9800 val perplexity 587.0786 -9800 train 6.476438 (lr=1.5139e-05) (hash(x)=50921139) -9900 val loss 6.3796 -9900 val perplexity 589.6859 -9900 train 6.516968 (lr=1.5035e-05) (hash(x)=48142455) -9999 val loss 6.3698 -9999 val perplexity 583.9539 +0 val loss 11.7444 +0 val perplexity 126046.6875 +0 val loss 11.7444 +0 val perplexity 126046.6875 +0 train 11.717561 (lr=5.0000e-07) (hash(x)=55241167) +0 train 11.717561 (lr=2.5000e-07) (hash(x)=55241167) +0 train 11.717561 (lr=3.5000e-07) (hash(x)=55241167) +100 val loss 10.0232 +100 val perplexity 22544.5508 +100 train 9.954894 (lr=5.0500e-05) (hash(x)=46387919) +100 val loss 10.4176 +100 val perplexity 33443.2539 +100 train 10.353876 (lr=2.5250e-05) (hash(x)=46387919) +100 val loss 10.2775 +100 val perplexity 29071.3027 +100 train 10.212311 (lr=3.5350e-05) (hash(x)=46387919) +200 val loss 8.1644 +200 val perplexity 3513.4387 +200 train 8.214122 (lr=1.0000e-04) (hash(x)=51276369) +200 val loss 9.3411 +200 val perplexity 11397.3809 +200 train 9.368240 (lr=5.0000e-05) (hash(x)=51276369) +200 val loss 8.7332 +200 val perplexity 6205.6846 +200 train 8.767446 (lr=7.0000e-05) (hash(x)=51276369) +300 val loss 7.7253 +300 val perplexity 2264.8604 +300 train 7.652946 (lr=9.9999e-05) (hash(x)=48572728) +300 val loss 8.2242 +300 val perplexity 3730.2830 +300 train 8.183640 (lr=5.0000e-05) (hash(x)=48572728) +300 val loss 7.9303 +300 val perplexity 2780.2734 +300 train 7.884811 (lr=6.9999e-05) (hash(x)=48572728) +400 val loss 7.6629 +400 val perplexity 2127.9680 +400 train 7.383858 (lr=9.9996e-05) (hash(x)=44943400) +400 val loss 7.9362 +400 val perplexity 2796.7278 +400 train 7.703554 (lr=4.9998e-05) (hash(x)=44943400) +400 val loss 7.7163 +400 val perplexity 2244.7075 +400 train 7.450461 (lr=6.9997e-05) (hash(x)=44943400) +500 val loss 7.6514 +500 val perplexity 2103.5808 +500 train 7.535083 (lr=9.9992e-05) (hash(x)=48434590) +500 val loss 7.7930 +500 val perplexity 2423.4624 +500 train 7.663220 (lr=4.9996e-05) (hash(x)=48434590) +500 val loss 7.6693 +500 val perplexity 2141.6780 +500 train 7.557061 (lr=6.9994e-05) (hash(x)=48434590) +600 val loss 7.6385 +600 val perplexity 2076.6013 +600 train 7.480948 (lr=9.9986e-05) (hash(x)=47442266) +600 val loss 7.7135 +600 val perplexity 2238.3938 +600 train 7.570283 (lr=4.9993e-05) (hash(x)=47442266) +700 val loss 7.6417 +700 val perplexity 2083.2205 +700 train 7.680826 (lr=9.9978e-05) (hash(x)=56495712) +600 val loss 7.6519 +600 val perplexity 2104.6392 +600 train 7.508933 (lr=6.9990e-05) (hash(x)=47442266) +700 val loss 7.6768 +700 val perplexity 2157.8054 +700 train 7.719896 (lr=4.9989e-05) (hash(x)=56495712) +800 val loss 7.6275 +800 val perplexity 2053.9451 +800 train 7.610380 (lr=9.9968e-05) (hash(x)=50093738) +700 val loss 7.6300 +700 val perplexity 2059.0032 +700 train 7.672153 (lr=6.9984e-05) (hash(x)=56495712) +800 val loss 7.6636 +800 val perplexity 2129.4907 +800 train 7.659448 (lr=4.9984e-05) (hash(x)=50093738) +900 val loss 7.5995 +900 val perplexity 1997.1936 +900 train 7.568697 (lr=9.9956e-05) (hash(x)=49436061) +800 val loss 7.6213 +800 val perplexity 2041.2787 +800 train 7.620076 (lr=6.9977e-05) (hash(x)=50093738) +900 val loss 7.6571 +900 val perplexity 2115.5845 +900 train 7.622772 (lr=4.9978e-05) (hash(x)=49436061) +1000 val loss 7.5903 +1000 val perplexity 1978.9233 +1000 train 7.524452 (lr=9.9943e-05) (hash(x)=48818282) +900 val loss 7.6047 +900 val perplexity 2007.5801 +900 train 7.571390 (lr=6.9969e-05) (hash(x)=49436061) +1000 val loss 7.6521 +1000 val perplexity 2105.0488 +1000 train 7.600875 (lr=4.9971e-05) (hash(x)=48818282) +1100 val loss 7.5619 +1100 val perplexity 1923.4355 +1100 train 7.634329 (lr=9.9927e-05) (hash(x)=55387425) +1000 val loss 7.5996 +1000 val perplexity 1997.3289 +1000 train 7.541608 (lr=6.9960e-05) (hash(x)=48818282) +1100 val loss 7.6374 +1100 val perplexity 2074.3606 +1100 train 7.700971 (lr=4.9964e-05) (hash(x)=55387425) +1200 val loss 7.5663 +1200 val perplexity 1932.0634 +1200 train 7.518967 (lr=9.9910e-05) (hash(x)=48328378) +1100 val loss 7.5900 +1100 val perplexity 1978.3015 +1100 train 7.645202 (lr=6.9949e-05) (hash(x)=55387425) +1200 val loss 7.6194 +1200 val perplexity 2037.2762 +1200 train 7.575632 (lr=4.9955e-05) (hash(x)=48328378) +1300 val loss 7.5576 +1300 val perplexity 1915.2864 +1300 train 7.491983 (lr=9.9892e-05) (hash(x)=48956717) +1200 val loss 7.5793 +1200 val perplexity 1957.2340 +1200 train 7.525269 (lr=6.9937e-05) (hash(x)=48328378) +1300 val loss 7.6127 +1300 val perplexity 2023.8110 +1300 train 7.551956 (lr=4.9946e-05) (hash(x)=48956717) +1400 val loss 7.5658 +1400 val perplexity 1930.9194 +1400 train 7.451519 (lr=9.9871e-05) (hash(x)=49782571) +1300 val loss 7.5621 +1300 val perplexity 1923.8492 +1300 train 7.494945 (lr=6.9924e-05) (hash(x)=48956717) +1400 val loss 7.6143 +1400 val perplexity 2026.9111 +1400 train 7.516979 (lr=4.9936e-05) (hash(x)=49782571) +1500 val loss 7.5407 +1500 val perplexity 1883.0624 +1500 train 7.340364 (lr=9.9849e-05) (hash(x)=51081994) +1400 val loss 7.5519 +1400 val perplexity 1904.3701 +1400 train 7.434339 (lr=6.9910e-05) (hash(x)=49782571) +1500 val loss 7.6199 +1500 val perplexity 2038.2916 +1500 train 7.440731 (lr=4.9924e-05) (hash(x)=51081994) +1600 val loss 7.5355 +1600 val perplexity 1873.3113 +1600 train 7.621713 (lr=9.9825e-05) (hash(x)=48791204) +1500 val loss 7.5278 +1500 val perplexity 1858.9670 +1500 train 7.315996 (lr=6.9894e-05) (hash(x)=51081994) +1600 val loss 7.6135 +1600 val perplexity 2025.2590 +1600 train 7.676984 (lr=4.9912e-05) (hash(x)=48791204) +1700 val loss 7.5043 +1700 val perplexity 1815.8582 +1700 train 7.220179 (lr=9.9799e-05) (hash(x)=46425011) +1700 val loss 7.6135 +1700 val perplexity 2025.4175 +1700 train 7.341025 (lr=4.9899e-05) (hash(x)=46425011) +1600 val loss 7.5079 +1600 val perplexity 1822.3456 +1600 train 7.567111 (lr=6.9877e-05) (hash(x)=48791204) +1800 val loss 7.4488 +1800 val perplexity 1717.8859 +1800 train 7.120200 (lr=9.9771e-05) (hash(x)=44919141) +1800 val loss 7.6094 +1800 val perplexity 2017.1632 +1800 train 7.280530 (lr=4.9885e-05) (hash(x)=44919141) +1700 val loss 7.4883 +1700 val perplexity 1787.0634 +1700 train 7.190652 (lr=6.9859e-05) (hash(x)=46425011) +1900 val loss 7.3889 +1900 val perplexity 1617.9668 +1900 train 7.489644 (lr=9.9741e-05) (hash(x)=48299675) +1900 val loss 7.5958 +1900 val perplexity 1989.8569 +1900 train 7.690263 (lr=4.9871e-05) (hash(x)=48299675) +2000 val loss 7.3729 +2000 val perplexity 1592.3092 +2000 train 8.364852 (lr=9.9710e-05) (hash(x)=61804797) +1800 val loss 7.4852 +1800 val perplexity 1781.5627 +1800 train 7.144979 (lr=6.9840e-05) (hash(x)=44919141) +2000 val loss 7.5710 +2000 val perplexity 1941.0167 +2000 train 8.491461 (lr=4.9855e-05) (hash(x)=61804797) +2100 val loss 7.3058 +2100 val perplexity 1488.9027 +2100 train 7.396472 (lr=9.9677e-05) (hash(x)=53638030) +1900 val loss 7.4389 +1900 val perplexity 1700.9318 +1900 train 7.527557 (lr=6.9819e-05) (hash(x)=48299675) +2100 val loss 7.5529 +2100 val perplexity 1906.1926 +2100 train 7.650662 (lr=4.9839e-05) (hash(x)=53638030) +2200 val loss 7.2681 +2200 val perplexity 1433.8151 +2200 train 7.489449 (lr=9.9642e-05) (hash(x)=56364593) +2000 val loss 7.4157 +2000 val perplexity 1661.8918 +2000 train 8.400645 (lr=6.9797e-05) (hash(x)=61804797) +2200 val loss 7.5384 +2200 val perplexity 1878.8729 +2200 train 7.776010 (lr=4.9821e-05) (hash(x)=56364593) +2300 val loss 7.2275 +2300 val perplexity 1376.7749 +2300 train 7.310844 (lr=9.9606e-05) (hash(x)=52945580) +2100 val loss 7.3636 +2100 val perplexity 1577.5614 +2100 train 7.461798 (lr=6.9774e-05) (hash(x)=53638030) +2300 val loss 7.5429 +2300 val perplexity 1887.3053 +2300 train 7.644155 (lr=4.9803e-05) (hash(x)=52945580) +2400 val loss 7.2015 +2400 val perplexity 1341.4010 +2400 train 7.285708 (lr=9.9567e-05) (hash(x)=54595211) +2200 val loss 7.3501 +2200 val perplexity 1556.3634 +2200 train 7.590878 (lr=6.9750e-05) (hash(x)=56364593) +2400 val loss 7.5313 +2400 val perplexity 1865.5729 +2400 train 7.610705 (lr=4.9784e-05) (hash(x)=54595211) +2500 val loss 7.1696 +2500 val perplexity 1299.3694 +2500 train 7.171182 (lr=9.9527e-05) (hash(x)=52323363) +2300 val loss 7.3272 +2300 val perplexity 1521.1348 +2300 train 7.414377 (lr=6.9724e-05) (hash(x)=52945580) +2500 val loss 7.5244 +2500 val perplexity 1852.6257 +2500 train 7.541839 (lr=4.9764e-05) (hash(x)=52323363) +2600 val loss 7.1320 +2600 val perplexity 1251.3873 +2600 train 6.928236 (lr=9.9485e-05) (hash(x)=42652707) +2400 val loss 7.2980 +2400 val perplexity 1477.4061 +2400 train 7.387866 (lr=6.9697e-05) (hash(x)=54595211) +2600 val loss 7.5463 +2600 val perplexity 1893.6759 +2600 train 7.360832 (lr=4.9743e-05) (hash(x)=42652707) +2700 val loss 7.1148 +2700 val perplexity 1230.0439 +2700 train 6.847325 (lr=9.9442e-05) (hash(x)=45687855) +2700 val loss 7.5004 +2700 val perplexity 1808.7952 +2700 train 7.232375 (lr=4.9721e-05) (hash(x)=45687855) +2500 val loss 7.2751 +2500 val perplexity 1443.9042 +2500 train 7.286755 (lr=6.9669e-05) (hash(x)=52323363) +2800 val loss 7.0853 +2800 val perplexity 1194.2666 +2800 train 7.076136 (lr=9.9396e-05) (hash(x)=49962830) +2800 val loss 7.4797 +2800 val perplexity 1771.7085 +2800 train 7.449294 (lr=4.9698e-05) (hash(x)=49962830) +2600 val loss 7.2549 +2600 val perplexity 1415.0634 +2600 train 7.026134 (lr=6.9640e-05) (hash(x)=42652707) +2900 val loss 7.0704 +2900 val perplexity 1176.6239 +2900 train 7.107570 (lr=9.9349e-05) (hash(x)=52958451) +2900 val loss 7.4806 +2900 val perplexity 1773.2273 +2900 train 7.575212 (lr=4.9674e-05) (hash(x)=52958451) +2700 val loss 7.2088 +2700 val perplexity 1351.2257 +2700 train 6.914386 (lr=6.9609e-05) (hash(x)=45687855) +3000 val loss 7.0224 +3000 val perplexity 1121.4812 +3000 train 6.969501 (lr=9.9300e-05) (hash(x)=48896990) +3000 val loss 7.4718 +3000 val perplexity 1757.7021 +3000 train 7.401915 (lr=4.9650e-05) (hash(x)=48896990) +2800 val loss 7.2104 +2800 val perplexity 1353.4434 +2800 train 7.161264 (lr=6.9577e-05) (hash(x)=49962830) +3100 val loss 6.9968 +3100 val perplexity 1093.1571 +3100 train 6.989478 (lr=9.9249e-05) (hash(x)=47064930) +3100 val loss 7.4412 +3100 val perplexity 1704.8423 +3100 train 7.387419 (lr=4.9625e-05) (hash(x)=47064930) +2900 val loss 7.2341 +2900 val perplexity 1385.8876 +2900 train 7.283199 (lr=6.9544e-05) (hash(x)=52958451) +3200 val loss 7.0087 +3200 val perplexity 1106.1842 +3200 train 6.756989 (lr=9.9197e-05) (hash(x)=45145921) +3200 val loss 7.4138 +3200 val perplexity 1658.7645 +3200 train 7.175171 (lr=4.9598e-05) (hash(x)=45145921) +3300 val loss 6.9522 +3300 val perplexity 1045.4287 +3300 train 6.791659 (lr=9.9142e-05) (hash(x)=48085018) +3000 val loss 7.2728 +3000 val perplexity 1440.6320 +3000 train 7.216793 (lr=6.9510e-05) (hash(x)=48896990) +3300 val loss 7.3939 +3300 val perplexity 1626.0817 +3300 train 7.266852 (lr=4.9571e-05) (hash(x)=48085018) +3400 val loss 6.9438 +3400 val perplexity 1036.7441 +3400 train 6.943289 (lr=9.9086e-05) (hash(x)=49596053) +3100 val loss 7.1621 +3100 val perplexity 1289.6044 +3100 train 7.132561 (lr=6.9474e-05) (hash(x)=47064930) +3400 val loss 7.3480 +3400 val perplexity 1553.1340 +3400 train 7.367152 (lr=4.9543e-05) (hash(x)=49596053) +3500 val loss 6.9038 +3500 val perplexity 996.0829 +3500 train 6.516293 (lr=9.9028e-05) (hash(x)=41135634) +3200 val loss 7.1516 +3200 val perplexity 1276.0901 +3200 train 6.933980 (lr=6.9438e-05) (hash(x)=45145921) +3500 val loss 7.3144 +3500 val perplexity 1501.7457 +3500 train 6.877497 (lr=4.9514e-05) (hash(x)=41135634) +3600 val loss 6.8964 +3600 val perplexity 988.7259 +3600 train 6.909090 (lr=9.8969e-05) (hash(x)=54286330) +3300 val loss 7.1531 +3300 val perplexity 1278.0381 +3300 train 7.020805 (lr=6.9400e-05) (hash(x)=48085018) +3600 val loss 7.3016 +3600 val perplexity 1482.5974 +3600 train 7.303183 (lr=4.9484e-05) (hash(x)=54286330) +3700 val loss 6.8708 +3700 val perplexity 963.7280 +3700 train 6.759006 (lr=9.8908e-05) (hash(x)=57061993) +3700 val loss 7.2850 +3700 val perplexity 1458.2623 +3700 train 7.198061 (lr=4.9454e-05) (hash(x)=57061993) +3400 val loss 7.1262 +3400 val perplexity 1244.1886 +3400 train 7.148568 (lr=6.9360e-05) (hash(x)=49596053) +3800 val loss 6.8487 +3800 val perplexity 942.6844 +3800 train 6.403491 (lr=9.8845e-05) (hash(x)=46544884) +3800 val loss 7.2615 +3800 val perplexity 1424.3975 +3800 train 6.839896 (lr=4.9422e-05) (hash(x)=46544884) +3500 val loss 7.1148 +3500 val perplexity 1230.0734 +3500 train 6.743076 (lr=6.9320e-05) (hash(x)=41135634) +3900 val loss 6.8487 +3900 val perplexity 942.6713 +3900 train 6.683726 (lr=9.8780e-05) (hash(x)=48937435) +3900 val loss 7.2531 +3900 val perplexity 1412.5129 +3900 train 7.092195 (lr=4.9390e-05) (hash(x)=48937435) +3600 val loss 7.1074 +3600 val perplexity 1220.9386 +3600 train 7.123626 (lr=6.9278e-05) (hash(x)=54286330) +4000 val loss 6.8492 +4000 val perplexity 943.0823 +4000 train 6.922632 (lr=9.8713e-05) (hash(x)=54466186) +4000 val loss 7.2354 +4000 val perplexity 1387.7087 +4000 train 7.246126 (lr=4.9357e-05) (hash(x)=54466186) +3700 val loss 7.1050 +3700 val perplexity 1217.9886 +3700 train 7.032697 (lr=6.9235e-05) (hash(x)=57061993) +4100 val loss 6.8203 +4100 val perplexity 916.2958 +4100 train 6.640280 (lr=9.8645e-05) (hash(x)=51079995) +4100 val loss 7.2099 +4100 val perplexity 1352.7820 +4100 train 7.045227 (lr=4.9322e-05) (hash(x)=51079995) +3800 val loss 7.0810 +3800 val perplexity 1189.1099 +3800 train 6.669867 (lr=6.9191e-05) (hash(x)=46544884) +4200 val loss 6.8007 +4200 val perplexity 898.4398 +4200 train 6.927116 (lr=9.8575e-05) (hash(x)=56495581) +4200 val loss 7.1831 +4200 val perplexity 1317.0364 +4200 train 7.338355 (lr=4.9287e-05) (hash(x)=56495581) +3900 val loss 7.0475 +3900 val perplexity 1149.9277 +3900 train 6.894968 (lr=6.9146e-05) (hash(x)=48937435) +4300 val loss 6.7575 +4300 val perplexity 860.4601 +4300 train 6.609615 (lr=9.8503e-05) (hash(x)=45095478) +4300 val loss 7.1410 +4300 val perplexity 1262.6746 +4300 train 6.996377 (lr=4.9252e-05) (hash(x)=45095478) +4000 val loss 7.0571 +4000 val perplexity 1161.0270 +4000 train 7.094718 (lr=6.9099e-05) (hash(x)=54466186) +4400 val loss 6.7344 +4400 val perplexity 840.8615 +4400 train 6.716791 (lr=9.8430e-05) (hash(x)=49954028) +4400 val loss 7.1261 +4400 val perplexity 1244.0100 +4400 train 7.071599 (lr=4.9215e-05) (hash(x)=49954028) +4100 val loss 6.9831 +4100 val perplexity 1078.2644 +4100 train 6.825128 (lr=6.9051e-05) (hash(x)=51079995) +4500 val loss 6.7151 +4500 val perplexity 824.7881 +4500 train 6.654397 (lr=9.8355e-05) (hash(x)=48975821) +4500 val loss 7.0778 +4500 val perplexity 1185.3824 +4500 train 6.998955 (lr=4.9177e-05) (hash(x)=48975821) +4200 val loss 6.9663 +4200 val perplexity 1060.3020 +4200 train 7.100050 (lr=6.9002e-05) (hash(x)=56495581) +4600 val loss 6.6988 +4600 val perplexity 811.4537 +4600 train 6.786911 (lr=9.8278e-05) (hash(x)=49293028) +4600 val loss 7.0381 +4600 val perplexity 1139.2338 +4600 train 7.101270 (lr=4.9139e-05) (hash(x)=49293028) +4300 val loss 6.9039 +4300 val perplexity 996.1081 +4300 train 6.771300 (lr=6.8952e-05) (hash(x)=45095478) +4700 val loss 6.6957 +4700 val perplexity 808.9425 +4700 train 6.622647 (lr=9.8199e-05) (hash(x)=48077753) +4700 val loss 7.0090 +4700 val perplexity 1106.5409 +4700 train 6.926502 (lr=4.9099e-05) (hash(x)=48077753) +4400 val loss 6.8956 +4400 val perplexity 987.9341 +4400 train 6.820930 (lr=6.8901e-05) (hash(x)=49954028) +4800 val loss 6.9764 +4800 val perplexity 1071.0530 +4800 train 6.803982 (lr=4.9059e-05) (hash(x)=45037879) +4800 val loss 6.6808 +4800 val perplexity 796.9500 +4800 train 6.553126 (lr=9.8119e-05) (hash(x)=45037879) +4500 val loss 6.8893 +4500 val perplexity 981.6792 +4500 train 6.835466 (lr=6.8848e-05) (hash(x)=48975821) +4900 val loss 6.9487 +4900 val perplexity 1041.8422 +4900 train 6.884708 (lr=4.9018e-05) (hash(x)=50320154) +4900 val loss 6.6928 +4900 val perplexity 806.5449 +4900 train 6.624340 (lr=9.8036e-05) (hash(x)=50320154) +4600 val loss 6.8234 +4600 val perplexity 919.1371 +4600 train 6.901464 (lr=6.8794e-05) (hash(x)=49293028) +5000 val loss 6.9164 +5000 val perplexity 1008.6340 +5000 train 7.118251 (lr=4.8976e-05) (hash(x)=55024523) +5000 val loss 6.6927 +5000 val perplexity 806.4706 +5000 train 6.928228 (lr=9.7953e-05) (hash(x)=55024523) +4700 val loss 6.8218 +4700 val perplexity 917.6364 +4700 train 6.741679 (lr=6.8739e-05) (hash(x)=48077753) +5100 val loss 6.8859 +5100 val perplexity 978.3375 +5100 train 6.834711 (lr=4.8934e-05) (hash(x)=47570607) +5100 val loss 6.6925 +5100 val perplexity 806.3434 +5100 train 6.637020 (lr=9.7867e-05) (hash(x)=47570607) +4800 val loss 6.7892 +4800 val perplexity 888.1746 +4800 train 6.649376 (lr=6.8683e-05) (hash(x)=45037879) +5200 val loss 6.8648 +5200 val perplexity 957.9620 +5200 train 6.989194 (lr=4.8890e-05) (hash(x)=54311349) +5200 val loss 6.6713 +5200 val perplexity 789.3846 +5200 train 6.798645 (lr=9.7780e-05) (hash(x)=54311349) +4900 val loss 6.7833 +4900 val perplexity 883.0000 +4900 train 6.700462 (lr=6.8626e-05) (hash(x)=50320154) +5300 val loss 6.8514 +5300 val perplexity 945.1937 +5300 train 6.747072 (lr=4.8846e-05) (hash(x)=47178524) +5300 val loss 6.6621 +5300 val perplexity 782.1596 +5300 train 6.567628 (lr=9.7691e-05) (hash(x)=47178524) +5000 val loss 6.7733 +5000 val perplexity 874.1767 +5400 val loss 6.8309 +5400 val perplexity 926.0236 +5400 train 6.862480 (lr=4.8800e-05) (hash(x)=49425088) +5000 train 6.995894 (lr=6.8567e-05) (hash(x)=55024523) +5400 val loss 6.6605 +5400 val perplexity 780.9615 +5400 train 6.674005 (lr=9.7600e-05) (hash(x)=49425088) +5500 val loss 6.7999 +5500 val perplexity 897.7542 +5500 train 6.627831 (lr=4.8754e-05) (hash(x)=46383189) +5100 val loss 6.7533 +5100 val perplexity 856.8628 +5100 train 6.695008 (lr=6.8507e-05) (hash(x)=47570607) +5500 val loss 6.6548 +5500 val perplexity 776.5374 +5500 train 6.504308 (lr=9.7508e-05) (hash(x)=46383189) +5600 val loss 6.7902 +5600 val perplexity 889.1043 +5600 train 6.988300 (lr=4.8707e-05) (hash(x)=54084990) +5200 val loss 6.7284 +5200 val perplexity 835.7720 +5200 train 6.850222 (lr=6.8446e-05) (hash(x)=54311349) +5600 val loss 6.6525 +5600 val perplexity 774.7244 +5600 train 6.827147 (lr=9.7414e-05) (hash(x)=54084990) +5700 val loss 6.7643 +5700 val perplexity 866.3623 +5700 train 6.442833 (lr=4.8659e-05) (hash(x)=47384182) +5300 val loss 6.7114 +5300 val perplexity 821.7026 +5300 train 6.605969 (lr=6.8384e-05) (hash(x)=47178524) +5700 val loss 6.6368 +5700 val perplexity 762.6723 +5700 train 6.298797 (lr=9.7318e-05) (hash(x)=47384182) +5800 val loss 6.7417 +5800 val perplexity 846.9810 +5800 train 6.638907 (lr=4.8611e-05) (hash(x)=51683744) +5400 val loss 6.7027 +5400 val perplexity 814.5709 +5400 train 6.726990 (lr=6.8320e-05) (hash(x)=49425088) +5800 val loss 6.6430 +5800 val perplexity 767.3810 +5800 train 6.526483 (lr=9.7221e-05) (hash(x)=51683744) +5900 val loss 6.7302 +5900 val perplexity 837.2730 +5900 train 6.729558 (lr=4.8561e-05) (hash(x)=52798836) +5500 val loss 6.6886 +5500 val perplexity 803.1932 +5500 train 6.527968 (lr=6.8256e-05) (hash(x)=46383189) +5900 val loss 6.6413 +5900 val perplexity 766.0853 +5900 train 6.645696 (lr=9.7122e-05) (hash(x)=52798836) +6000 val loss 6.7192 +6000 val perplexity 828.1390 +6000 train 6.623009 (lr=4.8511e-05) (hash(x)=50945000) +5600 val loss 6.7087 +5600 val perplexity 819.5231 +6000 val loss 6.6610 +6000 val perplexity 781.2982 +6000 train 6.546430 (lr=9.7021e-05) (hash(x)=50945000) +5600 train 6.890930 (lr=6.8190e-05) (hash(x)=54084990) +6100 val loss 6.7036 +6100 val perplexity 815.3730 +6100 train 6.419569 (lr=4.8459e-05) (hash(x)=48964427) +6100 val loss 6.6431 +6100 val perplexity 767.4487 +6100 train 6.336549 (lr=9.6919e-05) (hash(x)=48964427) +5700 val loss 6.6840 +5700 val perplexity 799.5497 +5700 train 6.347425 (lr=6.8123e-05) (hash(x)=47384182) +6200 val loss 6.7055 +6200 val perplexity 816.8487 +6200 train 6.473526 (lr=4.8407e-05) (hash(x)=47148610) +6200 val loss 6.6407 +6200 val perplexity 765.6679 +6200 train 6.396455 (lr=9.6815e-05) (hash(x)=47148610) +5800 val loss 6.6849 +5800 val perplexity 800.2454 +5800 train 6.568721 (lr=6.8055e-05) (hash(x)=51683744) +6300 val loss 6.6806 +6300 val perplexity 796.8170 +6300 train 6.611919 (lr=4.8355e-05) (hash(x)=50860553) +6300 val loss 6.6401 +6300 val perplexity 765.1675 +6300 train 6.538082 (lr=9.6709e-05) (hash(x)=50860553) +5900 val loss 6.6704 +5900 val perplexity 788.7311 +5900 train 6.678335 (lr=6.7985e-05) (hash(x)=52798836) +6400 val loss 6.6896 +6400 val perplexity 803.9791 +6400 train 6.461349 (lr=4.8301e-05) (hash(x)=50553992) +6400 val loss 6.6315 +6400 val perplexity 758.6357 +6400 train 6.403618 (lr=9.6602e-05) (hash(x)=50553992) +6500 val loss 6.6780 +6500 val perplexity 794.6962 +6500 train 6.493553 (lr=4.8246e-05) (hash(x)=52521332) +6000 val loss 6.6903 +6000 val perplexity 804.5701 +6000 train 6.592606 (lr=6.7915e-05) (hash(x)=50945000) +6500 val loss 6.6315 +6500 val perplexity 758.6274 +6500 train 6.440781 (lr=9.6493e-05) (hash(x)=52521332) +6600 val loss 6.6464 +6600 val perplexity 770.0264 +6600 train 7.018993 (lr=4.8191e-05) (hash(x)=52609843) +6100 val loss 6.6652 +6100 val perplexity 784.6075 +6100 train 6.380037 (lr=6.7843e-05) (hash(x)=48964427) +6700 val loss 6.6126 +6700 val perplexity 744.4439 +6700 train 6.647134 (lr=4.8135e-05) (hash(x)=53632957) +6600 val loss 6.5836 +6600 val perplexity 723.1738 +6600 train 6.929787 (lr=9.6382e-05) (hash(x)=52609843) +6200 val loss 6.6869 +6200 val perplexity 801.8214 +6200 train 6.454489 (lr=6.7770e-05) (hash(x)=47148610) +6800 val loss 6.6050 +6800 val perplexity 738.7873 +6800 train 6.612069 (lr=4.8078e-05) (hash(x)=46927608) +6700 val loss 6.5615 +6700 val perplexity 707.3563 +6700 train 6.595787 (lr=9.6270e-05) (hash(x)=53632957) +6300 val loss 6.6527 +6300 val perplexity 774.9099 +6300 train 6.581693 (lr=6.7696e-05) (hash(x)=50860553) +6900 val loss 6.5811 +6900 val perplexity 721.3437 +6900 train 6.601569 (lr=4.8020e-05) (hash(x)=54483057) +6800 val loss 6.5531 +6800 val perplexity 701.4470 +6800 train 6.572984 (lr=9.6156e-05) (hash(x)=46927608) +6400 val loss 6.6662 +6400 val perplexity 785.3917 +6400 train 6.447389 (lr=6.7621e-05) (hash(x)=50553992) +7000 val loss 6.5629 +7000 val perplexity 708.3310 +7000 train 6.755750 (lr=4.7961e-05) (hash(x)=51493884) +6900 val loss 6.5342 +6900 val perplexity 688.2824 +6900 train 6.548564 (lr=9.6040e-05) (hash(x)=54483057) +6500 val loss 6.6694 +6500 val perplexity 787.9563 +6500 train 6.480408 (lr=6.7545e-05) (hash(x)=52521332) +7100 val loss 6.5530 +7100 val perplexity 701.3360 +7100 train 6.707948 (lr=4.7902e-05) (hash(x)=53303341) +7000 val loss 6.5230 +7000 val perplexity 680.5875 +7000 train 6.697546 (lr=9.5923e-05) (hash(x)=51493884) +6600 val loss 6.6223 +6600 val perplexity 751.6710 +6600 train 6.960495 (lr=6.7467e-05) (hash(x)=52609843) +7200 val loss 6.5556 +7200 val perplexity 703.1818 +7200 train 6.249412 (lr=4.7842e-05) (hash(x)=45272178) +7100 val loss 6.5120 +7100 val perplexity 673.1962 +7100 train 6.653329 (lr=9.5804e-05) (hash(x)=53303341) +6700 val loss 6.5993 +6700 val perplexity 734.6102 +6700 train 6.636389 (lr=6.7389e-05) (hash(x)=53632957) +7300 val loss 6.5582 +7300 val perplexity 705.0065 +7300 train 6.503459 (lr=4.7781e-05) (hash(x)=50389872) +7200 val loss 6.5314 +7200 val perplexity 686.3867 +7200 train 6.234296 (lr=9.5683e-05) (hash(x)=45272178) +6800 val loss 6.5990 +6800 val perplexity 734.3454 +6800 train 6.596207 (lr=6.7309e-05) (hash(x)=46927608) +7400 val loss 6.5284 +7400 val perplexity 684.2949 +7400 train 6.174231 (lr=4.7719e-05) (hash(x)=43796301) +7300 val loss 6.5203 +7300 val perplexity 678.7845 +7300 train 6.458115 (lr=9.5561e-05) (hash(x)=50389872) +6900 val loss 6.5833 +6900 val perplexity 722.9094 +6900 train 6.621687 (lr=6.7228e-05) (hash(x)=54483057) +7500 val loss 6.5215 +7500 val perplexity 679.5922 +7500 train 6.475193 (lr=4.7656e-05) (hash(x)=47808686) +7400 val loss 6.5153 +7400 val perplexity 675.3668 +7400 train 6.160817 (lr=9.5437e-05) (hash(x)=43796301) +7600 val loss 6.5104 +7600 val perplexity 672.1002 +7600 train 6.245540 (lr=4.7593e-05) (hash(x)=41936898) +7000 val loss 6.5737 +7000 val perplexity 716.0107 +7000 train 6.746213 (lr=6.7146e-05) (hash(x)=51493884) +7500 val loss 6.4960 +7500 val perplexity 662.4863 +7500 train 6.448035 (lr=9.5312e-05) (hash(x)=47808686) +7700 val loss 6.5437 +7700 val perplexity 694.8350 +7700 train 6.765839 (lr=4.7528e-05) (hash(x)=57550318) +7100 val loss 6.5556 +7100 val perplexity 703.1359 +7100 train 6.689147 (lr=6.7063e-05) (hash(x)=53303341) +7600 val loss 6.5110 +7600 val perplexity 672.5247 +7600 train 6.242159 (lr=9.5185e-05) (hash(x)=41936898) +7800 val loss 6.5072 +7800 val perplexity 669.9379 +7800 train 6.386351 (lr=4.7463e-05) (hash(x)=47485210) +7200 val loss 6.5631 +7200 val perplexity 708.4371 +7200 train 6.267578 (lr=6.6978e-05) (hash(x)=45272178) +7700 val loss 6.5560 +7700 val perplexity 703.4413 +7700 train 6.755744 (lr=9.5057e-05) (hash(x)=57550318) +7900 val loss 6.4952 +7900 val perplexity 661.9788 +7900 train 6.856887 (lr=4.7397e-05) (hash(x)=53228688) +7300 val loss 6.5509 +7300 val perplexity 699.8741 +7300 train 6.476492 (lr=6.6893e-05) (hash(x)=50389872) +7800 val loss 6.5042 +7800 val perplexity 667.9736 +7800 train 6.364461 (lr=9.4926e-05) (hash(x)=47485210) +8000 val loss 6.4845 +8000 val perplexity 654.9376 +8000 train 6.538146 (lr=4.7331e-05) (hash(x)=52018673) +7400 val loss 6.5347 +7400 val perplexity 688.6169 +7400 train 6.193424 (lr=6.6806e-05) (hash(x)=43796301) +7900 val loss 6.5301 +7900 val perplexity 685.4624 +7900 train 6.866262 (lr=9.4795e-05) (hash(x)=53228688) +8100 val loss 6.4782 +8100 val perplexity 650.7988 +8100 train 6.300193 (lr=4.7263e-05) (hash(x)=47079349) +7500 val loss 6.5268 +7500 val perplexity 683.2115 +7500 train 6.448314 (lr=6.6718e-05) (hash(x)=47808686) +8000 val loss 6.5068 +8000 val perplexity 669.6859 +8000 train 6.577403 (lr=9.4661e-05) (hash(x)=52018673) +8200 val loss 6.4762 +8200 val perplexity 649.5302 +8200 train 6.523792 (lr=4.7195e-05) (hash(x)=57921563) +7600 val loss 6.5195 +7600 val perplexity 678.2145 +7600 train 6.261240 (lr=6.6630e-05) (hash(x)=41936898) +8300 val loss 6.4670 +8300 val perplexity 643.5186 +8100 val loss 6.5174 +8100 val perplexity 676.8450 +8300 train 5.990251 (lr=4.7126e-05) (hash(x)=45038933) +8100 train 6.335436 (lr=9.4526e-05) (hash(x)=47079349) +7700 val loss 6.5438 +7700 val perplexity 694.8987 +7700 train 6.748104 (lr=6.6540e-05) (hash(x)=57550318) +8400 val loss 6.4961 +8400 val perplexity 662.5320 +8400 train 6.334754 (lr=4.7056e-05) (hash(x)=47763246) +8200 val loss 6.4995 +8200 val perplexity 664.7935 +8200 train 6.527027 (lr=9.4390e-05) (hash(x)=57921563) +7800 val loss 6.5203 +7800 val perplexity 678.7521 +7800 train 6.382823 (lr=6.6448e-05) (hash(x)=47485210) +8500 val loss 6.4677 +8500 val perplexity 644.0254 +8500 train 6.522591 (lr=4.6986e-05) (hash(x)=56176595) +8300 val loss 6.5019 +8300 val perplexity 666.3915 +8300 train 6.023483 (lr=9.4252e-05) (hash(x)=45038933) +7900 val loss 6.4990 +7900 val perplexity 664.4810 +8600 val loss 6.4592 +8600 val perplexity 638.5692 +7900 train 6.858734 (lr=6.6356e-05) (hash(x)=53228688) +8600 train 6.534775 (lr=4.6914e-05) (hash(x)=55184249) +8400 val loss 6.5294 +8400 val perplexity 684.9655 +8400 train 6.357156 (lr=9.4112e-05) (hash(x)=47763246) +8700 val loss 6.4525 +8700 val perplexity 634.2612 +8700 train 6.378607 (lr=4.6842e-05) (hash(x)=46471646) +8000 val loss 6.4971 +8000 val perplexity 663.2284 +8000 train 6.537724 (lr=6.6263e-05) (hash(x)=52018673) +8500 val loss 6.4983 +8500 val perplexity 663.9872 +8500 train 6.559092 (lr=9.3971e-05) (hash(x)=56176595) +8800 val loss 6.4490 +8800 val perplexity 632.0590 +8800 train 6.138185 (lr=4.6769e-05) (hash(x)=46233162) +8100 val loss 6.4872 +8100 val perplexity 656.7007 +8100 train 6.310978 (lr=6.6169e-05) (hash(x)=47079349) +8600 val loss 6.4893 +8600 val perplexity 658.0417 +8600 train 6.562383 (lr=9.3828e-05) (hash(x)=55184249) +8900 val loss 6.4592 +8900 val perplexity 638.5201 +8900 train 6.371769 (lr=4.6696e-05) (hash(x)=47233684) +8200 val loss 6.4736 +8200 val perplexity 647.7810 +8200 train 6.520305 (lr=6.6073e-05) (hash(x)=57921563) +8700 val loss 6.4823 +8700 val perplexity 653.5045 +8700 train 6.430116 (lr=9.3684e-05) (hash(x)=46471646) +9000 val loss 6.4335 +9000 val perplexity 622.3281 +9000 train 6.317858 (lr=4.6621e-05) (hash(x)=48374529) +8800 val loss 6.4801 +8800 val perplexity 652.0471 +8800 train 6.171477 (lr=9.3538e-05) (hash(x)=46233162) +8300 val loss 6.4695 +8300 val perplexity 645.1451 +8300 train 6.009447 (lr=6.5976e-05) (hash(x)=45038933) +9100 val loss 6.3986 +9100 val perplexity 601.0143 +9100 train 6.379299 (lr=4.6546e-05) (hash(x)=48065371) +8900 val loss 6.5132 +8900 val perplexity 674.0120 +8900 train 6.434764 (lr=9.3391e-05) (hash(x)=47233684) +8400 val loss 6.5067 +8400 val perplexity 669.5898 +8400 train 6.319866 (lr=6.5879e-05) (hash(x)=47763246) +9200 val loss 6.4103 +9200 val perplexity 608.0588 +9200 train 6.404684 (lr=4.6470e-05) (hash(x)=47408078) +9000 val loss 6.4669 +9000 val perplexity 643.4824 +9000 train 6.383555 (lr=9.3242e-05) (hash(x)=48374529) +8500 val loss 6.4861 +8500 val perplexity 655.9434 +8500 train 6.526380 (lr=6.5780e-05) (hash(x)=56176595) +9300 val loss 6.3951 +9300 val perplexity 598.9264 +9300 train 6.380902 (lr=4.6393e-05) (hash(x)=50749781) +9100 val loss 6.4501 +9100 val perplexity 632.7797 +9100 train 6.407254 (lr=9.3092e-05) (hash(x)=48065371) +8600 val loss 6.4796 +8600 val perplexity 651.7400 +8600 train 6.548368 (lr=6.5680e-05) (hash(x)=55184249) +9400 val loss 6.3808 +9400 val perplexity 590.4025 +9400 train 6.584859 (lr=4.6316e-05) (hash(x)=48560169) +9200 val loss 6.4469 +9200 val perplexity 630.7631 +9200 train 6.404437 (lr=9.2940e-05) (hash(x)=47408078) +8700 val loss 6.4779 +8700 val perplexity 650.6334 +8700 train 6.409275 (lr=6.5579e-05) (hash(x)=46471646) +9500 val loss 6.3702 +9500 val perplexity 584.1541 +9500 train 6.499002 (lr=4.6238e-05) (hash(x)=50936392) +9300 val loss 6.4336 +9300 val perplexity 622.4111 +9300 train 6.417411 (lr=9.2786e-05) (hash(x)=50749781) +9600 val loss 6.3660 +9600 val perplexity 581.7228 +8800 val loss 6.4744 +8800 val perplexity 648.3469 +9600 train 6.397464 (lr=4.6159e-05) (hash(x)=50651714) +8800 train 6.155792 (lr=6.5477e-05) (hash(x)=46233162) +9400 val loss 6.4373 +9400 val perplexity 624.7006 +9400 train 6.645349 (lr=9.2632e-05) (hash(x)=48560169) +9700 val loss 6.3622 +9700 val perplexity 579.5178 +9700 train 6.442993 (lr=4.6079e-05) (hash(x)=47311384) +8900 val loss 6.5010 +8900 val perplexity 665.8210 +8900 train 6.413991 (lr=6.5374e-05) (hash(x)=47233684) +9500 val loss 6.4286 +9500 val perplexity 619.3281 +9500 train 6.495954 (lr=9.2475e-05) (hash(x)=50936392) +9800 val loss 6.3523 +9800 val perplexity 573.8036 +9800 train 6.467707 (lr=4.5999e-05) (hash(x)=50921139) +9000 val loss 6.4552 +9000 val perplexity 635.9965 +9000 train 6.314552 (lr=6.5270e-05) (hash(x)=48374529) +9600 val loss 6.4161 +9600 val perplexity 611.5986 +9600 train 6.447667 (lr=9.2317e-05) (hash(x)=50651714) +9900 val loss 6.3963 +9900 val perplexity 599.6365 +9900 train 6.523573 (lr=4.5917e-05) (hash(x)=48142455) +9100 val loss 6.4225 +9100 val perplexity 615.5554 +9100 train 6.385074 (lr=6.5164e-05) (hash(x)=48065371) +10000 val loss 6.3595 +10000 val perplexity 577.9396 +9700 val loss 6.4472 +9700 val perplexity 630.9060 +9700 train 6.521022 (lr=9.2158e-05) (hash(x)=47311384) +10000 train 6.527015 (lr=4.5835e-05) (hash(x)=50757167) +9200 val loss 6.4102 +9200 val perplexity 608.0118 +9200 train 6.397487 (lr=6.5058e-05) (hash(x)=47408078) +10100 val loss 6.3427 +10100 val perplexity 568.3453 +10100 train 6.306806 (lr=4.5753e-05) (hash(x)=46128585) +9800 val loss 6.4369 +9800 val perplexity 624.4707 +9800 train 6.547096 (lr=9.1997e-05) (hash(x)=50921139) +9300 val loss 6.4153 +9300 val perplexity 611.1243 +9300 train 6.385008 (lr=6.4951e-05) (hash(x)=50749781) +10200 val loss 6.3368 +10200 val perplexity 564.9656 +10200 train 6.583423 (lr=4.5669e-05) (hash(x)=57016296) +9900 val loss 6.4813 +9900 val perplexity 652.8093 +9900 train 6.580163 (lr=9.1835e-05) (hash(x)=48142455) +9400 val loss 6.3939 +9400 val perplexity 598.1655 +9400 train 6.605084 (lr=6.4842e-05) (hash(x)=48560169) +10300 val loss 6.3370 +10300 val perplexity 565.0995 +10300 train 6.398365 (lr=4.5585e-05) (hash(x)=49839464) +10000 val loss 6.4333 +10000 val perplexity 622.2515 +10000 train 6.574889 (lr=9.1671e-05) (hash(x)=50757167) +9500 val loss 6.3949 +9500 val perplexity 598.7739 +9500 train 6.505234 (lr=6.4733e-05) (hash(x)=50936392) +10400 val loss 6.3355 +10400 val perplexity 564.2524 +10400 train 6.375431 (lr=4.5501e-05) (hash(x)=50237852) +10100 val loss 6.4271 +10100 val perplexity 618.3655 +10100 train 6.385709 (lr=9.1506e-05) (hash(x)=46128585) +9600 val loss 6.3899 +9600 val perplexity 595.8224 +9600 train 6.408211 (lr=6.4622e-05) (hash(x)=50651714) +10500 val loss 6.3327 +10500 val perplexity 562.6645 +10500 train 6.463687 (lr=4.5415e-05) (hash(x)=48003282) +10200 val loss 6.4375 +10200 val perplexity 624.8270 +10200 train 6.647473 (lr=9.1339e-05) (hash(x)=57016296) +9700 val loss 6.4123 +9700 val perplexity 609.2880 +9700 train 6.483752 (lr=6.4511e-05) (hash(x)=47311384) +10600 val loss 6.3217 +10600 val perplexity 556.5027 +10600 train 6.243126 (lr=4.5329e-05) (hash(x)=53088427) +10300 val loss 6.4242 +10300 val perplexity 616.5942 +10300 train 6.493999 (lr=9.1171e-05) (hash(x)=49839464) +9800 val loss 6.3913 +9800 val perplexity 596.6329 +9800 train 6.521517 (lr=6.4398e-05) (hash(x)=50921139) +10700 val loss 6.3183 +10700 val perplexity 554.6146 +10700 train 6.143308 (lr=4.5242e-05) (hash(x)=47687508) +10400 val loss 6.4189 +10400 val perplexity 613.3044 +10400 train 6.429158 (lr=9.1001e-05) (hash(x)=50237852) +9900 val loss 6.4089 +9900 val perplexity 607.2403 +9900 train 6.524827 (lr=6.4284e-05) (hash(x)=48142455) +10800 val loss 6.3158 +10800 val perplexity 553.2683 +10800 train 6.176620 (lr=4.5154e-05) (hash(x)=47658698) +10500 val loss 6.4138 +10500 val perplexity 610.1808 +10500 train 6.476384 (lr=9.0830e-05) (hash(x)=48003282) +10000 val loss 6.3896 +10000 val perplexity 595.6434 +10000 train 6.556873 (lr=6.4170e-05) (hash(x)=50757167) +10900 val loss 6.3140 +10900 val perplexity 552.2707 +10900 train 6.319444 (lr=4.5066e-05) (hash(x)=52957012) +10600 val loss 6.4125 +10600 val perplexity 609.4178 +10600 train 6.330734 (lr=9.0658e-05) (hash(x)=53088427) +10100 val loss 6.3830 +10100 val perplexity 591.6942 +10100 train 6.365413 (lr=6.4054e-05) (hash(x)=46128585) +11000 val loss 6.3041 +11000 val perplexity 546.7923 +11000 train 6.431828 (lr=4.4977e-05) (hash(x)=54632498) +10700 val loss 6.4377 +10700 val perplexity 624.9506 +10700 train 6.248556 (lr=9.0484e-05) (hash(x)=47687508) +10200 val loss 6.3851 +10200 val perplexity 592.9426 +10200 train 6.623396 (lr=6.3937e-05) (hash(x)=57016296) +11100 val loss 6.2974 +11100 val perplexity 543.1434 +11100 train 6.717603 (lr=4.4887e-05) (hash(x)=54028593) +10800 val loss 6.4308 +10800 val perplexity 620.6729 +10300 val loss 6.3942 +10300 val perplexity 598.3644 +10800 train 6.281879 (lr=9.0308e-05) (hash(x)=47658698) +10300 train 6.451300 (lr=6.3820e-05) (hash(x)=49839464) +11200 val loss 6.2905 +11200 val perplexity 539.4050 +11200 train 6.437254 (lr=4.4797e-05) (hash(x)=54084412) +10900 val loss 6.4203 +10900 val perplexity 614.1601 +10900 train 6.417076 (lr=9.0132e-05) (hash(x)=52957012) +11300 val loss 6.2949 +11300 val perplexity 541.8032 +10400 val loss 6.3741 +10400 val perplexity 586.4368 +11300 train 6.081520 (lr=4.4706e-05) (hash(x)=48170961) +10400 train 6.395322 (lr=6.3701e-05) (hash(x)=50237852) +11400 val loss 6.2732 +11400 val perplexity 530.1591 +11000 val loss 6.4158 +11000 val perplexity 611.4429 +11400 train 6.277826 (lr=4.4614e-05) (hash(x)=52124243) +11000 train 6.532582 (lr=8.9954e-05) (hash(x)=54632498) +10500 val loss 6.3744 +10500 val perplexity 586.6580 +10500 train 6.436301 (lr=6.3581e-05) (hash(x)=48003282) +11500 val loss 6.2693 +11500 val perplexity 528.1030 +11500 train 6.208919 (lr=4.4521e-05) (hash(x)=46985517) +11100 val loss 6.3796 +11100 val perplexity 589.7199 +11100 train 6.831900 (lr=8.9774e-05) (hash(x)=54028593) +10600 val loss 6.3612 +10600 val perplexity 578.9550 +10600 train 6.263483 (lr=6.3460e-05) (hash(x)=53088427) +11600 val loss 6.2706 +11600 val perplexity 528.8131 +11600 train 6.018594 (lr=4.4428e-05) (hash(x)=46741028) +11200 val loss 6.3781 +11200 val perplexity 588.8362 +11200 train 6.515458 (lr=8.9593e-05) (hash(x)=54084412) +10700 val loss 6.3604 +10700 val perplexity 578.4891 +10700 train 6.186274 (lr=6.3339e-05) (hash(x)=47687508) +11700 val loss 6.2734 +11700 val perplexity 530.2776 +11700 train 6.162641 (lr=4.4334e-05) (hash(x)=45729773) +11300 val loss 6.3881 +11300 val perplexity 594.7457 +11300 train 6.200679 (lr=8.9411e-05) (hash(x)=48170961) +10800 val loss 6.3703 +10800 val perplexity 584.2268 +10800 train 6.229922 (lr=6.3216e-05) (hash(x)=47658698) +11800 val loss 6.2669 +11800 val perplexity 526.8268 +11800 train 6.216451 (lr=4.4240e-05) (hash(x)=50540350) +11400 val loss 6.3756 +11400 val perplexity 587.3629 +11400 train 6.401515 (lr=8.9227e-05) (hash(x)=52124243) +10900 val loss 6.3589 +10900 val perplexity 577.6041 +10900 train 6.340561 (lr=6.3092e-05) (hash(x)=52957012) +11900 val loss 6.2600 +11900 val perplexity 523.2253 +11900 train 6.204460 (lr=4.4145e-05) (hash(x)=50313941) +11500 val loss 6.3683 +11500 val perplexity 583.0668 +11500 train 6.308839 (lr=8.9043e-05) (hash(x)=46985517) +11000 val loss 6.3422 +11000 val perplexity 568.0190 +11000 train 6.441959 (lr=6.2968e-05) (hash(x)=54632498) +12000 val loss 6.2546 +12000 val perplexity 520.3788 +12000 train 6.188013 (lr=4.4049e-05) (hash(x)=49621720) +11600 val loss 6.3659 +11600 val perplexity 581.6612 +11600 train 6.130044 (lr=8.8856e-05) (hash(x)=46741028) +11100 val loss 6.3410 +11100 val perplexity 567.3729 +11100 train 6.690869 (lr=6.2842e-05) (hash(x)=54028593) +12100 val loss 6.2551 +12100 val perplexity 520.6556 +12100 train 6.121255 (lr=4.3952e-05) (hash(x)=48903010) +11700 val loss 6.3635 +11700 val perplexity 580.2996 +11700 train 6.248008 (lr=8.8668e-05) (hash(x)=45729773) +11200 val loss 6.3367 +11200 val perplexity 564.9144 +11200 train 6.474953 (lr=6.2715e-05) (hash(x)=54084412) +12200 val loss 6.2480 +12200 val perplexity 516.9927 +12200 train 6.215145 (lr=4.3855e-05) (hash(x)=51643030) +11800 val loss 6.3663 +11800 val perplexity 581.9139 +11800 train 6.320728 (lr=8.8479e-05) (hash(x)=50540350) +11300 val loss 6.3327 +11300 val perplexity 562.6554 +11300 train 6.125880 (lr=6.2588e-05) (hash(x)=48170961) +12300 val loss 6.2489 +12300 val perplexity 517.4391 +12300 train 6.211147 (lr=4.3757e-05) (hash(x)=48863143) +11900 val loss 6.3436 +11900 val perplexity 568.8187 +11900 train 6.294075 (lr=8.8289e-05) (hash(x)=50313941) +11400 val loss 6.3326 +11400 val perplexity 562.6205 +11400 train 6.364399 (lr=6.2459e-05) (hash(x)=52124243) +12400 val loss 6.2573 +12400 val perplexity 521.8071 +12400 train 6.121604 (lr=4.3659e-05) (hash(x)=53251144) +12000 val loss 6.3498 +12000 val perplexity 572.3559 +12000 train 6.286205 (lr=8.8097e-05) (hash(x)=49621720) +12500 val loss 6.2475 +12500 val perplexity 516.7026 +12500 train 6.015054 (lr=4.3560e-05) (hash(x)=45178705) +11500 val loss 6.3383 +11500 val perplexity 565.8513 +11500 train 6.276237 (lr=6.2330e-05) (hash(x)=46985517) +12100 val loss 6.3515 +12100 val perplexity 573.3371 +12100 train 6.240752 (lr=8.7904e-05) (hash(x)=48903010) +12600 val loss 6.2939 +12600 val perplexity 541.2366 +12600 train 6.276508 (lr=4.3460e-05) (hash(x)=52779738) +11600 val loss 6.3257 +11600 val perplexity 558.7335 +11600 train 6.083837 (lr=6.2199e-05) (hash(x)=46741028) +12200 val loss 6.3486 +12200 val perplexity 571.7062 +12200 train 6.312387 (lr=8.7710e-05) (hash(x)=51643030) +12700 val loss 6.2382 +12700 val perplexity 511.9407 +12700 train 6.075450 (lr=4.3360e-05) (hash(x)=50253395) +11700 val loss 6.3377 +11700 val perplexity 565.5016 +11700 train 6.222089 (lr=6.2068e-05) (hash(x)=45729773) +12300 val loss 6.3547 +12300 val perplexity 575.1711 +12300 train 6.307712 (lr=8.7515e-05) (hash(x)=48863143) +12800 val loss 6.2423 +12800 val perplexity 514.0385 +12800 train 6.154401 (lr=4.3259e-05) (hash(x)=50786101) +11800 val loss 6.3221 +11800 val perplexity 556.7155 +11800 train 6.265420 (lr=6.1936e-05) (hash(x)=50540350) +12400 val loss 6.3711 +12400 val perplexity 584.6923 +12400 train 6.238752 (lr=8.7318e-05) (hash(x)=53251144) +12900 val loss 6.2225 +12900 val perplexity 503.9471 +12900 train 6.303534 (lr=4.3157e-05) (hash(x)=51296645) +11900 val loss 6.3115 +11900 val perplexity 550.8672 +11900 train 6.256864 (lr=6.1802e-05) (hash(x)=50313941) +12500 val loss 6.3411 +12500 val perplexity 567.4178 +12500 train 6.125415 (lr=8.7119e-05) (hash(x)=45178705) +13000 val loss 6.2215 +13000 val perplexity 503.4427 +13000 train 5.943898 (lr=4.3055e-05) (hash(x)=44184494) +12000 val loss 6.3016 +12000 val perplexity 545.4577 +12000 train 6.224243 (lr=6.1668e-05) (hash(x)=49621720) +12600 val loss 6.4105 +12600 val perplexity 608.1832 +12600 train 6.389183 (lr=8.6920e-05) (hash(x)=52779738) +13100 val loss 6.2167 +13100 val perplexity 501.0674 +13100 train 6.167193 (lr=4.2952e-05) (hash(x)=48196980) +12100 val loss 6.3094 +12100 val perplexity 549.7292 +12100 train 6.192972 (lr=6.1533e-05) (hash(x)=48903010) +12700 val loss 6.3322 +12700 val perplexity 562.3773 +13200 val loss 6.1949 +13200 val perplexity 490.2583 +12700 train 6.191623 (lr=8.6719e-05) (hash(x)=50253395) +13200 train 6.291950 (lr=4.2848e-05) (hash(x)=52207779) +12200 val loss 6.3106 +12200 val perplexity 550.3957 +12200 train 6.270916 (lr=6.1397e-05) (hash(x)=51643030) +13300 val loss 6.2002 +13300 val perplexity 492.8639 +13300 train 6.222753 (lr=4.2744e-05) (hash(x)=51012914) +12800 val loss 6.3396 +12800 val perplexity 566.5978 +12800 train 6.268835 (lr=8.6517e-05) (hash(x)=50786101) +12300 val loss 6.3107 +12300 val perplexity 550.4548 +12300 train 6.258844 (lr=6.1260e-05) (hash(x)=48863143) +13400 val loss 6.1958 +13400 val perplexity 490.7067 +13400 train 6.207230 (lr=4.2640e-05) (hash(x)=48426131) +12900 val loss 6.3150 +12900 val perplexity 552.7924 +12900 train 6.392570 (lr=8.6314e-05) (hash(x)=51296645) +12400 val loss 6.3090 +12400 val perplexity 549.5101 +12400 train 6.195606 (lr=6.1122e-05) (hash(x)=53251144) +13500 val loss 6.2016 +13500 val perplexity 493.5369 +13500 train 6.103122 (lr=4.2534e-05) (hash(x)=50588044) +13000 val loss 6.3299 +13000 val perplexity 561.1127 +13000 train 6.058338 (lr=8.6110e-05) (hash(x)=44184494) +12500 val loss 6.3015 +12500 val perplexity 545.4150 +12500 train 6.088305 (lr=6.0984e-05) (hash(x)=45178705) +13600 val loss 6.1881 +13600 val perplexity 486.9337 +13600 train 6.327810 (lr=4.2428e-05) (hash(x)=57937534) +13100 val loss 6.3223 +13100 val perplexity 556.8345 +13100 train 6.285286 (lr=8.5904e-05) (hash(x)=48196980) +13700 val loss 6.1814 +13700 val perplexity 483.6738 +13700 train 6.291908 (lr=4.2322e-05) (hash(x)=51168202) +12600 val loss 6.3457 +12600 val perplexity 570.0136 +12600 train 6.344426 (lr=6.0844e-05) (hash(x)=52779738) +13200 val loss 6.3261 +13200 val perplexity 558.9976 +13200 train 6.399488 (lr=8.5697e-05) (hash(x)=52207779) +13800 val loss 6.1834 +13800 val perplexity 484.6241 +13800 train 5.926325 (lr=4.2215e-05) (hash(x)=44144400) +12700 val loss 6.3009 +12700 val perplexity 545.0581 +12700 train 6.161895 (lr=6.0703e-05) (hash(x)=50253395) +13300 val loss 6.3418 +13300 val perplexity 567.8043 +13300 train 6.346761 (lr=8.5489e-05) (hash(x)=51012914) +13900 val loss 6.1828 +13900 val perplexity 484.3473 +13900 train 6.293851 (lr=4.2107e-05) (hash(x)=54816018) +12800 val loss 6.2970 +12800 val perplexity 542.9264 +12800 train 6.208713 (lr=6.0562e-05) (hash(x)=50786101) +13400 val loss 6.3301 +13400 val perplexity 561.1954 +13400 train 6.325660 (lr=8.5279e-05) (hash(x)=48426131) +14000 val loss 6.1765 +14000 val perplexity 481.2950 +14000 train 6.058903 (lr=4.1999e-05) (hash(x)=46103642) +12900 val loss 6.2792 +12900 val perplexity 533.3705 +12900 train 6.347450 (lr=6.0420e-05) (hash(x)=51296645) +13500 val loss 6.3255 +13500 val perplexity 558.6429 +13500 train 6.220666 (lr=8.5069e-05) (hash(x)=50588044) +14100 val loss 6.1726 +14100 val perplexity 479.4149 +14100 train 6.112087 (lr=4.1890e-05) (hash(x)=49605955) +13000 val loss 6.2710 +13000 val perplexity 529.0005 +13000 train 6.011692 (lr=6.0277e-05) (hash(x)=44184494) +13600 val loss 6.3303 +13600 val perplexity 561.3222 +13600 train 6.466724 (lr=8.4857e-05) (hash(x)=57937534) +14200 val loss 6.1753 +14200 val perplexity 480.7287 +14200 train 6.044846 (lr=4.1781e-05) (hash(x)=48663080) +13100 val loss 6.2728 +13100 val perplexity 529.9665 +13100 train 6.225151 (lr=6.0133e-05) (hash(x)=48196980) +13700 val loss 6.3313 +13700 val perplexity 561.8926 +13700 train 6.424407 (lr=8.4644e-05) (hash(x)=51168202) +14300 val loss 6.1719 +14300 val perplexity 479.1125 +14300 train 6.139426 (lr=4.1671e-05) (hash(x)=52021000) +13200 val loss 6.2550 +13200 val perplexity 520.5853 +13200 train 6.336420 (lr=5.9988e-05) (hash(x)=52207779) +13800 val loss 6.3428 +13800 val perplexity 568.4006 +13800 train 6.084524 (lr=8.4430e-05) (hash(x)=44144400) +14400 val loss 6.1754 +14400 val perplexity 480.7878 +14400 train 6.561809 (lr=4.1560e-05) (hash(x)=55092055) +13900 val loss 6.3320 +13900 val perplexity 562.3038 +13900 train 6.447861 (lr=8.4214e-05) (hash(x)=54816018) +13300 val loss 6.2692 +13300 val perplexity 528.0549 +13300 train 6.280471 (lr=5.9842e-05) (hash(x)=51012914) +14500 val loss 6.1745 +14500 val perplexity 480.3194 +14500 train 6.131366 (lr=4.1449e-05) (hash(x)=51163106) +14000 val loss 6.3341 +14000 val perplexity 563.4412 +14000 train 6.227144 (lr=8.3998e-05) (hash(x)=46103642) +13400 val loss 6.2616 +13400 val perplexity 524.0313 +13400 train 6.268219 (lr=5.9695e-05) (hash(x)=48426131) +14600 val loss 6.1576 +14600 val perplexity 472.3103 +14600 train 5.969269 (lr=4.1337e-05) (hash(x)=43587182) +14100 val loss 6.3377 +14100 val perplexity 565.5149 +14100 train 6.258365 (lr=8.3780e-05) (hash(x)=49605955) +13500 val loss 6.2441 +13500 val perplexity 514.9732 +13500 train 6.153737 (lr=5.9548e-05) (hash(x)=50588044) +14700 val loss 6.1536 +14700 val perplexity 470.4221 +14700 train 6.195409 (lr=4.1225e-05) (hash(x)=52950516) +14200 val loss 6.3335 +14200 val perplexity 563.1345 +14200 train 6.197090 (lr=8.3561e-05) (hash(x)=48663080) +13600 val loss 6.2546 +13600 val perplexity 520.4012 +13600 train 6.391332 (lr=5.9400e-05) (hash(x)=57937534) +14800 val loss 6.1599 +14800 val perplexity 473.3930 +14800 train 6.328089 (lr=4.1113e-05) (hash(x)=55136517) +14300 val loss 6.3323 +14300 val perplexity 562.4716 +14300 train 6.312100 (lr=8.3341e-05) (hash(x)=52021000) +14900 val loss 6.1520 +14900 val perplexity 469.6723 +14900 train 6.109107 (lr=4.0999e-05) (hash(x)=49852275) +13700 val loss 6.2527 +13700 val perplexity 519.3945 +13700 train 6.347453 (lr=5.9251e-05) (hash(x)=51168202) +14400 val loss 6.3345 +14400 val perplexity 563.6721 +14400 train 6.707219 (lr=8.3120e-05) (hash(x)=55092055) +15000 val loss 6.1329 +15000 val perplexity 460.7704 +15000 train 6.169803 (lr=4.0885e-05) (hash(x)=49783972) +13800 val loss 6.2505 +13800 val perplexity 518.2626 +13800 train 5.985782 (lr=5.9101e-05) (hash(x)=44144400) +14500 val loss 6.3373 +14500 val perplexity 565.2563 +14500 train 6.299037 (lr=8.2898e-05) (hash(x)=51163106) +15100 val loss 6.1280 +15100 val perplexity 458.5122 +15100 train 6.142279 (lr=4.0771e-05) (hash(x)=52480637) +13900 val loss 6.2596 +13900 val perplexity 523.0073 +13900 train 6.392067 (lr=5.8950e-05) (hash(x)=54816018) +14600 val loss 6.3722 +14600 val perplexity 585.3548 +14600 train 6.249589 (lr=8.2675e-05) (hash(x)=43587182) +15200 val loss 6.1326 +15200 val perplexity 460.6166 +15200 train 6.066558 (lr=4.0656e-05) (hash(x)=52067533) +14000 val loss 6.2445 +14000 val perplexity 515.1687 +14000 train 6.133444 (lr=5.8799e-05) (hash(x)=46103642) +14700 val loss 6.3391 +14700 val perplexity 566.2710 +14700 train 6.373418 (lr=8.2451e-05) (hash(x)=52950516) +15300 val loss 6.1288 +15300 val perplexity 458.8762 +15300 train 6.001804 (lr=4.0541e-05) (hash(x)=49883403) +14100 val loss 6.2404 +14100 val perplexity 513.0829 +14100 train 6.164306 (lr=5.8646e-05) (hash(x)=49605955) +14800 val loss 6.3538 +14800 val perplexity 574.6692 +15400 val loss 6.1339 +15400 val perplexity 461.2274 +14800 train 6.504078 (lr=8.2225e-05) (hash(x)=55136517) +15400 train 5.989538 (lr=4.0425e-05) (hash(x)=46270791) +14200 val loss 6.2435 +14200 val perplexity 514.6627 +14200 train 6.101449 (lr=5.8493e-05) (hash(x)=48663080) +15500 val loss 6.1181 +15500 val perplexity 453.9840 +15500 train 6.218023 (lr=4.0308e-05) (hash(x)=51703939) +14900 val loss 6.3411 +14900 val perplexity 567.4089 +14900 train 6.331109 (lr=8.1998e-05) (hash(x)=49852275) +14300 val loss 6.2520 +14300 val perplexity 519.0565 +14300 train 6.222289 (lr=5.8339e-05) (hash(x)=52021000) +15600 val loss 6.1166 +15600 val perplexity 453.3385 +15600 train 6.961124 (lr=4.0191e-05) (hash(x)=62165678) +15000 val loss 6.3416 +15000 val perplexity 567.6938 +15000 train 6.368067 (lr=8.1771e-05) (hash(x)=49783972) +14400 val loss 6.2433 +14400 val perplexity 514.5604 +14400 train 6.645755 (lr=5.8184e-05) (hash(x)=55092055) +15700 val loss 6.1198 +15700 val perplexity 454.7714 +15700 train 6.021963 (lr=4.0074e-05) (hash(x)=47311753) +15100 val loss 6.3420 +15100 val perplexity 567.9161 +15100 train 6.362710 (lr=8.1542e-05) (hash(x)=52480637) +14500 val loss 6.2333 +14500 val perplexity 509.4116 +14500 train 6.186269 (lr=5.8029e-05) (hash(x)=51163106) +15800 val loss 6.1125 +15800 val perplexity 451.4527 +15800 train 6.074830 (lr=3.9956e-05) (hash(x)=52207302) +15200 val loss 6.3447 +15200 val perplexity 569.4559 +15200 train 6.297228 (lr=8.1312e-05) (hash(x)=52067533) +14600 val loss 6.2358 +14600 val perplexity 510.7221 +14600 train 6.062140 (lr=5.7872e-05) (hash(x)=43587182) +15900 val loss 6.1070 +15900 val perplexity 448.9798 +15900 train 5.941743 (lr=3.9838e-05) (hash(x)=51909196) +15300 val loss 6.3473 +15300 val perplexity 570.9769 +15300 train 6.217721 (lr=8.1082e-05) (hash(x)=49883403) +14700 val loss 6.2510 +14700 val perplexity 518.5546 +14700 train 6.288167 (lr=5.7715e-05) (hash(x)=52950516) +16000 val loss 6.1025 +16000 val perplexity 446.9606 +16000 train 5.896904 (lr=3.9719e-05) (hash(x)=41044250) +15400 val loss 6.3756 +15400 val perplexity 587.3416 +15400 train 6.242879 (lr=8.0850e-05) (hash(x)=46270791) +14800 val loss 6.2332 +14800 val perplexity 509.3771 +14800 train 6.394971 (lr=5.7558e-05) (hash(x)=55136517) +16100 val loss 6.0981 +16100 val perplexity 444.9907 +16100 train 6.036687 (lr=3.9599e-05) (hash(x)=51378388) +15500 val loss 6.3377 +15500 val perplexity 565.4774 +15500 train 6.379775 (lr=8.0617e-05) (hash(x)=51703939) +14900 val loss 6.2365 +14900 val perplexity 511.0609 +14900 train 6.204981 (lr=5.7399e-05) (hash(x)=49852275) +16200 val loss 6.0978 +16200 val perplexity 444.8690 +16200 train 5.888139 (lr=3.9479e-05) (hash(x)=46846254) +15600 val loss 6.3562 +15600 val perplexity 576.0654 +15600 train 7.168634 (lr=8.0383e-05) (hash(x)=62165678) +15000 val loss 6.2295 +15000 val perplexity 507.4941 +16300 val loss 6.0916 +16300 val perplexity 442.1240 +16300 train 6.156555 (lr=3.9359e-05) (hash(x)=49455544) +15000 train 6.251557 (lr=5.7240e-05) (hash(x)=49783972) +15700 val loss 6.3457 +15700 val perplexity 570.0641 +15700 train 6.253168 (lr=8.0148e-05) (hash(x)=47311753) +16400 val loss 6.0867 +16400 val perplexity 439.9786 +16400 train 6.085169 (lr=3.9238e-05) (hash(x)=49391900) +15100 val loss 6.2190 +15100 val perplexity 502.1859 +15100 train 6.238817 (lr=5.7079e-05) (hash(x)=52480637) +15800 val loss 6.3490 +15800 val perplexity 571.9074 +15800 train 6.390619 (lr=7.9912e-05) (hash(x)=52207302) +16500 val loss 6.0904 +16500 val perplexity 441.5922 +16500 train 6.312325 (lr=3.9117e-05) (hash(x)=51821231) +15200 val loss 6.2161 +15200 val perplexity 500.7220 +15200 train 6.154900 (lr=5.6919e-05) (hash(x)=52067533) +15900 val loss 6.3594 +15900 val perplexity 577.8961 +15900 train 6.214854 (lr=7.9675e-05) (hash(x)=51909196) +16600 val loss 6.0691 +16600 val perplexity 432.3025 +16600 train 6.317935 (lr=3.8995e-05) (hash(x)=59206850) +15300 val loss 6.2343 +15300 val perplexity 509.9185 +15300 train 6.093091 (lr=5.6757e-05) (hash(x)=49883403) +16000 val loss 6.3335 +16000 val perplexity 563.1077 +16000 train 6.116721 (lr=7.9437e-05) (hash(x)=41044250) +16700 val loss 6.0614 +16700 val perplexity 428.9936 +16700 train 6.026721 (lr=3.8873e-05) (hash(x)=48167967) +15400 val loss 6.2245 +15400 val perplexity 504.9682 +15400 train 6.078164 (lr=5.6595e-05) (hash(x)=46270791) +16100 val loss 6.3476 +16100 val perplexity 571.1027 +16100 train 6.303515 (lr=7.9198e-05) (hash(x)=51378388) +16800 val loss 6.0700 +16800 val perplexity 432.6975 +16800 train 6.066960 (lr=3.8750e-05) (hash(x)=54230562) +15500 val loss 6.2074 +15500 val perplexity 496.4067 +15500 train 6.258642 (lr=5.6432e-05) (hash(x)=51703939) +16200 val loss 6.3274 +16200 val perplexity 559.7178 +16900 val loss 6.0694 +16900 val perplexity 432.4252 +16900 train 5.839090 (lr=3.8627e-05) (hash(x)=40802028) +16200 train 6.105833 (lr=7.8959e-05) (hash(x)=46846254) +15600 val loss 6.2045 +15600 val perplexity 494.9795 +15600 train 7.082390 (lr=5.6268e-05) (hash(x)=62165678) +17000 val loss 6.0672 +17000 val perplexity 431.4821 +17000 train 6.077165 (lr=3.8503e-05) (hash(x)=48123845) +16300 val loss 6.3070 +16300 val perplexity 548.3869 +16300 train 6.353926 (lr=7.8718e-05) (hash(x)=49455544) +15700 val loss 6.2035 +15700 val perplexity 494.4829 +15700 train 6.097520 (lr=5.6104e-05) (hash(x)=47311753) +17100 val loss 6.0621 +17100 val perplexity 429.2716 +17100 train 5.944340 (lr=3.8379e-05) (hash(x)=52646709) +16400 val loss 6.3199 +16400 val perplexity 555.5233 +16400 train 6.348472 (lr=7.8476e-05) (hash(x)=49391900) +15800 val loss 6.1923 +15800 val perplexity 488.9628 +15800 train 6.168169 (lr=5.5938e-05) (hash(x)=52207302) +17200 val loss 6.0591 +17200 val perplexity 427.9767 +17200 train 6.129535 (lr=3.8255e-05) (hash(x)=49699031) +16500 val loss 6.2945 +16500 val perplexity 541.6105 +16500 train 6.519775 (lr=7.8233e-05) (hash(x)=51821231) +15900 val loss 6.2330 +15900 val perplexity 509.2916 +15900 train 6.081164 (lr=5.5773e-05) (hash(x)=51909196) +17300 val loss 6.0577 +17300 val perplexity 427.3804 +17300 train 6.194827 (lr=3.8130e-05) (hash(x)=54509180) +16600 val loss 6.2831 +16600 val perplexity 535.4581 +16600 train 6.537582 (lr=7.7990e-05) (hash(x)=59206850) +16000 val loss 6.1885 +16000 val perplexity 487.1093 +16000 train 5.973365 (lr=5.5606e-05) (hash(x)=41044250) +17400 val loss 6.0483 +17400 val perplexity 423.3843 +17400 train 5.886956 (lr=3.8005e-05) (hash(x)=48132725) +16700 val loss 6.2704 +16700 val perplexity 528.7017 +16700 train 6.234813 (lr=7.7745e-05) (hash(x)=48167967) +17500 val loss 6.0530 +17500 val perplexity 425.3731 +16100 val loss 6.1873 +16100 val perplexity 486.5457 +16100 train 6.127850 (lr=5.5439e-05) (hash(x)=51378388) +17500 train 5.526296 (lr=3.7879e-05) (hash(x)=42062605) +16800 val loss 6.2620 +16800 val perplexity 524.2465 +16800 train 6.270666 (lr=7.7500e-05) (hash(x)=54230562) +17600 val loss 6.0502 +17600 val perplexity 424.2157 +17600 train 5.994176 (lr=3.7753e-05) (hash(x)=51292000) +16200 val loss 6.1866 +16200 val perplexity 486.2010 +16200 train 5.959226 (lr=5.5271e-05) (hash(x)=46846254) +16900 val loss 6.2755 +16900 val perplexity 531.4180 +16900 train 6.064984 (lr=7.7254e-05) (hash(x)=40802028) +17700 val loss 6.0492 +17700 val perplexity 423.7743 +17700 train 5.899381 (lr=3.7626e-05) (hash(x)=51626643) +16300 val loss 6.1859 +16300 val perplexity 485.8703 +16300 train 6.235335 (lr=5.5102e-05) (hash(x)=49455544) +17000 val loss 6.2593 +17000 val perplexity 522.8427 +17000 train 6.272185 (lr=7.7007e-05) (hash(x)=48123845) +17800 val loss 6.0482 +17800 val perplexity 423.3625 +17800 train 5.885615 (lr=3.7499e-05) (hash(x)=53093658) +16400 val loss 6.1909 +16400 val perplexity 488.2653 +16400 train 6.190922 (lr=5.4933e-05) (hash(x)=49391900) +17100 val loss 6.2607 +17100 val perplexity 523.5630 +17100 train 6.147341 (lr=7.6758e-05) (hash(x)=52646709) +17900 val loss 6.0543 +17900 val perplexity 425.9440 +17900 train 6.150290 (lr=3.7372e-05) (hash(x)=51370179) +16500 val loss 6.1825 +16500 val perplexity 484.2194 +16500 train 6.411568 (lr=5.4763e-05) (hash(x)=51821231) +17200 val loss 6.3020 +17200 val perplexity 545.6510 +17200 train 6.353559 (lr=7.6510e-05) (hash(x)=49699031) +18000 val loss 6.0449 +18000 val perplexity 421.9582 +18000 train 5.881778 (lr=3.7244e-05) (hash(x)=49974890) +16600 val loss 6.1647 +16600 val perplexity 475.6709 +16600 train 6.416338 (lr=5.4593e-05) (hash(x)=59206850) +17300 val loss 6.2697 +17300 val perplexity 528.3310 +17300 train 6.407465 (lr=7.6260e-05) (hash(x)=54509180) +18100 val loss 6.0312 +18100 val perplexity 416.2074 +18100 train 6.033624 (lr=3.7116e-05) (hash(x)=55831634) +16700 val loss 6.1782 +16700 val perplexity 482.1354 +16700 train 6.127187 (lr=5.4422e-05) (hash(x)=48167967) +17400 val loss 6.2856 +17400 val perplexity 536.7748 +17400 train 6.125854 (lr=7.6009e-05) (hash(x)=48132725) +18200 val loss 6.0412 +18200 val perplexity 420.3889 +18200 train 5.999671 (lr=3.6987e-05) (hash(x)=49919405) +16800 val loss 6.1510 +16800 val perplexity 469.1843 +16800 train 6.134031 (lr=5.4250e-05) (hash(x)=54230562) +17500 val loss 6.2892 +17500 val perplexity 538.7270 +17500 train 5.753527 (lr=7.5758e-05) (hash(x)=42062605) +18300 val loss 6.0296 +18300 val perplexity 415.5354 +18300 train 6.147791 (lr=3.6859e-05) (hash(x)=53822543) +16900 val loss 6.1572 +16900 val perplexity 472.0813 +16900 train 5.908586 (lr=5.4078e-05) (hash(x)=40802028) +17600 val loss 6.2549 +17600 val perplexity 520.5685 +17600 train 6.215703 (lr=7.5505e-05) (hash(x)=51292000) +18400 val loss 6.0273 +18400 val perplexity 414.5735 +18400 train 6.317899 (lr=3.6729e-05) (hash(x)=61904736) +17000 val loss 6.1520 +17000 val perplexity 469.6705 +17000 train 6.150666 (lr=5.3905e-05) (hash(x)=48123845) +17700 val loss 6.2620 +17700 val perplexity 524.2450 +17700 train 6.110244 (lr=7.5252e-05) (hash(x)=51626643) +18500 val loss 6.0197 +18500 val perplexity 411.4693 +18500 train 5.855900 (lr=3.6600e-05) (hash(x)=47570884) +17100 val loss 6.1610 +17100 val perplexity 473.9044 +17100 train 6.058381 (lr=5.3731e-05) (hash(x)=52646709) +18600 val loss 6.0132 +18600 val perplexity 408.7953 +18600 train 5.962462 (lr=3.6470e-05) (hash(x)=49104519) +17800 val loss 6.2728 +17800 val perplexity 529.9855 +17800 train 6.082724 (lr=7.4998e-05) (hash(x)=53093658) +17200 val loss 6.1592 +17200 val perplexity 473.0566 +17200 train 6.214075 (lr=5.3557e-05) (hash(x)=49699031) +18700 val loss 6.0078 +18700 val perplexity 406.5821 +18700 train 5.875597 (lr=3.6339e-05) (hash(x)=54160283) +17900 val loss 6.2879 +17900 val perplexity 538.0276 +17900 train 6.380679 (lr=7.4744e-05) (hash(x)=51370179) +17300 val loss 6.1540 +17300 val perplexity 470.5836 +17300 train 6.289455 (lr=5.3382e-05) (hash(x)=54509180) +18800 val loss 6.0054 +18800 val perplexity 405.6289 +18800 train 5.983706 (lr=3.6209e-05) (hash(x)=47840786) +18000 val loss 6.2677 +18000 val perplexity 527.2629 +18000 train 6.117117 (lr=7.4488e-05) (hash(x)=49974890) +17400 val loss 6.1564 +17400 val perplexity 471.7471 +17400 train 6.004965 (lr=5.3206e-05) (hash(x)=48132725) +18900 val loss 6.0071 +18900 val perplexity 406.3125 +18900 train 6.111996 (lr=3.6078e-05) (hash(x)=48397694) +18100 val loss 6.2656 +18100 val perplexity 526.1430 +18100 train 6.265298 (lr=7.4232e-05) (hash(x)=55831634) +17500 val loss 6.1593 +17500 val perplexity 473.0981 +19000 val loss 6.0100 +19000 val perplexity 407.4875 +19000 train 5.932898 (lr=3.5946e-05) (hash(x)=50089065) +17500 train 5.632998 (lr=5.3030e-05) (hash(x)=42062605) +18200 val loss 6.2462 +18200 val perplexity 516.0393 +18200 train 6.178269 (lr=7.3975e-05) (hash(x)=49919405) +19100 val loss 6.0074 +19100 val perplexity 406.4352 +19100 train 5.790021 (lr=3.5814e-05) (hash(x)=46007516) +17600 val loss 6.1553 +17600 val perplexity 471.1975 +17600 train 6.103945 (lr=5.2854e-05) (hash(x)=51292000) +18300 val loss 6.2526 +18300 val perplexity 519.3813 +18300 train 6.379425 (lr=7.3717e-05) (hash(x)=53822543) +19200 val loss 6.0074 +19200 val perplexity 406.4200 +19200 train 5.984462 (lr=3.5682e-05) (hash(x)=42677001) +17700 val loss 6.1588 +17700 val perplexity 472.8784 +17700 train 6.000956 (lr=5.2677e-05) (hash(x)=51626643) +18400 val loss 6.2400 +18400 val perplexity 512.8633 +18400 train 6.668130 (lr=7.3459e-05) (hash(x)=61904736) +19300 val loss 6.0009 +19300 val perplexity 403.8041 +19300 train 5.915502 (lr=3.5550e-05) (hash(x)=46979320) +17800 val loss 6.1551 +17800 val perplexity 471.0979 +17800 train 5.971088 (lr=5.2499e-05) (hash(x)=53093658) +18500 val loss 6.2535 +18500 val perplexity 519.8528 +18500 train 6.079978 (lr=7.3199e-05) (hash(x)=47570884) +19400 val loss 6.0130 +19400 val perplexity 408.7057 +19400 train 5.692779 (lr=3.5417e-05) (hash(x)=47572038) +17900 val loss 6.1734 +17900 val perplexity 479.8302 +17900 train 6.259157 (lr=5.2321e-05) (hash(x)=51370179) +18600 val loss 6.2356 +18600 val perplexity 510.6247 +18600 train 6.162380 (lr=7.2939e-05) (hash(x)=49104519) +19500 val loss 6.0002 +19500 val perplexity 403.4933 +19500 train 5.983724 (lr=3.5284e-05) (hash(x)=52774451) +18000 val loss 6.1458 +18000 val perplexity 466.7636 +18000 train 5.975090 (lr=5.2142e-05) (hash(x)=49974890) +18700 val loss 6.2460 +18700 val perplexity 515.9503 +18700 train 6.150711 (lr=7.2679e-05) (hash(x)=54160283) +19600 val loss 5.9971 +19600 val perplexity 402.2668 +19600 train 6.078898 (lr=3.5151e-05) (hash(x)=53716950) +18100 val loss 6.1396 +18100 val perplexity 463.8814 +18100 train 6.112491 (lr=5.1962e-05) (hash(x)=55831634) +18800 val loss 6.2392 +18800 val perplexity 512.4291 +18800 train 6.228754 (lr=7.2417e-05) (hash(x)=47840786) +19700 val loss 5.9894 +19700 val perplexity 399.1634 +19700 train 5.716701 (lr=3.5017e-05) (hash(x)=45863098) +18900 val loss 6.2321 +18900 val perplexity 508.8236 +18900 train 6.298053 (lr=7.2155e-05) (hash(x)=48397694) +18200 val loss 6.1366 +18200 val perplexity 462.4560 +18200 train 6.085211 (lr=5.1782e-05) (hash(x)=49919405) +19800 val loss 5.9890 +19800 val perplexity 399.0199 +19800 train 6.078104 (lr=3.4883e-05) (hash(x)=51188683) +19000 val loss 6.2494 +19000 val perplexity 517.7143 +19000 train 6.162954 (lr=7.1892e-05) (hash(x)=50089065) +18300 val loss 6.1506 +18300 val perplexity 469.0029 +18300 train 6.275788 (lr=5.1602e-05) (hash(x)=53822543) +19900 val loss 5.9872 +19900 val perplexity 398.2954 +19900 train 5.915298 (lr=3.4749e-05) (hash(x)=49990053) +19100 val loss 6.2420 +19100 val perplexity 513.8819 +19100 train 6.015706 (lr=7.1629e-05) (hash(x)=46007516) +18400 val loss 6.1381 +18400 val perplexity 463.1878 +18400 train 6.458289 (lr=5.1421e-05) (hash(x)=61904736) +20000 val loss 6.0057 +20000 val perplexity 405.7185 +20000 train 5.767802 (lr=3.4615e-05) (hash(x)=44849004) +19200 val loss 6.2443 +19200 val perplexity 515.0599 +19200 train 6.232844 (lr=7.1365e-05) (hash(x)=42677001) +18500 val loss 6.1294 +18500 val perplexity 459.1550 +18500 train 5.958077 (lr=5.1240e-05) (hash(x)=47570884) +20100 val loss 5.9916 +20100 val perplexity 400.0643 +20100 train 5.817237 (lr=3.4480e-05) (hash(x)=50132035) +19300 val loss 6.2301 +19300 val perplexity 507.8056 +19300 train 6.120657 (lr=7.1100e-05) (hash(x)=46979320) +18600 val loss 6.1238 +18600 val perplexity 456.5844 +18600 train 6.058788 (lr=5.1058e-05) (hash(x)=49104519) +20200 val loss 5.9848 +20200 val perplexity 397.3479 +20200 train 6.173896 (lr=3.4345e-05) (hash(x)=54258143) +18700 val loss 6.1485 +18700 val perplexity 468.0246 +19400 val loss 6.2339 +19400 val perplexity 509.7632 +18700 train 6.020862 (lr=5.0875e-05) (hash(x)=54160283) +19400 train 5.940428 (lr=7.0835e-05) (hash(x)=47572038) +20300 val loss 5.9733 +20300 val perplexity 392.8120 +20300 train 6.450908 (lr=3.4209e-05) (hash(x)=54953105) +18800 val loss 6.1149 +18800 val perplexity 452.5690 +18800 train 6.083082 (lr=5.0692e-05) (hash(x)=47840786) +20400 val loss 5.9680 +20400 val perplexity 390.7113 +19500 val loss 6.2651 +19500 val perplexity 525.8947 +20400 train 5.998998 (lr=3.4074e-05) (hash(x)=51574049) +19500 train 6.264404 (lr=7.0569e-05) (hash(x)=52774451) +20500 val loss 5.9693 +20500 val perplexity 391.2180 +20500 train 6.160251 (lr=3.3938e-05) (hash(x)=51616142) +18900 val loss 6.1216 +18900 val perplexity 455.6027 +18900 train 6.208717 (lr=5.0509e-05) (hash(x)=48397694) +19600 val loss 6.2428 +19600 val perplexity 514.2851 +19600 train 6.305923 (lr=7.0302e-05) (hash(x)=53716950) +20600 val loss 5.9681 +20600 val perplexity 390.7540 +20600 train 6.277696 (lr=3.3802e-05) (hash(x)=52549976) +19000 val loss 6.1211 +19000 val perplexity 455.3540 +19000 train 6.024971 (lr=5.0325e-05) (hash(x)=50089065) +19700 val loss 6.2501 +19700 val perplexity 518.0897 +19700 train 5.945279 (lr=7.0035e-05) (hash(x)=45863098) +20700 val loss 5.9611 +20700 val perplexity 388.0362 +20700 train 5.856840 (lr=3.3665e-05) (hash(x)=60058155) +19100 val loss 6.1189 +19100 val perplexity 454.3557 +19100 train 5.908583 (lr=5.0140e-05) (hash(x)=46007516) +19800 val loss 6.2348 +19800 val perplexity 510.2109 +19800 train 6.383919 (lr=6.9767e-05) (hash(x)=51188683) +20800 val loss 5.9639 +20800 val perplexity 389.1317 +20800 train 5.929707 (lr=3.3529e-05) (hash(x)=51348881) +19200 val loss 6.1132 +19200 val perplexity 451.7994 +19200 train 6.168391 (lr=4.9955e-05) (hash(x)=42677001) +19900 val loss 6.2550 +19900 val perplexity 520.5988 +19900 train 6.150089 (lr=6.9498e-05) (hash(x)=49990053) +20900 val loss 5.9608 +20900 val perplexity 387.9117 +20900 train 5.906507 (lr=3.3392e-05) (hash(x)=56417746) +19300 val loss 6.1125 +19300 val perplexity 451.4869 +19300 train 6.013178 (lr=4.9770e-05) (hash(x)=46979320) +20000 val loss 6.2304 +20000 val perplexity 507.9495 +20000 train 5.975549 (lr=6.9229e-05) (hash(x)=44849004) +21000 val loss 5.9593 +21000 val perplexity 387.3271 +21000 train 5.878503 (lr=3.3255e-05) (hash(x)=50984540) +19400 val loss 6.1205 +19400 val perplexity 455.1135 +19400 train 5.794230 (lr=4.9584e-05) (hash(x)=47572038) +20100 val loss 6.2245 +20100 val perplexity 504.9496 +20100 train 6.090897 (lr=6.8960e-05) (hash(x)=50132035) +21100 val loss 5.9580 +21100 val perplexity 386.8212 +21100 train 5.978691 (lr=3.3118e-05) (hash(x)=47991688) +19500 val loss 6.1236 +19500 val perplexity 456.4956 +19500 train 6.107743 (lr=4.9398e-05) (hash(x)=52774451) +20200 val loss 6.2142 +20200 val perplexity 499.7829 +20200 train 6.408185 (lr=6.8690e-05) (hash(x)=54258143) +21200 val loss 5.9499 +21200 val perplexity 383.7145 +21200 train 5.412036 (lr=3.2980e-05) (hash(x)=38120795) +19600 val loss 6.1251 +19600 val perplexity 457.1824 +19600 train 6.181188 (lr=4.9211e-05) (hash(x)=53716950) +20300 val loss 6.2136 +20300 val perplexity 499.5070 +20300 train 6.663392 (lr=6.8419e-05) (hash(x)=54953105) +21300 val loss 5.9511 +21300 val perplexity 384.1758 +21300 train 5.882217 (lr=3.2842e-05) (hash(x)=43713373) +19700 val loss 6.1195 +19700 val perplexity 454.6588 +19700 train 5.817235 (lr=4.9024e-05) (hash(x)=45863098) +20400 val loss 6.2063 +20400 val perplexity 495.8862 +20400 train 6.259881 (lr=6.8148e-05) (hash(x)=51574049) +21400 val loss 5.9569 +21400 val perplexity 386.4132 +21400 train 5.928698 (lr=3.2704e-05) (hash(x)=50567003) +19800 val loss 6.1133 +19800 val perplexity 451.8149 +20500 val loss 6.2199 +20500 val perplexity 502.6557 +19800 train 6.198086 (lr=4.8837e-05) (hash(x)=51188683) +20500 train 6.389875 (lr=6.7876e-05) (hash(x)=51616142) +21500 val loss 5.9456 +21500 val perplexity 382.0707 +21500 train 5.827958 (lr=3.2566e-05) (hash(x)=49430102) +19900 val loss 6.1071 +19900 val perplexity 449.0136 +19900 train 6.012293 (lr=4.8649e-05) (hash(x)=49990053) +20600 val loss 6.2127 +20600 val perplexity 499.0347 +20600 train 6.480650 (lr=6.7604e-05) (hash(x)=52549976) +21600 val loss 5.9551 +21600 val perplexity 385.7319 +21600 train 5.918579 (lr=3.2428e-05) (hash(x)=49016562) +20000 val loss 6.1184 +20000 val perplexity 454.1289 +20000 train 5.877104 (lr=4.8461e-05) (hash(x)=44849004) +20700 val loss 6.2234 +20700 val perplexity 504.4055 +20700 train 6.290070 (lr=6.7331e-05) (hash(x)=60058155) +21700 val loss 5.9467 +21700 val perplexity 382.4763 +21700 train 5.928431 (lr=3.2289e-05) (hash(x)=48939247) +20100 val loss 6.0977 +20100 val perplexity 444.8395 +20100 train 5.936666 (lr=4.8272e-05) (hash(x)=50132035) +20800 val loss 6.2128 +20800 val perplexity 499.0815 +20800 train 6.155025 (lr=6.7058e-05) (hash(x)=51348881) +21800 val loss 5.9442 +21800 val perplexity 381.5269 +21800 train 6.012375 (lr=3.2150e-05) (hash(x)=55200591) +20200 val loss 6.0916 +20200 val perplexity 442.1118 +20900 val loss 6.2260 +20900 val perplexity 505.7193 +20200 train 6.274688 (lr=4.8083e-05) (hash(x)=54258143) +20900 train 6.213724 (lr=6.6784e-05) (hash(x)=56417746) +21900 val loss 5.9415 +21900 val perplexity 380.4950 +21900 train 5.711638 (lr=3.2011e-05) (hash(x)=45779836) +22000 val loss 5.9409 +22000 val perplexity 380.2645 +20300 val loss 6.0977 +20300 val perplexity 444.8543 +22000 train 5.720763 (lr=3.1872e-05) (hash(x)=54998738) +20300 train 6.534375 (lr=4.7893e-05) (hash(x)=54953105) +21000 val loss 6.2215 +21000 val perplexity 503.4751 +21000 train 6.132468 (lr=6.6510e-05) (hash(x)=50984540) +22100 val loss 5.9419 +22100 val perplexity 380.6431 +22100 train 6.043688 (lr=3.1733e-05) (hash(x)=52133006) +21100 val loss 6.2427 +21100 val perplexity 514.2265 +20400 val loss 6.0859 +20400 val perplexity 439.6098 +21100 train 6.222150 (lr=6.6235e-05) (hash(x)=47991688) +20400 train 6.120447 (lr=4.7703e-05) (hash(x)=51574049) +22200 val loss 5.9431 +22200 val perplexity 381.0980 +22200 train 5.936491 (lr=3.1593e-05) (hash(x)=51835734) +20500 val loss 6.0838 +20500 val perplexity 438.6838 +20500 train 6.223388 (lr=4.7513e-05) (hash(x)=51616142) +21200 val loss 6.2294 +21200 val perplexity 507.4331 +21200 train 5.723592 (lr=6.5960e-05) (hash(x)=38120795) +22300 val loss 5.9294 +22300 val perplexity 375.9305 +22300 train 5.769768 (lr=3.1454e-05) (hash(x)=47416361) +20600 val loss 6.0763 +20600 val perplexity 435.4090 +20600 train 6.355922 (lr=4.7323e-05) (hash(x)=52549976) +21300 val loss 6.2232 +21300 val perplexity 504.3024 +21300 train 6.165063 (lr=6.5684e-05) (hash(x)=43713373) +22400 val loss 5.9226 +22400 val perplexity 373.3639 +22400 train 6.032010 (lr=3.1314e-05) (hash(x)=53442231) +20700 val loss 6.0861 +20700 val perplexity 439.7098 +20700 train 6.018911 (lr=4.7132e-05) (hash(x)=60058155) +21400 val loss 6.2192 +21400 val perplexity 502.3200 +21400 train 6.165868 (lr=6.5408e-05) (hash(x)=50567003) +22500 val loss 5.9203 +22500 val perplexity 372.5185 +22500 train 5.979662 (lr=3.1174e-05) (hash(x)=51470764) +20800 val loss 6.0740 +20800 val perplexity 434.4306 +20800 train 6.033575 (lr=4.6940e-05) (hash(x)=51348881) +21500 val loss 6.2185 +21500 val perplexity 501.9359 +21500 train 6.113496 (lr=6.5132e-05) (hash(x)=49430102) +22600 val loss 5.9211 +22600 val perplexity 372.8309 +22600 train 6.141892 (lr=3.1034e-05) (hash(x)=57017795) +20900 val loss 6.0879 +20900 val perplexity 440.5034 +20900 train 6.030206 (lr=4.6749e-05) (hash(x)=56417746) +21600 val loss 6.2193 +21600 val perplexity 502.3547 +21600 train 6.202714 (lr=6.4855e-05) (hash(x)=49016562) +22700 val loss 5.9203 +22700 val perplexity 372.5279 +22700 train 5.891856 (lr=3.0894e-05) (hash(x)=51579140) +21000 val loss 6.0791 +21000 val perplexity 436.6230 +21000 train 5.987165 (lr=4.6557e-05) (hash(x)=50984540) +21700 val loss 6.2155 +21700 val perplexity 500.4315 +21700 train 6.199688 (lr=6.4578e-05) (hash(x)=48939247) +22800 val loss 5.9217 +22800 val perplexity 373.0493 +22800 train 5.953135 (lr=3.0753e-05) (hash(x)=54012893) +21100 val loss 6.0836 +21100 val perplexity 438.5913 +21100 train 6.084665 (lr=4.6365e-05) (hash(x)=47991688) +21800 val loss 6.2134 +21800 val perplexity 499.4082 +21800 train 6.312807 (lr=6.4300e-05) (hash(x)=55200591) +22900 val loss 5.9217 +22900 val perplexity 373.0420 +22900 train 5.932428 (lr=3.0613e-05) (hash(x)=47758345) +21200 val loss 6.0768 +21200 val perplexity 435.6449 +21200 train 5.553420 (lr=4.6172e-05) (hash(x)=38120795) +21900 val loss 6.2131 +21900 val perplexity 499.2491 +21900 train 5.992251 (lr=6.4023e-05) (hash(x)=45779836) +23000 val loss 5.9157 +23000 val perplexity 370.8046 +23000 train 6.283643 (lr=3.0472e-05) (hash(x)=57909888) +21300 val loss 6.0674 +21300 val perplexity 431.5724 +21300 train 5.992763 (lr=4.5979e-05) (hash(x)=43713373) +22000 val loss 6.1938 +22000 val perplexity 489.7144 +22000 train 5.988573 (lr=6.3744e-05) (hash(x)=54998738) +23100 val loss 5.9194 +23100 val perplexity 372.2000 +23100 train 6.056535 (lr=3.0331e-05) (hash(x)=53979657) +21400 val loss 6.0733 +21400 val perplexity 434.0937 +21400 train 6.019704 (lr=4.5786e-05) (hash(x)=50567003) +22100 val loss 6.2140 +22100 val perplexity 499.7169 +22100 train 6.298937 (lr=6.3466e-05) (hash(x)=52133006) +23200 val loss 5.9188 +23200 val perplexity 371.9645 +23200 train 5.730975 (lr=3.0190e-05) (hash(x)=52104721) +21500 val loss 6.0703 +21500 val perplexity 432.8320 +21500 train 5.941805 (lr=4.5592e-05) (hash(x)=49430102) +22200 val loss 6.2037 +22200 val perplexity 494.5787 +22200 train 6.198413 (lr=6.3187e-05) (hash(x)=51835734) +23300 val loss 5.9066 +23300 val perplexity 367.4701 +23300 train 5.796109 (lr=3.0049e-05) (hash(x)=51364855) +21600 val loss 6.0872 +21600 val perplexity 440.1834 +21600 train 6.034882 (lr=4.5399e-05) (hash(x)=49016562) +22300 val loss 6.1873 +22300 val perplexity 486.5480 +22300 train 6.054706 (lr=6.2907e-05) (hash(x)=47416361) +23400 val loss 5.9119 +23400 val perplexity 369.3900 +23400 train 5.844833 (lr=2.9908e-05) (hash(x)=49542635) +21700 val loss 6.0626 +21700 val perplexity 429.4995 +21700 train 6.028594 (lr=4.5205e-05) (hash(x)=48939247) +22400 val loss 6.2030 +22400 val perplexity 494.2534 +22400 train 6.271864 (lr=6.2628e-05) (hash(x)=53442231) +23500 val loss 5.9147 +23500 val perplexity 370.4508 +23500 train 5.701004 (lr=2.9767e-05) (hash(x)=46017969) +21800 val loss 6.0706 +21800 val perplexity 432.9532 +21800 train 6.154336 (lr=4.5010e-05) (hash(x)=55200591) +22500 val loss 6.1860 +22500 val perplexity 485.8766 +22500 train 6.229110 (lr=6.2348e-05) (hash(x)=51470764) +23600 val loss 5.9117 +23600 val perplexity 369.3452 +23600 train 5.973455 (lr=2.9626e-05) (hash(x)=51565375) +21900 val loss 6.0616 +21900 val perplexity 429.0813 +21900 train 5.836717 (lr=4.4816e-05) (hash(x)=45779836) +22600 val loss 6.2025 +22600 val perplexity 493.9838 +22600 train 6.401227 (lr=6.2068e-05) (hash(x)=57017795) +23700 val loss 5.9109 +23700 val perplexity 369.0289 +23700 train 5.802050 (lr=2.9485e-05) (hash(x)=48511656) +22700 val loss 6.1990 +22700 val perplexity 492.2722 +22700 train 6.172696 (lr=6.1787e-05) (hash(x)=51579140) +22000 val loss 6.0435 +22000 val perplexity 421.3542 +22000 train 5.829783 (lr=4.4621e-05) (hash(x)=54998738) +23800 val loss 5.9063 +23800 val perplexity 367.3433 +23800 train 5.865274 (lr=2.9343e-05) (hash(x)=51503144) +22800 val loss 6.1950 +22800 val perplexity 490.3025 +22800 train 6.224982 (lr=6.1506e-05) (hash(x)=54012893) +22100 val loss 6.0509 +22100 val perplexity 424.4917 +22100 train 6.119735 (lr=4.4426e-05) (hash(x)=52133006) +23900 val loss 5.8996 +23900 val perplexity 364.9087 +23900 train 5.995259 (lr=2.9202e-05) (hash(x)=53271580) +22900 val loss 6.1987 +22900 val perplexity 492.0844 +22900 train 6.162778 (lr=6.1225e-05) (hash(x)=47758345) +22200 val loss 6.0561 +22200 val perplexity 426.7114 +22200 train 6.057048 (lr=4.4231e-05) (hash(x)=51835734) +24000 val loss 5.8941 +24000 val perplexity 362.9014 +24000 train 5.927412 (lr=2.9060e-05) (hash(x)=45797109) +23000 val loss 6.1946 +23000 val perplexity 490.0963 +23000 train 6.547087 (lr=6.0944e-05) (hash(x)=57909888) +22300 val loss 6.0420 +22300 val perplexity 420.7131 +22300 train 5.921842 (lr=4.4035e-05) (hash(x)=47416361) +24100 val loss 5.8990 +24100 val perplexity 364.6831 +24100 train 6.216515 (lr=2.8918e-05) (hash(x)=54215536) +23100 val loss 6.1815 +23100 val perplexity 483.7172 +23100 train 6.291510 (lr=6.0663e-05) (hash(x)=53979657) +22400 val loss 6.0605 +22400 val perplexity 428.5763 +22400 train 6.139305 (lr=4.3839e-05) (hash(x)=53442231) +24200 val loss 5.9018 +24200 val perplexity 365.7102 +24200 train 5.877811 (lr=2.8777e-05) (hash(x)=50087698) +23200 val loss 6.2041 +23200 val perplexity 494.7962 +23200 train 6.016724 (lr=6.0381e-05) (hash(x)=52104721) +22500 val loss 6.0420 +22500 val perplexity 420.7296 +22500 train 6.084447 (lr=4.3643e-05) (hash(x)=51470764) +24300 val loss 5.9014 +24300 val perplexity 365.5587 +24300 train 6.113760 (lr=2.8635e-05) (hash(x)=52070789) +23300 val loss 6.1875 +23300 val perplexity 486.6410 +23300 train 6.103590 (lr=6.0099e-05) (hash(x)=51364855) +24400 val loss 5.8864 +24400 val perplexity 360.1159 +24400 train 6.090665 (lr=2.8493e-05) (hash(x)=51743588) +22600 val loss 6.0496 +22600 val perplexity 423.9293 +22600 train 6.265945 (lr=4.3447e-05) (hash(x)=57017795) +23400 val loss 6.1867 +23400 val perplexity 486.2425 +23400 train 6.107842 (lr=5.9817e-05) (hash(x)=49542635) +24500 val loss 5.8855 +24500 val perplexity 359.7711 +24500 train 5.730194 (lr=2.8351e-05) (hash(x)=48574502) +22700 val loss 6.0421 +22700 val perplexity 420.7745 +22700 train 5.991201 (lr=4.3251e-05) (hash(x)=51579140) +23500 val loss 6.1882 +23500 val perplexity 486.9904 +23500 train 5.967282 (lr=5.9534e-05) (hash(x)=46017969) +24600 val loss 5.8805 +24600 val perplexity 357.9956 +24600 train 5.622158 (lr=2.8210e-05) (hash(x)=42950372) +22800 val loss 6.0392 +22800 val perplexity 419.5506 +22800 train 6.062196 (lr=4.3054e-05) (hash(x)=54012893) +24700 val loss 5.8887 +24700 val perplexity 360.9211 +24700 train 5.868454 (lr=2.8068e-05) (hash(x)=48073103) +23600 val loss 6.1972 +23600 val perplexity 491.3576 +23600 train 6.227380 (lr=5.9252e-05) (hash(x)=51565375) +22900 val loss 6.0563 +22900 val perplexity 426.8128 +22900 train 6.071026 (lr=4.2858e-05) (hash(x)=47758345) +24800 val loss 5.8762 +24800 val perplexity 356.4490 +24800 train 5.798516 (lr=2.7926e-05) (hash(x)=55420175) +23700 val loss 6.1780 +23700 val perplexity 482.0095 +23700 train 6.050842 (lr=5.8969e-05) (hash(x)=48511656) +23000 val loss 6.0423 +23000 val perplexity 420.8524 +23000 train 6.401070 (lr=4.2661e-05) (hash(x)=57909888) +24900 val loss 5.8740 +24900 val perplexity 355.6705 +24900 train 5.924924 (lr=2.7784e-05) (hash(x)=50911918) +23800 val loss 6.1682 +23800 val perplexity 477.3205 +23800 train 6.117439 (lr=5.8686e-05) (hash(x)=51503144) +23100 val loss 6.0392 +23100 val perplexity 419.5592 +23100 train 6.180597 (lr=4.2464e-05) (hash(x)=53979657) +25000 val loss 5.8810 +25000 val perplexity 358.1824 +25000 train 5.824456 (lr=2.7642e-05) (hash(x)=56577519) +23900 val loss 6.1597 +23900 val perplexity 473.3016 +23900 train 6.237184 (lr=5.8403e-05) (hash(x)=53271580) +23200 val loss 6.0448 +23200 val perplexity 421.9272 +23200 train 5.862872 (lr=4.2267e-05) (hash(x)=52104721) +25100 val loss 5.8804 +25100 val perplexity 357.9426 +25100 train 5.931868 (lr=2.7500e-05) (hash(x)=54225130) +24000 val loss 6.1483 +24000 val perplexity 467.9082 +24000 train 6.197769 (lr=5.8120e-05) (hash(x)=45797109) +23300 val loss 6.0382 +23300 val perplexity 419.1197 +23300 train 5.936008 (lr=4.2069e-05) (hash(x)=51364855) +25200 val loss 5.8807 +25200 val perplexity 358.0593 +25200 train 5.971258 (lr=2.7358e-05) (hash(x)=50144293) +24100 val loss 6.1478 +24100 val perplexity 467.6976 +24100 train 6.452353 (lr=5.7837e-05) (hash(x)=54215536) +23400 val loss 6.0360 +23400 val perplexity 418.2171 +23400 train 5.960160 (lr=4.1872e-05) (hash(x)=49542635) +25300 val loss 5.8771 +25300 val perplexity 356.7566 +25300 train 5.680949 (lr=2.7216e-05) (hash(x)=43198841) +24200 val loss 6.1528 +24200 val perplexity 470.0422 +24200 train 6.112811 (lr=5.7554e-05) (hash(x)=50087698) +23500 val loss 6.0421 +23500 val perplexity 420.7655 +23500 train 5.817683 (lr=4.1674e-05) (hash(x)=46017969) +25400 val loss 5.8784 +25400 val perplexity 357.2245 +25400 train 5.593025 (lr=2.7074e-05) (hash(x)=48956461) +24300 val loss 6.1707 +24300 val perplexity 478.5392 +24300 train 6.311932 (lr=5.7270e-05) (hash(x)=52070789) +23600 val loss 6.0507 +23600 val perplexity 424.3927 +23600 train 6.097725 (lr=4.1476e-05) (hash(x)=51565375) +25500 val loss 5.8795 +25500 val perplexity 357.6462 +25500 train 6.056159 (lr=2.6932e-05) (hash(x)=57573293) +24400 val loss 6.1458 +24400 val perplexity 466.7447 +24400 train 6.376659 (lr=5.6987e-05) (hash(x)=51743588) +23700 val loss 6.0428 +23700 val perplexity 421.0812 +23700 train 5.920084 (lr=4.1278e-05) (hash(x)=48511656) +25600 val loss 5.8664 +25600 val perplexity 352.9792 +25600 train 5.892700 (lr=2.6790e-05) (hash(x)=47499056) +24500 val loss 6.1357 +24500 val perplexity 462.0636 +24500 train 5.982192 (lr=5.6703e-05) (hash(x)=48574502) +23800 val loss 6.0505 +23800 val perplexity 424.3332 +23800 train 6.011724 (lr=4.1080e-05) (hash(x)=51503144) +25700 val loss 5.8713 +25700 val perplexity 354.6936 +25700 train 7.015054 (lr=2.6649e-05) (hash(x)=42134899) +24600 val loss 6.1423 +24600 val perplexity 465.1430 +24600 train 5.835693 (lr=5.6419e-05) (hash(x)=42950372) +23900 val loss 6.0307 +23900 val perplexity 416.0201 +23900 train 6.098989 (lr=4.0882e-05) (hash(x)=53271580) +25800 val loss 5.8694 +25800 val perplexity 354.0520 +25800 train 5.842076 (lr=2.6507e-05) (hash(x)=52870010) +24700 val loss 6.1337 +24700 val perplexity 461.1335 +24700 train 6.110519 (lr=5.6135e-05) (hash(x)=48073103) +24000 val loss 6.0209 +24000 val perplexity 411.9489 +24000 train 6.072887 (lr=4.0684e-05) (hash(x)=45797109) +25900 val loss 5.8589 +25900 val perplexity 350.3326 +25900 train 5.803767 (lr=2.6365e-05) (hash(x)=51558271) +24800 val loss 6.1421 +24800 val perplexity 465.0069 +24800 train 6.094350 (lr=5.5852e-05) (hash(x)=55420175) +24100 val loss 6.0258 +24100 val perplexity 413.9785 +24100 train 6.323084 (lr=4.0486e-05) (hash(x)=54215536) +26000 val loss 5.8514 +26000 val perplexity 347.7357 +26000 train 5.865356 (lr=2.6223e-05) (hash(x)=49663062) +24900 val loss 6.1350 +24900 val perplexity 461.7368 +24900 train 6.177255 (lr=5.5568e-05) (hash(x)=50911918) +24200 val loss 6.0186 +24200 val perplexity 411.0032 +24200 train 5.987100 (lr=4.0287e-05) (hash(x)=50087698) +26100 val loss 5.8569 +26100 val perplexity 349.6288 +26100 train 6.131372 (lr=2.6082e-05) (hash(x)=48349329) +25000 val loss 6.1568 +25000 val perplexity 471.8988 +25000 train 6.089927 (lr=5.5284e-05) (hash(x)=56577519) +24300 val loss 6.0397 +24300 val perplexity 419.7695 +24300 train 6.198836 (lr=4.0089e-05) (hash(x)=52070789) +26200 val loss 5.8533 +26200 val perplexity 348.3943 +26200 train 5.802189 (lr=2.5940e-05) (hash(x)=48588410) +25100 val loss 6.1364 +25100 val perplexity 462.3987 +25100 train 6.160192 (lr=5.5000e-05) (hash(x)=54225130) +24400 val loss 6.0067 +24400 val perplexity 406.1368 +24400 train 6.220952 (lr=3.9891e-05) (hash(x)=51743588) +26300 val loss 5.8507 +26300 val perplexity 347.4743 +26300 train 5.850303 (lr=2.5798e-05) (hash(x)=52286930) +25200 val loss 6.1326 +25200 val perplexity 460.6421 +25200 train 6.233682 (lr=5.4716e-05) (hash(x)=50144293) +24500 val loss 6.0212 +24500 val perplexity 412.0756 +24500 train 5.869260 (lr=3.9692e-05) (hash(x)=48574502) +26400 val loss 5.8438 +26400 val perplexity 345.0953 +26400 train 5.681284 (lr=2.5657e-05) (hash(x)=43797796) +25300 val loss 6.1382 +25300 val perplexity 463.2344 +25300 train 5.935916 (lr=5.4432e-05) (hash(x)=43198841) +24600 val loss 6.0065 +24600 val perplexity 406.0529 +24600 train 5.731123 (lr=3.9493e-05) (hash(x)=42950372) +26500 val loss 5.8486 +26500 val perplexity 346.7411 +26500 train 5.773521 (lr=2.5515e-05) (hash(x)=50812654) +25400 val loss 6.1475 +25400 val perplexity 467.5678 +25400 train 5.855129 (lr=5.4148e-05) (hash(x)=48956461) +24700 val loss 6.0032 +24700 val perplexity 404.7025 +24700 train 5.976134 (lr=3.9295e-05) (hash(x)=48073103) +26600 val loss 5.8444 +26600 val perplexity 345.3091 +26600 train 5.620793 (lr=2.5374e-05) (hash(x)=45633368) +25500 val loss 6.1428 +25500 val perplexity 465.3540 +25500 train 6.353780 (lr=5.3865e-05) (hash(x)=57573293) +24800 val loss 6.0039 +24800 val perplexity 405.0201 +24800 train 5.944275 (lr=3.9096e-05) (hash(x)=55420175) +26700 val loss 5.8438 +26700 val perplexity 345.0931 +26700 train 6.138330 (lr=2.5233e-05) (hash(x)=58836955) +25600 val loss 6.1246 +25600 val perplexity 456.9649 +25600 train 6.121487 (lr=5.3581e-05) (hash(x)=47499056) +24900 val loss 5.9983 +24900 val perplexity 402.7609 +24900 train 6.057400 (lr=3.8897e-05) (hash(x)=50911918) +26800 val loss 5.8402 +26800 val perplexity 343.8605 +26800 train 5.980484 (lr=2.5092e-05) (hash(x)=53665925) +25700 val loss 6.1268 +25700 val perplexity 457.9825 +25700 train 7.349618 (lr=5.3297e-05) (hash(x)=42134899) +25000 val loss 5.9969 +25000 val perplexity 402.1859 +25000 train 5.952920 (lr=3.8699e-05) (hash(x)=56577519) +26900 val loss 5.8513 +26900 val perplexity 347.6948 +26900 train 5.992325 (lr=2.4951e-05) (hash(x)=49535575) +25800 val loss 6.1291 +25800 val perplexity 459.0154 +25800 train 6.099176 (lr=5.3013e-05) (hash(x)=52870010) +25100 val loss 5.9924 +25100 val perplexity 400.3882 +25100 train 6.019009 (lr=3.8500e-05) (hash(x)=54225130) +27000 val loss 5.8463 +27000 val perplexity 345.9527 +27000 train 5.462827 (lr=2.4810e-05) (hash(x)=46799308) +25900 val loss 6.1129 +25900 val perplexity 451.6441 +25900 train 6.043187 (lr=5.2730e-05) (hash(x)=51558271) +25200 val loss 5.9974 +25200 val perplexity 402.3685 +25200 train 6.107521 (lr=3.8301e-05) (hash(x)=50144293) +27100 val loss 5.8450 +27100 val perplexity 345.4908 +27100 train 5.520372 (lr=2.4669e-05) (hash(x)=42191940) +26000 val loss 6.1084 +26000 val perplexity 449.6107 +26000 train 6.107673 (lr=5.2446e-05) (hash(x)=49663062) +25300 val loss 5.9931 +25300 val perplexity 400.6670 +25300 train 5.794105 (lr=3.8103e-05) (hash(x)=43198841) +27200 val loss 5.8506 +27200 val perplexity 347.4589 +27200 train 5.592172 (lr=2.4528e-05) (hash(x)=42152270) +26100 val loss 6.1057 +26100 val perplexity 448.4022 +26100 train 6.334844 (lr=5.2163e-05) (hash(x)=48349329) +25400 val loss 5.9994 +25400 val perplexity 403.1780 +25400 train 5.703325 (lr=3.7904e-05) (hash(x)=48956461) +27300 val loss 5.8425 +27300 val perplexity 344.6436 +27300 train 5.746789 (lr=2.4387e-05) (hash(x)=44970208) +26200 val loss 6.1179 +26200 val perplexity 453.8964 +26200 train 6.072689 (lr=5.1880e-05) (hash(x)=48588410) +25500 val loss 6.0098 +25500 val perplexity 407.4160 +25500 train 6.189918 (lr=3.7705e-05) (hash(x)=57573293) +27400 val loss 5.8325 +27400 val perplexity 341.2069 +27400 train 5.909132 (lr=2.4247e-05) (hash(x)=52832889) +26300 val loss 6.1199 +26300 val perplexity 454.8365 +26300 train 6.118000 (lr=5.1597e-05) (hash(x)=52286930) +25600 val loss 5.9873 +25600 val perplexity 398.3460 +25600 train 5.948598 (lr=3.7507e-05) (hash(x)=47499056) +27500 val loss 5.8393 +27500 val perplexity 343.5270 +27500 train 5.792419 (lr=2.4106e-05) (hash(x)=49318466) +26400 val loss 6.1056 +26400 val perplexity 448.3712 +26400 train 5.914580 (lr=5.1314e-05) (hash(x)=43797796) +25700 val loss 5.9842 +25700 val perplexity 397.1021 +25700 train 7.133686 (lr=3.7308e-05) (hash(x)=42134899) +27600 val loss 5.8346 +27600 val perplexity 341.9251 +27600 train 5.893499 (lr=2.3966e-05) (hash(x)=50216713) +26500 val loss 6.1088 +26500 val perplexity 449.8074 +26500 train 6.024493 (lr=5.1031e-05) (hash(x)=50812654) +25800 val loss 5.9678 +25800 val perplexity 390.6590 +25800 train 5.935536 (lr=3.7109e-05) (hash(x)=52870010) +27700 val loss 5.8274 +27700 val perplexity 339.4763 +27700 train 5.872871 (lr=2.3826e-05) (hash(x)=44599482) +26600 val loss 6.1054 +26600 val perplexity 448.2546 +26600 train 5.894306 (lr=5.0748e-05) (hash(x)=45633368) +25900 val loss 5.9873 +25900 val perplexity 398.3391 +25900 train 5.919422 (lr=3.6911e-05) (hash(x)=51558271) +27800 val loss 5.8225 +27800 val perplexity 337.8256 +27800 train 5.953266 (lr=2.3686e-05) (hash(x)=52853588) +26700 val loss 6.1036 +26700 val perplexity 447.4879 +26700 train 6.393814 (lr=5.0466e-05) (hash(x)=58836955) +26000 val loss 5.9674 +26000 val perplexity 390.4783 +26000 train 5.970323 (lr=3.6713e-05) (hash(x)=49663062) +27900 val loss 5.8642 +27900 val perplexity 352.2107 +27900 train 5.774900 (lr=2.3546e-05) (hash(x)=45297678) +26800 val loss 6.1097 +26800 val perplexity 450.2209 +26800 train 6.211730 (lr=5.0183e-05) (hash(x)=53665925) +26100 val loss 5.9745 +26100 val perplexity 393.2737 +26100 train 6.220833 (lr=3.6514e-05) (hash(x)=48349329) +28000 val loss 5.8237 +28000 val perplexity 338.2316 +28000 train 5.664844 (lr=2.3407e-05) (hash(x)=45648934) +26900 val loss 6.1039 +26900 val perplexity 447.6089 +26900 train 6.210480 (lr=4.9901e-05) (hash(x)=49535575) +26200 val loss 5.9713 +26200 val perplexity 392.0186 +26200 train 5.909633 (lr=3.6316e-05) (hash(x)=48588410) +28100 val loss 5.8271 +28100 val perplexity 339.3576 +28100 train 5.821928 (lr=2.3267e-05) (hash(x)=46498270) +27000 val loss 6.1055 +27000 val perplexity 448.2944 +27000 train 5.710536 (lr=4.9619e-05) (hash(x)=46799308) +26300 val loss 5.9655 +26300 val perplexity 389.7474 +26300 train 5.966847 (lr=3.6118e-05) (hash(x)=52286930) +28200 val loss 5.8222 +28200 val perplexity 337.7288 +28200 train 5.718663 (lr=2.3128e-05) (hash(x)=45066849) +27100 val loss 6.1067 +27100 val perplexity 448.8629 +27100 train 5.758008 (lr=4.9337e-05) (hash(x)=42191940) +26400 val loss 5.9651 +26400 val perplexity 389.5895 +26400 train 5.789912 (lr=3.5920e-05) (hash(x)=43797796) +28300 val loss 5.8203 +28300 val perplexity 337.0790 +28300 train 5.763829 (lr=2.2989e-05) (hash(x)=46542887) +27200 val loss 6.1067 +27200 val perplexity 448.8764 +27200 train 5.819967 (lr=4.9056e-05) (hash(x)=42152270) +26500 val loss 5.9683 +26500 val perplexity 390.8509 +26500 train 5.895777 (lr=3.5722e-05) (hash(x)=50812654) +28400 val loss 5.8205 +28400 val perplexity 337.1486 +28400 train 5.834213 (lr=2.2850e-05) (hash(x)=50854598) +27300 val loss 6.1232 +27300 val perplexity 456.3432 +27300 train 5.997438 (lr=4.8775e-05) (hash(x)=44970208) +26600 val loss 5.9610 +26600 val perplexity 387.9818 +26600 train 5.736207 (lr=3.5524e-05) (hash(x)=45633368) +28500 val loss 5.8156 +28500 val perplexity 335.4853 +27400 val loss 6.1202 +27400 val perplexity 454.9430 +28500 train 5.912088 (lr=2.2711e-05) (hash(x)=50005909) +27400 train 6.197363 (lr=4.8494e-05) (hash(x)=52832889) +26700 val loss 5.9621 +26700 val perplexity 388.4183 +26700 train 6.242162 (lr=3.5326e-05) (hash(x)=58836955) +27500 val loss 6.1112 +27500 val perplexity 450.8747 +28600 val loss 5.8201 +28600 val perplexity 337.0226 +28600 train 5.857492 (lr=2.2572e-05) (hash(x)=50777665) +27500 train 6.060279 (lr=4.8213e-05) (hash(x)=49318466) +26800 val loss 5.9600 +26800 val perplexity 387.5974 +26800 train 6.071075 (lr=3.5128e-05) (hash(x)=53665925) +28700 val loss 5.8187 +28700 val perplexity 336.5232 +28700 train 6.215127 (lr=2.2434e-05) (hash(x)=60124649) +27600 val loss 6.1163 +27600 val perplexity 453.2019 +27600 train 6.198575 (lr=4.7932e-05) (hash(x)=50216713) +26900 val loss 5.9667 +26900 val perplexity 390.2296 +26900 train 6.092587 (lr=3.4931e-05) (hash(x)=49535575) +28800 val loss 5.8199 +28800 val perplexity 336.9409 +28800 train 5.722304 (lr=2.2296e-05) (hash(x)=49592362) +27700 val loss 6.1006 +27700 val perplexity 446.1080 +27700 train 6.168913 (lr=4.7652e-05) (hash(x)=44599482) +27000 val loss 5.9668 +27000 val perplexity 390.2402 +27000 train 5.578745 (lr=3.4733e-05) (hash(x)=46799308) +27800 val loss 6.0896 +27800 val perplexity 441.2434 +27800 train 6.216833 (lr=4.7372e-05) (hash(x)=52853588) +28900 val loss 5.8178 +28900 val perplexity 336.2186 +28900 train 5.762272 (lr=2.2158e-05) (hash(x)=48094168) +27100 val loss 5.9668 +27100 val perplexity 390.2596 +27100 train 5.635358 (lr=3.4536e-05) (hash(x)=42191940) +27900 val loss 6.1372 +27900 val perplexity 462.7791 +27900 train 6.010057 (lr=4.7093e-05) (hash(x)=45297678) +29000 val loss 5.8197 +29000 val perplexity 336.8595 +29000 train 5.640748 (lr=2.2020e-05) (hash(x)=49283091) +28000 val loss 6.1135 +28000 val perplexity 451.9227 +27200 val loss 5.9627 +27200 val perplexity 388.6427 +28000 train 5.931136 (lr=4.6813e-05) (hash(x)=45648934) +27200 train 5.683019 (lr=3.4339e-05) (hash(x)=42152270) +29100 val loss 5.8153 +29100 val perplexity 335.4080 +29100 train 5.584193 (lr=2.1882e-05) (hash(x)=49704892) +27300 val loss 5.9609 +27300 val perplexity 387.9694 +28100 val loss 6.0997 +28100 val perplexity 445.7223 +28100 train 6.104679 (lr=4.6534e-05) (hash(x)=46498270) +27300 train 5.837162 (lr=3.4142e-05) (hash(x)=44970208) +29200 val loss 5.8079 +29200 val perplexity 332.9180 +29200 train 5.840348 (lr=2.1745e-05) (hash(x)=53661448) +28200 val loss 6.1072 +28200 val perplexity 449.0687 +28200 train 6.004508 (lr=4.6256e-05) (hash(x)=45066849) +27400 val loss 5.9565 +27400 val perplexity 386.2557 +27400 train 6.030290 (lr=3.3946e-05) (hash(x)=52832889) +29300 val loss 5.8136 +29300 val perplexity 334.8057 +29300 train 5.883146 (lr=2.1608e-05) (hash(x)=52935872) +28300 val loss 6.1031 +28300 val perplexity 447.2243 +28300 train 6.026671 (lr=4.5977e-05) (hash(x)=46542887) +27500 val loss 5.9421 +27500 val perplexity 380.7266 +27500 train 5.885997 (lr=3.3749e-05) (hash(x)=49318466) +29400 val loss 5.8014 +29400 val perplexity 330.7589 +29400 train 5.537606 (lr=2.1471e-05) (hash(x)=47385579) +28400 val loss 6.1005 +28400 val perplexity 446.0780 +28400 train 6.104301 (lr=4.5700e-05) (hash(x)=50854598) +27600 val loss 5.9504 +27600 val perplexity 383.9207 +27600 train 6.012819 (lr=3.3553e-05) (hash(x)=50216713) +29500 val loss 5.8096 +29500 val perplexity 333.4997 +29500 train 5.728515 (lr=2.1335e-05) (hash(x)=48758515) +28500 val loss 6.0993 +28500 val perplexity 445.5493 +28500 train 6.181650 (lr=4.5422e-05) (hash(x)=50005909) +27700 val loss 5.9420 +27700 val perplexity 380.6776 +27700 train 5.978776 (lr=3.3357e-05) (hash(x)=44599482) +29600 val loss 5.7934 +29600 val perplexity 328.1339 +29600 train 5.942144 (lr=2.1198e-05) (hash(x)=53373488) +28600 val loss 6.1124 +28600 val perplexity 451.4079 +28600 train 6.161959 (lr=4.5145e-05) (hash(x)=50777665) +27800 val loss 5.9526 +27800 val perplexity 384.7445 +27800 train 6.067351 (lr=3.3161e-05) (hash(x)=52853588) +29700 val loss 5.7938 +29700 val perplexity 328.2430 +29700 train 5.612563 (lr=2.1062e-05) (hash(x)=46924542) +28700 val loss 6.1053 +28700 val perplexity 448.2437 +28700 train 6.512455 (lr=4.4868e-05) (hash(x)=60124649) +27900 val loss 5.9805 +27900 val perplexity 395.6448 +27900 train 5.913355 (lr=3.2965e-05) (hash(x)=45297678) +29800 val loss 5.7944 +29800 val perplexity 328.4390 +29800 train 5.632945 (lr=2.0926e-05) (hash(x)=43935095) +28800 val loss 6.1208 +28800 val perplexity 455.2211 +28800 train 6.028340 (lr=4.4592e-05) (hash(x)=49592362) +28000 val loss 5.9514 +28000 val perplexity 384.2845 +28000 train 5.780007 (lr=3.2769e-05) (hash(x)=45648934) +29900 val loss 5.7910 +29900 val perplexity 327.3405 +29900 train 5.592191 (lr=2.0791e-05) (hash(x)=47165031) +28900 val loss 6.1055 +28900 val perplexity 448.3115 +28900 train 6.056837 (lr=4.4316e-05) (hash(x)=48094168) +30000 val loss 5.7980 +30000 val perplexity 329.6251 +30000 train 6.037576 (lr=2.0655e-05) (hash(x)=55187448) +28100 val loss 5.9459 +28100 val perplexity 382.1905 +28100 train 5.922847 (lr=3.2574e-05) (hash(x)=46498270) +29000 val loss 6.1115 +29000 val perplexity 450.9929 +29000 train 5.923835 (lr=4.4040e-05) (hash(x)=49283091) +30100 val loss 5.7866 +30100 val perplexity 325.8972 +30100 train 5.691603 (lr=2.0520e-05) (hash(x)=49106182) +28200 val loss 5.9379 +28200 val perplexity 379.1294 +28200 train 5.839457 (lr=3.2379e-05) (hash(x)=45066849) +29100 val loss 6.1068 +29100 val perplexity 448.9193 +29100 train 5.878161 (lr=4.3765e-05) (hash(x)=49704892) +30200 val loss 5.7931 +30200 val perplexity 328.0275 +30200 train 5.591982 (lr=2.0385e-05) (hash(x)=44482250) +28300 val loss 5.9433 +28300 val perplexity 381.1796 +28300 train 5.882786 (lr=3.2184e-05) (hash(x)=46542887) +30300 val loss 5.7977 +30300 val perplexity 329.5324 +30300 train 5.783575 (lr=2.0251e-05) (hash(x)=47186431) +29200 val loss 6.0913 +29200 val perplexity 442.0104 +29200 train 6.110059 (lr=4.3490e-05) (hash(x)=53661448) +28400 val loss 5.9440 +28400 val perplexity 381.4474 +28400 train 5.941051 (lr=3.1990e-05) (hash(x)=50854598) +30400 val loss 5.7889 +30400 val perplexity 326.6508 +30400 train 5.608795 (lr=2.0117e-05) (hash(x)=45796670) +29300 val loss 6.0815 +29300 val perplexity 437.6796 +29300 train 6.161415 (lr=4.3216e-05) (hash(x)=52935872) +28500 val loss 5.9358 +28500 val perplexity 378.3388 +28500 train 6.027782 (lr=3.1795e-05) (hash(x)=50005909) +30500 val loss 5.7883 +30500 val perplexity 326.4546 +30500 train 5.465854 (lr=1.9983e-05) (hash(x)=48580072) +28600 val loss 5.9435 +28600 val perplexity 381.2661 +29400 val loss 6.0871 +29400 val perplexity 440.1505 +28600 train 5.982423 (lr=3.1601e-05) (hash(x)=50777665) +29400 train 5.857463 (lr=4.2942e-05) (hash(x)=47385579) +30600 val loss 5.7908 +30600 val perplexity 327.2632 +30600 train 5.801043 (lr=1.9849e-05) (hash(x)=53974898) +28700 val loss 5.9410 +28700 val perplexity 380.3170 +28700 train 6.358287 (lr=3.1408e-05) (hash(x)=60124649) +29500 val loss 6.0790 +29500 val perplexity 436.6113 +29500 train 6.023903 (lr=4.2669e-05) (hash(x)=48758515) +30700 val loss 5.7839 +30700 val perplexity 325.0244 +30700 train 6.010131 (lr=1.9716e-05) (hash(x)=57628887) +28800 val loss 5.9463 +28800 val perplexity 382.3244 +28800 train 5.833020 (lr=3.1214e-05) (hash(x)=49592362) +29600 val loss 6.0758 +29600 val perplexity 435.1965 +29600 train 6.212215 (lr=4.2396e-05) (hash(x)=53373488) +30800 val loss 5.7843 +30800 val perplexity 325.1573 +30800 train 5.423889 (lr=1.9583e-05) (hash(x)=46526494) +28900 val loss 5.9458 +28900 val perplexity 382.1573 +28900 train 5.893951 (lr=3.1021e-05) (hash(x)=48094168) +29700 val loss 6.0843 +29700 val perplexity 438.9209 +29700 train 5.901107 (lr=4.2124e-05) (hash(x)=46924542) +30900 val loss 5.7837 +30900 val perplexity 324.9485 +30900 train 5.974986 (lr=1.9450e-05) (hash(x)=47535221) +29000 val loss 5.9423 +29000 val perplexity 380.8088 +29000 train 5.767596 (lr=3.0828e-05) (hash(x)=49283091) +29800 val loss 6.0750 +29800 val perplexity 434.8368 +29800 train 5.894272 (lr=4.1852e-05) (hash(x)=43935095) +31000 val loss 5.7835 +31000 val perplexity 324.9079 +31000 train 5.542758 (lr=1.9318e-05) (hash(x)=47825023) +29100 val loss 5.9430 +29100 val perplexity 381.0584 +29100 train 5.694561 (lr=3.0635e-05) (hash(x)=49704892) +29900 val loss 6.0700 +29900 val perplexity 432.6927 +29900 train 5.853664 (lr=4.1581e-05) (hash(x)=47165031) +31100 val loss 5.7873 +31100 val perplexity 326.1184 +31100 train 5.412718 (lr=1.9186e-05) (hash(x)=45348268) +29200 val loss 5.9348 +29200 val perplexity 377.9549 +29200 train 5.963836 (lr=3.0443e-05) (hash(x)=53661448) +30000 val loss 6.0776 +30000 val perplexity 435.9884 +30000 train 6.327174 (lr=4.1310e-05) (hash(x)=55187448) +31200 val loss 5.7808 +31200 val perplexity 324.0116 +31200 train 5.340343 (lr=1.9054e-05) (hash(x)=47201868) +29300 val loss 5.9331 +29300 val perplexity 377.3043 +29300 train 6.022016 (lr=3.0251e-05) (hash(x)=52935872) +30100 val loss 6.0809 +30100 val perplexity 437.4088 +30100 train 5.974068 (lr=4.1040e-05) (hash(x)=49106182) +31300 val loss 5.7842 +31300 val perplexity 325.1253 +31300 train 5.962461 (lr=1.8922e-05) (hash(x)=54166284) +29400 val loss 5.9202 +29400 val perplexity 372.4895 +29400 train 5.672938 (lr=3.0060e-05) (hash(x)=47385579) +30200 val loss 6.0790 +30200 val perplexity 436.5880 +30200 train 5.847426 (lr=4.0771e-05) (hash(x)=44482250) +31400 val loss 5.7838 +31400 val perplexity 324.9988 +31400 train 5.704097 (lr=1.8791e-05) (hash(x)=47387646) +29500 val loss 5.9310 +29500 val perplexity 376.5484 +29500 train 5.856774 (lr=2.9868e-05) (hash(x)=48758515) +30300 val loss 6.0680 +30300 val perplexity 431.8087 +30300 train 6.030759 (lr=4.0502e-05) (hash(x)=47186431) +31500 val loss 5.7905 +31500 val perplexity 327.1862 +31500 train 5.823154 (lr=1.8661e-05) (hash(x)=58341520) +29600 val loss 5.9085 +29600 val perplexity 368.1467 +29600 train 6.042802 (lr=2.9677e-05) (hash(x)=53373488) +30400 val loss 6.0689 +30400 val perplexity 432.2168 +30400 train 5.897361 (lr=4.0233e-05) (hash(x)=45796670) +31600 val loss 5.7827 +31600 val perplexity 324.6190 +31600 train 5.496343 (lr=1.8530e-05) (hash(x)=49020919) +29700 val loss 5.9117 +29700 val perplexity 369.3507 +29700 train 5.734521 (lr=2.9487e-05) (hash(x)=46924542) +30500 val loss 6.0713 +30500 val perplexity 433.2252 +30500 train 5.777969 (lr=3.9965e-05) (hash(x)=48580072) +31700 val loss 5.7790 +31700 val perplexity 323.4226 +31700 train 5.510296 (lr=1.8400e-05) (hash(x)=47216075) +29800 val loss 5.9095 +29800 val perplexity 368.5323 +29800 train 5.741061 (lr=2.9297e-05) (hash(x)=43935095) +31800 val loss 5.7751 +31800 val perplexity 322.1896 +30600 val loss 6.0823 +30600 val perplexity 438.0421 +31800 train 5.506766 (lr=1.8271e-05) (hash(x)=53041188) +30600 train 6.070498 (lr=3.9698e-05) (hash(x)=53974898) +29900 val loss 5.9207 +29900 val perplexity 372.6667 +29900 train 5.698563 (lr=2.9107e-05) (hash(x)=47165031) +31900 val loss 5.7722 +31900 val perplexity 321.2306 +31900 train 5.800081 (lr=1.8141e-05) (hash(x)=53464466) +30700 val loss 6.0601 +30700 val perplexity 428.4169 +30700 train 6.245838 (lr=3.9431e-05) (hash(x)=57628887) +32000 val loss 5.7645 +32000 val perplexity 318.7831 +32000 train 5.618022 (lr=1.8013e-05) (hash(x)=43954421) +30000 val loss 5.9199 +30000 val perplexity 372.3885 +30000 train 6.165928 (lr=2.8917e-05) (hash(x)=55187448) +30800 val loss 6.0645 +30800 val perplexity 430.3249 +30800 train 5.745758 (lr=3.9165e-05) (hash(x)=46526494) +32100 val loss 5.7660 +32100 val perplexity 319.2547 +32100 train 5.633451 (lr=1.7884e-05) (hash(x)=54546093) +30100 val loss 5.9046 +30100 val perplexity 366.7384 +30100 train 5.811330 (lr=2.8728e-05) (hash(x)=49106182) +30900 val loss 6.0672 +30900 val perplexity 431.4677 +30900 train 6.233388 (lr=3.8900e-05) (hash(x)=47535221) +32200 val loss 5.7644 +32200 val perplexity 318.7423 +32200 train 5.654344 (lr=1.7756e-05) (hash(x)=46553608) +30200 val loss 5.9075 +30200 val perplexity 367.7706 +30200 train 5.700475 (lr=2.8539e-05) (hash(x)=44482250) +31000 val loss 6.0666 +31000 val perplexity 431.1908 +31000 train 5.798377 (lr=3.8635e-05) (hash(x)=47825023) +32300 val loss 5.7609 +32300 val perplexity 317.6444 +32300 train 5.835842 (lr=1.7628e-05) (hash(x)=49034740) +30300 val loss 5.9116 +30300 val perplexity 369.2802 +30300 train 5.892492 (lr=2.8351e-05) (hash(x)=47186431) +31100 val loss 6.0703 +31100 val perplexity 432.8275 +31100 train 5.714082 (lr=3.8371e-05) (hash(x)=45348268) +32400 val loss 5.7616 +32400 val perplexity 317.8466 +32400 train 5.442878 (lr=1.7501e-05) (hash(x)=44275249) +30400 val loss 5.9140 +30400 val perplexity 370.1986 +30400 train 5.732464 (lr=2.8163e-05) (hash(x)=45796670) +31200 val loss 6.0650 +31200 val perplexity 430.5118 +31200 train 5.660306 (lr=3.8108e-05) (hash(x)=47201868) +32500 val loss 5.7570 +32500 val perplexity 316.4041 +32500 train 6.085079 (lr=1.7374e-05) (hash(x)=57029816) +30500 val loss 5.9074 +30500 val perplexity 367.7353 +30500 train 5.608353 (lr=2.7976e-05) (hash(x)=48580072) +31300 val loss 6.0670 +31300 val perplexity 431.3753 +31300 train 6.226044 (lr=3.7845e-05) (hash(x)=54166284) +32600 val loss 5.7566 +32600 val perplexity 316.2609 +32600 train 5.701710 (lr=1.7247e-05) (hash(x)=50987489) +30600 val loss 5.9076 +30600 val perplexity 367.8395 +30600 train 5.908784 (lr=2.7789e-05) (hash(x)=53974898) +31400 val loss 6.0652 +31400 val perplexity 430.5921 +31400 train 5.989585 (lr=3.7583e-05) (hash(x)=47387646) +32700 val loss 5.7607 +32700 val perplexity 317.5847 +32700 train 5.696806 (lr=1.7121e-05) (hash(x)=48694296) +30700 val loss 5.9026 +30700 val perplexity 365.9965 +30700 train 6.088913 (lr=2.7602e-05) (hash(x)=57628887) +31500 val loss 6.0739 +31500 val perplexity 434.3668 +31500 train 6.105442 (lr=3.7321e-05) (hash(x)=58341520) +32800 val loss 5.7582 +32800 val perplexity 316.7799 +32800 train 5.825755 (lr=1.6995e-05) (hash(x)=51471528) +31600 val loss 6.0705 +31600 val perplexity 432.8824 +30800 val loss 5.9085 +30800 val perplexity 368.1676 +31600 train 5.777446 (lr=3.7061e-05) (hash(x)=49020919) +30800 train 5.573091 (lr=2.7416e-05) (hash(x)=46526494) +32900 val loss 5.7545 +32900 val perplexity 315.6232 +32900 train 5.903592 (lr=1.6870e-05) (hash(x)=56096057) +30900 val loss 5.9050 +30900 val perplexity 366.8774 +30900 train 6.089088 (lr=2.7230e-05) (hash(x)=47535221) +31700 val loss 6.0572 +31700 val perplexity 427.1823 +31700 train 5.782055 (lr=3.6801e-05) (hash(x)=47216075) +33000 val loss 5.7548 +33000 val perplexity 315.7084 +33000 train 5.598426 (lr=1.6745e-05) (hash(x)=47079017) +31000 val loss 5.9047 +31000 val perplexity 366.7734 +31000 train 5.651431 (lr=2.7045e-05) (hash(x)=47825023) +31800 val loss 6.0569 +31800 val perplexity 427.0561 +31800 train 5.811709 (lr=3.6541e-05) (hash(x)=53041188) +33100 val loss 5.7576 +33100 val perplexity 316.5944 +33100 train 5.903429 (lr=1.6621e-05) (hash(x)=47759875) +31900 val loss 6.0612 +31900 val perplexity 428.8980 +31900 train 6.081325 (lr=3.6283e-05) (hash(x)=53464466) +31100 val loss 5.9147 +31100 val perplexity 370.4598 +31100 train 5.522404 (lr=2.6860e-05) (hash(x)=45348268) +33200 val loss 5.7594 +33200 val perplexity 317.1484 +33200 train 5.954252 (lr=1.6497e-05) (hash(x)=53977891) +32000 val loss 6.0573 +32000 val perplexity 427.2416 +32000 train 5.902425 (lr=3.6025e-05) (hash(x)=43954421) +31200 val loss 5.9041 +31200 val perplexity 366.5522 +31200 train 5.467555 (lr=2.6675e-05) (hash(x)=47201868) +33300 val loss 5.7537 +33300 val perplexity 315.3483 +33300 train 5.549120 (lr=1.6373e-05) (hash(x)=48728077) +32100 val loss 6.0489 +32100 val perplexity 423.6535 +32100 train 5.978281 (lr=3.5768e-05) (hash(x)=54546093) +31300 val loss 5.9034 +31300 val perplexity 366.2973 +31300 train 6.043652 (lr=2.6491e-05) (hash(x)=54166284) +33400 val loss 5.7525 +33400 val perplexity 314.9785 +33400 train 5.755599 (lr=1.6250e-05) (hash(x)=54923913) +32200 val loss 6.0469 +32200 val perplexity 422.7990 +32200 train 5.922348 (lr=3.5512e-05) (hash(x)=46553608) +31400 val loss 5.9069 +31400 val perplexity 367.5768 +31400 train 5.841915 (lr=2.6308e-05) (hash(x)=47387646) +33500 val loss 5.7579 +33500 val perplexity 316.6836 +33500 train 5.470273 (lr=1.6127e-05) (hash(x)=46623422) +32300 val loss 6.0445 +32300 val perplexity 421.7757 +32300 train 6.102075 (lr=3.5256e-05) (hash(x)=49034740) +31500 val loss 5.9135 +31500 val perplexity 369.9953 +31500 train 5.938449 (lr=2.6125e-05) (hash(x)=58341520) +33600 val loss 5.7599 +33600 val perplexity 317.3321 +33600 train 5.810010 (lr=1.6005e-05) (hash(x)=53759672) +32400 val loss 6.0455 +32400 val perplexity 422.1995 +32400 train 5.771219 (lr=3.5002e-05) (hash(x)=44275249) +31600 val loss 5.9048 +31600 val perplexity 366.7888 +31600 train 5.597604 (lr=2.5942e-05) (hash(x)=49020919) +33700 val loss 5.7563 +33700 val perplexity 316.1742 +33700 train 5.988208 (lr=1.5883e-05) (hash(x)=50177499) +32500 val loss 6.0467 +32500 val perplexity 422.7168 +32500 train 6.399490 (lr=3.4748e-05) (hash(x)=57029816) +33800 val loss 5.7571 +33800 val perplexity 316.4152 +33800 train 5.599567 (lr=1.5762e-05) (hash(x)=49208351) +31700 val loss 5.8965 +31700 val perplexity 363.7442 +31700 train 5.593674 (lr=2.5760e-05) (hash(x)=47216075) +32600 val loss 6.0418 +32600 val perplexity 420.6439 +32600 train 6.008672 (lr=3.4495e-05) (hash(x)=50987489) +33900 val loss 5.7592 +33900 val perplexity 317.0901 +33900 train 5.583519 (lr=1.5641e-05) (hash(x)=51136014) +31800 val loss 5.8934 +31800 val perplexity 362.6540 +31800 train 5.631096 (lr=2.5579e-05) (hash(x)=53041188) +32700 val loss 6.0434 +32700 val perplexity 421.3045 +32700 train 5.934851 (lr=3.4242e-05) (hash(x)=48694296) +34000 val loss 5.7571 +34000 val perplexity 316.4316 +34000 train 5.535407 (lr=1.5521e-05) (hash(x)=55323465) +31900 val loss 5.8887 +31900 val perplexity 360.9532 +31900 train 5.909057 (lr=2.5398e-05) (hash(x)=53464466) +34100 val loss 5.7509 +34100 val perplexity 314.4702 +34100 train 5.845364 (lr=1.5401e-05) (hash(x)=46512090) +32800 val loss 6.0448 +32800 val perplexity 421.9050 +32800 train 6.125890 (lr=3.3991e-05) (hash(x)=51471528) +32000 val loss 5.8852 +32000 val perplexity 359.6706 +32000 train 5.730884 (lr=2.5218e-05) (hash(x)=43954421) +34200 val loss 5.7617 +34200 val perplexity 317.8948 +34200 train 5.809556 (lr=1.5281e-05) (hash(x)=52513966) +32900 val loss 6.0441 +32900 val perplexity 421.6351 +32900 train 6.160576 (lr=3.3740e-05) (hash(x)=56096057) +32100 val loss 5.8864 +32100 val perplexity 360.1190 +32100 train 5.828067 (lr=2.5038e-05) (hash(x)=54546093) +34300 val loss 5.7451 +34300 val perplexity 312.6570 +34300 train 5.754789 (lr=1.5162e-05) (hash(x)=51181125) +33000 val loss 6.0489 +33000 val perplexity 423.6525 +33000 train 5.880287 (lr=3.3490e-05) (hash(x)=47079017) +32200 val loss 5.8855 +32200 val perplexity 359.8004 +32200 train 5.762052 (lr=2.4858e-05) (hash(x)=46553608) +34400 val loss 5.7408 +34400 val perplexity 311.3093 +34400 train 5.899054 (lr=1.5044e-05) (hash(x)=50979117) +33100 val loss 6.0489 +33100 val perplexity 423.6296 +33100 train 6.102413 (lr=3.3242e-05) (hash(x)=47759875) +32300 val loss 5.8819 +32300 val perplexity 358.4811 +32300 train 5.963989 (lr=2.4679e-05) (hash(x)=49034740) +34500 val loss 5.7630 +34500 val perplexity 318.2933 +34500 train 5.833666 (lr=1.4926e-05) (hash(x)=50394150) +33200 val loss 6.0480 +33200 val perplexity 423.2547 +33200 train 6.237544 (lr=3.2993e-05) (hash(x)=53977891) +32400 val loss 5.8768 +32400 val perplexity 356.6489 +32400 train 5.569736 (lr=2.4501e-05) (hash(x)=44275249) +34600 val loss 5.7422 +34600 val perplexity 311.7411 +34600 train 5.651293 (lr=1.4809e-05) (hash(x)=52666346) +33300 val loss 6.0471 +33300 val perplexity 422.8942 +33300 train 5.815996 (lr=3.2746e-05) (hash(x)=48728077) +32500 val loss 5.8774 +32500 val perplexity 356.8799 +32500 train 6.200098 (lr=2.4323e-05) (hash(x)=57029816) +34700 val loss 5.7335 +34700 val perplexity 309.0387 +34700 train 5.980616 (lr=1.4692e-05) (hash(x)=48275110) +33400 val loss 6.0473 +33400 val perplexity 422.9884 +33400 train 6.104011 (lr=3.2500e-05) (hash(x)=54923913) +32600 val loss 5.8767 +32600 val perplexity 356.6389 +32600 train 5.829865 (lr=2.4146e-05) (hash(x)=50987489) +34800 val loss 5.7371 +34800 val perplexity 310.1674 +34800 train 5.574396 (lr=1.4575e-05) (hash(x)=43282226) +33500 val loss 6.0501 +33500 val perplexity 424.1439 +33500 train 5.772273 (lr=3.2255e-05) (hash(x)=46623422) +32700 val loss 5.8768 +32700 val perplexity 356.6607 +32700 train 5.794640 (lr=2.3970e-05) (hash(x)=48694296) +34900 val loss 5.7307 +34900 val perplexity 308.1846 +34900 train 5.910062 (lr=1.4459e-05) (hash(x)=55252392) +33600 val loss 6.0508 +33600 val perplexity 424.4607 +33600 train 6.120157 (lr=3.2010e-05) (hash(x)=53759672) +32800 val loss 5.8773 +32800 val perplexity 356.8321 +32800 train 5.957132 (lr=2.3794e-05) (hash(x)=51471528) +35000 val loss 5.7318 +35000 val perplexity 308.5099 +35000 train 5.753193 (lr=1.4344e-05) (hash(x)=51882979) +33700 val loss 6.0524 +33700 val perplexity 425.1133 +33700 train 6.298470 (lr=3.1767e-05) (hash(x)=50177499) +35100 val loss 5.7297 +35100 val perplexity 307.8743 +35100 train 5.446522 (lr=1.4229e-05) (hash(x)=56053572) +32900 val loss 5.8730 +32900 val perplexity 355.3115 +32900 train 6.000703 (lr=2.3618e-05) (hash(x)=56096057) +33800 val loss 6.0522 +33800 val perplexity 425.0387 +33800 train 5.890966 (lr=3.1524e-05) (hash(x)=49208351) +35200 val loss 5.7285 +35200 val perplexity 307.4984 +35200 train 5.782646 (lr=1.4115e-05) (hash(x)=51979525) +33000 val loss 5.8687 +33000 val perplexity 353.7780 +33000 train 5.680752 (lr=2.3443e-05) (hash(x)=47079017) +33900 val loss 6.0534 +33900 val perplexity 425.5526 +33900 train 5.890080 (lr=3.1282e-05) (hash(x)=51136014) +35300 val loss 5.7321 +35300 val perplexity 308.6095 +35300 train 5.789152 (lr=1.4001e-05) (hash(x)=50873666) +33100 val loss 5.8743 +33100 val perplexity 355.7642 +33100 train 5.968645 (lr=2.3269e-05) (hash(x)=47759875) +34000 val loss 6.0532 +34000 val perplexity 425.4844 +34000 train 5.851697 (lr=3.1041e-05) (hash(x)=55323465) +35400 val loss 5.7339 +35400 val perplexity 309.1577 +35400 train 6.330473 (lr=1.3887e-05) (hash(x)=69087970) +33200 val loss 5.8698 +33200 val perplexity 354.1841 +33200 train 6.056199 (lr=2.3095e-05) (hash(x)=53977891) +34100 val loss 6.0402 +34100 val perplexity 419.9815 +34100 train 6.179260 (lr=3.0802e-05) (hash(x)=46512090) +35500 val loss 5.7362 +35500 val perplexity 309.8897 +35500 train 5.458553 (lr=1.3775e-05) (hash(x)=48654918) +33300 val loss 5.8780 +33300 val perplexity 357.0811 +33300 train 5.647558 (lr=2.2922e-05) (hash(x)=48728077) +34200 val loss 6.0535 +34200 val perplexity 425.5835 +34200 train 6.109689 (lr=3.0563e-05) (hash(x)=52513966) +35600 val loss 5.7308 +35600 val perplexity 308.2239 +35600 train 5.433198 (lr=1.3663e-05) (hash(x)=49987504) +33400 val loss 5.8739 +33400 val perplexity 355.6239 +33400 train 5.910737 (lr=2.2750e-05) (hash(x)=54923913) +35700 val loss 5.7313 +35700 val perplexity 308.3802 +35700 train 5.448093 (lr=1.3551e-05) (hash(x)=46372898) +34300 val loss 6.0343 +34300 val perplexity 417.5106 +34300 train 6.052635 (lr=3.0325e-05) (hash(x)=51181125) +33500 val loss 5.8779 +33500 val perplexity 357.0762 +33500 train 5.586398 (lr=2.2578e-05) (hash(x)=46623422) +35800 val loss 5.7301 +35800 val perplexity 307.9938 +35800 train 5.737034 (lr=1.3440e-05) (hash(x)=49996578) +34400 val loss 6.0310 +34400 val perplexity 416.1405 +34400 train 6.186016 (lr=3.0088e-05) (hash(x)=50979117) +33600 val loss 5.8823 +33600 val perplexity 358.6203 +33600 train 5.940199 (lr=2.2407e-05) (hash(x)=53759672) +35900 val loss 5.7358 +35900 val perplexity 309.7492 +35900 train 5.789247 (lr=1.3329e-05) (hash(x)=49932960) +34500 val loss 6.0742 +34500 val perplexity 434.4923 +34500 train 6.157396 (lr=2.9852e-05) (hash(x)=50394150) +33700 val loss 5.8789 +33700 val perplexity 357.4198 +33700 train 6.136014 (lr=2.2237e-05) (hash(x)=50177499) +36000 val loss 5.7349 +36000 val perplexity 309.4847 +36000 train 5.515670 (lr=1.3219e-05) (hash(x)=41502642) +34600 val loss 6.0374 +34600 val perplexity 418.7870 +34600 train 5.968144 (lr=2.9617e-05) (hash(x)=52666346) +33800 val loss 5.8753 +33800 val perplexity 356.1367 +36100 val loss 5.7337 +36100 val perplexity 309.1030 +33800 train 5.696282 (lr=2.2067e-05) (hash(x)=49208351) +36100 train 5.603767 (lr=1.3110e-05) (hash(x)=55380312) +34700 val loss 6.0314 +34700 val perplexity 416.3164 +34700 train 6.262363 (lr=2.9383e-05) (hash(x)=48275110) +36200 val loss 5.7326 +36200 val perplexity 308.7657 +36200 train 5.439537 (lr=1.3001e-05) (hash(x)=51323559) +33900 val loss 5.8779 +33900 val perplexity 357.0672 +33900 train 5.694364 (lr=2.1898e-05) (hash(x)=51136014) +34800 val loss 6.0343 +34800 val perplexity 417.5126 +34800 train 5.855494 (lr=2.9150e-05) (hash(x)=43282226) +36300 val loss 5.7351 +36300 val perplexity 309.5447 +36300 train 5.651711 (lr=1.2893e-05) (hash(x)=47058693) +34000 val loss 5.8763 +34000 val perplexity 356.5022 +34000 train 5.661440 (lr=2.1729e-05) (hash(x)=55323465) +34900 val loss 6.0206 +34900 val perplexity 411.8460 +34900 train 6.199155 (lr=2.8918e-05) (hash(x)=55252392) +36400 val loss 5.7337 +36400 val perplexity 309.1139 +36400 train 5.309684 (lr=1.2785e-05) (hash(x)=45928242) +34100 val loss 5.8652 +34100 val perplexity 352.5658 +34100 train 5.995674 (lr=2.1561e-05) (hash(x)=46512090) +35000 val loss 6.0253 +35000 val perplexity 413.7551 +35000 train 6.039495 (lr=2.8688e-05) (hash(x)=51882979) +36500 val loss 5.7250 +36500 val perplexity 306.4465 +36500 train 5.824481 (lr=1.2678e-05) (hash(x)=52834624) +34200 val loss 5.8971 +34200 val perplexity 363.9693 +34200 train 5.958440 (lr=2.1394e-05) (hash(x)=52513966) +35100 val loss 6.0235 +35100 val perplexity 413.0061 +35100 train 5.811615 (lr=2.8458e-05) (hash(x)=56053572) +36600 val loss 5.7227 +36600 val perplexity 305.7209 +36600 train 5.909156 (lr=1.2572e-05) (hash(x)=47195270) +34300 val loss 5.8547 +34300 val perplexity 348.8528 +34300 train 5.872749 (lr=2.1227e-05) (hash(x)=51181125) +35200 val loss 6.0204 +35200 val perplexity 411.7578 +35200 train 6.077020 (lr=2.8229e-05) (hash(x)=51979525) +36700 val loss 5.7224 +36700 val perplexity 305.6363 +36700 train 5.817258 (lr=1.2466e-05) (hash(x)=52501734) +34400 val loss 5.8551 +34400 val perplexity 349.0004 +34400 train 6.006017 (lr=2.1062e-05) (hash(x)=50979117) +35300 val loss 6.0335 +35300 val perplexity 417.1536 +35300 train 6.077876 (lr=2.8002e-05) (hash(x)=50873666) +36800 val loss 5.7198 +36800 val perplexity 304.8547 +36800 train 5.745171 (lr=1.2360e-05) (hash(x)=50333721) +34500 val loss 5.9069 +34500 val perplexity 367.5803 +34500 train 5.971160 (lr=2.0896e-05) (hash(x)=50394150) +35400 val loss 6.0551 +35400 val perplexity 426.2675 +35400 train 6.741091 (lr=2.7775e-05) (hash(x)=69087970) +36900 val loss 5.7185 +36900 val perplexity 304.4420 +36900 train 5.637016 (lr=1.2256e-05) (hash(x)=46841481) +34600 val loss 5.8603 +34600 val perplexity 350.8386 +34600 train 5.770261 (lr=2.0732e-05) (hash(x)=52666346) +35500 val loss 6.0342 +35500 val perplexity 417.4473 +35500 train 5.741636 (lr=2.7549e-05) (hash(x)=48654918) +37000 val loss 5.7159 +37000 val perplexity 303.6544 +37000 train 5.698927 (lr=1.2152e-05) (hash(x)=61838433) +34700 val loss 5.8457 +34700 val perplexity 345.7498 +34700 train 6.085506 (lr=2.0568e-05) (hash(x)=48275110) +35600 val loss 6.0332 +35600 val perplexity 417.0508 +35600 train 5.784684 (lr=2.7325e-05) (hash(x)=49987504) +37100 val loss 5.7148 +37100 val perplexity 303.3307 +37100 train 5.676449 (lr=1.2048e-05) (hash(x)=47537252) +34800 val loss 5.8484 +34800 val perplexity 346.6727 +34800 train 5.721792 (lr=2.0405e-05) (hash(x)=43282226) +35700 val loss 6.0401 +35700 val perplexity 419.9275 +35700 train 5.738249 (lr=2.7102e-05) (hash(x)=46372898) +37200 val loss 5.7158 +37200 val perplexity 303.6374 +37200 train 5.887646 (lr=1.1945e-05) (hash(x)=51840280) +34900 val loss 5.8453 +34900 val perplexity 345.6132 +34900 train 6.016756 (lr=2.0243e-05) (hash(x)=55252392) +37300 val loss 5.7120 +37300 val perplexity 302.4794 +37300 train 5.533561 (lr=1.1843e-05) (hash(x)=48870226) +35800 val loss 6.0475 +35800 val perplexity 423.0491 +35800 train 6.050315 (lr=2.6880e-05) (hash(x)=49996578) +35000 val loss 5.8462 +35000 val perplexity 345.9202 +35000 train 5.856221 (lr=2.0081e-05) (hash(x)=51882979) +37400 val loss 5.7126 +37400 val perplexity 302.6512 +37400 train 5.682428 (lr=1.1741e-05) (hash(x)=45398282) +35900 val loss 6.0469 +35900 val perplexity 422.7807 +35900 train 6.080755 (lr=2.6659e-05) (hash(x)=49932960) +37500 val loss 5.7131 +37500 val perplexity 302.7963 +35100 val loss 5.8464 +35100 val perplexity 345.9776 +35100 train 5.595922 (lr=1.9921e-05) (hash(x)=56053572) +37500 train 5.740012 (lr=1.1640e-05) (hash(x)=46278516) +36000 val loss 6.0317 +36000 val perplexity 416.4264 +36000 train 5.782988 (lr=2.6439e-05) (hash(x)=41502642) +37600 val loss 5.7090 +37600 val perplexity 301.5764 +37600 train 5.667379 (lr=1.1540e-05) (hash(x)=49553278) +35200 val loss 5.8413 +35200 val perplexity 344.2349 +35200 train 5.884139 (lr=1.9760e-05) (hash(x)=51979525) +36100 val loss 6.0308 +36100 val perplexity 416.0473 +36100 train 5.934444 (lr=2.6220e-05) (hash(x)=55380312) +37700 val loss 5.7108 +37700 val perplexity 302.1149 +37700 train 5.944287 (lr=1.1440e-05) (hash(x)=61653685) +35300 val loss 5.8495 +35300 val perplexity 347.0741 +35300 train 5.897538 (lr=1.9601e-05) (hash(x)=50873666) +36200 val loss 6.0332 +36200 val perplexity 417.0444 +36200 train 5.796605 (lr=2.6002e-05) (hash(x)=51323559) +37800 val loss 5.7079 +37800 val perplexity 301.2398 +37800 train 5.788130 (lr=1.1341e-05) (hash(x)=58942903) +35400 val loss 5.8641 +35400 val perplexity 352.1585 +35400 train 6.520232 (lr=1.9442e-05) (hash(x)=69087970) +36300 val loss 6.0331 +36300 val perplexity 417.0058 +36300 train 5.918619 (lr=2.5786e-05) (hash(x)=47058693) +37900 val loss 5.7116 +37900 val perplexity 302.3538 +37900 train 5.697130 (lr=1.1243e-05) (hash(x)=51529837) +35500 val loss 5.8522 +35500 val perplexity 348.0034 +35500 train 5.568428 (lr=1.9285e-05) (hash(x)=48654918) +36400 val loss 6.0355 +36400 val perplexity 417.9948 +36400 train 5.603760 (lr=2.5570e-05) (hash(x)=45928242) +38000 val loss 5.7145 +38000 val perplexity 303.2283 +38000 train 5.424836 (lr=1.1145e-05) (hash(x)=51047484) +35600 val loss 5.8436 +35600 val perplexity 345.0071 +35600 train 5.579630 (lr=1.9128e-05) (hash(x)=49987504) +36500 val loss 6.0244 +36500 val perplexity 413.3806 +36500 train 6.122868 (lr=2.5356e-05) (hash(x)=52834624) +38100 val loss 5.7134 +38100 val perplexity 302.8935 +38100 train 5.599649 (lr=1.1048e-05) (hash(x)=49117671) +35700 val loss 5.8467 +35700 val perplexity 346.1030 +35700 train 5.547647 (lr=1.8971e-05) (hash(x)=46372898) +36600 val loss 6.0116 +36600 val perplexity 408.1541 +36600 train 6.224730 (lr=2.5143e-05) (hash(x)=47195270) +38200 val loss 5.7150 +38200 val perplexity 303.3857 +38200 train 5.642801 (lr=1.0951e-05) (hash(x)=49501130) +35800 val loss 5.8534 +35800 val perplexity 348.4005 +35800 train 5.851306 (lr=1.8816e-05) (hash(x)=49996578) +36700 val loss 6.0131 +36700 val perplexity 408.7386 +36700 train 6.118310 (lr=2.4931e-05) (hash(x)=52501734) +38300 val loss 5.7133 +38300 val perplexity 302.8613 +38300 train 5.425329 (lr=1.0855e-05) (hash(x)=49878964) +35900 val loss 5.8584 +35900 val perplexity 350.1510 +36800 val loss 6.0043 +36800 val perplexity 405.1619 +36800 train 6.069719 (lr=2.4721e-05) (hash(x)=50333721) +35900 train 5.889972 (lr=1.8661e-05) (hash(x)=49932960) +38400 val loss 5.7123 +38400 val perplexity 302.5759 +38400 train 5.657566 (lr=1.0760e-05) (hash(x)=52855313) +36900 val loss 6.0013 +36900 val perplexity 403.9736 +36900 train 5.927393 (lr=2.4511e-05) (hash(x)=46841481) +36000 val loss 5.8615 +36000 val perplexity 351.2392 +36000 train 5.622486 (lr=1.8507e-05) (hash(x)=41502642) +38500 val loss 5.7121 +38500 val perplexity 302.5028 +38500 train 5.307865 (lr=1.0666e-05) (hash(x)=47213252) +37000 val loss 5.9966 +37000 val perplexity 402.0534 +37000 train 6.119756 (lr=2.4303e-05) (hash(x)=61838433) +36100 val loss 5.8528 +36100 val perplexity 348.1932 +36100 train 5.717245 (lr=1.8354e-05) (hash(x)=55380312) +38600 val loss 5.7120 +38600 val perplexity 302.4777 +38600 train 5.878788 (lr=1.0572e-05) (hash(x)=50001498) +37100 val loss 5.9973 +37100 val perplexity 402.3244 +37100 train 5.942426 (lr=2.4096e-05) (hash(x)=47537252) +36200 val loss 5.8507 +36200 val perplexity 347.4787 +36200 train 5.580647 (lr=1.8201e-05) (hash(x)=51323559) +38700 val loss 5.7122 +38700 val perplexity 302.5380 +38700 train 5.441878 (lr=1.0479e-05) (hash(x)=49541736) +37200 val loss 5.9990 +37200 val perplexity 403.0089 +37200 train 6.159816 (lr=2.3890e-05) (hash(x)=51840280) +38800 val loss 5.7084 +38800 val perplexity 301.4013 +38800 train 5.624380 (lr=1.0386e-05) (hash(x)=49712820) +36300 val loss 5.8548 +36300 val perplexity 348.9044 +36300 train 5.751373 (lr=1.8050e-05) (hash(x)=47058693) +37300 val loss 5.9960 +37300 val perplexity 401.8144 +37300 train 5.806243 (lr=2.3686e-05) (hash(x)=48870226) +38900 val loss 5.7028 +38900 val perplexity 299.6950 +38900 train 5.647665 (lr=1.0294e-05) (hash(x)=50913051) +36400 val loss 5.8504 +36400 val perplexity 347.3871 +36400 train 5.421002 (lr=1.7899e-05) (hash(x)=45928242) +37400 val loss 5.9940 +37400 val perplexity 400.9990 +37400 train 5.947510 (lr=2.3483e-05) (hash(x)=45398282) +39000 val loss 5.7012 +39000 val perplexity 299.2183 +39000 train 5.663111 (lr=1.0203e-05) (hash(x)=46110703) +36500 val loss 5.8348 +36500 val perplexity 341.9937 +36500 train 5.920135 (lr=1.7749e-05) (hash(x)=52834624) +37500 val loss 5.9938 +37500 val perplexity 400.9417 +37500 train 6.013215 (lr=2.3281e-05) (hash(x)=46278516) +39100 val loss 5.6991 +39100 val perplexity 298.6089 +39100 train 5.523420 (lr=1.0113e-05) (hash(x)=43887045) +36600 val loss 5.8325 +36600 val perplexity 341.2057 +36600 train 6.044613 (lr=1.7600e-05) (hash(x)=47195270) +39200 val loss 5.6971 +39200 val perplexity 298.0023 +39200 train 5.942028 (lr=1.0023e-05) (hash(x)=53102477) +37600 val loss 5.9891 +37600 val perplexity 399.0745 +37600 train 5.916512 (lr=2.3080e-05) (hash(x)=49553278) +36700 val loss 5.8314 +36700 val perplexity 340.8327 +36700 train 5.931088 (lr=1.7452e-05) (hash(x)=52501734) +39300 val loss 5.6985 +39300 val perplexity 298.4218 +39300 train 5.543431 (lr=9.9341e-06) (hash(x)=44607212) +37700 val loss 5.9967 +37700 val perplexity 402.1080 +37700 train 6.205086 (lr=2.2881e-05) (hash(x)=61653685) +36800 val loss 5.8264 +36800 val perplexity 339.1412 +36800 train 5.880772 (lr=1.7305e-05) (hash(x)=50333721) +39400 val loss 5.6985 +39400 val perplexity 298.4128 +39400 train 5.883617 (lr=9.8458e-06) (hash(x)=55054109) +37800 val loss 5.9911 +37800 val perplexity 399.8488 +37800 train 6.101978 (lr=2.2682e-05) (hash(x)=58942903) +36900 val loss 5.8249 +36900 val perplexity 338.6270 +36900 train 5.741447 (lr=1.7158e-05) (hash(x)=46841481) +39500 val loss 5.6971 +39500 val perplexity 297.9993 +39500 train 5.545697 (lr=9.7581e-06) (hash(x)=47529853) +37900 val loss 5.9967 +37900 val perplexity 402.0835 +37900 train 5.968445 (lr=2.2485e-05) (hash(x)=51529837) +37000 val loss 5.8232 +37000 val perplexity 338.0477 +37000 train 5.784673 (lr=1.7012e-05) (hash(x)=61838433) +39600 val loss 5.6978 +39600 val perplexity 298.2125 +39600 train 5.469454 (lr=9.6712e-06) (hash(x)=50783698) +38000 val loss 5.9944 +38000 val perplexity 401.1801 +38000 train 5.733158 (lr=2.2290e-05) (hash(x)=51047484) +37100 val loss 5.8231 +37100 val perplexity 338.0135 +37100 train 5.770815 (lr=1.6867e-05) (hash(x)=47537252) +39700 val loss 5.6990 +39700 val perplexity 298.5551 +39700 train 5.473249 (lr=9.5849e-06) (hash(x)=47104160) +38100 val loss 5.9905 +38100 val perplexity 399.6018 +38100 train 5.876994 (lr=2.2096e-05) (hash(x)=49117671) +37200 val loss 5.8214 +37200 val perplexity 337.4463 +37200 train 5.959623 (lr=1.6723e-05) (hash(x)=51840280) +39800 val loss 5.7006 +39800 val perplexity 299.0361 +39800 train 5.632528 (lr=9.4994e-06) (hash(x)=50941478) +38200 val loss 5.9989 +38200 val perplexity 402.9718 +38200 train 5.948269 (lr=2.1903e-05) (hash(x)=49501130) +37300 val loss 5.8199 +37300 val perplexity 336.9491 +37300 train 5.637849 (lr=1.6580e-05) (hash(x)=48870226) +39900 val loss 5.7002 +39900 val perplexity 298.9415 +39900 train 5.441486 (lr=9.4146e-06) (hash(x)=47374830) +38300 val loss 6.0049 +38300 val perplexity 405.4156 +38300 train 5.728223 (lr=2.1711e-05) (hash(x)=49878964) +37400 val loss 5.8221 +37400 val perplexity 337.6863 +37400 train 5.787810 (lr=1.6438e-05) (hash(x)=45398282) +40000 val loss 5.7050 +40000 val perplexity 300.3579 +40000 train 5.835725 (lr=9.3305e-06) (hash(x)=56749414) +38400 val loss 6.0012 +38400 val perplexity 403.9121 +38400 train 5.944952 (lr=2.1521e-05) (hash(x)=52855313) +40100 val loss 5.7014 +40100 val perplexity 299.2915 +37500 val loss 5.8248 +37500 val perplexity 338.6047 +40100 train 5.647171 (lr=9.2472e-06) (hash(x)=49256359) +37500 train 5.849154 (lr=1.6297e-05) (hash(x)=46278516) +38500 val loss 6.0038 +38500 val perplexity 404.9452 +38500 train 5.583987 (lr=2.1332e-05) (hash(x)=47213252) +40200 val loss 5.6969 +40200 val perplexity 297.9366 +40200 train 5.626110 (lr=9.1646e-06) (hash(x)=47932192) +37600 val loss 5.8178 +37600 val perplexity 336.2471 +37600 train 5.750561 (lr=1.6156e-05) (hash(x)=49553278) +38600 val loss 5.9952 +38600 val perplexity 401.5104 +38600 train 6.087283 (lr=2.1144e-05) (hash(x)=50001498) +40300 val loss 5.7002 +40300 val perplexity 298.9420 +40300 train 5.501065 (lr=9.0827e-06) (hash(x)=42473499) +37700 val loss 5.8217 +37700 val perplexity 337.5312 +37700 train 6.045236 (lr=1.6016e-05) (hash(x)=61653685) +38700 val loss 6.0030 +38700 val perplexity 404.6603 +38700 train 5.736640 (lr=2.0957e-05) (hash(x)=49541736) +40400 val loss 5.7001 +40400 val perplexity 298.9070 +40400 train 5.492098 (lr=9.0015e-06) (hash(x)=50469946) +37800 val loss 5.8208 +37800 val perplexity 337.2463 +37800 train 5.907042 (lr=1.5878e-05) (hash(x)=58942903) +38800 val loss 5.9985 +38800 val perplexity 402.8217 +38800 train 5.915122 (lr=2.0773e-05) (hash(x)=49712820) +40500 val loss 5.6936 +40500 val perplexity 296.9536 +40500 train 5.797897 (lr=8.9211e-06) (hash(x)=48818656) +37900 val loss 5.8202 +37900 val perplexity 337.0476 +37900 train 5.792669 (lr=1.5740e-05) (hash(x)=51529837) +38900 val loss 5.9877 +38900 val perplexity 398.5088 +38900 train 5.949051 (lr=2.0589e-05) (hash(x)=50913051) +40600 val loss 5.6919 +40600 val perplexity 296.4660 +40600 train 5.551157 (lr=8.8414e-06) (hash(x)=45215748) +38000 val loss 5.8326 +38000 val perplexity 341.2430 +38000 train 5.560158 (lr=1.5603e-05) (hash(x)=51047484) +39000 val loss 5.9927 +39000 val perplexity 400.4775 +39000 train 5.949429 (lr=2.0407e-05) (hash(x)=46110703) +40700 val loss 5.6892 +40700 val perplexity 295.6530 +40700 train 5.825824 (lr=8.7624e-06) (hash(x)=48057577) +38100 val loss 5.8185 +38100 val perplexity 336.4795 +38100 train 5.710571 (lr=1.5467e-05) (hash(x)=49117671) +40800 val loss 5.6876 +40800 val perplexity 295.1978 +40800 train 5.637198 (lr=8.6842e-06) (hash(x)=50103136) +39100 val loss 5.9835 +39100 val perplexity 396.8333 +39100 train 5.849606 (lr=2.0226e-05) (hash(x)=43887045) +38200 val loss 5.8202 +38200 val perplexity 337.0504 +38200 train 5.784738 (lr=1.5332e-05) (hash(x)=49501130) +40900 val loss 5.6859 +40900 val perplexity 294.6921 +40900 train 5.525743 (lr=8.6068e-06) (hash(x)=45764202) +39200 val loss 5.9848 +39200 val perplexity 397.3331 +39200 train 6.238296 (lr=2.0046e-05) (hash(x)=53102477) +38300 val loss 5.8303 +38300 val perplexity 340.4497 +38300 train 5.545831 (lr=1.5198e-05) (hash(x)=49878964) +41000 val loss 5.6855 +41000 val perplexity 294.5566 +41000 train 5.789423 (lr=8.5301e-06) (hash(x)=50724130) +39300 val loss 5.9889 +39300 val perplexity 398.9841 +39300 train 5.828202 (lr=1.9868e-05) (hash(x)=44607212) +38400 val loss 5.8281 +38400 val perplexity 339.7107 +38400 train 5.785296 (lr=1.5064e-05) (hash(x)=52855313) +41100 val loss 5.6822 +41100 val perplexity 293.6031 +41100 train 5.595158 (lr=8.4541e-06) (hash(x)=48739125) +39400 val loss 5.9906 +39400 val perplexity 399.6391 +39400 train 6.167635 (lr=1.9692e-05) (hash(x)=55054109) +38500 val loss 5.8269 +38500 val perplexity 339.3130 +38500 train 5.388604 (lr=1.4932e-05) (hash(x)=47213252) +41200 val loss 5.6818 +41200 val perplexity 293.4859 +41200 train 5.626306 (lr=8.3789e-06) (hash(x)=47473129) +39500 val loss 5.9860 +39500 val perplexity 397.8399 +39500 train 5.824685 (lr=1.9516e-05) (hash(x)=47529853) +38600 val loss 5.8208 +38600 val perplexity 337.2390 +38600 train 5.951091 (lr=1.4801e-05) (hash(x)=50001498) +41300 val loss 5.6821 +41300 val perplexity 293.5674 +41300 train 5.622810 (lr=8.3045e-06) (hash(x)=51032823) +39600 val loss 5.9816 +39600 val perplexity 396.0548 +39600 train 5.780418 (lr=1.9342e-05) (hash(x)=50783698) +41400 val loss 5.6890 +41400 val perplexity 295.6012 +41400 train 5.727981 (lr=8.2308e-06) (hash(x)=44411781) +38700 val loss 5.8224 +38700 val perplexity 337.7910 +38700 train 5.530736 (lr=1.4670e-05) (hash(x)=49541736) +39700 val loss 5.9860 +39700 val perplexity 397.8219 +39700 train 5.760863 (lr=1.9170e-05) (hash(x)=47104160) +41500 val loss 5.6826 +41500 val perplexity 293.7117 +41500 train 5.709755 (lr=8.1579e-06) (hash(x)=48630357) +38800 val loss 5.8194 +38800 val perplexity 336.7745 +38800 train 5.736711 (lr=1.4541e-05) (hash(x)=49712820) +39800 val loss 5.9934 +39800 val perplexity 400.7880 +39800 train 5.914539 (lr=1.8999e-05) (hash(x)=50941478) +41600 val loss 5.6816 +41600 val perplexity 293.4307 +41600 train 5.493839 (lr=8.0858e-06) (hash(x)=47836869) +38900 val loss 5.8146 +38900 val perplexity 335.1730 +38900 train 5.752806 (lr=1.4412e-05) (hash(x)=50913051) +39900 val loss 5.9922 +39900 val perplexity 400.2914 +39900 train 5.753880 (lr=1.8829e-05) (hash(x)=47374830) +41700 val loss 5.6808 +41700 val perplexity 293.1702 +41700 train 5.556981 (lr=8.0144e-06) (hash(x)=49143658) +39000 val loss 5.8109 +39000 val perplexity 333.9320 +39000 train 5.752901 (lr=1.4285e-05) (hash(x)=46110703) +40000 val loss 5.9871 +40000 val perplexity 398.2521 +40000 train 6.150303 (lr=1.8661e-05) (hash(x)=56749414) +41800 val loss 5.6824 +41800 val perplexity 293.6628 +41800 train 5.561540 (lr=7.9438e-06) (hash(x)=50426446) +39100 val loss 5.8074 +39100 val perplexity 332.7551 +39100 train 5.652875 (lr=1.4158e-05) (hash(x)=43887045) +40100 val loss 5.9916 +40100 val perplexity 400.0676 +40100 train 5.931597 (lr=1.8494e-05) (hash(x)=49256359) +41900 val loss 5.6812 +41900 val perplexity 293.2867 +41900 train 5.571254 (lr=7.8740e-06) (hash(x)=48134303) +39200 val loss 5.8108 +39200 val perplexity 333.8914 +39200 train 6.051419 (lr=1.4032e-05) (hash(x)=53102477) +40200 val loss 5.9882 +40200 val perplexity 398.7082 +40200 train 5.907793 (lr=1.8329e-05) (hash(x)=47932192) +42000 val loss 5.6812 +42000 val perplexity 293.3020 +42000 train 5.477006 (lr=7.8050e-06) (hash(x)=48151525) +39300 val loss 5.8101 +39300 val perplexity 333.6398 +39300 train 5.653918 (lr=1.3908e-05) (hash(x)=44607212) +40300 val loss 5.9972 +40300 val perplexity 402.3111 +40300 train 5.784026 (lr=1.8165e-05) (hash(x)=42473499) +42100 val loss 5.6873 +42100 val perplexity 295.0871 +42100 train 5.527502 (lr=7.7368e-06) (hash(x)=49984437) +39400 val loss 5.8140 +39400 val perplexity 334.9546 +39400 train 5.970944 (lr=1.3784e-05) (hash(x)=55054109) +40400 val loss 5.9904 +40400 val perplexity 399.5568 +40400 train 5.766722 (lr=1.8003e-05) (hash(x)=50469946) +42200 val loss 5.6825 +42200 val perplexity 293.6876 +42200 train 5.578340 (lr=7.6693e-06) (hash(x)=49103965) +39500 val loss 5.8093 +39500 val perplexity 333.3748 +39500 train 5.658933 (lr=1.3661e-05) (hash(x)=47529853) +40500 val loss 5.9817 +40500 val perplexity 396.1183 +40500 train 6.098699 (lr=1.7842e-05) (hash(x)=48818656) +42300 val loss 5.6826 +42300 val perplexity 293.7138 +42300 train 5.388159 (lr=7.6027e-06) (hash(x)=52134795) +40600 val loss 5.9743 +40600 val perplexity 393.1989 +39600 val loss 5.8081 +39600 val perplexity 332.9996 +40600 train 5.865192 (lr=1.7683e-05) (hash(x)=45215748) +39600 train 5.579896 (lr=1.3540e-05) (hash(x)=50783698) +42400 val loss 5.6849 +42400 val perplexity 294.3798 +42400 train 5.497135 (lr=7.5368e-06) (hash(x)=47289074) +42500 val loss 5.6817 +42500 val perplexity 293.4420 +42500 train 6.089974 (lr=7.4717e-06) (hash(x)=54213272) +40700 val loss 5.9730 +40700 val perplexity 392.6797 +40700 train 6.130781 (lr=1.7525e-05) (hash(x)=48057577) +39700 val loss 5.8128 +39700 val perplexity 334.5605 +39700 train 5.609338 (lr=1.3419e-05) (hash(x)=47104160) +42600 val loss 5.6804 +42600 val perplexity 293.0627 +42600 train 5.387930 (lr=7.4074e-06) (hash(x)=49347326) +40800 val loss 5.9731 +40800 val perplexity 392.7374 +39800 val loss 5.8115 +39800 val perplexity 334.1109 +40800 train 5.924250 (lr=1.7368e-05) (hash(x)=50103136) +39800 train 5.755116 (lr=1.3299e-05) (hash(x)=50941478) +42700 val loss 5.6819 +42700 val perplexity 293.4980 +42700 train 5.446971 (lr=7.3440e-06) (hash(x)=45762383) +39900 val loss 5.8155 +39900 val perplexity 335.4691 +39900 train 5.549471 (lr=1.3180e-05) (hash(x)=47374830) +40900 val loss 5.9706 +40900 val perplexity 391.7424 +40900 train 5.843431 (lr=1.7214e-05) (hash(x)=45764202) +42800 val loss 5.6801 +42800 val perplexity 292.9767 +42800 train 5.763570 (lr=7.2813e-06) (hash(x)=50377506) +41000 val loss 5.9694 +41000 val perplexity 391.2844 +41000 train 6.078058 (lr=1.7060e-05) (hash(x)=50724130) +40000 val loss 5.8120 +40000 val perplexity 334.2712 +40000 train 5.966955 (lr=1.3063e-05) (hash(x)=56749414) +42900 val loss 5.6759 +42900 val perplexity 291.7648 +42900 train 6.045265 (lr=7.2194e-06) (hash(x)=57332431) +41100 val loss 5.9657 +41100 val perplexity 389.8272 +41100 train 5.913898 (lr=1.6908e-05) (hash(x)=48739125) +40100 val loss 5.8132 +40100 val perplexity 334.6787 +40100 train 5.746479 (lr=1.2946e-05) (hash(x)=49256359) +43000 val loss 5.6743 +43000 val perplexity 291.2978 +43000 train 5.731861 (lr=7.1583e-06) (hash(x)=50599951) +40200 val loss 5.8061 +40200 val perplexity 332.3086 +40200 train 5.728039 (lr=1.2830e-05) (hash(x)=47932192) +41200 val loss 5.9700 +41200 val perplexity 391.5218 +41200 train 5.908168 (lr=1.6758e-05) (hash(x)=47473129) +43100 val loss 5.6724 +43100 val perplexity 290.7371 +43100 train 5.637295 (lr=7.0981e-06) (hash(x)=51139362) +41300 val loss 5.9688 +41300 val perplexity 391.0380 +40300 val loss 5.8119 +40300 val perplexity 334.2409 +41300 train 5.885458 (lr=1.6609e-05) (hash(x)=51032823) +40300 train 5.628065 (lr=1.2716e-05) (hash(x)=42473499) +43200 val loss 5.6720 +43200 val perplexity 290.6013 +43200 train 5.517743 (lr=7.0386e-06) (hash(x)=45759314) +40400 val loss 5.8130 +40400 val perplexity 334.6235 +40400 train 5.593667 (lr=1.2602e-05) (hash(x)=50469946) +41400 val loss 5.9787 +41400 val perplexity 394.9352 +41400 train 5.970953 (lr=1.6462e-05) (hash(x)=44411781) +43300 val loss 5.6719 +43300 val perplexity 290.5804 +43300 train 5.719936 (lr=6.9800e-06) (hash(x)=48251106) +40500 val loss 5.8038 +40500 val perplexity 331.5617 +40500 train 5.902449 (lr=1.2489e-05) (hash(x)=48818656) +41500 val loss 5.9659 +41500 val perplexity 389.9201 +41500 train 5.958271 (lr=1.6316e-05) (hash(x)=48630357) +43400 val loss 5.6698 +43400 val perplexity 289.9633 +43400 train 5.998192 (lr=6.9222e-06) (hash(x)=49941825) +40600 val loss 5.7985 +40600 val perplexity 329.7891 +41600 val loss 5.9676 +41600 val perplexity 390.5535 +40600 train 5.647307 (lr=1.2378e-05) (hash(x)=45215748) +41600 train 5.794570 (lr=1.6172e-05) (hash(x)=47836869) +43500 val loss 5.6713 +43500 val perplexity 290.4133 +43500 train 6.060657 (lr=6.8652e-06) (hash(x)=47282597) +41700 val loss 5.9657 +41700 val perplexity 389.8231 +40700 val loss 5.7992 +40700 val perplexity 330.0334 +41700 train 5.840422 (lr=1.6029e-05) (hash(x)=49143658) +40700 train 5.934358 (lr=1.2267e-05) (hash(x)=48057577) +43600 val loss 5.6685 +43600 val perplexity 289.6115 +43600 train 5.783797 (lr=6.8090e-06) (hash(x)=47311813) +41800 val loss 5.9673 +41800 val perplexity 390.4526 +41800 train 5.861011 (lr=1.5888e-05) (hash(x)=50426446) +40800 val loss 5.7953 +40800 val perplexity 328.7346 +40800 train 5.744502 (lr=1.2158e-05) (hash(x)=50103136) +43700 val loss 5.6694 +43700 val perplexity 289.8531 +43700 train 5.744710 (lr=6.7537e-06) (hash(x)=51739445) +41900 val loss 5.9666 +41900 val perplexity 390.1734 +41900 train 5.840520 (lr=1.5748e-05) (hash(x)=48134303) +43800 val loss 5.6675 +43800 val perplexity 289.3014 +43800 train 5.717494 (lr=6.6992e-06) (hash(x)=58597156) +40900 val loss 5.7931 +40900 val perplexity 328.0425 +40900 train 5.639379 (lr=1.2049e-05) (hash(x)=45764202) +42000 val loss 5.9664 +42000 val perplexity 390.1174 +42000 train 5.787454 (lr=1.5610e-05) (hash(x)=48151525) +43900 val loss 5.6677 +43900 val perplexity 289.3651 +43900 train 5.394792 (lr=6.6455e-06) (hash(x)=45351304) +41000 val loss 5.7924 +41000 val perplexity 327.8011 +41000 train 5.923404 (lr=1.1942e-05) (hash(x)=50724130) +42100 val loss 5.9650 +42100 val perplexity 389.5488 +42100 train 5.867509 (lr=1.5474e-05) (hash(x)=49984437) +44000 val loss 5.6693 +44000 val perplexity 289.8399 +44000 train 5.441494 (lr=6.5926e-06) (hash(x)=46603503) +41100 val loss 5.7934 +41100 val perplexity 328.1278 +41100 train 5.721740 (lr=1.1836e-05) (hash(x)=48739125) +44100 val loss 5.6659 +44100 val perplexity 288.8588 +44100 train 5.644786 (lr=6.5406e-06) (hash(x)=52189096) +42200 val loss 5.9716 +42200 val perplexity 392.1365 +42200 train 5.883844 (lr=1.5339e-05) (hash(x)=49103965) +41200 val loss 5.7911 +41200 val perplexity 327.3667 +41200 train 5.731915 (lr=1.1730e-05) (hash(x)=47473129) +44200 val loss 5.6683 +44200 val perplexity 289.5338 +44200 train 5.841959 (lr=6.4894e-06) (hash(x)=51527792) +42300 val loss 5.9707 +42300 val perplexity 391.7895 +42300 train 5.701947 (lr=1.5205e-05) (hash(x)=52134795) +41300 val loss 5.7886 +41300 val perplexity 326.5566 +41300 train 5.698506 (lr=1.1626e-05) (hash(x)=51032823) +44300 val loss 5.6667 +44300 val perplexity 289.0701 +44300 train 5.518201 (lr=6.4390e-06) (hash(x)=47982628) +42400 val loss 5.9766 +42400 val perplexity 394.1097 +42400 train 5.804103 (lr=1.5074e-05) (hash(x)=47289074) +41400 val loss 5.8091 +41400 val perplexity 333.3033 +41400 train 5.838019 (lr=1.1523e-05) (hash(x)=44411781) +44400 val loss 5.6695 +44400 val perplexity 289.8885 +44400 train 5.676157 (lr=6.3895e-06) (hash(x)=54902968) +42500 val loss 5.9734 +42500 val perplexity 392.8230 +42500 train 6.306897 (lr=1.4943e-05) (hash(x)=54213272) +41500 val loss 5.7889 +41500 val perplexity 326.6471 +41500 train 5.798408 (lr=1.1421e-05) (hash(x)=48630357) +44500 val loss 5.6663 +44500 val perplexity 288.9597 +44500 train 5.594099 (lr=6.3408e-06) (hash(x)=49008493) +42600 val loss 5.9771 +42600 val perplexity 394.2832 +42600 train 5.728152 (lr=1.4815e-05) (hash(x)=49347326) +41600 val loss 5.7887 +41600 val perplexity 326.6022 +41600 train 5.614422 (lr=1.1320e-05) (hash(x)=47836869) +44600 val loss 5.6690 +44600 val perplexity 289.7509 +44600 train 5.401289 (lr=6.2929e-06) (hash(x)=43643824) +42700 val loss 5.9713 +42700 val perplexity 391.9958 +42700 train 5.749424 (lr=1.4688e-05) (hash(x)=45762383) +41700 val loss 5.7904 +41700 val perplexity 327.1425 +41700 train 5.664684 (lr=1.1220e-05) (hash(x)=49143658) +44700 val loss 5.6688 +44700 val perplexity 289.6859 +44700 train 5.511226 (lr=6.2459e-06) (hash(x)=50465872) +42800 val loss 5.9680 +42800 val perplexity 390.7340 +42800 train 6.032648 (lr=1.4563e-05) (hash(x)=50377506) +41800 val loss 5.7917 +41800 val perplexity 327.5564 +41800 train 5.658336 (lr=1.1121e-05) (hash(x)=50426446) +44800 val loss 5.6694 +44800 val perplexity 289.8656 +44800 train 5.428413 (lr=6.1998e-06) (hash(x)=51962160) +42900 val loss 5.9601 +42900 val perplexity 387.6460 +42900 train 6.334489 (lr=1.4439e-05) (hash(x)=57332431) +44900 val loss 5.6698 +44900 val perplexity 289.9770 +44900 train 5.804595 (lr=6.1545e-06) (hash(x)=57370039) +41900 val loss 5.7906 +41900 val perplexity 327.2231 +41900 train 5.654394 (lr=1.1024e-05) (hash(x)=48134303) +43000 val loss 5.9571 +43000 val perplexity 386.4911 +43000 train 6.005164 (lr=1.4317e-05) (hash(x)=50599951) +45000 val loss 5.6708 +45000 val perplexity 290.2657 +45000 train 5.514421 (lr=6.1100e-06) (hash(x)=52648130) +42000 val loss 5.7894 +42000 val perplexity 326.8289 +42000 train 5.598606 (lr=1.0927e-05) (hash(x)=48151525) +43100 val loss 5.9594 +43100 val perplexity 387.3624 +43100 train 5.926691 (lr=1.4196e-05) (hash(x)=51139362) +45100 val loss 5.6921 +45100 val perplexity 296.5020 +45100 train 5.363546 (lr=6.0664e-06) (hash(x)=45155459) +42100 val loss 5.7953 +42100 val perplexity 328.7434 +42100 train 5.678817 (lr=1.0831e-05) (hash(x)=49984437) +43200 val loss 5.9546 +43200 val perplexity 385.5143 +43200 train 5.830906 (lr=1.4077e-05) (hash(x)=45759314) +45200 val loss 5.6633 +45200 val perplexity 288.0978 +45200 train 5.669035 (lr=6.0237e-06) (hash(x)=52304992) +42200 val loss 5.7949 +42200 val perplexity 328.6353 +42200 train 5.716239 (lr=1.0737e-05) (hash(x)=49103965) +43300 val loss 5.9537 +43300 val perplexity 385.1729 +43300 train 6.009885 (lr=1.3960e-05) (hash(x)=48251106) +45300 val loss 5.6632 +45300 val perplexity 288.0673 +45300 train 5.508833 (lr=5.9818e-06) (hash(x)=46456407) +42300 val loss 5.7949 +42300 val perplexity 328.6093 +42300 train 5.507755 (lr=1.0644e-05) (hash(x)=52134795) +43400 val loss 5.9544 +43400 val perplexity 385.4469 +43400 train 6.247155 (lr=1.3844e-05) (hash(x)=49941825) +45400 val loss 5.6606 +45400 val perplexity 287.3250 +45400 train 5.678850 (lr=5.9407e-06) (hash(x)=48325493) +42400 val loss 5.7968 +42400 val perplexity 329.2362 +42400 train 5.613380 (lr=1.0552e-05) (hash(x)=47289074) +43500 val loss 5.9512 +43500 val perplexity 384.2051 +43500 train 6.320533 (lr=1.3730e-05) (hash(x)=47282597) +45500 val loss 5.6609 +45500 val perplexity 287.4078 +45500 train 5.645679 (lr=5.9005e-06) (hash(x)=48869064) +42500 val loss 5.7937 +42500 val perplexity 328.2267 +42500 train 6.148101 (lr=1.0460e-05) (hash(x)=54213272) +43600 val loss 5.9536 +43600 val perplexity 385.1219 +43600 train 6.040003 (lr=1.3618e-05) (hash(x)=47311813) +45600 val loss 5.6587 +45600 val perplexity 286.7633 +45600 train 5.715800 (lr=5.8612e-06) (hash(x)=46077805) +42600 val loss 5.7952 +42600 val perplexity 328.7027 +42600 train 5.527590 (lr=1.0370e-05) (hash(x)=49347326) +43700 val loss 5.9533 +43700 val perplexity 385.0056 +43700 train 6.002683 (lr=1.3507e-05) (hash(x)=51739445) +45700 val loss 5.6578 +45700 val perplexity 286.5095 +45700 train 5.899162 (lr=5.8227e-06) (hash(x)=56831165) +42700 val loss 5.7930 +42700 val perplexity 328.0038 +42700 train 5.554371 (lr=1.0282e-05) (hash(x)=45762383) +43800 val loss 5.9494 +43800 val perplexity 383.5334 +43800 train 5.986272 (lr=1.3398e-05) (hash(x)=58597156) +45800 val loss 5.6593 +45800 val perplexity 286.9525 +45800 train 5.518417 (lr=5.7851e-06) (hash(x)=50124921) +45900 val loss 5.6568 +45900 val perplexity 286.2190 +42800 val loss 5.7907 +42800 val perplexity 327.2475 +45900 train 5.271795 (lr=5.7484e-06) (hash(x)=43227400) +42800 train 5.854580 (lr=1.0194e-05) (hash(x)=50377506) +43900 val loss 5.9497 +43900 val perplexity 383.6561 +43900 train 5.671648 (lr=1.3291e-05) (hash(x)=45351304) +46000 val loss 5.6598 +46000 val perplexity 287.0796 +46000 train 5.681318 (lr=5.7125e-06) (hash(x)=58308887) +42900 val loss 5.7824 +42900 val perplexity 324.5236 +42900 train 6.166087 (lr=1.0107e-05) (hash(x)=57332431) +44000 val loss 5.9501 +44000 val perplexity 383.7866 +44000 train 5.698932 (lr=1.3185e-05) (hash(x)=46603503) +46100 val loss 5.6564 +46100 val perplexity 286.1296 +46100 train 5.729013 (lr=5.6775e-06) (hash(x)=48571632) +43000 val loss 5.7788 +43000 val perplexity 323.3703 +43000 train 5.814094 (lr=1.0022e-05) (hash(x)=50599951) +44100 val loss 5.9503 +44100 val perplexity 383.8850 +44100 train 5.942808 (lr=1.3081e-05) (hash(x)=52189096) +46200 val loss 5.6581 +46200 val perplexity 286.6016 +46200 train 5.917659 (lr=5.6434e-06) (hash(x)=46368862) +44200 val loss 5.9524 +44200 val perplexity 384.6645 +44200 train 6.133305 (lr=1.2979e-05) (hash(x)=51527792) +43100 val loss 5.7799 +43100 val perplexity 323.7279 +43100 train 5.745881 (lr=9.9373e-06) (hash(x)=51139362) +46300 val loss 5.6581 +46300 val perplexity 286.6173 +46300 train 5.408631 (lr=5.6101e-06) (hash(x)=52820317) +44300 val loss 5.9522 +44300 val perplexity 384.5846 +44300 train 5.801945 (lr=1.2878e-05) (hash(x)=47982628) +43200 val loss 5.7784 +43200 val perplexity 323.2374 +43200 train 5.630578 (lr=9.8541e-06) (hash(x)=45759314) +46400 val loss 5.6575 +46400 val perplexity 286.4451 +46400 train 5.450654 (lr=5.5777e-06) (hash(x)=51871487) +44400 val loss 5.9516 +44400 val perplexity 384.3706 +44400 train 5.973132 (lr=1.2779e-05) (hash(x)=54902968) +43300 val loss 5.7787 +43300 val perplexity 323.3299 +43300 train 5.835676 (lr=9.7720e-06) (hash(x)=48251106) +46500 val loss 5.6576 +46500 val perplexity 286.4626 +46500 train 5.616755 (lr=5.5462e-06) (hash(x)=47867973) +44500 val loss 5.9517 +44500 val perplexity 384.3900 +44500 train 5.869427 (lr=1.2682e-05) (hash(x)=49008493) +43400 val loss 5.7791 +43400 val perplexity 323.4810 +43400 train 6.143980 (lr=9.6911e-06) (hash(x)=49941825) +46600 val loss 5.6555 +46600 val perplexity 285.8474 +46600 train 5.312100 (lr=5.5156e-06) (hash(x)=50685713) +44600 val loss 5.9565 +44600 val perplexity 386.2570 +44600 train 5.689929 (lr=1.2586e-05) (hash(x)=43643824) +43500 val loss 5.7759 +43500 val perplexity 322.4240 +43500 train 6.149743 (lr=9.6113e-06) (hash(x)=47282597) +46700 val loss 5.6565 +46700 val perplexity 286.1541 +46700 train 5.766818 (lr=5.4858e-06) (hash(x)=50975526) +44700 val loss 5.9580 +44700 val perplexity 386.8501 +44700 train 5.792778 (lr=1.2492e-05) (hash(x)=50465872) +43600 val loss 5.7761 +43600 val perplexity 322.4915 +43600 train 5.872808 (lr=9.5326e-06) (hash(x)=47311813) +46800 val loss 5.6586 +46800 val perplexity 286.7573 +46800 train 5.809611 (lr=5.4569e-06) (hash(x)=51012129) +44800 val loss 5.9566 +44800 val perplexity 386.2924 +44800 train 5.739930 (lr=1.2400e-05) (hash(x)=51962160) +43700 val loss 5.7768 +43700 val perplexity 322.7124 +43700 train 5.847607 (lr=9.4552e-06) (hash(x)=51739445) +46900 val loss 5.6603 +46900 val perplexity 287.2206 +46900 train 5.364366 (lr=5.4289e-06) (hash(x)=51205353) +44900 val loss 5.9594 +44900 val perplexity 387.3823 +44900 train 6.100443 (lr=1.2309e-05) (hash(x)=57370039) +43800 val loss 5.7734 +43800 val perplexity 321.6403 +43800 train 5.810861 (lr=9.3788e-06) (hash(x)=58597156) +47000 val loss 5.6605 +47000 val perplexity 287.3062 +47000 train 5.801267 (lr=5.4017e-06) (hash(x)=48939661) +45000 val loss 5.9533 +45000 val perplexity 385.0308 +45000 train 5.796408 (lr=1.2220e-05) (hash(x)=52648130) +47100 val loss 5.6576 +47100 val perplexity 286.4546 +47100 train 5.584507 (lr=5.3755e-06) (hash(x)=51002557) +43900 val loss 5.7749 +43900 val perplexity 322.0998 +43900 train 5.511091 (lr=9.3036e-06) (hash(x)=45351304) +45100 val loss 5.9621 +45100 val perplexity 388.4366 +45100 train 5.610552 (lr=1.2133e-05) (hash(x)=45155459) +47200 val loss 5.6592 +47200 val perplexity 286.9117 +47200 train 5.511639 (lr=5.3501e-06) (hash(x)=43422209) +44000 val loss 5.7750 +44000 val perplexity 322.1348 +44000 train 5.542221 (lr=9.2296e-06) (hash(x)=46603503) +47300 val loss 5.6587 +47300 val perplexity 286.7806 +47300 train 5.433320 (lr=5.3256e-06) (hash(x)=45870552) +45200 val loss 5.9429 +45200 val perplexity 381.0319 +45200 train 5.934372 (lr=1.2047e-05) (hash(x)=52304992) +44100 val loss 5.7745 +44100 val perplexity 321.9746 +44100 train 5.767216 (lr=9.1568e-06) (hash(x)=52189096) +47400 val loss 5.6616 +47400 val perplexity 287.6050 +47400 train 5.505679 (lr=5.3020e-06) (hash(x)=47001810) +45300 val loss 5.9421 +45300 val perplexity 380.7375 +45300 train 5.788478 (lr=1.1964e-05) (hash(x)=46456407) +44200 val loss 5.7782 +44200 val perplexity 323.1830 +44200 train 5.955333 (lr=9.0851e-06) (hash(x)=51527792) +47500 val loss 5.6553 +47500 val perplexity 285.8004 +47500 train 5.707728 (lr=5.2792e-06) (hash(x)=49146619) +45400 val loss 5.9395 +45400 val perplexity 379.7390 +45400 train 5.965747 (lr=1.1881e-05) (hash(x)=48325493) +44300 val loss 5.7750 +44300 val perplexity 322.1468 +44300 train 5.617612 (lr=9.0146e-06) (hash(x)=47982628) +47600 val loss 5.6534 +47600 val perplexity 285.2548 +47600 train 5.722886 (lr=5.2574e-06) (hash(x)=53742853) +45500 val loss 5.9379 +45500 val perplexity 379.1255 +45500 train 5.897460 (lr=1.1801e-05) (hash(x)=48869064) +44400 val loss 5.7788 +44400 val perplexity 323.3816 +44400 train 5.747175 (lr=8.9453e-06) (hash(x)=54902968) +47700 val loss 5.6527 +47700 val perplexity 285.0697 +47700 train 5.882221 (lr=5.2364e-06) (hash(x)=53019458) +45600 val loss 5.9383 +45600 val perplexity 379.2873 +45600 train 5.986554 (lr=1.1722e-05) (hash(x)=46077805) +44500 val loss 5.7747 +44500 val perplexity 322.0545 +44500 train 5.693212 (lr=8.8771e-06) (hash(x)=49008493) +47800 val loss 5.6515 +47800 val perplexity 284.7323 +47800 train 5.745243 (lr=5.2163e-06) (hash(x)=54497914) +45700 val loss 5.9358 +45700 val perplexity 378.3559 +45700 train 6.167429 (lr=1.1645e-05) (hash(x)=56831165) +44600 val loss 5.7787 +44600 val perplexity 323.3544 +44600 train 5.510222 (lr=8.8101e-06) (hash(x)=43643824) +47900 val loss 5.6533 +47900 val perplexity 285.2190 +47900 train 5.561633 (lr=5.1972e-06) (hash(x)=47109012) +45800 val loss 5.9389 +45800 val perplexity 379.5174 +45800 train 5.792811 (lr=1.1570e-05) (hash(x)=50124921) +44700 val loss 5.7763 +44700 val perplexity 322.5546 +44700 train 5.600916 (lr=8.7443e-06) (hash(x)=50465872) +48000 val loss 5.6498 +48000 val perplexity 284.2386 +48000 train 5.628646 (lr=5.1788e-06) (hash(x)=48429816) +45900 val loss 5.9394 +45900 val perplexity 379.7004 +45900 train 5.544888 (lr=1.1497e-05) (hash(x)=43227400) +44800 val loss 5.7765 +44800 val perplexity 322.6223 +44800 train 5.524536 (lr=8.6797e-06) (hash(x)=51962160) +48100 val loss 5.6498 +48100 val perplexity 284.2397 +48100 train 5.601079 (lr=5.1614e-06) (hash(x)=46500465) +46000 val loss 5.9362 +46000 val perplexity 378.5059 +46000 train 5.994588 (lr=1.1425e-05) (hash(x)=58308887) +44900 val loss 5.7783 +44900 val perplexity 323.2074 +44900 train 5.909980 (lr=8.6163e-06) (hash(x)=57370039) +48200 val loss 5.6477 +48200 val perplexity 283.6242 +48200 train 5.805573 (lr=5.1449e-06) (hash(x)=50888195) +46100 val loss 5.9403 +46100 val perplexity 380.0424 +46100 train 6.014759 (lr=1.1355e-05) (hash(x)=48571632) +45000 val loss 5.7785 +45000 val perplexity 323.2732 +45000 train 5.612806 (lr=8.5540e-06) (hash(x)=52648130) +48300 val loss 5.6466 +48300 val perplexity 283.3195 +48300 train 5.738935 (lr=5.1293e-06) (hash(x)=56406586) +46200 val loss 5.9401 +46200 val perplexity 379.9839 +46200 train 6.136560 (lr=1.1287e-05) (hash(x)=46368862) +48400 val loss 5.6482 +48400 val perplexity 283.7928 +48400 train 5.795070 (lr=5.1145e-06) (hash(x)=48713969) +45100 val loss 5.7983 +45100 val perplexity 329.7279 +45100 train 5.465753 (lr=8.4930e-06) (hash(x)=45155459) +46300 val loss 5.9413 +46300 val perplexity 380.4251 +46300 train 5.727779 (lr=1.1220e-05) (hash(x)=52820317) +48500 val loss 5.6467 +48500 val perplexity 283.3447 +48500 train 5.069698 (lr=5.1007e-06) (hash(x)=37691696) +45200 val loss 5.7693 +45200 val perplexity 320.3024 +45200 train 5.753223 (lr=8.4331e-06) (hash(x)=52304992) +46400 val loss 5.9405 +46400 val perplexity 380.1421 +46400 train 5.747329 (lr=1.1155e-05) (hash(x)=51871487) +48600 val loss 5.6464 +48600 val perplexity 283.2689 +48600 train 5.927204 (lr=5.0877e-06) (hash(x)=48712321) +45300 val loss 5.7679 +45300 val perplexity 319.8522 +45300 train 5.609525 (lr=8.3745e-06) (hash(x)=46456407) +46500 val loss 5.9401 +46500 val perplexity 379.9685 +46500 train 5.852152 (lr=1.1092e-05) (hash(x)=47867973) +48700 val loss 5.6474 +48700 val perplexity 283.5566 +48700 train 6.110847 (lr=5.0756e-06) (hash(x)=63513096) +45400 val loss 5.7669 +45400 val perplexity 319.5305 +45400 train 5.797219 (lr=8.3170e-06) (hash(x)=48325493) +46600 val loss 5.9380 +46600 val perplexity 379.1804 +46600 train 5.637182 (lr=1.1031e-05) (hash(x)=50685713) +48800 val loss 5.6492 +48800 val perplexity 284.0618 +48800 train 5.278167 (lr=5.0644e-06) (hash(x)=44917721) +45500 val loss 5.7654 +45500 val perplexity 319.0598 +45500 train 5.707479 (lr=8.2607e-06) (hash(x)=48869064) +46700 val loss 5.9392 +46700 val perplexity 379.6454 +46700 train 6.003179 (lr=1.0972e-05) (hash(x)=50975526) +48900 val loss 5.6466 +48900 val perplexity 283.3356 +48900 train 5.843465 (lr=5.0542e-06) (hash(x)=51426559) +45600 val loss 5.7651 +45600 val perplexity 318.9851 +45600 train 5.814144 (lr=8.2057e-06) (hash(x)=46077805) +46800 val loss 5.9401 +46800 val perplexity 379.9679 +46800 train 6.097919 (lr=1.0914e-05) (hash(x)=51012129) +49000 val loss 5.6482 +49000 val perplexity 283.7913 +49000 train 5.569285 (lr=5.0448e-06) (hash(x)=47909690) +45700 val loss 5.7646 +45700 val perplexity 318.8216 +45700 train 6.004718 (lr=8.1518e-06) (hash(x)=56831165) +49100 val loss 5.6483 +49100 val perplexity 283.7994 +49100 train 5.552581 (lr=5.0363e-06) (hash(x)=48329226) +46900 val loss 5.9402 +46900 val perplexity 379.9924 +46900 train 5.683467 (lr=1.0858e-05) (hash(x)=51205353) +45800 val loss 5.7632 +45800 val perplexity 318.3785 +45800 train 5.622146 (lr=8.0992e-06) (hash(x)=50124921) +49200 val loss 5.6466 +49200 val perplexity 283.3242 +49200 train 5.642118 (lr=5.0286e-06) (hash(x)=49512472) +47000 val loss 5.9412 +47000 val perplexity 380.4094 +47000 train 6.105960 (lr=1.0803e-05) (hash(x)=48939661) +45900 val loss 5.7642 +45900 val perplexity 318.6901 +45900 train 5.378597 (lr=8.0478e-06) (hash(x)=43227400) +49300 val loss 5.6480 +49300 val perplexity 283.7257 +49300 train 5.494261 (lr=5.0219e-06) (hash(x)=46541146) +47100 val loss 5.9431 +47100 val perplexity 381.1316 +47100 train 5.854105 (lr=1.0751e-05) (hash(x)=51002557) +46000 val loss 5.7633 +46000 val perplexity 318.3893 +46000 train 5.783821 (lr=7.9976e-06) (hash(x)=58308887) +49400 val loss 5.6487 +49400 val perplexity 283.9217 +49400 train 5.598648 (lr=5.0161e-06) (hash(x)=52192792) +47200 val loss 5.9447 +47200 val perplexity 381.7328 +47200 train 5.774392 (lr=1.0700e-05) (hash(x)=43422209) +46100 val loss 5.7640 +46100 val perplexity 318.6330 +46100 train 5.840580 (lr=7.9485e-06) (hash(x)=48571632) +49500 val loss 5.6485 +49500 val perplexity 283.8550 +49500 train 5.445575 (lr=5.0112e-06) (hash(x)=48954331) +47300 val loss 5.9450 +47300 val perplexity 381.8438 +47300 train 5.744865 (lr=1.0651e-05) (hash(x)=45870552) +46200 val loss 5.7643 +46200 val perplexity 318.7055 +46200 train 5.995966 (lr=7.9008e-06) (hash(x)=46368862) +49600 val loss 5.6469 +49600 val perplexity 283.4171 +49600 train 5.788623 (lr=5.0072e-06) (hash(x)=50364098) +47400 val loss 5.9466 +47400 val perplexity 382.4550 +47400 train 5.783464 (lr=1.0604e-05) (hash(x)=47001810) +49700 val loss 5.6466 +49700 val perplexity 283.3178 +49700 train 5.585721 (lr=5.0040e-06) (hash(x)=52016774) +46300 val loss 5.7649 +46300 val perplexity 318.9004 +46300 train 5.536500 (lr=7.8542e-06) (hash(x)=52820317) +47500 val loss 5.9407 +47500 val perplexity 380.1974 +47500 train 5.977067 (lr=1.0558e-05) (hash(x)=49146619) +49800 val loss 5.6472 +49800 val perplexity 283.5085 +49800 train 5.868614 (lr=5.0018e-06) (hash(x)=54182957) +46400 val loss 5.7660 +46400 val perplexity 319.2433 +46400 train 5.581118 (lr=7.8088e-06) (hash(x)=51871487) +47600 val loss 5.9357 +47600 val perplexity 378.3157 +47600 train 6.003555 (lr=1.0515e-05) (hash(x)=53742853) +49900 val loss 5.6429 +49900 val perplexity 282.2744 +49900 train 5.382163 (lr=5.0004e-06) (hash(x)=48188126) +46500 val loss 5.7640 +46500 val perplexity 318.6158 +46500 train 5.695992 (lr=7.7647e-06) (hash(x)=47867973) +47700 val loss 5.9322 +47700 val perplexity 376.9758 +47700 train 6.159915 (lr=1.0473e-05) (hash(x)=53019458) +49999 val loss 5.6555 +49999 val perplexity 285.8727 +46600 val loss 5.7643 +46600 val perplexity 318.7117 +46600 train 5.425659 (lr=7.7218e-06) (hash(x)=50685713) +47800 val loss 5.9301 +47800 val perplexity 376.1934 +47800 train 6.034805 (lr=1.0433e-05) (hash(x)=54497914) +46700 val loss 5.7648 +46700 val perplexity 318.8823 +46700 train 5.842339 (lr=7.6801e-06) (hash(x)=50975526) +47900 val loss 5.9309 +47900 val perplexity 376.4895 +47900 train 5.879211 (lr=1.0394e-05) (hash(x)=47109012) +46800 val loss 5.7642 +46800 val perplexity 318.6724 +46800 train 5.917613 (lr=7.6397e-06) (hash(x)=51012129) +48000 val loss 5.9266 +48000 val perplexity 374.8613 +48000 train 5.889581 (lr=1.0358e-05) (hash(x)=48429816) +46900 val loss 5.7674 +46900 val perplexity 319.7078 +46900 train 5.489128 (lr=7.6004e-06) (hash(x)=51205353) +48100 val loss 5.9293 +48100 val perplexity 375.9086 +48100 train 5.853606 (lr=1.0323e-05) (hash(x)=46500465) +47000 val loss 5.7699 +47000 val perplexity 320.5186 +47000 train 5.939128 (lr=7.5624e-06) (hash(x)=48939661) +48200 val loss 5.9295 +48200 val perplexity 375.9542 +48200 train 6.055623 (lr=1.0290e-05) (hash(x)=50888195) +47100 val loss 5.7678 +47100 val perplexity 319.8304 +47100 train 5.677411 (lr=7.5257e-06) (hash(x)=51002557) +48300 val loss 5.9283 +48300 val perplexity 375.4982 +48300 train 6.020633 (lr=1.0259e-05) (hash(x)=56406586) +47200 val loss 5.7671 +47200 val perplexity 319.5965 +47200 train 5.593568 (lr=7.4901e-06) (hash(x)=43422209) +48400 val loss 5.9298 +48400 val perplexity 376.0822 +48400 train 6.088981 (lr=1.0229e-05) (hash(x)=48713969) +47300 val loss 5.7669 +47300 val perplexity 319.5301 +47300 train 5.541608 (lr=7.4558e-06) (hash(x)=45870552) +48500 val loss 5.9281 +48500 val perplexity 375.4431 +48500 train 5.326134 (lr=1.0201e-05) (hash(x)=37691696) +47400 val loss 5.7710 +47400 val perplexity 320.8619 +47400 train 5.604455 (lr=7.4228e-06) (hash(x)=47001810) +48600 val loss 5.9278 +48600 val perplexity 375.3410 +48600 train 6.172936 (lr=1.0175e-05) (hash(x)=48712321) +47500 val loss 5.7651 +47500 val perplexity 318.9667 +47500 train 5.814198 (lr=7.3909e-06) (hash(x)=49146619) +48700 val loss 5.9286 +48700 val perplexity 375.6250 +48700 train 6.377697 (lr=1.0151e-05) (hash(x)=63513096) +47600 val loss 5.7612 +47600 val perplexity 317.7333 +47600 train 5.818495 (lr=7.3603e-06) (hash(x)=53742853) +48800 val loss 5.9311 +48800 val perplexity 376.5676 +48800 train 5.570303 (lr=1.0129e-05) (hash(x)=44917721) +47700 val loss 5.7605 +47700 val perplexity 317.5016 +47700 train 6.006634 (lr=7.3310e-06) (hash(x)=53019458) +48900 val loss 5.9293 +48900 val perplexity 375.8988 +48900 train 6.126920 (lr=1.0108e-05) (hash(x)=51426559) +47800 val loss 5.7595 +47800 val perplexity 317.1902 +47800 train 5.871322 (lr=7.3029e-06) (hash(x)=54497914) +49000 val loss 5.9285 +49000 val perplexity 375.5811 +49000 train 5.768394 (lr=1.0090e-05) (hash(x)=47909690) +47900 val loss 5.7629 +47900 val perplexity 318.2778 +47900 train 5.701595 (lr=7.2760e-06) (hash(x)=47109012) +49100 val loss 5.9258 +49100 val perplexity 374.5785 +49100 train 5.823383 (lr=1.0073e-05) (hash(x)=48329226) +48000 val loss 5.7561 +48000 val perplexity 316.1074 +48000 train 5.713926 (lr=7.2504e-06) (hash(x)=48429816) +48100 val loss 5.7567 +48100 val perplexity 316.3069 +48100 train 5.693079 (lr=7.2260e-06) (hash(x)=46500465) +49200 val loss 5.9261 +49200 val perplexity 374.6937 +49200 train 5.971701 (lr=1.0057e-05) (hash(x)=49512472) +48200 val loss 5.7536 +48200 val perplexity 315.3319 +48200 train 5.890446 (lr=7.2029e-06) (hash(x)=50888195) +49300 val loss 5.9344 +49300 val perplexity 377.7967 +49300 train 5.797986 (lr=1.0044e-05) (hash(x)=46541146) +48300 val loss 5.7533 +48300 val perplexity 315.2209 +48300 train 5.808914 (lr=7.1810e-06) (hash(x)=56406586) +49400 val loss 5.9355 +49400 val perplexity 378.2379 +49400 train 5.881021 (lr=1.0032e-05) (hash(x)=52192792) +48400 val loss 5.7552 +48400 val perplexity 315.8377 +48400 train 5.928476 (lr=7.1603e-06) (hash(x)=48713969) +49500 val loss 5.9340 +49500 val perplexity 377.6542 +49500 train 5.735154 (lr=1.0022e-05) (hash(x)=48954331) +48500 val loss 5.7553 +48500 val perplexity 315.8630 +48500 train 5.155114 (lr=7.1409e-06) (hash(x)=37691696) +49600 val loss 5.9327 +49600 val perplexity 377.1858 +49600 train 6.038009 (lr=1.0014e-05) (hash(x)=50364098) +48600 val loss 5.7542 +48600 val perplexity 315.5130 +49700 val loss 5.9341 +49700 val perplexity 377.6830 +48600 train 6.018201 (lr=7.1228e-06) (hash(x)=48712321) +49700 train 5.892002 (lr=1.0008e-05) (hash(x)=52016774) +49800 val loss 5.9311 +49800 val perplexity 376.5593 +49800 train 6.161837 (lr=1.0004e-05) (hash(x)=54182957) +48700 val loss 5.7538 +48700 val perplexity 315.3769 +48700 train 6.203762 (lr=7.1059e-06) (hash(x)=63513096) +49900 val loss 5.9245 +49900 val perplexity 374.1044 +49900 train 5.682785 (lr=1.0001e-05) (hash(x)=48188126) +48800 val loss 5.7556 +48800 val perplexity 315.9489 +48800 train 5.374390 (lr=7.0902e-06) (hash(x)=44917721) +49999 val loss 5.9311 +49999 val perplexity 376.5791 +48900 val loss 5.7547 +48900 val perplexity 315.6626 +48900 train 5.954336 (lr=7.0758e-06) (hash(x)=51426559) +49000 val loss 5.7534 +49000 val perplexity 315.2458 +49000 train 5.633802 (lr=7.0627e-06) (hash(x)=47909690) +49100 val loss 5.7549 +49100 val perplexity 315.7364 +49100 train 5.643294 (lr=7.0508e-06) (hash(x)=48329226) diff --git a/attention_kindselective_n_heads2_seed1339/model_02500.pt b/attention_kindselective_n_heads2_seed1339/model_02500.pt index 65a3de3f435128f32413196ee9dac2cc58fda69f..2687680560f797e52a4e47bec5102b37aa354782 100644 --- a/attention_kindselective_n_heads2_seed1339/model_02500.pt +++ b/attention_kindselective_n_heads2_seed1339/model_02500.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae274553779f4440ebea69b957bc31051896ebee2d3258e12a570a1b866cbeeb +oid sha256:ec939bef019e61f342e17ba3503c4bf65bfe6303a9e2d62dfd86a7108da33b00 size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_05000.pt b/attention_kindselective_n_heads2_seed1339/model_05000.pt index b08dd9bd2418667495c5ba67500135e7b5771c07..0208b3c7e9c8abb3e4000c6744c0c47654a76851 100644 --- a/attention_kindselective_n_heads2_seed1339/model_05000.pt +++ b/attention_kindselective_n_heads2_seed1339/model_05000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26b8ade9b792c88be8ed413ee2fb84a1876a3caa9a0ff9735161613f62be9b07 +oid sha256:f451876f7ed5e17c6216a25fec544d0e23461840ad67e246c232446e66453118 size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_07500.pt b/attention_kindselective_n_heads2_seed1339/model_07500.pt index 814a1a0019092dd39a4c5f3f587580c87e067792..ebe9ac9e8375a1e5d71e243ecba95d8351b4ec82 100644 --- a/attention_kindselective_n_heads2_seed1339/model_07500.pt +++ b/attention_kindselective_n_heads2_seed1339/model_07500.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:108b45aea49deba3ee65399b9c6563aa2a3e5b3c9f4ff861fe0933f553ca579a +oid sha256:52af93b438949f7aa1c0defcb678b4a9e60b699ccda4ebe02a5bf5863592b8fb size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_10000.pt b/attention_kindselective_n_heads2_seed1339/model_10000.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a02b04c1e67ecce2acae381f2819f3b084809e6 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_10000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72715c48c5979bc815e2911f650ad1df34c891b95320414c05fa2f8c62f4892e +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_12500.pt b/attention_kindselective_n_heads2_seed1339/model_12500.pt new file mode 100644 index 0000000000000000000000000000000000000000..125985cbf44341771763caecdb54ccad637ef659 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_12500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8daee871fa259b17e7bb507dbca1a4a0c46050d36e2d22fcddcf051ba237c05b +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_15000.pt b/attention_kindselective_n_heads2_seed1339/model_15000.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3362797747e4f3c63a83338381b83213b2299de --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_15000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:831538c825a36765051bc3e92044dfe489e974bf1b611305d268563bf3da0e4b +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_17500.pt b/attention_kindselective_n_heads2_seed1339/model_17500.pt new file mode 100644 index 0000000000000000000000000000000000000000..058958c465de534b6fe86eedca8ce41261e3cbdc --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_17500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15dde448197e2e5be9d84b499f1c2c7ef8729c56441466846e4ec4cc582c9747 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_20000.pt b/attention_kindselective_n_heads2_seed1339/model_20000.pt new file mode 100644 index 0000000000000000000000000000000000000000..94432a4ca45d67bdd5f61521bd6bdaeb31d01fe4 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_20000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d1ce7d0cadb9a74ddc34b1b5e8d31e73d3acf19f55f2f683378cef3305b94ea +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_22500.pt b/attention_kindselective_n_heads2_seed1339/model_22500.pt new file mode 100644 index 0000000000000000000000000000000000000000..97ee914b653eea4a489068c5641915bdb89471a8 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_22500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4300eea058c78a0ea7587e8579ccf305b8e83a642fbab60906608679a9a96337 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_25000.pt b/attention_kindselective_n_heads2_seed1339/model_25000.pt new file mode 100644 index 0000000000000000000000000000000000000000..62a71e3b910fba12db4db95dd3d49fd1c0fd9285 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_25000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46ec7afae3f954aba8fe80ef16053fc9e81d36bf11367f92fca68f1306ec91c4 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_27500.pt b/attention_kindselective_n_heads2_seed1339/model_27500.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cd59238eb7d62c75966c0e541ecabd16b284606 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_27500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4eb0798f9a08e2eaa208930c2f5d6a247d5a22977cccf3854fa6d924aa5233d8 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_30000.pt b/attention_kindselective_n_heads2_seed1339/model_30000.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b143ea3ec207f9c54a1acb230b0e8b4fc7275a9 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_30000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bf2773cb4a8d6cf15d347949ab6e74030cc2a303a0ce9060a7ec15fb28cccd3 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_32500.pt b/attention_kindselective_n_heads2_seed1339/model_32500.pt new file mode 100644 index 0000000000000000000000000000000000000000..27e5750fdebde4b93a57e6dbb03a1b0618cc8f79 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_32500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:600a25cf308a06ac34d3fa59589719a046e1a9fff9b4e38b4dc8acd257277d6f +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_35000.pt b/attention_kindselective_n_heads2_seed1339/model_35000.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f2a51d6363a7bf81f28a7c49af6eacd1b503657 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_35000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcb186ed3757e433175187791cbb6c0f72d167aeee05149212810f52b9404614 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_37500.pt b/attention_kindselective_n_heads2_seed1339/model_37500.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c9a6d3ababd780d22fc4342ef58ef1e023001ec --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_37500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2670a6d5cf36b2c82baee292d84c90f6d4d003ecca3b63b39f709c8fe145d4a8 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_40000.pt b/attention_kindselective_n_heads2_seed1339/model_40000.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c79ced66fd7499e300ef7bac905141c6f8085c8 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_40000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77ecd814874df2cf2985ac6a4b912373d8fef7527b0532a32bb1e35f769b29ba +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_42500.pt b/attention_kindselective_n_heads2_seed1339/model_42500.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b64b7cd5e0058da024e828b84a7b24556804220 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_42500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cff43b924e124c5ac366b235d7ac6e68f5947481e0aecf7c392347e3f191f25d +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_45000.pt b/attention_kindselective_n_heads2_seed1339/model_45000.pt new file mode 100644 index 0000000000000000000000000000000000000000..b14ed8c18cc905d853feb2f9c592a9e60b0f6520 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_45000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dced948ffec2e10bf6d76fc439e31dd706d03eea8a67796d223af4f129a9a629 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_47500.pt b/attention_kindselective_n_heads2_seed1339/model_47500.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e2cde3513acd1855746d8e7c55127ded927f1fe --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_47500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cf20eeb2525dd12e58ccec8138c6fd4986598cd0f29c6dd7c6872ed0de456e7 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/model_49999.pt b/attention_kindselective_n_heads2_seed1339/model_49999.pt new file mode 100644 index 0000000000000000000000000000000000000000..da13a56364469f6ac8d982cadc96587f0d5c5f3d --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/model_49999.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3fd87ea6f915a7bb79bcc5c9c118ea2e3765ef92588d83288f7cf387299b8b1 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_02500.pt b/attention_kindselective_n_heads2_seed1339/optimizer_02500.pt index 1d7e4997dc494494858af2a0437133efeb8e5e90..5a8c8b78fae72b391ae30e2ef56380d5cf5b69f8 100644 --- a/attention_kindselective_n_heads2_seed1339/optimizer_02500.pt +++ b/attention_kindselective_n_heads2_seed1339/optimizer_02500.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ad72c0567bd0d1a62805941203c2f524d3cfdbbf3ba0188f4072436dcfbd8f9 +oid sha256:d26186ccd460cced82ce7ca326f311f33278bcbac26f41b2fdc161e3fb59abb0 size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_05000.pt b/attention_kindselective_n_heads2_seed1339/optimizer_05000.pt index 57a1ba14957f36da9cc0ea4ad028b1a2ee7a4e9d..58032a00a601ff00b60f483ad8470ecff19ab111 100644 --- a/attention_kindselective_n_heads2_seed1339/optimizer_05000.pt +++ b/attention_kindselective_n_heads2_seed1339/optimizer_05000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:226f99cb1aed7be3f245b40195adb0bb7cb364f7e3d93a2695839d21bf5a5a6c +oid sha256:ed3fc54c8ee18e4f474381576347fc57055812599bc0ff5cc5994b52476840ea size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_07500.pt b/attention_kindselective_n_heads2_seed1339/optimizer_07500.pt index fc24c76dbd928038afa0b056b88dd69b43aba78a..e1dfef0312bfbad7afe88309b0459ddcfe5117c2 100644 --- a/attention_kindselective_n_heads2_seed1339/optimizer_07500.pt +++ b/attention_kindselective_n_heads2_seed1339/optimizer_07500.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a228f94ffeda43d25053c8d30e2a4cc11e7c0c35911468617171f4ab849dfcc4 +oid sha256:f5de7032a5d8199a7be79d245bd8db93fe14474a812d8e1a7b8061dda18600d6 size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_10000.pt b/attention_kindselective_n_heads2_seed1339/optimizer_10000.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9d62346d9c7887799d16c2fe3498afec90ea8d4 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_10000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13f03e2d0cae806dafc5545daaa600d77bc6149c18fbea1018ca2d7472ec524b +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_12500.pt b/attention_kindselective_n_heads2_seed1339/optimizer_12500.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a7f11dae9c1335e492946cf8609f33dbd001835 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_12500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:211f9baaf5875e9b559aaf12bb2d9be4bd0cc0ee30381f61c79367ca2f08918a +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_15000.pt b/attention_kindselective_n_heads2_seed1339/optimizer_15000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a36be912983c248606b47d7a1e5961bb673053a4 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_15000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eefdf254022cc44e21811d089c21859a469ada307a3bebeb84f13399164eaebb +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_17500.pt b/attention_kindselective_n_heads2_seed1339/optimizer_17500.pt new file mode 100644 index 0000000000000000000000000000000000000000..c28b715aba327c7feab8f8f77d0827949a18e7e6 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_17500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:351304ca8328a9d20fe1e5617b337ec9cae336b55b8243b087d15f79f1c6de64 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_20000.pt b/attention_kindselective_n_heads2_seed1339/optimizer_20000.pt new file mode 100644 index 0000000000000000000000000000000000000000..57688e6292bf36aae0e167ee434755925f779ff9 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_20000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cad69a5ef12b38529983dc078bb197eb7a39ee0fbc4922b58d4a49f3a8d5059 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_22500.pt b/attention_kindselective_n_heads2_seed1339/optimizer_22500.pt new file mode 100644 index 0000000000000000000000000000000000000000..838c46d3eee50df04044ac62b8d79318fe2ab510 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_22500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82b5dba4d34fe2c2239cdcb1fecad8798fad01451efea2433a6e1d7796262e96 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_25000.pt b/attention_kindselective_n_heads2_seed1339/optimizer_25000.pt new file mode 100644 index 0000000000000000000000000000000000000000..87a0cbf239b07c1b1345bfc5754da42d99f1bbf9 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_25000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a595a3786c36ed2d4976997304490c37dc43eeb0bb09d39ab22ad53ab7e33314 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_27500.pt b/attention_kindselective_n_heads2_seed1339/optimizer_27500.pt new file mode 100644 index 0000000000000000000000000000000000000000..db2abc2aa785b2d01773c89009e8202b63900891 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_27500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:228fc10bafb10de026310098f3892bfc05d55585b15f9470297401b6447886de +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_30000.pt b/attention_kindselective_n_heads2_seed1339/optimizer_30000.pt new file mode 100644 index 0000000000000000000000000000000000000000..3bc50516ed8e5765ea789a82079374aaa6095b62 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_30000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8c1a20164b7f3e5e23a06aea4b645b22fdec3dc99886fc7672cb628ef59372b +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_32500.pt b/attention_kindselective_n_heads2_seed1339/optimizer_32500.pt new file mode 100644 index 0000000000000000000000000000000000000000..921467ea8fa88ef45048c6299902e9a4330c7c36 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_32500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6294b74a3eb743b145f33f28d222e25f58463c04f326d1983db2a40c8a7ad173 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_35000.pt b/attention_kindselective_n_heads2_seed1339/optimizer_35000.pt new file mode 100644 index 0000000000000000000000000000000000000000..db10d7846883feee3766f5f1363ab6181a4a3319 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_35000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca014fa68a69966023e3b3a8a81d7b4a41308d389b9505d21e052d448acd6ba2 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_37500.pt b/attention_kindselective_n_heads2_seed1339/optimizer_37500.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b86110ec5d4484b354a085eab2f430120b55df5 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_37500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:014efadb4e481bd571d611637a27e975d87b17168cfac61d383808a05bef9ce1 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_40000.pt b/attention_kindselective_n_heads2_seed1339/optimizer_40000.pt new file mode 100644 index 0000000000000000000000000000000000000000..83a2d00703ece51c9689b8a34812077735865812 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_40000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18890c7cccd4c5634713563bb2fbfffbc558d3eee3ecfd6a29325816ea150cf2 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_42500.pt b/attention_kindselective_n_heads2_seed1339/optimizer_42500.pt new file mode 100644 index 0000000000000000000000000000000000000000..3754179355ecd11e11f197084d9f1ce85dab495e --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_42500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de5c24505e2f876ea88b1023a6a634bedf6c3659079f3c8cd338dc0ccde83637 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_45000.pt b/attention_kindselective_n_heads2_seed1339/optimizer_45000.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2fb83a685eb3d180ff5dc53a2592e8342bc35bd --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_45000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afc534e4d9915a68cde314513e45180a2948362f08c742a3f901667801706b2d +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_47500.pt b/attention_kindselective_n_heads2_seed1339/optimizer_47500.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f6dc716b4dff13403861c1bed88bf7df7695f24 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_47500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee5f4d36f23b8b39196b54245a851153fffc3634a3bb26a39af5ce8747a247b4 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1339/optimizer_49999.pt b/attention_kindselective_n_heads2_seed1339/optimizer_49999.pt new file mode 100644 index 0000000000000000000000000000000000000000..f02b8dc3797ccc4264871d502ca3487b18ea2be4 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1339/optimizer_49999.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56f98c4af7f7b7aa3e8ffa079a048a8b5cb5acd698a54e633db9f1ac1f0d15ca +size 70895430