diff --git a/attention_kindselective_n_heads2_seed1340/args.json b/attention_kindselective_n_heads2_seed1340/args.json index bedd11d4fc6aa6d3ca8a5a66f58869ea46e39631..486439fdcb6d55a60fded08488769c0112891b42 100644 --- a/attention_kindselective_n_heads2_seed1340/args.json +++ b/attention_kindselective_n_heads2_seed1340/args.json @@ -1 +1 @@ -{"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads2_seed1340", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1340, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 0.00015, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "15e-5_10240_2_1340", "n_embd": 128} \ No newline at end of file +{"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_7/attention_kindselective_n_heads2_seed1340", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 50000, "warmup_steps": 200, "group": "wider_is_better_7", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1340, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 0.0001, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "10e-5_10240_2_1340", "n_embd": 128} \ No newline at end of file diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_10000.pt b/attention_kindselective_n_heads2_seed1340/dataloader_10000.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1452fd777edd0ccb65d6b47710c50208c14b312 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_10000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3858f6c832feea78a674d8c5c384061cc7d4f22cddbd0a2be6de33bc91e2c72 +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_12500.pt b/attention_kindselective_n_heads2_seed1340/dataloader_12500.pt new file mode 100644 index 0000000000000000000000000000000000000000..f63806424a0177a7f2d678c2c63138219ed021f3 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_12500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab3779d33c2e0a7873fcd8c39402e44260740665950323ad1445480ec339965a +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_15000.pt b/attention_kindselective_n_heads2_seed1340/dataloader_15000.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ea0f5a1bfab75667c4ebb0ca01b358cdc836a54 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_15000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:450020c7c306c73e5c07c463518ab937102d657515ea5a38da6f2e7291f20324 +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_17500.pt b/attention_kindselective_n_heads2_seed1340/dataloader_17500.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb9392348b1209c827e3e376b05eeda80e779aa8 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_17500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0696a655d7c8a9d7d275c7489f74a5a948ee029ac3941b045d6abaf12544a5b1 +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_20000.pt b/attention_kindselective_n_heads2_seed1340/dataloader_20000.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba32fca059aec40e4f758de96bdac6df23b9d9f5 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_20000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4eb226bdcee777fd1ca493533704dae226c077ef79c842fc9dc59a534d5381c1 +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_22500.pt b/attention_kindselective_n_heads2_seed1340/dataloader_22500.pt new file mode 100644 index 0000000000000000000000000000000000000000..498dc444f528d893090328a5bd1e2f37da46dc12 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_22500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88833bfd821adac4edd1dd0772083ae007c7b8d33041f66e53a679e1fa8993e0 +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_25000.pt b/attention_kindselective_n_heads2_seed1340/dataloader_25000.pt new file mode 100644 index 0000000000000000000000000000000000000000..b657bc134192d0ea956f984c289d0c682979a1f4 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_25000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:554555a425bac43d626c36f1c81c2b0aba51eda3281dab27a9cb56b61f413354 +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_27500.pt b/attention_kindselective_n_heads2_seed1340/dataloader_27500.pt new file mode 100644 index 0000000000000000000000000000000000000000..d92d43d89390714f43db4f0782e49af0145b4a90 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_27500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a52940b3b45414e6cdbec0dbaeba848f52d681c2daf78c269027057332d7fbd +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_30000.pt b/attention_kindselective_n_heads2_seed1340/dataloader_30000.pt new file mode 100644 index 0000000000000000000000000000000000000000..06856f53253b6c8cbefd7d595d9b9b7266b22621 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_30000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:775519ea26122cb70d533c496bcdbbc19f759f3d096e83e98cca1dc10275fe8e +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_32500.pt b/attention_kindselective_n_heads2_seed1340/dataloader_32500.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc0b4b40e57f41aa1046b3bd2697256635160c09 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_32500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab3422c8205fe45210246fed3dd6c317b75df02228cd8b75fba669574ce3b2d9 +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_35000.pt b/attention_kindselective_n_heads2_seed1340/dataloader_35000.pt new file mode 100644 index 0000000000000000000000000000000000000000..62647b540fa7925361626b9f8dfa3959eebb7608 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_35000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:199ed3be67b88981f686112c97a2261729a37e0be3d4b0f4a289985a95d3cdf1 +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_37500.pt b/attention_kindselective_n_heads2_seed1340/dataloader_37500.pt new file mode 100644 index 0000000000000000000000000000000000000000..475d8b538138a8e39b76a4cf04c8eaeac074d295 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_37500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f77178b3be9dd3f8cd03c935236251f73fde6da7948ba9feda0c888fb8912dfe +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_40000.pt b/attention_kindselective_n_heads2_seed1340/dataloader_40000.pt new file mode 100644 index 0000000000000000000000000000000000000000..50ec00ef6be330bbdb4cdf88e9a1097345da0d4d --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_40000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84f58ba3b4a1f9be7da4e697e06782f0e1ce4d3aca49f1997087fc83aa466dd9 +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_42500.pt b/attention_kindselective_n_heads2_seed1340/dataloader_42500.pt new file mode 100644 index 0000000000000000000000000000000000000000..74e133f73a7293d5f4d6407784703c91f705d6e3 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_42500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf6d24c78d89100d146bce9f26be940db3d71092473d9b55db97d6b35531eac2 +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_45000.pt b/attention_kindselective_n_heads2_seed1340/dataloader_45000.pt new file mode 100644 index 0000000000000000000000000000000000000000..718bc149d695b1e9498bdd0693053d7417207818 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_45000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22bb90b43d81f3da5454f91a70e1ed29aeb2f470a727ce38390ff8a5c4924889 +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_47500.pt b/attention_kindselective_n_heads2_seed1340/dataloader_47500.pt new file mode 100644 index 0000000000000000000000000000000000000000..3eab97c7c6d4b86d90405bf1c4f3435727495da4 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_47500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55507725e6988f190e4963078652fafa6b68e8d4f79221387612612babf3e1c1 +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/dataloader_49999.pt b/attention_kindselective_n_heads2_seed1340/dataloader_49999.pt new file mode 100644 index 0000000000000000000000000000000000000000..d87f88b62a343a49411f8a6feee8f527879fcd1f --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/dataloader_49999.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47776cddb8021172f048a950b83f25b692cb340214b800ce3837c15ceb58907c +size 964 diff --git a/attention_kindselective_n_heads2_seed1340/log2.txt b/attention_kindselective_n_heads2_seed1340/log2.txt index 480b785bf77977a380b294b46e4dc53ece89b256..67909bd01780a3fac2463f518e7d2703a81257c4 100644 --- a/attention_kindselective_n_heads2_seed1340/log2.txt +++ b/attention_kindselective_n_heads2_seed1340/log2.txt @@ -1,515 +1,4469 @@ -max_steps: 10000 -3000 val loss 6.8497 -3000 val perplexity 943.5969 -3000 train 6.557367 (lr=8.3057e-05) (hash(x)=44336167) +max_steps: 50000 0 val loss 11.8201 0 val perplexity 135955.4688 -3100 val loss 6.8108 -3100 val perplexity 907.6118 -3100 train 7.108453 (lr=8.1915e-05) (hash(x)=44479330) -3200 val loss 6.8168 -3200 val perplexity 913.0430 -3200 train 6.861060 (lr=8.0745e-05) (hash(x)=54593096) -3300 val loss 6.7666 -3300 val perplexity 868.3719 -3300 train 6.689936 (lr=7.9549e-05) (hash(x)=45347643) -0 train 11.815376 (lr=7.5000e-07) (hash(x)=57791809) -3400 val loss 6.7480 -3400 val perplexity 852.3517 -3400 train 6.868625 (lr=7.8328e-05) (hash(x)=47797247) -100 val loss 9.4103 -100 val perplexity 12214.1318 -100 train 9.435302 (lr=7.5750e-05) (hash(x)=48211824) -3500 val loss 6.7277 -3500 val perplexity 835.2067 -3500 train 6.586505 (lr=7.7082e-05) (hash(x)=46115683) -200 val loss 7.8489 -200 val perplexity 2562.9167 -200 train 7.782820 (lr=1.5000e-04) (hash(x)=50375849) -3600 val loss 6.7276 -3600 val perplexity 835.1287 -3600 train 6.583769 (lr=7.5814e-05) (hash(x)=44502074) -300 val loss 7.7186 -300 val perplexity 2249.8511 -300 train 8.037295 (lr=1.4997e-04) (hash(x)=57250808) -3700 val loss 6.6958 -3700 val perplexity 808.9891 -3700 train 6.847288 (lr=7.4525e-05) (hash(x)=55388443) -3800 val loss 6.6831 -3800 val perplexity 798.8195 -3800 train 6.478742 (lr=7.3215e-05) (hash(x)=43790341) -400 val loss 7.6896 -400 val perplexity 2185.4397 -400 train 8.281481 (lr=1.4986e-04) (hash(x)=62519858) -3900 val loss 6.6852 -3900 val perplexity 800.4603 -3900 train 6.680488 (lr=7.1887e-05) (hash(x)=50013318) -500 val loss 7.6498 -500 val perplexity 2100.2070 -500 train 7.555541 (lr=1.4969e-04) (hash(x)=47226806) -600 val loss 7.6013 -600 val perplexity 2000.8110 -600 train 7.611244 (lr=1.4945e-04) (hash(x)=51149322) -4000 val loss 6.6633 -4000 val perplexity 783.1157 -4000 train 6.724937 (lr=7.0541e-05) (hash(x)=51704787) -700 val loss 7.5505 -700 val perplexity 1901.7231 -700 train 7.570621 (lr=1.4913e-04) (hash(x)=51564551) -4100 val loss 6.6502 -4100 val perplexity 772.9760 -4100 train 6.690805 (lr=6.9180e-05) (hash(x)=50821964) -800 val loss 7.4868 -800 val perplexity 1784.4075 -800 train 7.224887 (lr=1.4876e-04) (hash(x)=45093459) -4200 val loss 6.6401 -4200 val perplexity 765.1970 -4200 train 6.676926 (lr=6.7804e-05) (hash(x)=49675080) -900 val loss 7.4101 -900 val perplexity 1652.5939 -900 train 7.676486 (lr=1.4831e-04) (hash(x)=54988361) -4300 val loss 6.6300 -4300 val perplexity 757.5169 -4300 train 6.297640 (lr=6.6414e-05) (hash(x)=43239281) -1000 val loss 7.3711 -1000 val perplexity 1589.3545 -1000 train 7.263645 (lr=1.4779e-04) (hash(x)=47588648) -4400 val loss 6.6187 -4400 val perplexity 748.9337 -4400 train 6.297700 (lr=6.5013e-05) (hash(x)=45076737) -1100 val loss 7.3459 -1100 val perplexity 1549.8197 -1100 train 6.974550 (lr=1.4721e-04) (hash(x)=37984588) -4500 val loss 6.6069 -4500 val perplexity 740.1642 -4500 train 6.723946 (lr=6.3602e-05) (hash(x)=57930262) -1200 val loss 7.2713 -1200 val perplexity 1438.3854 -1200 train 7.423686 (lr=1.4656e-04) (hash(x)=56333817) -4600 val loss 6.5956 -4600 val perplexity 731.8596 -4600 train 6.346147 (lr=6.2182e-05) (hash(x)=46721614) -1300 val loss 7.2478 -1300 val perplexity 1405.0525 -1300 train 7.408738 (lr=1.4585e-04) (hash(x)=53454056) -4700 val loss 6.5840 -4700 val perplexity 723.4270 -4700 train 6.413177 (lr=6.0754e-05) (hash(x)=49837920) -1400 val loss 7.1724 -1400 val perplexity 1302.9978 -1400 train 7.307054 (lr=1.4507e-04) (hash(x)=55284163) -4800 val loss 6.5830 -4800 val perplexity 722.7350 -4800 train 6.747669 (lr=5.9321e-05) (hash(x)=48380045) -1500 val loss 7.1053 -1500 val perplexity 1218.3500 -1500 train 6.960086 (lr=1.4422e-04) (hash(x)=48162598) -4900 val loss 6.5705 -4900 val perplexity 713.7289 -4900 train 6.455478 (lr=5.7883e-05) (hash(x)=44202577) -1600 val loss 7.0936 -1600 val perplexity 1204.2488 -1600 train 7.176346 (lr=1.4332e-04) (hash(x)=54214535) -5000 val loss 6.5548 -5000 val perplexity 702.6354 -5000 train 6.531763 (lr=5.6442e-05) (hash(x)=52038024) -1700 val loss 7.0477 -1700 val perplexity 1150.2354 -1700 train 7.288129 (lr=1.4235e-04) (hash(x)=53525003) -5100 val loss 6.5472 -5100 val perplexity 697.2672 -5100 train 6.682254 (lr=5.5000e-05) (hash(x)=53700038) -1800 val loss 6.9886 -1800 val perplexity 1084.1967 -1800 train 7.055763 (lr=1.4131e-04) (hash(x)=51848994) -5200 val loss 6.5449 -5200 val perplexity 695.6867 -5200 train 6.514779 (lr=5.3558e-05) (hash(x)=48137625) -1900 val loss 6.9741 -1900 val perplexity 1068.6400 -1900 train 6.808705 (lr=1.4022e-04) (hash(x)=48405987) -5300 val loss 6.5362 -5300 val perplexity 689.6800 -5300 train 6.367496 (lr=5.2117e-05) (hash(x)=43161573) -2000 val loss 6.9172 -2000 val perplexity 1009.4669 -2000 train 7.280006 (lr=1.3907e-04) (hash(x)=58592291) -2100 val loss 6.9460 -2100 val perplexity 1038.9731 -2100 train 7.032774 (lr=1.3786e-04) (hash(x)=51167081) -5400 val loss 6.5369 -5400 val perplexity 690.1711 -5400 train 6.705731 (lr=5.0679e-05) (hash(x)=56673322) -2200 val loss 6.8788 -2200 val perplexity 971.4705 -2200 train 6.855141 (lr=1.3660e-04) (hash(x)=47994988) -5500 val loss 6.5188 -5500 val perplexity 677.7745 -5500 train 6.729006 (lr=4.9246e-05) (hash(x)=53468295) -2300 val loss 6.9196 -2300 val perplexity 1011.8920 -2300 train 6.912843 (lr=1.3527e-04) (hash(x)=47377604) -5600 val loss 6.5010 -5600 val perplexity 665.8105 -5600 train 6.796459 (lr=4.7818e-05) (hash(x)=59287280) -2400 val loss 6.8704 -2400 val perplexity 963.3668 -2400 train 6.928603 (lr=1.3390e-04) (hash(x)=53554323) -5700 val loss 6.5016 -5700 val perplexity 666.2183 -5700 train 6.710692 (lr=4.6398e-05) (hash(x)=57575806) -2500 val loss 6.8641 -2500 val perplexity 957.2761 -2500 train 6.931783 (lr=1.3247e-04) (hash(x)=50780417) -5800 val loss 6.4968 -5800 val perplexity 663.0387 -5800 train 6.534732 (lr=4.4987e-05) (hash(x)=46897279) -2600 val loss 6.8701 -2600 val perplexity 963.0816 -2600 train 6.758279 (lr=1.3099e-04) (hash(x)=46453562) -5900 val loss 6.4894 -5900 val perplexity 658.1490 -5900 train 6.375528 (lr=4.3586e-05) (hash(x)=47565679) -2700 val loss 6.8107 -2700 val perplexity 907.5153 -2700 train 6.781026 (lr=1.2946e-04) (hash(x)=54404221) -6000 val loss 6.4841 -6000 val perplexity 654.6342 -6000 train 6.333650 (lr=4.2196e-05) (hash(x)=51590090) -2800 val loss 6.7924 -2800 val perplexity 891.0690 -2800 train 7.439435 (lr=1.2788e-04) (hash(x)=59318895) -6100 val loss 6.4780 -6100 val perplexity 650.6495 -6100 train 6.915969 (lr=4.0820e-05) (hash(x)=59732271) -2900 val loss 6.7875 -2900 val perplexity 886.6631 -2900 train 6.780449 (lr=1.2626e-04) (hash(x)=47845760) -6200 val loss 6.4670 -6200 val perplexity 643.5385 -6200 train 6.530252 (lr=3.9459e-05) (hash(x)=46394422) -3000 val loss 6.7641 -3000 val perplexity 866.1562 -3000 train 6.461198 (lr=1.2459e-04) (hash(x)=44336167) -6300 val loss 6.4697 -6300 val perplexity 645.2667 -6300 train 6.481519 (lr=3.8113e-05) (hash(x)=53748145) -3100 val loss 6.7478 -3100 val perplexity 852.1684 -3100 train 7.055303 (lr=1.2287e-04) (hash(x)=44479330) -6400 val loss 6.4581 -6400 val perplexity 637.8579 -6400 train 6.312898 (lr=3.6785e-05) (hash(x)=46054751) -3200 val loss 6.7569 -3200 val perplexity 859.9979 -3200 train 6.806598 (lr=1.2112e-04) (hash(x)=54593096) -6500 val loss 6.4519 -6500 val perplexity 633.8904 -6500 train 6.736262 (lr=3.5475e-05) (hash(x)=51816809) -3300 val loss 6.7278 -3300 val perplexity 835.3075 -3300 train 6.684790 (lr=1.1932e-04) (hash(x)=45347643) -6600 val loss 6.4588 -6600 val perplexity 638.2906 -6600 train 6.274892 (lr=3.4186e-05) (hash(x)=52453336) -3400 val loss 6.7000 -3400 val perplexity 812.3824 -3400 train 6.819838 (lr=1.1749e-04) (hash(x)=47797247) -6700 val loss 6.4472 -6700 val perplexity 630.9039 -6700 train 6.439846 (lr=3.2918e-05) (hash(x)=49108775) -3500 val loss 6.6888 -3500 val perplexity 803.3629 -3500 train 6.543953 (lr=1.1562e-04) (hash(x)=46115683) -6800 val loss 6.4345 -6800 val perplexity 622.9869 -6800 train 6.340245 (lr=3.1672e-05) (hash(x)=46745396) -3600 val loss 6.6714 -3600 val perplexity 789.5205 -3600 train 6.545329 (lr=1.1372e-04) (hash(x)=44502074) -6900 val loss 6.4267 -6900 val perplexity 618.1205 -6900 train 6.602197 (lr=3.0451e-05) (hash(x)=46534986) -3700 val loss 6.6690 -3700 val perplexity 787.6163 -3700 train 6.809144 (lr=1.1179e-04) (hash(x)=55388443) -7000 val loss 6.4191 -7000 val perplexity 613.4761 -7000 train 6.785058 (lr=2.9255e-05) (hash(x)=49317888) -3800 val loss 6.6656 -3800 val perplexity 784.8956 -3800 train 6.453290 (lr=1.0982e-04) (hash(x)=43790341) -7100 val loss 6.4132 -7100 val perplexity 609.8173 -7100 train 6.441988 (lr=2.8085e-05) (hash(x)=50360484) -3900 val loss 6.6528 -3900 val perplexity 774.9131 -3900 train 6.637487 (lr=1.0783e-04) (hash(x)=50013318) -7200 val loss 6.4056 -7200 val perplexity 605.1969 -7200 train 6.250135 (lr=2.6943e-05) (hash(x)=49515094) -4000 val loss 6.6511 -4000 val perplexity 773.6136 -4000 train 6.725606 (lr=1.0581e-04) (hash(x)=51704787) -7300 val loss 6.4160 -7300 val perplexity 611.5779 -7300 train 6.397116 (lr=2.5830e-05) (hash(x)=51546861) -4100 val loss 6.6422 -4100 val perplexity 766.7709 -4100 train 6.687906 (lr=1.0377e-04) (hash(x)=50821964) -7400 val loss 6.4147 -7400 val perplexity 610.7546 -7400 train 6.393087 (lr=2.4746e-05) (hash(x)=48320948) -4200 val loss 6.6313 -4200 val perplexity 758.4469 -4200 train 6.676696 (lr=1.0171e-04) (hash(x)=49675080) -7500 val loss 6.4144 -7500 val perplexity 610.5697 -7500 train 6.344778 (lr=2.3694e-05) (hash(x)=40167457) -4300 val loss 6.6157 -4300 val perplexity 746.7346 -4300 train 6.275900 (lr=9.9622e-05) (hash(x)=43239281) -7600 val loss 6.3956 -7600 val perplexity 599.2078 -7600 train 6.340053 (lr=2.2674e-05) (hash(x)=49942165) -4400 val loss 6.6233 -4400 val perplexity 752.4334 -4400 train 6.317001 (lr=9.7520e-05) (hash(x)=45076737) -7700 val loss 6.3912 -7700 val perplexity 596.5556 -7700 train 6.109773 (lr=2.1687e-05) (hash(x)=48853311) -4500 val loss 6.6173 -4500 val perplexity 747.8881 -4500 train 6.730391 (lr=9.5403e-05) (hash(x)=57930262) -7800 val loss 6.3869 -7800 val perplexity 594.0394 -7800 train 6.283017 (lr=2.0735e-05) (hash(x)=48510117) -4600 val loss 6.5940 -4600 val perplexity 730.6702 -4600 train 6.354017 (lr=9.3273e-05) (hash(x)=46721614) -7900 val loss 6.3907 -7900 val perplexity 596.3019 -7900 train 6.387419 (lr=1.9818e-05) (hash(x)=48339781) -4700 val loss 6.5861 -4700 val perplexity 724.9485 -4700 train 6.410762 (lr=9.1132e-05) (hash(x)=49837920) -8000 val loss 6.3852 -8000 val perplexity 593.0266 -8000 train 6.477322 (lr=1.8936e-05) (hash(x)=54927320) -4800 val loss 6.5926 -4800 val perplexity 729.6981 -4800 train 6.733562 (lr=8.8982e-05) (hash(x)=48380045) -8100 val loss 6.3803 -8100 val perplexity 590.0946 -8100 train 6.103539 (lr=1.8092e-05) (hash(x)=46461786) -4900 val loss 6.5911 -4900 val perplexity 728.6015 -4900 train 6.470277 (lr=8.6825e-05) (hash(x)=44202577) -8200 val loss 6.3766 -8200 val perplexity 587.9359 -8200 train 6.291387 (lr=1.7286e-05) (hash(x)=51536260) -5000 val loss 6.5647 -5000 val perplexity 709.6234 -5000 train 6.568296 (lr=8.4663e-05) (hash(x)=52038024) -8300 val loss 6.3755 -8300 val perplexity 587.3049 -8300 train 6.118826 (lr=1.6519e-05) (hash(x)=44770722) -5100 val loss 6.5647 -5100 val perplexity 709.6227 -5100 train 6.695176 (lr=8.2500e-05) (hash(x)=53700038) -8400 val loss 6.3739 -8400 val perplexity 586.3613 -8400 train 6.378792 (lr=1.5791e-05) (hash(x)=50104957) -5200 val loss 6.5524 -5200 val perplexity 700.9124 -5200 train 6.519544 (lr=8.0337e-05) (hash(x)=48137625) -8500 val loss 6.3685 -8500 val perplexity 583.1942 -8500 train 6.459326 (lr=1.5103e-05) (hash(x)=50132971) -5300 val loss 6.5438 -5300 val perplexity 694.8887 -5300 train 6.366642 (lr=7.8175e-05) (hash(x)=43161573) -8600 val loss 6.3651 -8600 val perplexity 581.2270 -8600 train 6.323270 (lr=1.4456e-05) (hash(x)=52193699) -5400 val loss 6.5453 -5400 val perplexity 695.9648 -5400 train 6.720807 (lr=7.6018e-05) (hash(x)=56673322) -8700 val loss 6.3602 -8700 val perplexity 578.3399 -8700 train 6.344991 (lr=1.3851e-05) (hash(x)=47902319) -5500 val loss 6.5354 -5500 val perplexity 689.1136 -5500 train 6.723743 (lr=7.3868e-05) (hash(x)=53468295) -8800 val loss 6.3528 -8800 val perplexity 574.0699 -8800 train 6.642093 (lr=1.3289e-05) (hash(x)=54904230) -5600 val loss 6.5390 -5600 val perplexity 691.6022 -5600 train 6.854773 (lr=7.1727e-05) (hash(x)=59287280) -8900 val loss 6.3538 -8900 val perplexity 574.6966 -8900 train 6.230078 (lr=1.2769e-05) (hash(x)=46311615) -5700 val loss 6.5331 -5700 val perplexity 687.5286 -5700 train 6.746007 (lr=6.9597e-05) (hash(x)=57575806) -9000 val loss 6.3527 -9000 val perplexity 574.0275 -9000 train 6.200503 (lr=1.2292e-05) (hash(x)=48535188) -5800 val loss 6.5352 -5800 val perplexity 688.9601 -5800 train 6.542769 (lr=6.7480e-05) (hash(x)=46897279) -9100 val loss 6.3477 -9100 val perplexity 571.1558 -9100 train 6.375886 (lr=1.1860e-05) (hash(x)=51757372) -5900 val loss 6.5230 -5900 val perplexity 680.6183 -5900 train 6.415868 (lr=6.5378e-05) (hash(x)=47565679) -9200 val loss 6.3460 -9200 val perplexity 570.2044 -9200 train 6.207469 (lr=1.1472e-05) (hash(x)=51131708) -6000 val loss 6.5250 -6000 val perplexity 682.0081 -6000 train 6.372841 (lr=6.3294e-05) (hash(x)=51590090) -9300 val loss 6.3423 -9300 val perplexity 568.0868 -9300 train 6.299280 (lr=1.1128e-05) (hash(x)=44784276) -6100 val loss 6.5130 -6100 val perplexity 673.8539 -6100 train 6.930476 (lr=6.1230e-05) (hash(x)=59732271) -9400 val loss 6.3432 -9400 val perplexity 568.6069 -9400 train 6.450517 (lr=1.0830e-05) (hash(x)=51981169) -6200 val loss 6.5090 -6200 val perplexity 671.1452 -9500 val loss 6.3407 -9500 val perplexity 567.1857 -6200 train 6.571561 (lr=5.9188e-05) (hash(x)=46394422) -9500 train 6.312754 (lr=1.0577e-05) (hash(x)=47232936) -9600 val loss 6.3410 -9600 val perplexity 567.3699 -9600 train 6.355495 (lr=1.0369e-05) (hash(x)=53800450) -6300 val loss 6.5214 -6300 val perplexity 679.5251 -6300 train 6.526929 (lr=5.7169e-05) (hash(x)=53748145) -9700 val loss 6.3406 -9700 val perplexity 567.1387 -9700 train 6.445551 (lr=1.0208e-05) (hash(x)=55768123) -6400 val loss 6.5170 -6400 val perplexity 676.5420 -6400 train 6.375550 (lr=5.5177e-05) (hash(x)=46054751) -9800 val loss 6.3420 -9800 val perplexity 567.9086 -9800 train 6.281539 (lr=1.0092e-05) (hash(x)=47745177) -6500 val loss 6.5060 -6500 val perplexity 669.1499 -6500 train 6.733078 (lr=5.3213e-05) (hash(x)=51816809) -9900 val loss 6.3431 -9900 val perplexity 568.5347 -9900 train 6.613006 (lr=1.0023e-05) (hash(x)=56592246) -6600 val loss 6.4993 -6600 val perplexity 664.6857 -6600 train 6.381551 (lr=5.1279e-05) (hash(x)=52453336) -9999 val loss 6.3466 -9999 val perplexity 570.5632 -6700 val loss 6.5058 -6700 val perplexity 669.0009 -6700 train 6.497029 (lr=4.9377e-05) (hash(x)=49108775) -6800 val loss 6.4920 -6800 val perplexity 659.8412 -6800 train 6.398888 (lr=4.7509e-05) (hash(x)=46745396) -6900 val loss 6.4899 -6900 val perplexity 658.4554 -6900 train 6.663884 (lr=4.5676e-05) (hash(x)=46534986) -7000 val loss 6.4768 -7000 val perplexity 649.8740 -7000 train 6.840258 (lr=4.3882e-05) (hash(x)=49317888) -7100 val loss 6.4738 -7100 val perplexity 647.9160 -7100 train 6.493526 (lr=4.2128e-05) (hash(x)=50360484) -7200 val loss 6.4698 -7200 val perplexity 645.3282 -7200 train 6.339218 (lr=4.0414e-05) (hash(x)=49515094) -7300 val loss 6.4730 -7300 val perplexity 647.4099 -7300 train 6.455276 (lr=3.8745e-05) (hash(x)=51546861) -7400 val loss 6.4622 -7400 val perplexity 640.4369 -7400 train 6.428791 (lr=3.7120e-05) (hash(x)=48320948) -7500 val loss 6.4678 -7500 val perplexity 644.0776 -7500 train 6.414234 (lr=3.5541e-05) (hash(x)=40167457) -7600 val loss 6.4528 -7600 val perplexity 634.4956 -7600 train 6.400621 (lr=3.4011e-05) (hash(x)=49942165) -7700 val loss 6.4515 -7700 val perplexity 633.6224 -7700 train 6.178792 (lr=3.2531e-05) (hash(x)=48853311) -7800 val loss 6.4469 -7800 val perplexity 630.7529 -7800 train 6.332662 (lr=3.1102e-05) (hash(x)=48510117) -7900 val loss 6.4452 -7900 val perplexity 629.6969 -7900 train 6.454893 (lr=2.9726e-05) (hash(x)=48339781) -8000 val loss 6.4485 -8000 val perplexity 631.7830 -8000 train 6.561584 (lr=2.8405e-05) (hash(x)=54927320) -8100 val loss 6.4404 -8100 val perplexity 626.6378 -8100 train 6.167403 (lr=2.7138e-05) (hash(x)=46461786) -8200 val loss 6.4388 -8200 val perplexity 625.6752 -8200 train 6.347755 (lr=2.5929e-05) (hash(x)=51536260) -8300 val loss 6.4365 -8300 val perplexity 624.2048 -8300 train 6.190253 (lr=2.4778e-05) (hash(x)=44770722) -8400 val loss 6.4290 -8400 val perplexity 619.5564 -8400 train 6.434593 (lr=2.3686e-05) (hash(x)=50104957) -8500 val loss 6.4245 -8500 val perplexity 616.7615 -8500 train 6.533162 (lr=2.2655e-05) (hash(x)=50132971) -8600 val loss 6.4212 -8600 val perplexity 614.7573 -8600 train 6.378406 (lr=2.1685e-05) (hash(x)=52193699) -8700 val loss 6.4183 -8700 val perplexity 612.9755 -8700 train 6.427931 (lr=2.0777e-05) (hash(x)=47902319) -8800 val loss 6.4139 -8800 val perplexity 610.2626 -8800 train 6.709031 (lr=1.9933e-05) (hash(x)=54904230) -8900 val loss 6.4051 -8900 val perplexity 604.9390 -8900 train 6.288378 (lr=1.9153e-05) (hash(x)=46311615) -9000 val loss 6.4113 -9000 val perplexity 608.7029 -9000 train 6.270270 (lr=1.8439e-05) (hash(x)=48535188) -9100 val loss 6.4026 -9100 val perplexity 603.3902 -9100 train 6.419470 (lr=1.7790e-05) (hash(x)=51757372) -9200 val loss 6.4024 -9200 val perplexity 603.2843 -9200 train 6.288178 (lr=1.7208e-05) (hash(x)=51131708) -9300 val loss 6.4021 -9300 val perplexity 603.1321 -9300 train 6.361725 (lr=1.6692e-05) (hash(x)=44784276) -9400 val loss 6.4006 -9400 val perplexity 602.2206 -9400 train 6.529187 (lr=1.6245e-05) (hash(x)=51981169) -9500 val loss 6.3956 -9500 val perplexity 599.1955 -9500 train 6.358074 (lr=1.5865e-05) (hash(x)=47232936) -9600 val loss 6.3954 -9600 val perplexity 599.1069 -9600 train 6.410747 (lr=1.5554e-05) (hash(x)=53800450) -9700 val loss 6.3937 -9700 val perplexity 598.0549 -9700 train 6.500488 (lr=1.5312e-05) (hash(x)=55768123) -9800 val loss 6.3977 -9800 val perplexity 600.4852 -9800 train 6.321241 (lr=1.5139e-05) (hash(x)=47745177) -9900 val loss 6.3977 -9900 val perplexity 600.4746 -9900 train 6.664391 (lr=1.5035e-05) (hash(x)=56592246) -9999 val loss 6.4009 -9999 val perplexity 602.3967 +0 val loss 11.8201 +0 val perplexity 135955.4688 +0 val loss 11.8201 +0 val perplexity 135955.4688 +0 train 11.815376 (lr=3.5000e-07) (hash(x)=57791809) +0 train 11.815376 (lr=2.5000e-07) (hash(x)=57791809) +0 train 11.815376 (lr=5.0000e-07) (hash(x)=57791809) +100 val loss 10.0061 +100 val perplexity 22160.5547 +100 train 10.031336 (lr=3.5350e-05) (hash(x)=48211824) +100 val loss 10.1395 +100 val perplexity 25324.7344 +100 train 10.163652 (lr=2.5250e-05) (hash(x)=48211824) +100 val loss 9.7609 +100 val perplexity 17342.8867 +100 train 9.788915 (lr=5.0500e-05) (hash(x)=48211824) +200 val loss 8.7280 +200 val perplexity 6173.5259 +200 train 8.680338 (lr=7.0000e-05) (hash(x)=50375849) +200 val loss 8.9196 +200 val perplexity 7477.1943 +200 train 8.881001 (lr=5.0000e-05) (hash(x)=50375849) +200 val loss 7.9945 +200 val perplexity 2964.6406 +200 train 7.936388 (lr=1.0000e-04) (hash(x)=50375849) +300 val loss 7.9588 +300 val perplexity 2860.6704 +300 train 8.242803 (lr=6.9999e-05) (hash(x)=57250808) +300 val loss 8.0427 +300 val perplexity 3111.0901 +300 train 8.321117 (lr=5.0000e-05) (hash(x)=57250808) +300 val loss 7.6873 +300 val perplexity 2180.5630 +300 train 8.001595 (lr=9.9999e-05) (hash(x)=57250808) +400 val loss 7.7226 +400 val perplexity 2258.8765 +400 train 8.318707 (lr=6.9997e-05) (hash(x)=62519858) +400 val loss 7.7574 +400 val perplexity 2338.7456 +400 train 8.329361 (lr=4.9998e-05) (hash(x)=62519858) +400 val loss 7.6750 +400 val perplexity 2153.8877 +400 train 8.293846 (lr=9.9996e-05) (hash(x)=62519858) +500 val loss 7.6695 +500 val perplexity 2142.0701 +500 train 7.568311 (lr=6.9994e-05) (hash(x)=47226806) +500 val loss 7.6585 +500 val perplexity 2118.6331 +500 train 7.559607 (lr=4.9996e-05) (hash(x)=47226806) +500 val loss 7.6635 +500 val perplexity 2129.2957 +500 train 7.589068 (lr=9.9992e-05) (hash(x)=47226806) +600 val loss 7.6428 +600 val perplexity 2085.4954 +600 train 7.671701 (lr=6.9990e-05) (hash(x)=51149322) +600 val loss 7.6290 +600 val perplexity 2057.0649 +600 train 7.657920 (lr=4.9993e-05) (hash(x)=51149322) +600 val loss 7.6835 +600 val perplexity 2172.2947 +600 train 7.712746 (lr=9.9986e-05) (hash(x)=51149322) +700 val loss 7.6430 +700 val perplexity 2085.8882 +700 train 7.653198 (lr=6.9984e-05) (hash(x)=51564551) +700 val loss 7.6192 +700 val perplexity 2036.8857 +700 train 7.630872 (lr=4.9989e-05) (hash(x)=51564551) +700 val loss 7.6578 +700 val perplexity 2117.1748 +700 train 7.676233 (lr=9.9978e-05) (hash(x)=51564551) +800 val loss 7.6202 +800 val perplexity 2039.0431 +800 train 7.373461 (lr=6.9977e-05) (hash(x)=45093459) +800 val loss 7.6100 +800 val perplexity 2018.2389 +800 train 7.380928 (lr=4.9984e-05) (hash(x)=45093459) +800 val loss 7.6409 +800 val perplexity 2081.5652 +800 train 7.411017 (lr=9.9968e-05) (hash(x)=45093459) +900 val loss 7.6041 +900 val perplexity 2006.4135 +900 train 7.915620 (lr=6.9969e-05) (hash(x)=54988361) +900 val loss 7.5990 +900 val perplexity 1996.2872 +900 train 7.901796 (lr=4.9978e-05) (hash(x)=54988361) +900 val loss 7.6248 +900 val perplexity 2048.3447 +900 train 7.913666 (lr=9.9956e-05) (hash(x)=54988361) +1000 val loss 7.6032 +1000 val perplexity 2004.5659 +1000 train 7.487446 (lr=6.9960e-05) (hash(x)=47588648) +1000 val loss 7.6414 +1000 val perplexity 2082.6899 +1000 train 7.527693 (lr=9.9943e-05) (hash(x)=47588648) +1000 val loss 7.5895 +1000 val perplexity 1977.3726 +1000 train 7.466284 (lr=4.9971e-05) (hash(x)=47588648) +1100 val loss 7.5861 +1100 val perplexity 1970.5438 +1100 train 7.207222 (lr=6.9949e-05) (hash(x)=37984588) +1100 val loss 7.6216 +1100 val perplexity 2041.8978 +1100 train 7.275620 (lr=9.9927e-05) (hash(x)=37984588) +1100 val loss 7.5907 +1100 val perplexity 1979.6897 +1100 train 7.223131 (lr=4.9964e-05) (hash(x)=37984588) +1200 val loss 7.5671 +1200 val perplexity 1933.5526 +1200 train 7.731050 (lr=6.9937e-05) (hash(x)=56333817) +1200 val loss 7.5809 +1200 val perplexity 1960.4640 +1200 train 7.750664 (lr=9.9910e-05) (hash(x)=56333817) +1200 val loss 7.5857 +1200 val perplexity 1969.8665 +1200 train 7.749602 (lr=4.9955e-05) (hash(x)=56333817) +1300 val loss 7.5572 +1300 val perplexity 1914.3943 +1300 train 7.702303 (lr=6.9924e-05) (hash(x)=53454056) +1300 val loss 7.5620 +1300 val perplexity 1923.5961 +1300 train 7.703107 (lr=9.9892e-05) (hash(x)=53454056) +1400 val loss 7.5422 +1400 val perplexity 1885.9550 +1400 train 7.643173 (lr=6.9910e-05) (hash(x)=55284163) +1300 val loss 7.5975 +1300 val perplexity 1993.1512 +1300 train 7.718252 (lr=4.9946e-05) (hash(x)=53454056) +1400 val loss 7.5226 +1400 val perplexity 1849.4447 +1400 train 7.625996 (lr=9.9871e-05) (hash(x)=55284163) +1500 val loss 7.6245 +1500 val perplexity 2047.8584 +1500 train 7.548000 (lr=6.9894e-05) (hash(x)=48162598) +1400 val loss 7.5680 +1400 val perplexity 1935.2434 +1400 train 7.663744 (lr=4.9936e-05) (hash(x)=55284163) +1600 val loss 7.5211 +1600 val perplexity 1846.5297 +1600 train 7.567415 (lr=6.9877e-05) (hash(x)=54214535) +1500 val loss 7.4830 +1500 val perplexity 1777.5719 +1500 train 7.413181 (lr=9.9849e-05) (hash(x)=48162598) +1500 val loss 7.5403 +1500 val perplexity 1882.3864 +1500 train 7.457651 (lr=4.9924e-05) (hash(x)=48162598) +1700 val loss 7.4984 +1700 val perplexity 1805.1927 +1700 train 7.708467 (lr=6.9859e-05) (hash(x)=53525003) +1600 val loss 7.4618 +1600 val perplexity 1740.1932 +1600 train 7.523397 (lr=9.9825e-05) (hash(x)=54214535) +1600 val loss 7.5234 +1600 val perplexity 1850.8969 +1600 train 7.582265 (lr=4.9912e-05) (hash(x)=54214535) +1800 val loss 7.4766 +1800 val perplexity 1766.2341 +1800 train 7.561733 (lr=6.9840e-05) (hash(x)=51848994) +1700 val loss 7.4384 +1700 val perplexity 1700.0876 +1700 train 7.670763 (lr=9.9799e-05) (hash(x)=53525003) +1700 val loss 7.5229 +1700 val perplexity 1849.8566 +1700 train 7.723702 (lr=4.9899e-05) (hash(x)=53525003) +1900 val loss 7.4697 +1900 val perplexity 1754.1061 +1900 train 7.271915 (lr=6.9819e-05) (hash(x)=48405987) +1800 val loss 7.4013 +1800 val perplexity 1638.1406 +1800 train 7.474629 (lr=9.9771e-05) (hash(x)=51848994) +1800 val loss 7.5197 +1800 val perplexity 1843.9948 +1800 train 7.608754 (lr=4.9885e-05) (hash(x)=51848994) +2000 val loss 7.4516 +2000 val perplexity 1722.6387 +2000 train 7.840192 (lr=6.9797e-05) (hash(x)=58592291) +1900 val loss 7.3939 +1900 val perplexity 1626.0569 +1900 train 7.207140 (lr=9.9741e-05) (hash(x)=48405987) +1900 val loss 7.5030 +1900 val perplexity 1813.5634 +1900 train 7.312572 (lr=4.9871e-05) (hash(x)=48405987) +2100 val loss 7.4566 +2100 val perplexity 1731.1819 +2100 train 7.538568 (lr=6.9774e-05) (hash(x)=51167081) +2000 val loss 7.3727 +2000 val perplexity 1591.9402 +2000 train 7.721725 (lr=9.9710e-05) (hash(x)=58592291) +2000 val loss 7.5025 +2000 val perplexity 1812.5009 +2000 train 7.861420 (lr=4.9855e-05) (hash(x)=58592291) +2200 val loss 7.4264 +2200 val perplexity 1679.8331 +2200 train 7.429912 (lr=6.9750e-05) (hash(x)=47994988) +2100 val loss 7.3837 +2100 val perplexity 1609.5760 +2100 train 7.464706 (lr=9.9677e-05) (hash(x)=51167081) +2100 val loss 7.4977 +2100 val perplexity 1803.8229 +2100 train 7.601411 (lr=4.9839e-05) (hash(x)=51167081) +2300 val loss 7.4027 +2300 val perplexity 1640.4669 +2300 train 7.390494 (lr=6.9724e-05) (hash(x)=47377604) +2200 val loss 7.4287 +2200 val perplexity 1683.6567 +2200 train 7.427091 (lr=9.9642e-05) (hash(x)=47994988) +2400 val loss 7.3508 +2400 val perplexity 1557.4562 +2400 train 7.401909 (lr=6.9697e-05) (hash(x)=53554323) +2200 val loss 7.4798 +2200 val perplexity 1771.9459 +2200 train 7.475770 (lr=4.9821e-05) (hash(x)=47994988) +2300 val loss 7.3702 +2300 val perplexity 1587.9591 +2300 train 7.350887 (lr=9.9606e-05) (hash(x)=47377604) +2500 val loss 7.3187 +2500 val perplexity 1508.2208 +2500 train 7.323335 (lr=6.9669e-05) (hash(x)=50780417) +2300 val loss 7.4838 +2300 val perplexity 1778.9948 +2300 train 7.455084 (lr=4.9803e-05) (hash(x)=47377604) +2400 val loss 7.3521 +2400 val perplexity 1559.5281 +2400 train 7.410884 (lr=9.9567e-05) (hash(x)=53554323) +2600 val loss 7.2853 +2600 val perplexity 1458.7247 +2600 train 7.181382 (lr=6.9640e-05) (hash(x)=46453562) +2400 val loss 7.4971 +2400 val perplexity 1802.8408 +2400 train 7.552158 (lr=4.9784e-05) (hash(x)=53554323) +2500 val loss 7.3333 +2500 val perplexity 1530.3920 +2500 train 7.320897 (lr=9.9527e-05) (hash(x)=50780417) +2700 val loss 7.2758 +2700 val perplexity 1444.8698 +2700 train 7.258449 (lr=6.9609e-05) (hash(x)=54404221) +2500 val loss 7.4722 +2500 val perplexity 1758.5254 +2500 train 7.460775 (lr=4.9764e-05) (hash(x)=50780417) +2600 val loss 7.3253 +2600 val perplexity 1518.1760 +2600 train 7.212368 (lr=9.9485e-05) (hash(x)=46453562) +2800 val loss 7.2339 +2800 val perplexity 1385.6417 +2800 train 7.944000 (lr=6.9577e-05) (hash(x)=59318895) +2600 val loss 7.4753 +2600 val perplexity 1763.9120 +2600 train 7.385237 (lr=4.9743e-05) (hash(x)=46453562) +2900 val loss 7.2038 +2900 val perplexity 1344.5312 +2900 train 7.183782 (lr=6.9544e-05) (hash(x)=47845760) +2700 val loss 7.3421 +2700 val perplexity 1543.9019 +2700 train 7.332971 (lr=9.9442e-05) (hash(x)=54404221) +2700 val loss 7.4581 +2700 val perplexity 1733.8107 +2700 train 7.486804 (lr=4.9721e-05) (hash(x)=54404221) +3000 val loss 7.1747 +3000 val perplexity 1306.0240 +3000 train 6.879607 (lr=6.9510e-05) (hash(x)=44336167) +2800 val loss 7.3176 +2800 val perplexity 1506.5914 +2800 train 8.045705 (lr=9.9396e-05) (hash(x)=59318895) +2800 val loss 7.4447 +2800 val perplexity 1710.8124 +2800 train 8.284585 (lr=4.9698e-05) (hash(x)=59318895) +3100 val loss 7.1436 +3100 val perplexity 1265.9867 +3100 train 7.444237 (lr=6.9474e-05) (hash(x)=44479330) +2900 val loss 7.2990 +2900 val perplexity 1478.8462 +2900 train 7.271435 (lr=9.9349e-05) (hash(x)=47845760) +2900 val loss 7.4506 +2900 val perplexity 1720.9794 +2900 train 7.431831 (lr=4.9674e-05) (hash(x)=47845760) +3200 val loss 7.1320 +3200 val perplexity 1251.3336 +3200 train 7.191150 (lr=6.9438e-05) (hash(x)=54593096) +3000 val loss 7.2992 +3000 val perplexity 1479.0916 +3000 train 6.994728 (lr=9.9300e-05) (hash(x)=44336167) +3300 val loss 7.0997 +3300 val perplexity 1211.6255 +3300 train 7.011525 (lr=6.9400e-05) (hash(x)=45347643) +3000 val loss 7.4237 +3000 val perplexity 1675.1554 +3000 train 7.138683 (lr=4.9650e-05) (hash(x)=44336167) +3100 val loss 7.2724 +3100 val perplexity 1440.0378 +3100 train 7.530640 (lr=9.9249e-05) (hash(x)=44479330) +3400 val loss 7.0938 +3400 val perplexity 1204.4785 +3400 train 7.215239 (lr=6.9360e-05) (hash(x)=47797247) +3100 val loss 7.4045 +3100 val perplexity 1643.3512 +3100 train 7.655307 (lr=4.9625e-05) (hash(x)=44479330) +3200 val loss 7.2596 +3200 val perplexity 1421.6920 +3200 train 7.307819 (lr=9.9197e-05) (hash(x)=54593096) +3500 val loss 7.0682 +3500 val perplexity 1174.0056 +3500 train 6.962046 (lr=6.9320e-05) (hash(x)=46115683) +3200 val loss 7.4067 +3200 val perplexity 1647.0352 +3200 train 7.450519 (lr=4.9598e-05) (hash(x)=54593096) +3300 val loss 7.2790 +3300 val perplexity 1449.4850 +3300 train 7.129585 (lr=9.9142e-05) (hash(x)=45347643) +3600 val loss 7.0492 +3600 val perplexity 1151.9397 +3600 train 6.904495 (lr=6.9278e-05) (hash(x)=44502074) +3300 val loss 7.3963 +3300 val perplexity 1629.9904 +3300 train 7.239784 (lr=4.9571e-05) (hash(x)=45347643) +3400 val loss 7.2493 +3400 val perplexity 1407.1310 +3400 train 7.375264 (lr=9.9086e-05) (hash(x)=47797247) +3700 val loss 7.0325 +3700 val perplexity 1132.8109 +3700 train 7.175766 (lr=6.9235e-05) (hash(x)=55388443) +3400 val loss 7.3565 +3400 val perplexity 1566.4130 +3400 train 7.464308 (lr=4.9543e-05) (hash(x)=47797247) +3500 val loss 7.2425 +3500 val perplexity 1397.5653 +3500 train 7.154598 (lr=9.9028e-05) (hash(x)=46115683) +3800 val loss 7.0013 +3800 val perplexity 1098.0376 +3800 train 6.810500 (lr=6.9191e-05) (hash(x)=43790341) +3500 val loss 7.3431 +3500 val perplexity 1545.4553 +3500 train 7.265201 (lr=4.9514e-05) (hash(x)=46115683) +3600 val loss 7.2292 +3600 val perplexity 1379.1554 +3600 train 7.053658 (lr=9.8969e-05) (hash(x)=44502074) +3900 val loss 6.9691 +3900 val perplexity 1063.2583 +3900 train 6.973560 (lr=6.9146e-05) (hash(x)=50013318) +3600 val loss 7.3240 +3600 val perplexity 1516.3240 +3600 train 7.149223 (lr=4.9484e-05) (hash(x)=44502074) +3700 val loss 7.2192 +3700 val perplexity 1365.4557 +3700 train 7.358182 (lr=9.8908e-05) (hash(x)=55388443) +4000 val loss 6.9317 +4000 val perplexity 1024.2280 +4000 train 7.021192 (lr=6.9099e-05) (hash(x)=51704787) +3700 val loss 7.3146 +3700 val perplexity 1502.0967 +3700 train 7.437807 (lr=4.9454e-05) (hash(x)=55388443) +3800 val loss 7.2244 +3800 val perplexity 1372.5228 +3800 train 7.005445 (lr=9.8845e-05) (hash(x)=43790341) +4100 val loss 6.9325 +4100 val perplexity 1025.0312 +4100 train 7.015868 (lr=6.9051e-05) (hash(x)=50821964) +3800 val loss 7.2936 +3800 val perplexity 1470.9061 +3800 train 7.100006 (lr=4.9422e-05) (hash(x)=43790341) +3900 val loss 7.1979 +3900 val perplexity 1336.5901 +3900 train 7.187948 (lr=9.8780e-05) (hash(x)=50013318) +4200 val loss 6.9185 +4200 val perplexity 1010.7939 +4200 train 6.944781 (lr=6.9002e-05) (hash(x)=49675080) +3900 val loss 7.2722 +3900 val perplexity 1439.6870 +3900 train 7.297838 (lr=4.9390e-05) (hash(x)=50013318) +4000 val loss 7.1747 +4000 val perplexity 1305.9094 +4000 train 7.256429 (lr=9.8713e-05) (hash(x)=51704787) +4300 val loss 6.9167 +4300 val perplexity 1008.9371 +4300 train 6.592006 (lr=6.8952e-05) (hash(x)=43239281) +4400 val loss 6.8773 +4400 val perplexity 969.9754 +4400 train 6.568204 (lr=6.8901e-05) (hash(x)=45076737) +4000 val loss 7.2549 +4000 val perplexity 1414.9628 +4000 train 7.339958 (lr=4.9357e-05) (hash(x)=51704787) +4100 val loss 7.1546 +4100 val perplexity 1280.0354 +4100 train 7.218249 (lr=9.8645e-05) (hash(x)=50821964) +4500 val loss 6.8609 +4500 val perplexity 954.2654 +4500 train 6.980698 (lr=6.8848e-05) (hash(x)=57930262) +4200 val loss 7.1594 +4200 val perplexity 1286.0985 +4200 train 7.182823 (lr=9.8575e-05) (hash(x)=49675080) +4100 val loss 7.2445 +4100 val perplexity 1400.3203 +4100 train 7.328505 (lr=4.9322e-05) (hash(x)=50821964) +4600 val loss 6.8439 +4600 val perplexity 938.1647 +4600 train 6.625656 (lr=6.8794e-05) (hash(x)=46721614) +4300 val loss 7.1313 +4300 val perplexity 1250.5415 +4300 train 6.837461 (lr=9.8503e-05) (hash(x)=43239281) +4200 val loss 7.2199 +4200 val perplexity 1366.3474 +4200 train 7.257685 (lr=4.9287e-05) (hash(x)=49675080) +4400 val loss 7.1284 +4400 val perplexity 1246.8333 +4400 train 6.826270 (lr=9.8430e-05) (hash(x)=45076737) +4700 val loss 6.8368 +4700 val perplexity 931.5139 +4700 train 6.672508 (lr=6.8739e-05) (hash(x)=49837920) +4300 val loss 7.2085 +4300 val perplexity 1350.9222 +4300 train 6.875511 (lr=4.9252e-05) (hash(x)=43239281) +4500 val loss 7.1382 +4500 val perplexity 1259.2046 +4500 train 7.242622 (lr=9.8355e-05) (hash(x)=57930262) +4800 val loss 6.8120 +4800 val perplexity 908.7191 +4800 train 6.964200 (lr=6.8683e-05) (hash(x)=48380045) +4400 val loss 7.1996 +4400 val perplexity 1338.9216 +4400 train 6.902917 (lr=4.9215e-05) (hash(x)=45076737) +4600 val loss 7.1429 +4600 val perplexity 1265.0376 +4600 train 6.952327 (lr=9.8278e-05) (hash(x)=46721614) +4900 val loss 6.7899 +4900 val perplexity 888.8326 +4900 train 6.666987 (lr=6.8626e-05) (hash(x)=44202577) +4500 val loss 7.1818 +4500 val perplexity 1315.2502 +4500 train 7.273689 (lr=4.9177e-05) (hash(x)=57930262) +5000 val loss 6.7547 +5000 val perplexity 858.0734 +5000 train 6.760885 (lr=6.8567e-05) (hash(x)=52038024) +4700 val loss 7.1145 +4700 val perplexity 1229.6857 +4700 train 6.983832 (lr=9.8199e-05) (hash(x)=49837920) +4600 val loss 7.1764 +4600 val perplexity 1308.2018 +4600 train 7.008025 (lr=4.9139e-05) (hash(x)=46721614) +5100 val loss 6.7286 +5100 val perplexity 835.9825 +5100 train 6.895877 (lr=6.8507e-05) (hash(x)=53700038) +4800 val loss 7.0932 +4800 val perplexity 1203.7006 +4800 train 7.200828 (lr=9.8119e-05) (hash(x)=48380045) +4700 val loss 7.1608 +4700 val perplexity 1287.9175 +4700 train 7.031097 (lr=4.9099e-05) (hash(x)=49837920) +5200 val loss 6.7278 +5200 val perplexity 835.3342 +5200 train 6.687095 (lr=6.8446e-05) (hash(x)=48137625) +4900 val loss 7.1155 +4900 val perplexity 1230.9294 +4900 train 6.997181 (lr=9.8036e-05) (hash(x)=44202577) +4800 val loss 7.1430 +4800 val perplexity 1265.2040 +4800 train 7.249168 (lr=4.9059e-05) (hash(x)=48380045) +5300 val loss 6.7074 +5300 val perplexity 818.4043 +5300 train 6.550315 (lr=6.8384e-05) (hash(x)=43161573) +5000 val loss 7.1163 +5000 val perplexity 1231.9117 +5000 train 7.120258 (lr=9.7953e-05) (hash(x)=52038024) +4900 val loss 7.1146 +4900 val perplexity 1229.7953 +4900 train 6.998655 (lr=4.9018e-05) (hash(x)=44202577) +5400 val loss 6.7170 +5400 val perplexity 826.3092 +5400 train 6.898795 (lr=6.8320e-05) (hash(x)=56673322) +5100 val loss 7.0771 +5100 val perplexity 1184.5468 +5100 train 7.241188 (lr=9.7867e-05) (hash(x)=53700038) +5000 val loss 7.0868 +5000 val perplexity 1196.0487 +5000 train 7.105583 (lr=4.8976e-05) (hash(x)=52038024) +5500 val loss 6.6953 +5500 val perplexity 808.5854 +5500 train 6.883098 (lr=6.8256e-05) (hash(x)=53468295) +5200 val loss 7.0188 +5200 val perplexity 1117.4941 +5200 train 6.979750 (lr=9.7780e-05) (hash(x)=48137625) +5100 val loss 7.0759 +5100 val perplexity 1183.0616 +5100 train 7.257845 (lr=4.8934e-05) (hash(x)=53700038) +5600 val loss 6.6755 +5600 val perplexity 792.7690 +5600 train 6.960256 (lr=6.8190e-05) (hash(x)=59287280) +5300 val loss 6.9958 +5300 val perplexity 1092.0437 +5300 train 6.847336 (lr=9.7691e-05) (hash(x)=43161573) +5200 val loss 7.0647 +5200 val perplexity 1169.9166 +5200 train 7.022595 (lr=4.8890e-05) (hash(x)=48137625) +5700 val loss 6.6592 +5700 val perplexity 779.9288 +5700 train 6.861547 (lr=6.8123e-05) (hash(x)=57575806) +5400 val loss 7.0182 +5400 val perplexity 1116.8011 +5400 train 7.199425 (lr=9.7600e-05) (hash(x)=56673322) +5300 val loss 7.0409 +5300 val perplexity 1142.4619 +5300 train 6.914318 (lr=4.8846e-05) (hash(x)=43161573) +5800 val loss 6.6521 +5800 val perplexity 774.4315 +5800 train 6.634932 (lr=6.8055e-05) (hash(x)=46897279) +5500 val loss 7.0327 +5500 val perplexity 1133.1058 +5500 train 7.204011 (lr=9.7508e-05) (hash(x)=53468295) +5400 val loss 7.0017 +5400 val perplexity 1098.5042 +5400 train 7.188062 (lr=4.8800e-05) (hash(x)=56673322) +5900 val loss 6.6694 +5900 val perplexity 787.9207 +5900 train 6.553205 (lr=6.7985e-05) (hash(x)=47565679) +5600 val loss 6.9888 +5600 val perplexity 1084.4437 +5600 train 7.272110 (lr=9.7414e-05) (hash(x)=59287280) +5500 val loss 6.9708 +5500 val perplexity 1065.1246 +5500 train 7.154151 (lr=4.8754e-05) (hash(x)=53468295) +6000 val loss 6.6493 +6000 val perplexity 772.2234 +6000 train 6.505241 (lr=6.7915e-05) (hash(x)=51590090) +5700 val loss 6.9836 +5700 val perplexity 1078.7675 +5700 train 7.219868 (lr=9.7318e-05) (hash(x)=57575806) +5600 val loss 6.9361 +5600 val perplexity 1028.7036 +5600 train 7.260452 (lr=4.8707e-05) (hash(x)=59287280) +6100 val loss 6.6288 +6100 val perplexity 756.5986 +6100 train 7.037403 (lr=6.7843e-05) (hash(x)=59732271) +5800 val loss 6.9847 +5800 val perplexity 1079.9718 +5800 train 6.926145 (lr=9.7221e-05) (hash(x)=46897279) +5700 val loss 6.9153 +5700 val perplexity 1007.5467 +5700 train 7.155314 (lr=4.8659e-05) (hash(x)=57575806) +6200 val loss 6.6045 +6200 val perplexity 738.4030 +6200 train 6.632282 (lr=6.7770e-05) (hash(x)=46394422) +5900 val loss 6.9803 +5900 val perplexity 1075.2649 +5900 train 6.846642 (lr=9.7122e-05) (hash(x)=47565679) +6300 val loss 6.6123 +6300 val perplexity 744.2033 +6300 train 6.649691 (lr=6.7696e-05) (hash(x)=53748145) +5800 val loss 6.8961 +5800 val perplexity 988.3776 +5800 train 6.821358 (lr=4.8611e-05) (hash(x)=46897279) +6000 val loss 7.0038 +6000 val perplexity 1100.8066 +6000 train 6.874036 (lr=9.7021e-05) (hash(x)=51590090) +6400 val loss 6.5895 +6400 val perplexity 727.4420 +6400 train 6.455684 (lr=6.7621e-05) (hash(x)=46054751) +6100 val loss 6.9770 +6100 val perplexity 1071.6895 +5900 val loss 6.8721 +5900 val perplexity 964.9728 +6100 train 7.369457 (lr=9.6919e-05) (hash(x)=59732271) +5900 train 6.744565 (lr=4.8561e-05) (hash(x)=47565679) +6500 val loss 6.5751 +6500 val perplexity 716.9923 +6500 train 6.855925 (lr=6.7545e-05) (hash(x)=51816809) +6200 val loss 6.9668 +6200 val perplexity 1060.8143 +6200 train 6.925545 (lr=9.6815e-05) (hash(x)=46394422) +6000 val loss 6.8457 +6000 val perplexity 939.8616 +6000 train 6.720833 (lr=4.8511e-05) (hash(x)=51590090) +6600 val loss 6.5757 +6600 val perplexity 717.4735 +6600 train 6.422357 (lr=6.7467e-05) (hash(x)=52453336) +6300 val loss 6.9703 +6300 val perplexity 1064.5681 +6300 train 6.989490 (lr=9.6709e-05) (hash(x)=53748145) +6100 val loss 6.8210 +6100 val perplexity 916.9286 +6100 train 7.224485 (lr=4.8459e-05) (hash(x)=59732271) +6700 val loss 6.5693 +6700 val perplexity 712.8660 +6700 train 6.581459 (lr=6.7389e-05) (hash(x)=49108775) +6400 val loss 6.9472 +6400 val perplexity 1040.2695 +6400 train 6.806530 (lr=9.6602e-05) (hash(x)=46054751) +6200 val loss 6.8051 +6200 val perplexity 902.4440 +6200 train 6.774230 (lr=4.8407e-05) (hash(x)=46394422) +6800 val loss 6.5485 +6800 val perplexity 698.1708 +6800 train 6.429181 (lr=6.7309e-05) (hash(x)=46745396) +6500 val loss 6.9401 +6500 val perplexity 1032.8649 +6500 train 7.142802 (lr=9.6493e-05) (hash(x)=51816809) +6300 val loss 6.7819 +6300 val perplexity 881.7365 +6300 train 6.804878 (lr=4.8355e-05) (hash(x)=53748145) +6900 val loss 6.5446 +6900 val perplexity 695.5093 +6900 train 6.687532 (lr=6.7228e-05) (hash(x)=46534986) +6600 val loss 6.9536 +6600 val perplexity 1046.8857 +6600 train 6.907654 (lr=9.6382e-05) (hash(x)=52453336) +6400 val loss 6.7609 +6400 val perplexity 863.4091 +6400 train 6.635312 (lr=4.8301e-05) (hash(x)=46054751) +7000 val loss 6.5358 +7000 val perplexity 689.4045 +7000 train 6.884590 (lr=6.7146e-05) (hash(x)=49317888) +6700 val loss 6.9180 +6700 val perplexity 1010.2961 +6700 train 6.927352 (lr=9.6270e-05) (hash(x)=49108775) +6500 val loss 6.7379 +6500 val perplexity 843.8121 +6500 train 6.975233 (lr=4.8246e-05) (hash(x)=51816809) +6800 val loss 6.9378 +6800 val perplexity 1030.5094 +6800 train 6.820474 (lr=9.6156e-05) (hash(x)=46745396) +7100 val loss 6.5310 +7100 val perplexity 686.0746 +7100 train 6.542171 (lr=6.7063e-05) (hash(x)=50360484) +6600 val loss 6.7466 +6600 val perplexity 851.1754 +6600 train 6.636971 (lr=4.8191e-05) (hash(x)=52453336) +7200 val loss 6.5334 +7200 val perplexity 687.7217 +7200 train 6.403098 (lr=6.6978e-05) (hash(x)=49515094) +6900 val loss 6.9012 +6900 val perplexity 993.4229 +6900 train 7.024705 (lr=9.6040e-05) (hash(x)=46534986) +6700 val loss 6.7190 +6700 val perplexity 828.0261 +6700 train 6.735272 (lr=4.8135e-05) (hash(x)=49108775) +7300 val loss 6.5433 +7300 val perplexity 694.5876 +7300 train 6.540457 (lr=6.6893e-05) (hash(x)=51546861) +7000 val loss 6.8997 +7000 val perplexity 991.9900 +7000 train 7.183020 (lr=9.5923e-05) (hash(x)=49317888) +6800 val loss 6.7086 +6800 val perplexity 819.4630 +6800 train 6.608204 (lr=4.8078e-05) (hash(x)=46745396) +7100 val loss 6.8746 +7100 val perplexity 967.3943 +7400 val loss 6.5318 +7400 val perplexity 686.6266 +7100 train 6.887483 (lr=9.5804e-05) (hash(x)=50360484) +7400 train 6.516788 (lr=6.6806e-05) (hash(x)=48320948) +6900 val loss 6.6809 +6900 val perplexity 797.0172 +6900 train 6.816370 (lr=4.8020e-05) (hash(x)=46534986) +7200 val loss 6.8591 +7200 val perplexity 952.5170 +7200 train 6.759748 (lr=9.5683e-05) (hash(x)=49515094) +7500 val loss 6.5409 +7500 val perplexity 692.9114 +7500 train 6.417346 (lr=6.6718e-05) (hash(x)=40167457) +7300 val loss 6.8558 +7300 val perplexity 949.3570 +7300 train 6.855737 (lr=9.5561e-05) (hash(x)=51546861) +7000 val loss 6.6739 +7000 val perplexity 791.4651 +7000 train 7.034169 (lr=4.7961e-05) (hash(x)=49317888) +7600 val loss 6.5199 +7600 val perplexity 678.4926 +7600 train 6.449652 (lr=6.6630e-05) (hash(x)=49942165) +7400 val loss 6.8292 +7400 val perplexity 924.4711 +7400 train 6.817543 (lr=9.5437e-05) (hash(x)=48320948) +7100 val loss 6.6571 +7100 val perplexity 778.2993 +7100 train 6.667387 (lr=4.7902e-05) (hash(x)=50360484) +7700 val loss 6.5375 +7700 val perplexity 690.5319 +7700 train 6.272810 (lr=6.6540e-05) (hash(x)=48853311) +7500 val loss 6.8651 +7500 val perplexity 958.2407 +7200 val loss 6.6463 +7200 val perplexity 769.8997 +7500 train 6.620294 (lr=9.5312e-05) (hash(x)=40167457) +7200 train 6.539092 (lr=4.7842e-05) (hash(x)=49515094) +7800 val loss 6.5229 +7800 val perplexity 680.5576 +7800 train 6.419036 (lr=6.6448e-05) (hash(x)=48510117) +7600 val loss 6.7919 +7600 val perplexity 890.5652 +7600 train 6.704808 (lr=9.5185e-05) (hash(x)=49942165) +7300 val loss 6.6627 +7300 val perplexity 782.6371 +7300 train 6.658652 (lr=4.7781e-05) (hash(x)=51546861) +7900 val loss 6.5182 +7900 val perplexity 677.3441 +7900 train 6.517859 (lr=6.6356e-05) (hash(x)=48339781) +7700 val loss 6.7696 +7700 val perplexity 870.9767 +7700 train 6.502357 (lr=9.5057e-05) (hash(x)=48853311) +7400 val loss 6.6366 +7400 val perplexity 762.4875 +7400 train 6.618514 (lr=4.7719e-05) (hash(x)=48320948) +8000 val loss 6.5207 +8000 val perplexity 679.0684 +8000 train 6.626083 (lr=6.6263e-05) (hash(x)=54927320) +7800 val loss 6.7588 +7800 val perplexity 861.5703 +7800 train 6.644112 (lr=9.4926e-05) (hash(x)=48510117) +7500 val loss 6.6470 +7500 val perplexity 770.4976 +7500 train 6.465994 (lr=4.7656e-05) (hash(x)=40167457) +8100 val loss 6.5098 +8100 val perplexity 671.6619 +8100 train 6.218547 (lr=6.6169e-05) (hash(x)=46461786) +7900 val loss 6.7503 +7900 val perplexity 854.2759 +7900 train 6.755628 (lr=9.4795e-05) (hash(x)=48339781) +7600 val loss 6.6118 +7600 val perplexity 743.8340 +7600 train 6.546916 (lr=4.7593e-05) (hash(x)=49942165) +8200 val loss 6.4979 +8200 val perplexity 663.7219 +8200 train 6.427443 (lr=6.6073e-05) (hash(x)=51536260) +8000 val loss 6.7511 +8000 val perplexity 855.0164 +8000 train 6.862554 (lr=9.4661e-05) (hash(x)=54927320) +7700 val loss 6.5963 +7700 val perplexity 732.3832 +7700 train 6.328202 (lr=4.7528e-05) (hash(x)=48853311) +8300 val loss 6.5058 +8300 val perplexity 669.0105 +8300 train 6.244885 (lr=6.5976e-05) (hash(x)=44770722) +8100 val loss 6.7385 +8100 val perplexity 844.2999 +8100 train 6.479411 (lr=9.4526e-05) (hash(x)=46461786) +7800 val loss 6.5929 +7800 val perplexity 729.8585 +7800 train 6.481411 (lr=4.7463e-05) (hash(x)=48510117) +8400 val loss 6.4937 +8400 val perplexity 660.9323 +8400 train 6.498044 (lr=6.5879e-05) (hash(x)=50104957) +8200 val loss 6.7219 +8200 val perplexity 830.3803 +8200 train 6.637498 (lr=9.4390e-05) (hash(x)=51536260) +7900 val loss 6.5861 +7900 val perplexity 724.9585 +7900 train 6.601991 (lr=4.7397e-05) (hash(x)=48339781) +8500 val loss 6.4829 +8500 val perplexity 653.8620 +8500 train 6.593787 (lr=6.5780e-05) (hash(x)=50132971) +8300 val loss 6.7466 +8300 val perplexity 851.1957 +8300 train 6.485762 (lr=9.4252e-05) (hash(x)=44770722) +8000 val loss 6.5696 +8000 val perplexity 713.0958 +8000 train 6.676756 (lr=4.7331e-05) (hash(x)=54927320) +8600 val loss 6.4764 +8600 val perplexity 649.6110 +8600 train 6.432298 (lr=6.5680e-05) (hash(x)=52193699) +8400 val loss 6.7239 +8400 val perplexity 832.0628 +8400 train 6.711411 (lr=9.4112e-05) (hash(x)=50104957) +8100 val loss 6.5660 +8100 val perplexity 710.5417 +8100 train 6.273369 (lr=4.7263e-05) (hash(x)=46461786) +8700 val loss 6.4616 +8700 val perplexity 640.1130 +8700 train 6.460426 (lr=6.5579e-05) (hash(x)=47902319) +8500 val loss 6.7088 +8500 val perplexity 819.5598 +8500 train 6.815518 (lr=9.3971e-05) (hash(x)=50132971) +8800 val loss 6.4731 +8800 val perplexity 647.5099 +8800 train 6.770748 (lr=6.5477e-05) (hash(x)=54904230) +8200 val loss 6.5484 +8200 val perplexity 698.1472 +8200 train 6.469982 (lr=4.7195e-05) (hash(x)=51536260) +8600 val loss 6.6916 +8600 val perplexity 805.6112 +8600 train 6.626624 (lr=9.3828e-05) (hash(x)=52193699) +8900 val loss 6.4588 +8900 val perplexity 638.2678 +8900 train 6.358573 (lr=6.5374e-05) (hash(x)=46311615) +8300 val loss 6.5529 +8300 val perplexity 701.2932 +8300 train 6.286871 (lr=4.7126e-05) (hash(x)=44770722) +8700 val loss 6.6874 +8700 val perplexity 802.2026 +8700 train 6.685539 (lr=9.3684e-05) (hash(x)=47902319) +9000 val loss 6.4628 +9000 val perplexity 640.8239 +9000 train 6.315720 (lr=6.5270e-05) (hash(x)=48535188) +8400 val loss 6.5408 +8400 val perplexity 692.8417 +8400 train 6.534705 (lr=4.7056e-05) (hash(x)=50104957) +8800 val loss 6.6797 +8800 val perplexity 796.0612 +8800 train 6.978171 (lr=9.3538e-05) (hash(x)=54904230) +9100 val loss 6.4444 +9100 val perplexity 629.1714 +9100 train 6.496108 (lr=6.5164e-05) (hash(x)=51757372) +8500 val loss 6.5281 +8500 val perplexity 684.1109 +8500 train 6.627559 (lr=4.6986e-05) (hash(x)=50132971) +8900 val loss 6.6656 +8900 val perplexity 784.9064 +8900 train 6.556992 (lr=9.3391e-05) (hash(x)=46311615) +9200 val loss 6.4398 +9200 val perplexity 626.2506 +9200 train 6.312794 (lr=6.5058e-05) (hash(x)=51131708) +8600 val loss 6.5169 +8600 val perplexity 676.5113 +8600 train 6.478631 (lr=4.6914e-05) (hash(x)=52193699) +9000 val loss 6.6869 +9000 val perplexity 801.8539 +9000 train 6.547362 (lr=9.3242e-05) (hash(x)=48535188) +9300 val loss 6.4419 +9300 val perplexity 627.5950 +9300 train 6.396623 (lr=6.4951e-05) (hash(x)=44784276) +8700 val loss 6.5056 +8700 val perplexity 668.9062 +8700 train 6.499852 (lr=4.6842e-05) (hash(x)=47902319) +9100 val loss 6.6641 +9100 val perplexity 783.7721 +9100 train 6.704998 (lr=9.3092e-05) (hash(x)=51757372) +9400 val loss 6.4374 +9400 val perplexity 624.7706 +9400 train 6.555745 (lr=6.4842e-05) (hash(x)=51981169) +8800 val loss 6.4953 +8800 val perplexity 662.0366 +8800 train 6.775081 (lr=4.6769e-05) (hash(x)=54904230) +9200 val loss 6.6609 +9200 val perplexity 781.2751 +9200 train 6.553735 (lr=9.2940e-05) (hash(x)=51131708) +9500 val loss 6.4396 +9500 val perplexity 626.1757 +9500 train 6.402099 (lr=6.4733e-05) (hash(x)=47232936) +8900 val loss 6.4968 +8900 val perplexity 663.0422 +8900 train 6.389380 (lr=4.6696e-05) (hash(x)=46311615) +9300 val loss 6.6413 +9300 val perplexity 766.1156 +9300 train 6.591171 (lr=9.2786e-05) (hash(x)=44784276) +9600 val loss 6.4422 +9600 val perplexity 627.8006 +9600 train 6.452951 (lr=6.4622e-05) (hash(x)=53800450) +9000 val loss 6.4916 +9000 val perplexity 659.5980 +9000 train 6.363636 (lr=4.6621e-05) (hash(x)=48535188) +9400 val loss 6.6425 +9400 val perplexity 767.0316 +9400 train 6.770110 (lr=9.2632e-05) (hash(x)=51981169) +9700 val loss 6.4194 +9700 val perplexity 613.6654 +9700 train 6.515686 (lr=6.4511e-05) (hash(x)=55768123) +9100 val loss 6.4601 +9100 val perplexity 639.1089 +9100 train 6.511439 (lr=4.6546e-05) (hash(x)=51757372) +9500 val loss 6.6338 +9500 val perplexity 760.3599 +9500 train 6.565683 (lr=9.2475e-05) (hash(x)=47232936) +9800 val loss 6.4353 +9800 val perplexity 623.4942 +9800 train 6.359432 (lr=6.4398e-05) (hash(x)=47745177) +9200 val loss 6.4557 +9200 val perplexity 636.2913 +9600 val loss 6.6392 +9600 val perplexity 764.5008 +9200 train 6.332733 (lr=4.6470e-05) (hash(x)=51131708) +9600 train 6.650306 (lr=9.2317e-05) (hash(x)=53800450) +9900 val loss 6.4333 +9900 val perplexity 622.2385 +9900 train 6.698323 (lr=6.4284e-05) (hash(x)=56592246) +9700 val loss 6.6322 +9700 val perplexity 759.1641 +9700 train 6.719718 (lr=9.2158e-05) (hash(x)=55768123) +9300 val loss 6.4501 +9300 val perplexity 632.7852 +9300 train 6.408199 (lr=4.6393e-05) (hash(x)=44784276) +10000 val loss 6.4301 +10000 val perplexity 620.2114 +10000 train 6.434453 (lr=6.4170e-05) (hash(x)=51655963) +9800 val loss 6.6505 +9800 val perplexity 773.1475 +9800 train 6.584711 (lr=9.1997e-05) (hash(x)=47745177) +9400 val loss 6.4461 +9400 val perplexity 630.2448 +9400 train 6.568119 (lr=4.6316e-05) (hash(x)=51981169) +10100 val loss 6.4364 +10100 val perplexity 624.1843 +10100 train 6.337934 (lr=6.4054e-05) (hash(x)=49809511) +9900 val loss 6.6443 +9900 val perplexity 768.3766 +9900 train 6.882045 (lr=9.1835e-05) (hash(x)=56592246) +9500 val loss 6.4431 +9500 val perplexity 628.3265 +9500 train 6.391317 (lr=4.6238e-05) (hash(x)=47232936) +10200 val loss 6.4607 +10200 val perplexity 639.5052 +10200 train 6.072261 (lr=6.3937e-05) (hash(x)=42297812) +10000 val loss 6.6411 +10000 val perplexity 765.9586 +10000 train 6.647980 (lr=9.1671e-05) (hash(x)=51655963) +9600 val loss 6.4569 +9600 val perplexity 637.1080 +9600 train 6.479267 (lr=4.6159e-05) (hash(x)=53800450) +10300 val loss 6.4193 +10300 val perplexity 613.5688 +10300 train 6.110044 (lr=6.3820e-05) (hash(x)=55529820) +10100 val loss 6.6212 +10100 val perplexity 750.8585 +10100 train 6.521398 (lr=9.1506e-05) (hash(x)=49809511) +10400 val loss 6.4131 +10400 val perplexity 609.7966 +10400 train 6.424881 (lr=6.3701e-05) (hash(x)=53255684) +9700 val loss 6.4372 +9700 val perplexity 624.6452 +9700 train 6.544666 (lr=4.6079e-05) (hash(x)=55768123) +10200 val loss 6.6227 +10200 val perplexity 752.0069 +10200 train 6.219979 (lr=9.1339e-05) (hash(x)=42297812) +10500 val loss 6.3948 +10500 val perplexity 598.7203 +10500 train 6.502378 (lr=6.3581e-05) (hash(x)=54306191) +9800 val loss 6.4376 +9800 val perplexity 624.9339 +9800 train 6.367727 (lr=4.5999e-05) (hash(x)=47745177) +10300 val loss 6.6043 +10300 val perplexity 738.2425 +10300 train 6.311920 (lr=9.1171e-05) (hash(x)=55529820) +10600 val loss 6.3874 +10600 val perplexity 594.2958 +10600 train 6.597528 (lr=6.3460e-05) (hash(x)=60130567) +9900 val loss 6.4565 +9900 val perplexity 636.8025 +9900 train 6.694003 (lr=4.5917e-05) (hash(x)=56592246) +10400 val loss 6.5965 +10400 val perplexity 732.5495 +10400 train 6.600775 (lr=9.1001e-05) (hash(x)=53255684) +10700 val loss 6.3920 +10700 val perplexity 597.0769 +10700 train 6.317720 (lr=6.3339e-05) (hash(x)=50074737) +10000 val loss 6.4274 +10000 val perplexity 618.5698 +10000 train 6.419580 (lr=4.5835e-05) (hash(x)=51655963) +10500 val loss 6.5842 +10500 val perplexity 723.6005 +10500 train 6.704099 (lr=9.0830e-05) (hash(x)=54306191) +10800 val loss 6.3791 +10800 val perplexity 589.3733 +10800 train 6.496333 (lr=6.3216e-05) (hash(x)=51547220) +10100 val loss 6.4270 +10100 val perplexity 618.3342 +10100 train 6.324362 (lr=4.5753e-05) (hash(x)=49809511) +10600 val loss 6.5758 +10600 val perplexity 717.4858 +10600 train 6.799292 (lr=9.0658e-05) (hash(x)=60130567) +10900 val loss 6.3685 +10900 val perplexity 583.1586 +10900 train 6.451928 (lr=6.3092e-05) (hash(x)=55943981) +10200 val loss 6.4205 +10200 val perplexity 614.3162 +10200 train 6.012050 (lr=4.5669e-05) (hash(x)=42297812) +10700 val loss 6.5728 +10700 val perplexity 715.3784 +10700 train 6.510283 (lr=9.0484e-05) (hash(x)=50074737) +11000 val loss 6.3550 +11000 val perplexity 575.3365 +11000 train 6.410864 (lr=6.2968e-05) (hash(x)=46444570) +10300 val loss 6.4132 +10300 val perplexity 609.8723 +10300 train 6.100938 (lr=4.5585e-05) (hash(x)=55529820) +10800 val loss 6.5693 +10800 val perplexity 712.9027 +10800 train 6.683527 (lr=9.0308e-05) (hash(x)=51547220) +11100 val loss 6.4048 +11100 val perplexity 604.7125 +11100 train 6.424164 (lr=6.2842e-05) (hash(x)=49589063) +10400 val loss 6.3961 +10400 val perplexity 599.5079 +10400 train 6.398651 (lr=4.5501e-05) (hash(x)=53255684) +10900 val loss 6.5989 +10900 val perplexity 734.3118 +10900 train 6.665658 (lr=9.0132e-05) (hash(x)=55943981) +11200 val loss 6.3603 +11200 val perplexity 578.4042 +11200 train 6.354611 (lr=6.2715e-05) (hash(x)=51392283) +10500 val loss 6.3786 +10500 val perplexity 589.1190 +10500 train 6.479599 (lr=4.5415e-05) (hash(x)=54306191) +11000 val loss 6.6051 +11000 val perplexity 738.8577 +11000 train 6.646755 (lr=8.9954e-05) (hash(x)=46444570) +11300 val loss 6.4280 +11300 val perplexity 618.9520 +11300 train 6.434875 (lr=6.2588e-05) (hash(x)=45081133) +10600 val loss 6.3785 +10600 val perplexity 589.0452 +10600 train 6.581085 (lr=4.5329e-05) (hash(x)=60130567) +11100 val loss 6.6082 +11100 val perplexity 741.1121 +11100 train 6.600307 (lr=8.9774e-05) (hash(x)=49589063) +11400 val loss 6.3718 +11400 val perplexity 585.1299 +11400 train 6.458348 (lr=6.2459e-05) (hash(x)=53700397) +11200 val loss 6.5779 +11200 val perplexity 719.0346 +10700 val loss 6.3648 +10700 val perplexity 581.0134 +11200 train 6.580002 (lr=8.9593e-05) (hash(x)=51392283) +10700 train 6.285759 (lr=4.5242e-05) (hash(x)=50074737) +11500 val loss 6.3571 +11500 val perplexity 576.5493 +11500 train 6.090324 (lr=6.2330e-05) (hash(x)=43839088) +11300 val loss 6.5766 +11300 val perplexity 718.0677 +11300 train 6.538782 (lr=8.9411e-05) (hash(x)=45081133) +10800 val loss 6.3522 +10800 val perplexity 573.7385 +10800 train 6.471161 (lr=4.5154e-05) (hash(x)=51547220) +11600 val loss 6.3560 +11600 val perplexity 575.9155 +11600 train 6.365404 (lr=6.2199e-05) (hash(x)=48088111) +11400 val loss 6.5712 +11400 val perplexity 714.2205 +11400 train 6.654538 (lr=8.9227e-05) (hash(x)=53700397) +10900 val loss 6.3555 +10900 val perplexity 575.6359 +10900 train 6.447626 (lr=4.5066e-05) (hash(x)=55943981) +11700 val loss 6.3418 +11700 val perplexity 567.8203 +11700 train 6.648592 (lr=6.2068e-05) (hash(x)=55108226) +11500 val loss 6.5605 +11500 val perplexity 706.6102 +11500 train 6.303116 (lr=8.9043e-05) (hash(x)=43839088) +11000 val loss 6.3386 +11000 val perplexity 565.9975 +11000 train 6.404186 (lr=4.4977e-05) (hash(x)=46444570) +11800 val loss 6.3481 +11800 val perplexity 571.4037 +11800 train 6.587350 (lr=6.1936e-05) (hash(x)=58524839) +11600 val loss 6.5759 +11600 val perplexity 717.5840 +11600 train 6.604741 (lr=8.8856e-05) (hash(x)=48088111) +11100 val loss 6.3403 +11100 val perplexity 566.9902 +11100 train 6.350871 (lr=4.4887e-05) (hash(x)=49589063) +11900 val loss 6.3405 +11900 val perplexity 567.0935 +11900 train 5.980791 (lr=6.1802e-05) (hash(x)=43864078) +11700 val loss 6.5846 +11700 val perplexity 723.8970 +11700 train 6.892133 (lr=8.8668e-05) (hash(x)=55108226) +12000 val loss 6.3421 +12000 val perplexity 567.9741 +12000 train 6.039177 (lr=6.1668e-05) (hash(x)=43448544) +11200 val loss 6.3322 +11200 val perplexity 562.4065 +11200 train 6.340265 (lr=4.4797e-05) (hash(x)=51392283) +11800 val loss 6.5801 +11800 val perplexity 720.6069 +11800 train 6.812759 (lr=8.8479e-05) (hash(x)=58524839) +12100 val loss 6.3541 +12100 val perplexity 574.8290 +12100 train 6.301033 (lr=6.1533e-05) (hash(x)=55200399) +11300 val loss 6.3448 +11300 val perplexity 569.5289 +11300 train 6.311532 (lr=4.4706e-05) (hash(x)=45081133) +11900 val loss 6.5679 +11900 val perplexity 711.8446 +11900 train 6.241637 (lr=8.8289e-05) (hash(x)=43864078) +12200 val loss 6.3503 +12200 val perplexity 572.6906 +12200 train 6.491292 (lr=6.1397e-05) (hash(x)=57627314) +11400 val loss 6.3166 +11400 val perplexity 553.6919 +11400 train 6.405518 (lr=4.4614e-05) (hash(x)=53700397) +12000 val loss 6.5646 +12000 val perplexity 709.5612 +12000 train 6.279223 (lr=8.8097e-05) (hash(x)=43448544) +12300 val loss 6.3520 +12300 val perplexity 573.6589 +12300 train 6.656027 (lr=6.1260e-05) (hash(x)=53617087) +11500 val loss 6.3162 +11500 val perplexity 553.4821 +11500 train 6.044001 (lr=4.4521e-05) (hash(x)=43839088) +12100 val loss 6.5575 +12100 val perplexity 704.4769 +12100 train 6.543485 (lr=8.7904e-05) (hash(x)=55200399) +12400 val loss 6.3432 +12400 val perplexity 568.5938 +12400 train 6.196439 (lr=6.1122e-05) (hash(x)=51135678) +11600 val loss 6.3103 +11600 val perplexity 550.1926 +11600 train 6.319879 (lr=4.4428e-05) (hash(x)=48088111) +12200 val loss 6.5644 +12200 val perplexity 709.3788 +12200 train 6.671797 (lr=8.7710e-05) (hash(x)=57627314) +12500 val loss 6.3827 +12500 val perplexity 591.5469 +12500 train 6.429889 (lr=6.0984e-05) (hash(x)=48025130) +11700 val loss 6.3133 +11700 val perplexity 551.8868 +11700 train 6.645210 (lr=4.4334e-05) (hash(x)=55108226) +12300 val loss 6.5624 +12300 val perplexity 707.9727 +12300 train 6.810684 (lr=8.7515e-05) (hash(x)=53617087) +12600 val loss 6.3387 +12600 val perplexity 566.0334 +12600 train 6.383214 (lr=6.0844e-05) (hash(x)=52135695) +12400 val loss 6.5626 +12400 val perplexity 708.1314 +12400 train 6.393559 (lr=8.7318e-05) (hash(x)=51135678) +11800 val loss 6.3220 +11800 val perplexity 556.7070 +11800 train 6.576732 (lr=4.4240e-05) (hash(x)=58524839) +12700 val loss 6.3426 +12700 val perplexity 568.2723 +12700 train 6.201974 (lr=6.0703e-05) (hash(x)=51888613) +12500 val loss 6.6048 +12500 val perplexity 738.6020 +12500 train 6.596214 (lr=8.7119e-05) (hash(x)=48025130) +11900 val loss 6.3032 +11900 val perplexity 546.3365 +11900 train 5.970202 (lr=4.4145e-05) (hash(x)=43864078) +12800 val loss 6.3370 +12800 val perplexity 565.0873 +12800 train 6.260093 (lr=6.0562e-05) (hash(x)=50418818) +12600 val loss 6.5513 +12600 val perplexity 700.1442 +12600 train 6.610987 (lr=8.6920e-05) (hash(x)=52135695) +12000 val loss 6.3029 +12000 val perplexity 546.1674 +12000 train 5.999082 (lr=4.4049e-05) (hash(x)=43448544) +12900 val loss 6.3169 +12900 val perplexity 553.8445 +12900 train 7.201547 (lr=6.0420e-05) (hash(x)=58649585) +12700 val loss 6.5578 +12700 val perplexity 704.6970 +12700 train 6.436542 (lr=8.6719e-05) (hash(x)=51888613) +12100 val loss 6.3017 +12100 val perplexity 545.4907 +12100 train 6.246994 (lr=4.3952e-05) (hash(x)=55200399) +13000 val loss 6.3034 +13000 val perplexity 546.4482 +13000 train 6.579797 (lr=6.0277e-05) (hash(x)=54567307) +12800 val loss 6.5742 +12800 val perplexity 716.3731 +12800 train 6.467278 (lr=8.6517e-05) (hash(x)=50418818) +12200 val loss 6.2922 +12200 val perplexity 540.3362 +12200 train 6.435254 (lr=4.3855e-05) (hash(x)=57627314) +13100 val loss 6.3046 +13100 val perplexity 547.0768 +13100 train 6.428601 (lr=6.0133e-05) (hash(x)=52071473) +12900 val loss 6.5427 +12900 val perplexity 694.1595 +12900 train 7.406726 (lr=8.6314e-05) (hash(x)=58649585) +12300 val loss 6.2971 +12300 val perplexity 543.0134 +12300 train 6.599758 (lr=4.3757e-05) (hash(x)=53617087) +13200 val loss 6.2888 +13200 val perplexity 538.5305 +13200 train 6.058242 (lr=5.9988e-05) (hash(x)=46293092) +13000 val loss 6.5517 +13000 val perplexity 700.4313 +13000 train 6.792434 (lr=8.6110e-05) (hash(x)=54567307) +12400 val loss 6.3048 +12400 val perplexity 547.1718 +12400 train 6.146738 (lr=4.3659e-05) (hash(x)=51135678) +13300 val loss 6.2830 +13300 val perplexity 535.3733 +13300 train 6.390278 (lr=5.9842e-05) (hash(x)=56511467) +13100 val loss 6.5425 +13100 val perplexity 694.0284 +13100 train 6.697961 (lr=8.5904e-05) (hash(x)=52071473) +13400 val loss 6.2832 +13400 val perplexity 535.4987 +13400 train 6.380155 (lr=5.9695e-05) (hash(x)=54753763) +12500 val loss 6.3133 +12500 val perplexity 551.8790 +12500 train 6.356315 (lr=4.3560e-05) (hash(x)=48025130) +13200 val loss 6.5249 +13200 val perplexity 681.9034 +13200 train 6.271211 (lr=8.5697e-05) (hash(x)=46293092) +13500 val loss 6.2965 +13500 val perplexity 542.6864 +13500 train 6.437078 (lr=5.9548e-05) (hash(x)=53610247) +12600 val loss 6.3031 +12600 val perplexity 546.2406 +12600 train 6.349621 (lr=4.3460e-05) (hash(x)=52135695) +13300 val loss 6.5190 +13300 val perplexity 677.9108 +13300 train 6.633325 (lr=8.5489e-05) (hash(x)=56511467) +13600 val loss 6.2793 +13600 val perplexity 533.3911 +13600 train 6.178974 (lr=5.9400e-05) (hash(x)=47526249) +12700 val loss 6.2954 +12700 val perplexity 542.0657 +12700 train 6.138735 (lr=4.3360e-05) (hash(x)=51888613) +13400 val loss 6.5053 +13400 val perplexity 668.6843 +13400 train 6.609445 (lr=8.5279e-05) (hash(x)=54753763) +13700 val loss 6.2769 +13700 val perplexity 532.1181 +13700 train 6.125827 (lr=5.9251e-05) (hash(x)=51185517) +12800 val loss 6.2961 +12800 val perplexity 542.4781 +12800 train 6.221097 (lr=4.3259e-05) (hash(x)=50418818) +13500 val loss 6.5038 +13500 val perplexity 667.7010 +13500 train 6.635751 (lr=8.5069e-05) (hash(x)=53610247) +13800 val loss 6.2660 +13800 val perplexity 526.3919 +13800 train 5.934208 (lr=5.9101e-05) (hash(x)=45953529) +12900 val loss 6.2687 +12900 val perplexity 527.8148 +12900 train 7.175869 (lr=4.3157e-05) (hash(x)=58649585) +13600 val loss 6.4959 +13600 val perplexity 662.4256 +13600 train 6.370335 (lr=8.4857e-05) (hash(x)=47526249) +13900 val loss 6.2591 +13900 val perplexity 522.7527 +13900 train 6.160434 (lr=5.8950e-05) (hash(x)=47238157) +13000 val loss 6.2670 +13000 val perplexity 526.8743 +13000 train 6.563784 (lr=4.3055e-05) (hash(x)=54567307) +13700 val loss 6.5016 +13700 val perplexity 666.2260 +13700 train 6.336977 (lr=8.4644e-05) (hash(x)=51185517) +14000 val loss 6.2591 +14000 val perplexity 522.7397 +14000 train 6.215769 (lr=5.8799e-05) (hash(x)=54250750) +13100 val loss 6.2628 +13100 val perplexity 524.6711 +13100 train 6.379539 (lr=4.2952e-05) (hash(x)=52071473) +13800 val loss 6.4955 +13800 val perplexity 662.1556 +13800 train 6.183954 (lr=8.4430e-05) (hash(x)=45953529) +14100 val loss 6.2618 +14100 val perplexity 524.1670 +14100 train 6.260184 (lr=5.8646e-05) (hash(x)=48198552) +13200 val loss 6.2525 +13200 val perplexity 519.2887 +13200 train 6.040390 (lr=4.2848e-05) (hash(x)=46293092) +13900 val loss 6.4803 +13900 val perplexity 652.1401 +13900 train 6.410282 (lr=8.4214e-05) (hash(x)=47238157) +14200 val loss 6.2610 +14200 val perplexity 523.7622 +14200 train 6.367507 (lr=5.8493e-05) (hash(x)=52020690) +13300 val loss 6.2446 +13300 val perplexity 515.2338 +13300 train 6.354766 (lr=4.2744e-05) (hash(x)=56511467) +14000 val loss 6.4987 +14000 val perplexity 664.2668 +14000 train 6.451422 (lr=8.3998e-05) (hash(x)=54250750) +14300 val loss 6.2597 +14300 val perplexity 523.0881 +14300 train 6.066946 (lr=5.8339e-05) (hash(x)=45165483) +13400 val loss 6.2296 +13400 val perplexity 507.5316 +13400 train 6.346925 (lr=4.2640e-05) (hash(x)=54753763) +14100 val loss 6.5067 +14100 val perplexity 669.6297 +14100 train 6.501920 (lr=8.3780e-05) (hash(x)=48198552) +14400 val loss 6.2628 +14400 val perplexity 524.6866 +14400 train 6.359657 (lr=5.8184e-05) (hash(x)=52184072) +13500 val loss 6.2320 +13500 val perplexity 508.7899 +13500 train 6.372458 (lr=4.2534e-05) (hash(x)=53610247) +14200 val loss 6.4944 +14200 val perplexity 661.4572 +14200 train 6.618190 (lr=8.3561e-05) (hash(x)=52020690) +14500 val loss 6.2733 +14500 val perplexity 530.2107 +14500 train 6.625431 (lr=5.8029e-05) (hash(x)=56046436) +13600 val loss 6.2320 +13600 val perplexity 508.7525 +14300 val loss 6.4961 +14300 val perplexity 662.5339 +13600 train 6.122686 (lr=4.2428e-05) (hash(x)=47526249) +14300 train 6.288409 (lr=8.3341e-05) (hash(x)=45165483) +14600 val loss 6.2666 +14600 val perplexity 526.6653 +14600 train 6.079505 (lr=5.7872e-05) (hash(x)=52029694) +14400 val loss 6.5160 +14400 val perplexity 675.9000 +14400 train 6.597093 (lr=8.3120e-05) (hash(x)=52184072) +13700 val loss 6.2232 +13700 val perplexity 504.3288 +13700 train 6.069735 (lr=4.2322e-05) (hash(x)=51185517) +14700 val loss 6.2799 +14700 val perplexity 533.7134 +14700 train 6.125349 (lr=5.7715e-05) (hash(x)=50258224) +14500 val loss 6.4984 +14500 val perplexity 664.1047 +14500 train 6.804925 (lr=8.2898e-05) (hash(x)=56046436) +13800 val loss 6.2141 +13800 val perplexity 499.7631 +13800 train 5.898561 (lr=4.2215e-05) (hash(x)=45953529) +14800 val loss 6.2789 +14800 val perplexity 533.1945 +14800 train 5.578360 (lr=5.7558e-05) (hash(x)=42112262) +14600 val loss 6.4967 +14600 val perplexity 662.9833 +14600 train 6.377328 (lr=8.2675e-05) (hash(x)=52029694) +13900 val loss 6.2199 +13900 val perplexity 502.6653 +13900 train 6.139126 (lr=4.2107e-05) (hash(x)=47238157) +14900 val loss 6.2827 +14900 val perplexity 535.2421 +14900 train 6.032803 (lr=5.7399e-05) (hash(x)=47219933) +14700 val loss 6.5066 +14700 val perplexity 669.5291 +14700 train 6.350065 (lr=8.2451e-05) (hash(x)=50258224) +14000 val loss 6.2114 +14000 val perplexity 498.4152 +14000 train 6.168877 (lr=4.1999e-05) (hash(x)=54250750) +15000 val loss 6.3072 +15000 val perplexity 548.5156 +15000 train 5.927101 (lr=5.7240e-05) (hash(x)=58309309) +14800 val loss 6.5008 +14800 val perplexity 665.7036 +14800 train 5.805090 (lr=8.2225e-05) (hash(x)=42112262) +14100 val loss 6.2063 +14100 val perplexity 495.8396 +14100 train 6.204028 (lr=4.1890e-05) (hash(x)=48198552) +15100 val loss 6.2989 +15100 val perplexity 543.9886 +15100 train 6.204672 (lr=5.7079e-05) (hash(x)=48756049) +14900 val loss 6.5060 +14900 val perplexity 669.1774 +14900 train 6.271698 (lr=8.1998e-05) (hash(x)=47219933) +15200 val loss 6.2903 +15200 val perplexity 539.2983 +15200 train 5.983079 (lr=5.6919e-05) (hash(x)=49791737) +14200 val loss 6.2062 +14200 val perplexity 495.8238 +14200 train 6.306804 (lr=4.1781e-05) (hash(x)=52020690) +15000 val loss 6.5320 +15000 val perplexity 686.7567 +15000 train 6.149031 (lr=8.1771e-05) (hash(x)=58309309) +15300 val loss 6.2821 +15300 val perplexity 534.8953 +15300 train 6.211565 (lr=5.6757e-05) (hash(x)=53084126) +14300 val loss 6.2119 +14300 val perplexity 498.6263 +14300 train 6.032381 (lr=4.1671e-05) (hash(x)=45165483) +15100 val loss 6.5226 +15100 val perplexity 680.3143 +15100 train 6.455632 (lr=8.1542e-05) (hash(x)=48756049) +15400 val loss 6.2561 +15400 val perplexity 521.1675 +15400 train 6.303825 (lr=5.6595e-05) (hash(x)=55041679) +14400 val loss 6.2042 +14400 val perplexity 494.8153 +14400 train 6.308162 (lr=4.1560e-05) (hash(x)=52184072) +15200 val loss 6.5137 +15200 val perplexity 674.2991 +15200 train 6.220746 (lr=8.1312e-05) (hash(x)=49791737) +15500 val loss 6.2417 +15500 val perplexity 513.7371 +15500 train 6.041740 (lr=5.6432e-05) (hash(x)=43249867) +14500 val loss 6.2019 +14500 val perplexity 493.6857 +14500 train 6.565360 (lr=4.1449e-05) (hash(x)=56046436) +15300 val loss 6.5161 +15300 val perplexity 675.9696 +15300 train 6.489110 (lr=8.1082e-05) (hash(x)=53084126) +15600 val loss 6.2454 +15600 val perplexity 515.6241 +15600 train 6.214768 (lr=5.6268e-05) (hash(x)=49006517) +14600 val loss 6.2042 +14600 val perplexity 494.8474 +14600 train 5.997983 (lr=4.1337e-05) (hash(x)=52029694) +15400 val loss 6.4838 +15400 val perplexity 654.4450 +15400 train 6.545664 (lr=8.0850e-05) (hash(x)=55041679) +15700 val loss 6.2311 +15700 val perplexity 508.2974 +15700 train 6.525909 (lr=5.6104e-05) (hash(x)=60986839) +14700 val loss 6.2143 +14700 val perplexity 499.8418 +14700 train 6.074037 (lr=4.1225e-05) (hash(x)=50258224) +15500 val loss 6.4487 +15500 val perplexity 631.9008 +15500 train 6.254567 (lr=8.0617e-05) (hash(x)=43249867) +15800 val loss 6.2335 +15800 val perplexity 509.5486 +15800 train 6.157606 (lr=5.5938e-05) (hash(x)=48354906) +14800 val loss 6.1960 +14800 val perplexity 490.7956 +14800 train 5.508065 (lr=4.1113e-05) (hash(x)=42112262) +15600 val loss 6.4573 +15600 val perplexity 637.3088 +15600 train 6.429033 (lr=8.0383e-05) (hash(x)=49006517) +15900 val loss 6.2353 +15900 val perplexity 510.4343 +15900 train 6.071015 (lr=5.5773e-05) (hash(x)=52679780) +14900 val loss 6.1977 +14900 val perplexity 491.5990 +14900 train 5.965704 (lr=4.0999e-05) (hash(x)=47219933) +15700 val loss 6.4370 +15700 val perplexity 624.5377 +15700 train 6.727993 (lr=8.0148e-05) (hash(x)=60986839) +16000 val loss 6.2307 +16000 val perplexity 508.1077 +16000 train 6.364089 (lr=5.5606e-05) (hash(x)=58049587) +15000 val loss 6.2242 +15000 val perplexity 504.8374 +15000 train 5.856767 (lr=4.0885e-05) (hash(x)=58309309) +15800 val loss 6.4396 +15800 val perplexity 626.1273 +15800 train 6.376960 (lr=7.9912e-05) (hash(x)=48354906) +16100 val loss 6.2193 +16100 val perplexity 502.3358 +16100 train 5.868877 (lr=5.5439e-05) (hash(x)=39795825) +15100 val loss 6.1894 +15100 val perplexity 487.5722 +15100 train 6.100931 (lr=4.0771e-05) (hash(x)=48756049) +15900 val loss 6.4217 +15900 val perplexity 615.0214 +15900 train 6.283855 (lr=7.9675e-05) (hash(x)=52679780) +16200 val loss 6.2147 +16200 val perplexity 500.0380 +16200 train 6.207601 (lr=5.5271e-05) (hash(x)=49498689) +16000 val loss 6.4304 +16000 val perplexity 620.4217 +16000 train 6.587129 (lr=7.9437e-05) (hash(x)=58049587) +16300 val loss 6.2421 +16300 val perplexity 513.9544 +16300 train 6.600447 (lr=5.5102e-05) (hash(x)=51235194) +15200 val loss 6.2033 +15200 val perplexity 494.3926 +15200 train 5.858490 (lr=4.0656e-05) (hash(x)=49791737) +16100 val loss 6.4319 +16100 val perplexity 621.3712 +16100 train 6.077530 (lr=7.9198e-05) (hash(x)=39795825) +16400 val loss 6.2190 +16400 val perplexity 502.1808 +16400 train 6.082575 (lr=5.4933e-05) (hash(x)=47789417) +15300 val loss 6.1788 +15300 val perplexity 482.4112 +15300 train 6.114874 (lr=4.0541e-05) (hash(x)=53084126) +16200 val loss 6.4436 +16200 val perplexity 628.6627 +16200 train 6.434575 (lr=7.8959e-05) (hash(x)=49498689) +16500 val loss 6.2251 +16500 val perplexity 505.2743 +16500 train 6.784148 (lr=5.4763e-05) (hash(x)=59665889) +15400 val loss 6.1750 +15400 val perplexity 480.5643 +15400 train 6.214546 (lr=4.0425e-05) (hash(x)=55041679) +16300 val loss 6.4347 +16300 val perplexity 623.1120 +16300 train 6.750012 (lr=7.8718e-05) (hash(x)=51235194) +16600 val loss 6.2364 +16600 val perplexity 511.0405 +16600 train 6.131698 (lr=5.4593e-05) (hash(x)=49034893) +15500 val loss 6.1506 +15500 val perplexity 468.9987 +15500 train 5.963602 (lr=4.0308e-05) (hash(x)=43249867) +16400 val loss 6.4408 +16400 val perplexity 626.8846 +16400 train 6.300445 (lr=7.8476e-05) (hash(x)=47789417) +16700 val loss 6.2236 +16700 val perplexity 504.5263 +16700 train 6.050609 (lr=5.4422e-05) (hash(x)=45685955) +15600 val loss 6.1598 +15600 val perplexity 473.3522 +15600 train 6.138355 (lr=4.0191e-05) (hash(x)=49006517) +16500 val loss 6.4328 +16500 val perplexity 621.8873 +16500 train 6.958402 (lr=7.8233e-05) (hash(x)=59665889) +16800 val loss 6.2321 +16800 val perplexity 508.8360 +16800 train 6.032184 (lr=5.4250e-05) (hash(x)=49589835) +15700 val loss 6.1445 +15700 val perplexity 466.1526 +15700 train 6.424474 (lr=4.0074e-05) (hash(x)=60986839) +16600 val loss 6.4360 +16600 val perplexity 623.9278 +16600 train 6.318515 (lr=7.7990e-05) (hash(x)=49034893) +16900 val loss 6.2381 +16900 val perplexity 511.8797 +16900 train 5.675816 (lr=5.4078e-05) (hash(x)=38655565) +15800 val loss 6.1471 +15800 val perplexity 467.3422 +15800 train 6.088002 (lr=3.9956e-05) (hash(x)=48354906) +16700 val loss 6.4353 +16700 val perplexity 623.4493 +16700 train 6.250768 (lr=7.7745e-05) (hash(x)=45685955) +17000 val loss 6.2390 +17000 val perplexity 512.3212 +17000 train 6.162674 (lr=5.3905e-05) (hash(x)=46730732) +15900 val loss 6.1403 +15900 val perplexity 464.1739 +15900 train 5.986931 (lr=3.9838e-05) (hash(x)=52679780) +16800 val loss 6.4277 +16800 val perplexity 618.7781 +16800 train 6.249182 (lr=7.7500e-05) (hash(x)=49589835) +17100 val loss 6.2363 +17100 val perplexity 510.9644 +17100 train 5.835392 (lr=5.3731e-05) (hash(x)=43843722) +16000 val loss 6.1446 +16000 val perplexity 466.1955 +16000 train 6.260038 (lr=3.9719e-05) (hash(x)=58049587) +16900 val loss 6.4309 +16900 val perplexity 620.7564 +16900 train 5.830621 (lr=7.7254e-05) (hash(x)=38655565) +17200 val loss 6.2360 +17200 val perplexity 510.8192 +17200 train 5.961731 (lr=5.3557e-05) (hash(x)=50198978) +16100 val loss 6.1321 +16100 val perplexity 460.3810 +16100 train 5.801347 (lr=3.9599e-05) (hash(x)=39795825) +17300 val loss 6.2276 +17300 val perplexity 506.5314 +17300 train 6.210217 (lr=5.3382e-05) (hash(x)=52976276) +17000 val loss 6.4156 +17000 val perplexity 611.3114 +17000 train 6.364450 (lr=7.7007e-05) (hash(x)=46730732) +16200 val loss 6.1447 +16200 val perplexity 466.2173 +16200 train 6.124951 (lr=3.9479e-05) (hash(x)=49498689) +17400 val loss 6.2305 +17400 val perplexity 507.9846 +17400 train 6.068756 (lr=5.3206e-05) (hash(x)=52224275) +17100 val loss 6.4381 +17100 val perplexity 625.2069 +17100 train 6.037139 (lr=7.6758e-05) (hash(x)=43843722) +16300 val loss 6.1461 +16300 val perplexity 466.8778 +16300 train 6.477264 (lr=3.9359e-05) (hash(x)=51235194) +17200 val loss 6.4236 +17200 val perplexity 616.2365 +17200 train 6.175350 (lr=7.6510e-05) (hash(x)=50198978) +17500 val loss 6.2296 +17500 val perplexity 507.5582 +17500 train 6.032399 (lr=5.3030e-05) (hash(x)=49055112) +16400 val loss 6.1407 +16400 val perplexity 464.3915 +16400 train 5.994458 (lr=3.9238e-05) (hash(x)=47789417) +17300 val loss 6.4258 +17300 val perplexity 617.5781 +17300 train 6.377968 (lr=7.6260e-05) (hash(x)=52976276) +17600 val loss 6.2388 +17600 val perplexity 512.2540 +17600 train 6.254847 (lr=5.2854e-05) (hash(x)=49996498) +16500 val loss 6.1271 +16500 val perplexity 458.0865 +16500 train 6.665648 (lr=3.9117e-05) (hash(x)=59665889) +17400 val loss 6.4322 +17400 val perplexity 621.5561 +17400 train 6.273139 (lr=7.6009e-05) (hash(x)=52224275) +17700 val loss 6.2319 +17700 val perplexity 508.6999 +17700 train 6.250253 (lr=5.2677e-05) (hash(x)=50526074) +16600 val loss 6.1285 +16600 val perplexity 458.7510 +16600 train 6.013433 (lr=3.8995e-05) (hash(x)=49034893) +17500 val loss 6.4342 +17500 val perplexity 622.7935 +17500 train 6.241991 (lr=7.5758e-05) (hash(x)=49055112) +17800 val loss 6.2203 +17800 val perplexity 502.8618 +17800 train 6.085866 (lr=5.2499e-05) (hash(x)=48410966) +16700 val loss 6.1134 +16700 val perplexity 451.8671 +16700 train 5.965143 (lr=3.8873e-05) (hash(x)=45685955) +17900 val loss 6.2088 +17900 val perplexity 497.1135 +17900 train 6.120636 (lr=5.2321e-05) (hash(x)=51654550) +17600 val loss 6.4394 +17600 val perplexity 626.0258 +17600 train 6.431095 (lr=7.5505e-05) (hash(x)=49996498) +16800 val loss 6.1151 +16800 val perplexity 452.6602 +16800 train 5.916518 (lr=3.8750e-05) (hash(x)=49589835) +18000 val loss 6.2110 +18000 val perplexity 498.2175 +18000 train 6.256108 (lr=5.2142e-05) (hash(x)=44227619) +17700 val loss 6.4051 +17700 val perplexity 604.9517 +17700 train 6.401339 (lr=7.5252e-05) (hash(x)=50526074) +16900 val loss 6.1184 +16900 val perplexity 454.1241 +16900 train 5.535720 (lr=3.8627e-05) (hash(x)=38655565) +18100 val loss 6.2499 +18100 val perplexity 517.9538 +18100 train 6.060187 (lr=5.1962e-05) (hash(x)=45771090) +17800 val loss 6.3837 +17800 val perplexity 592.0933 +17800 train 6.248812 (lr=7.4998e-05) (hash(x)=48410966) +18200 val loss 6.2090 +18200 val perplexity 497.1884 +18200 train 5.813533 (lr=5.1782e-05) (hash(x)=37914739) +17000 val loss 6.1139 +17000 val perplexity 452.0802 +17000 train 6.051076 (lr=3.8503e-05) (hash(x)=46730732) +17900 val loss 6.3942 +17900 val perplexity 598.3661 +17900 train 6.317099 (lr=7.4744e-05) (hash(x)=51654550) +18300 val loss 6.2202 +18300 val perplexity 502.8067 +18300 train 6.649755 (lr=5.1602e-05) (hash(x)=68784189) +17100 val loss 6.1165 +17100 val perplexity 453.2661 +17100 train 5.728023 (lr=3.8379e-05) (hash(x)=43843722) +18000 val loss 6.3821 +18000 val perplexity 591.1916 +18000 train 6.441657 (lr=7.4488e-05) (hash(x)=44227619) +18400 val loss 6.2116 +18400 val perplexity 498.4774 +18400 train 6.305532 (lr=5.1421e-05) (hash(x)=55006900) +18100 val loss 6.3686 +18100 val perplexity 583.2571 +17200 val loss 6.1154 +17200 val perplexity 452.7926 +18100 train 6.152564 (lr=7.4232e-05) (hash(x)=45771090) +17200 train 5.870935 (lr=3.8255e-05) (hash(x)=50198978) +18500 val loss 6.2078 +18500 val perplexity 496.5970 +18500 train 6.639764 (lr=5.1240e-05) (hash(x)=56001248) +18200 val loss 6.3627 +18200 val perplexity 579.8299 +18200 train 5.982582 (lr=7.3975e-05) (hash(x)=37914739) +17300 val loss 6.1174 +17300 val perplexity 453.6789 +17300 train 6.124028 (lr=3.8130e-05) (hash(x)=52976276) +18600 val loss 6.2151 +18600 val perplexity 500.2407 +18600 train 6.427946 (lr=5.1058e-05) (hash(x)=51339352) +18300 val loss 6.3540 +18300 val perplexity 574.8029 +18300 train 6.817049 (lr=7.3717e-05) (hash(x)=68784189) +17400 val loss 6.1143 +17400 val perplexity 452.2792 +17400 train 5.953983 (lr=3.8005e-05) (hash(x)=52224275) +18700 val loss 6.1958 +18700 val perplexity 490.6945 +18700 train 6.316683 (lr=5.0875e-05) (hash(x)=53256371) +18400 val loss 6.3656 +18400 val perplexity 581.4715 +18400 train 6.470822 (lr=7.3459e-05) (hash(x)=55006900) +17500 val loss 6.1301 +17500 val perplexity 459.4903 +17500 train 5.912059 (lr=3.7879e-05) (hash(x)=49055112) +18800 val loss 6.1895 +18800 val perplexity 487.5994 +18800 train 6.153018 (lr=5.0692e-05) (hash(x)=51580096) +18500 val loss 6.3750 +18500 val perplexity 586.9753 +18500 train 6.800819 (lr=7.3199e-05) (hash(x)=56001248) +17600 val loss 6.1130 +17600 val perplexity 451.6745 +17600 train 6.148704 (lr=3.7753e-05) (hash(x)=49996498) +18900 val loss 6.2135 +18900 val perplexity 499.4353 +18900 train 6.289693 (lr=5.0509e-05) (hash(x)=53001073) +18600 val loss 6.3839 +18600 val perplexity 592.2291 +18600 train 6.638814 (lr=7.2939e-05) (hash(x)=51339352) +17700 val loss 6.1098 +17700 val perplexity 450.2263 +17700 train 6.122671 (lr=3.7626e-05) (hash(x)=50526074) +19000 val loss 6.1946 +19000 val perplexity 490.0882 +19000 train 6.040896 (lr=5.0325e-05) (hash(x)=51531298) +18700 val loss 6.3737 +18700 val perplexity 586.2148 +18700 train 6.463876 (lr=7.2679e-05) (hash(x)=53256371) +17800 val loss 6.0852 +17800 val perplexity 439.3212 +17800 train 5.954217 (lr=3.7499e-05) (hash(x)=48410966) +19100 val loss 6.1863 +19100 val perplexity 486.0649 +19100 train 5.977464 (lr=5.0140e-05) (hash(x)=48047406) +18800 val loss 6.3755 +18800 val perplexity 587.2612 +18800 train 6.335759 (lr=7.2417e-05) (hash(x)=51580096) +17900 val loss 6.0942 +17900 val perplexity 443.2969 +17900 train 5.995052 (lr=3.7372e-05) (hash(x)=51654550) +19200 val loss 6.2016 +19200 val perplexity 493.5228 +19200 train 5.989591 (lr=4.9955e-05) (hash(x)=44939100) +18900 val loss 6.4042 +18900 val perplexity 604.3491 +18900 train 6.465156 (lr=7.2155e-05) (hash(x)=53001073) +18000 val loss 6.0769 +18000 val perplexity 435.6833 +18000 train 6.125412 (lr=3.7244e-05) (hash(x)=44227619) +19300 val loss 6.1938 +19300 val perplexity 489.6812 +19300 train 6.048463 (lr=4.9770e-05) (hash(x)=50157419) +19000 val loss 6.3922 +19000 val perplexity 597.1473 +19000 train 6.266483 (lr=7.1892e-05) (hash(x)=51531298) +18100 val loss 6.0814 +18100 val perplexity 437.6197 +18100 train 5.878476 (lr=3.7116e-05) (hash(x)=45771090) +19400 val loss 6.2012 +19400 val perplexity 493.3522 +19400 train 5.994554 (lr=4.9584e-05) (hash(x)=46973593) +19100 val loss 6.4022 +19100 val perplexity 603.1468 +19100 train 6.178774 (lr=7.1629e-05) (hash(x)=48047406) +18200 val loss 6.0721 +18200 val perplexity 433.5980 +18200 train 5.698674 (lr=3.6987e-05) (hash(x)=37914739) +19500 val loss 6.1906 +19500 val perplexity 488.1631 +19500 train 5.973404 (lr=4.9398e-05) (hash(x)=44056368) +19200 val loss 6.4015 +19200 val perplexity 602.7693 +19200 train 6.190560 (lr=7.1365e-05) (hash(x)=44939100) +18300 val loss 6.0712 +18300 val perplexity 433.1967 +18300 train 6.515245 (lr=3.6859e-05) (hash(x)=68784189) +19600 val loss 6.1805 +19600 val perplexity 483.2188 +19600 train 6.429302 (lr=4.9211e-05) (hash(x)=58496552) +19300 val loss 6.3932 +19300 val perplexity 597.7740 +19300 train 6.245660 (lr=7.1100e-05) (hash(x)=50157419) +18400 val loss 6.0754 +18400 val perplexity 435.0180 +18400 train 6.186811 (lr=3.6729e-05) (hash(x)=55006900) +19700 val loss 6.1768 +19700 val perplexity 481.4540 +19700 train 6.024097 (lr=4.9024e-05) (hash(x)=47753232) +19400 val loss 6.3897 +19400 val perplexity 595.6659 +19400 train 6.178835 (lr=7.0835e-05) (hash(x)=46973593) +18500 val loss 6.0686 +18500 val perplexity 432.0583 +18500 train 6.523821 (lr=3.6600e-05) (hash(x)=56001248) +19800 val loss 6.1627 +19800 val perplexity 474.7133 +19800 train 5.977198 (lr=4.8837e-05) (hash(x)=49031231) +19500 val loss 6.3898 +19500 val perplexity 595.7554 +19500 train 6.186742 (lr=7.0569e-05) (hash(x)=44056368) +19900 val loss 6.1755 +19900 val perplexity 480.8091 +19900 train 6.226699 (lr=4.8649e-05) (hash(x)=54352451) +18600 val loss 6.1130 +18600 val perplexity 451.6773 +18600 train 6.311148 (lr=3.6470e-05) (hash(x)=51339352) +19600 val loss 6.3913 +19600 val perplexity 596.6461 +19600 train 6.729703 (lr=7.0302e-05) (hash(x)=58496552) +20000 val loss 6.1642 +20000 val perplexity 475.4031 +20000 train 5.978202 (lr=4.8461e-05) (hash(x)=45789621) +18700 val loss 6.0599 +18700 val perplexity 428.3229 +18700 train 6.181108 (lr=3.6339e-05) (hash(x)=53256371) +19700 val loss 6.3835 +19700 val perplexity 592.0207 +19700 train 6.216898 (lr=7.0035e-05) (hash(x)=47753232) +20100 val loss 6.1786 +20100 val perplexity 482.3323 +20100 train 6.119834 (lr=4.8272e-05) (hash(x)=45096720) +18800 val loss 6.0531 +18800 val perplexity 425.4240 +18800 train 6.023508 (lr=3.6209e-05) (hash(x)=51580096) +19800 val loss 6.3565 +19800 val perplexity 576.2151 +19800 train 6.147184 (lr=6.9767e-05) (hash(x)=49031231) +20200 val loss 6.1726 +20200 val perplexity 479.4096 +20200 train 6.233849 (lr=4.8083e-05) (hash(x)=49377287) +18900 val loss 6.0609 +18900 val perplexity 428.7735 +18900 train 6.137412 (lr=3.6078e-05) (hash(x)=53001073) +19900 val loss 6.3602 +19900 val perplexity 578.3471 +19900 train 6.447223 (lr=6.9498e-05) (hash(x)=54352451) +20300 val loss 6.1640 +20300 val perplexity 475.3399 +20300 train 5.945217 (lr=4.7893e-05) (hash(x)=49454716) +19000 val loss 6.0583 +19000 val perplexity 427.6386 +19000 train 5.912615 (lr=3.5946e-05) (hash(x)=51531298) +20000 val loss 6.3695 +20000 val perplexity 583.7882 +20000 train 6.155598 (lr=6.9229e-05) (hash(x)=45789621) +20400 val loss 6.1713 +20400 val perplexity 478.8310 +20400 train 5.680020 (lr=4.7703e-05) (hash(x)=38699401) +19100 val loss 6.0597 +19100 val perplexity 428.2618 +19100 train 5.843977 (lr=3.5814e-05) (hash(x)=48047406) +20100 val loss 6.3631 +20100 val perplexity 580.0168 +20100 train 6.303806 (lr=6.8960e-05) (hash(x)=45096720) +20500 val loss 6.1714 +20500 val perplexity 478.8588 +20500 train 6.270490 (lr=4.7513e-05) (hash(x)=53266981) +20200 val loss 6.3630 +20200 val perplexity 579.9930 +19200 val loss 6.0528 +19200 val perplexity 425.3007 +19200 train 5.875459 (lr=3.5682e-05) (hash(x)=44939100) +20200 train 6.376566 (lr=6.8690e-05) (hash(x)=49377287) +20600 val loss 6.1660 +20600 val perplexity 476.2712 +20600 train 6.099114 (lr=4.7323e-05) (hash(x)=51783804) +20300 val loss 6.3610 +20300 val perplexity 578.8163 +20300 train 6.128880 (lr=6.8419e-05) (hash(x)=49454716) +19300 val loss 6.0601 +19300 val perplexity 428.3989 +19300 train 5.926717 (lr=3.5550e-05) (hash(x)=50157419) +20700 val loss 6.1819 +20700 val perplexity 483.8918 +20700 train 6.071718 (lr=4.7132e-05) (hash(x)=51388467) +20400 val loss 6.3674 +20400 val perplexity 582.5652 +20400 train 5.859035 (lr=6.8148e-05) (hash(x)=38699401) +19400 val loss 6.0600 +19400 val perplexity 428.3746 +19400 train 5.875482 (lr=3.5417e-05) (hash(x)=46973593) +20800 val loss 6.1892 +20800 val perplexity 487.4608 +20800 train 5.933630 (lr=4.6940e-05) (hash(x)=45324903) +20500 val loss 6.3694 +20500 val perplexity 583.7231 +20500 train 6.425434 (lr=6.7876e-05) (hash(x)=53266981) +19500 val loss 6.0467 +19500 val perplexity 422.7293 +19500 train 5.862154 (lr=3.5284e-05) (hash(x)=44056368) +20900 val loss 6.1733 +20900 val perplexity 479.7522 +20900 train 5.804732 (lr=4.6749e-05) (hash(x)=45972467) +20600 val loss 6.3675 +20600 val perplexity 582.6213 +20600 train 6.303026 (lr=6.7604e-05) (hash(x)=51783804) +19600 val loss 6.0503 +19600 val perplexity 424.2211 +19600 train 6.313962 (lr=3.5151e-05) (hash(x)=58496552) +21000 val loss 6.1696 +21000 val perplexity 477.9921 +21000 train 6.117172 (lr=4.6557e-05) (hash(x)=46754507) +20700 val loss 6.3765 +20700 val perplexity 587.8462 +20700 train 6.280571 (lr=6.7331e-05) (hash(x)=51388467) +19700 val loss 6.0547 +19700 val perplexity 426.1084 +19700 train 5.911861 (lr=3.5017e-05) (hash(x)=47753232) +21100 val loss 6.1708 +21100 val perplexity 478.5657 +21100 train 6.458302 (lr=4.6365e-05) (hash(x)=59913667) +20800 val loss 6.3633 +20800 val perplexity 580.1413 +20800 train 6.099911 (lr=6.7058e-05) (hash(x)=45324903) +19800 val loss 6.0476 +19800 val perplexity 423.1147 +19800 train 5.875927 (lr=3.4883e-05) (hash(x)=49031231) +21200 val loss 6.1723 +21200 val perplexity 479.3056 +21200 train 6.133057 (lr=4.6172e-05) (hash(x)=46487759) +20900 val loss 6.3644 +20900 val perplexity 580.7993 +20900 train 5.990748 (lr=6.6784e-05) (hash(x)=45972467) +19900 val loss 6.0356 +19900 val perplexity 418.0502 +19900 train 6.095958 (lr=3.4749e-05) (hash(x)=54352451) +21300 val loss 6.1538 +21300 val perplexity 470.5030 +21300 train 5.945532 (lr=4.5979e-05) (hash(x)=45535495) +21000 val loss 6.3754 +21000 val perplexity 587.2021 +21000 train 6.294639 (lr=6.6510e-05) (hash(x)=46754507) +20000 val loss 6.0344 +20000 val perplexity 417.5580 +21400 val loss 6.1425 +21400 val perplexity 465.2253 +20000 train 5.854521 (lr=3.4615e-05) (hash(x)=45789621) +21400 train 6.150859 (lr=4.5786e-05) (hash(x)=54559895) +21100 val loss 6.3533 +21100 val perplexity 574.3936 +21100 train 6.633968 (lr=6.6235e-05) (hash(x)=59913667) +21500 val loss 6.1391 +21500 val perplexity 463.6258 +21500 train 5.982634 (lr=4.5592e-05) (hash(x)=48474176) +20100 val loss 6.0352 +20100 val perplexity 417.8784 +20100 train 6.019393 (lr=3.4480e-05) (hash(x)=45096720) +21200 val loss 6.3447 +21200 val perplexity 569.4678 +21200 train 6.289260 (lr=6.5960e-05) (hash(x)=46487759) +21600 val loss 6.1398 +21600 val perplexity 463.9677 +21600 train 6.109609 (lr=4.5399e-05) (hash(x)=47915584) +20200 val loss 6.0307 +20200 val perplexity 416.0139 +20200 train 6.110758 (lr=3.4345e-05) (hash(x)=49377287) +21300 val loss 6.3526 +21300 val perplexity 573.9709 +21300 train 6.203497 (lr=6.5684e-05) (hash(x)=45535495) +21700 val loss 6.1441 +21700 val perplexity 465.9380 +21700 train 5.896945 (lr=4.5205e-05) (hash(x)=45846696) +20300 val loss 6.0276 +20300 val perplexity 414.7291 +20300 train 5.837848 (lr=3.4209e-05) (hash(x)=49454716) +21400 val loss 6.3514 +21400 val perplexity 573.3196 +21400 train 6.349491 (lr=6.5408e-05) (hash(x)=54559895) +21800 val loss 6.1466 +21800 val perplexity 467.1121 +21800 train 5.892045 (lr=4.5010e-05) (hash(x)=47201143) +20400 val loss 6.0263 +20400 val perplexity 414.1902 +20400 train 5.551824 (lr=3.4074e-05) (hash(x)=38699401) +21500 val loss 6.3461 +21500 val perplexity 570.2754 +21500 train 6.196745 (lr=6.5132e-05) (hash(x)=48474176) +21900 val loss 6.1458 +21900 val perplexity 466.7358 +21900 train 6.086146 (lr=4.4816e-05) (hash(x)=50028395) +20500 val loss 6.0176 +20500 val perplexity 410.5836 +20500 train 6.118743 (lr=3.3938e-05) (hash(x)=53266981) +21600 val loss 6.3470 +21600 val perplexity 570.7966 +21600 train 6.301847 (lr=6.4855e-05) (hash(x)=47915584) +22000 val loss 6.1393 +22000 val perplexity 463.7465 +22000 train 6.101057 (lr=4.4621e-05) (hash(x)=50273781) +20600 val loss 6.0144 +20600 val perplexity 409.2816 +20600 train 5.957831 (lr=3.3802e-05) (hash(x)=51783804) +21700 val loss 6.3332 +21700 val perplexity 562.9777 +21700 train 6.098959 (lr=6.4578e-05) (hash(x)=45846696) +22100 val loss 6.1502 +22100 val perplexity 468.8317 +22100 train 6.074613 (lr=4.4426e-05) (hash(x)=47859152) +20700 val loss 6.0155 +20700 val perplexity 409.7299 +20700 train 5.928106 (lr=3.3665e-05) (hash(x)=51388467) +21800 val loss 6.3397 +21800 val perplexity 566.6172 +21800 train 6.066625 (lr=6.4300e-05) (hash(x)=47201143) +22200 val loss 6.1422 +22200 val perplexity 465.0601 +22200 train 6.002366 (lr=4.4231e-05) (hash(x)=47820296) +20800 val loss 6.0164 +20800 val perplexity 410.0995 +20800 train 5.793630 (lr=3.3529e-05) (hash(x)=45324903) +21900 val loss 6.3431 +21900 val perplexity 568.5334 +21900 train 6.304872 (lr=6.4023e-05) (hash(x)=50028395) +22300 val loss 6.1341 +22300 val perplexity 461.3391 +22300 train 5.852176 (lr=4.4035e-05) (hash(x)=51741508) +20900 val loss 6.0135 +20900 val perplexity 408.9259 +20900 train 5.664894 (lr=3.3392e-05) (hash(x)=45972467) +22000 val loss 6.3380 +22000 val perplexity 565.6362 +22000 train 6.293690 (lr=6.3744e-05) (hash(x)=50273781) +22400 val loss 6.1399 +22400 val perplexity 464.0199 +22400 train 5.707395 (lr=4.3839e-05) (hash(x)=44585679) +21000 val loss 6.0136 +21000 val perplexity 408.9552 +21000 train 5.986984 (lr=3.3255e-05) (hash(x)=46754507) +22100 val loss 6.3400 +22100 val perplexity 566.7731 +22100 train 6.252201 (lr=6.3466e-05) (hash(x)=47859152) +22500 val loss 6.1268 +22500 val perplexity 457.9565 +22500 train 6.158222 (lr=4.3643e-05) (hash(x)=48753202) +21100 val loss 6.0039 +21100 val perplexity 405.0070 +21100 train 6.288526 (lr=3.3118e-05) (hash(x)=59913667) +22200 val loss 6.3427 +22200 val perplexity 568.3276 +22200 train 6.203479 (lr=6.3187e-05) (hash(x)=47820296) +22600 val loss 6.1390 +22600 val perplexity 463.5871 +22600 train 5.701408 (lr=4.3447e-05) (hash(x)=45351902) +21200 val loss 6.0067 +21200 val perplexity 406.1277 +21200 train 6.005379 (lr=3.2980e-05) (hash(x)=46487759) +22300 val loss 6.3471 +22300 val perplexity 570.8563 +22300 train 6.079469 (lr=6.2907e-05) (hash(x)=51741508) +22700 val loss 6.1328 +22700 val perplexity 460.7227 +22700 train 6.620811 (lr=4.3251e-05) (hash(x)=59263323) +21300 val loss 6.0122 +21300 val perplexity 408.3809 +21300 train 5.797719 (lr=3.2842e-05) (hash(x)=45535495) +22400 val loss 6.3355 +22400 val perplexity 564.2387 +22400 train 5.933209 (lr=6.2628e-05) (hash(x)=44585679) +22800 val loss 6.1419 +22800 val perplexity 464.9315 +22800 train 5.694124 (lr=4.3054e-05) (hash(x)=44873637) +22500 val loss 6.3426 +22500 val perplexity 568.2452 +21400 val loss 6.0007 +21400 val perplexity 403.6951 +21400 train 6.002113 (lr=3.2704e-05) (hash(x)=54559895) +22500 train 6.350743 (lr=6.2348e-05) (hash(x)=48753202) +22900 val loss 6.1300 +22900 val perplexity 459.4432 +22900 train 6.244625 (lr=4.2858e-05) (hash(x)=58208930) +21500 val loss 5.9900 +21500 val perplexity 399.4275 +21500 train 5.842444 (lr=3.2566e-05) (hash(x)=48474176) +22600 val loss 6.3425 +22600 val perplexity 568.2057 +22600 train 5.930212 (lr=6.2068e-05) (hash(x)=45351902) +23000 val loss 6.1255 +23000 val perplexity 457.3562 +23000 train 6.102044 (lr=4.2661e-05) (hash(x)=50248612) +21600 val loss 5.9876 +21600 val perplexity 398.4431 +21600 train 5.968499 (lr=3.2428e-05) (hash(x)=47915584) +23100 val loss 6.1814 +23100 val perplexity 483.6907 +22700 val loss 6.3480 +22700 val perplexity 571.3630 +23100 train 6.055898 (lr=4.2464e-05) (hash(x)=47902220) +22700 train 6.802013 (lr=6.1787e-05) (hash(x)=59263323) +21700 val loss 5.9948 +21700 val perplexity 401.3510 +21700 train 5.751846 (lr=3.2289e-05) (hash(x)=45846696) +23200 val loss 6.1135 +23200 val perplexity 451.9276 +23200 train 5.991982 (lr=4.2267e-05) (hash(x)=45606218) +22800 val loss 6.3393 +22800 val perplexity 566.3903 +22800 train 5.898570 (lr=6.1506e-05) (hash(x)=44873637) +23300 val loss 6.1096 +23300 val perplexity 450.1411 +23300 train 6.233109 (lr=4.2069e-05) (hash(x)=55596334) +21800 val loss 5.9915 +21800 val perplexity 400.0187 +21800 train 5.753116 (lr=3.2150e-05) (hash(x)=47201143) +22900 val loss 6.3156 +22900 val perplexity 553.1541 +22900 train 6.436237 (lr=6.1225e-05) (hash(x)=58208930) +23400 val loss 6.1044 +23400 val perplexity 447.8400 +23400 train 6.133411 (lr=4.1872e-05) (hash(x)=59736985) +21900 val loss 5.9855 +21900 val perplexity 397.6227 +23000 val loss 6.3196 +23000 val perplexity 555.3450 +21900 train 5.923649 (lr=3.2011e-05) (hash(x)=50028395) +23000 train 6.307461 (lr=6.0944e-05) (hash(x)=50248612) +23500 val loss 6.1119 +23500 val perplexity 451.1934 +23500 train 6.046716 (lr=4.1674e-05) (hash(x)=48935694) +22000 val loss 5.9808 +22000 val perplexity 395.7760 +23100 val loss 6.3458 +23100 val perplexity 570.0873 +22000 train 5.972328 (lr=3.1872e-05) (hash(x)=50273781) +23100 train 6.207120 (lr=6.0663e-05) (hash(x)=47902220) +23600 val loss 6.1151 +23600 val perplexity 452.6223 +23600 train 5.924499 (lr=4.1476e-05) (hash(x)=53977113) +23200 val loss 6.3292 +23200 val perplexity 560.6997 +22100 val loss 5.9880 +22100 val perplexity 398.6314 +23200 train 6.211708 (lr=6.0381e-05) (hash(x)=45606218) +22100 train 5.931949 (lr=3.1733e-05) (hash(x)=47859152) +23700 val loss 6.1222 +23700 val perplexity 455.8661 +23700 train 5.764342 (lr=4.1278e-05) (hash(x)=45466992) +22200 val loss 5.9857 +22200 val perplexity 397.6931 +23300 val loss 6.3152 +23300 val perplexity 552.8865 +22200 train 5.832684 (lr=3.1593e-05) (hash(x)=47820296) +23300 train 6.430233 (lr=6.0099e-05) (hash(x)=55596334) +23800 val loss 6.1038 +23800 val perplexity 447.5618 +23800 train 5.909907 (lr=4.1080e-05) (hash(x)=45830878) +23400 val loss 6.3127 +23400 val perplexity 551.5491 +23400 train 6.347643 (lr=5.9817e-05) (hash(x)=59736985) +22300 val loss 5.9815 +22300 val perplexity 396.0442 +22300 train 5.725097 (lr=3.1454e-05) (hash(x)=51741508) +23900 val loss 6.1088 +23900 val perplexity 449.8033 +23900 train 6.000215 (lr=4.0882e-05) (hash(x)=48268066) +23500 val loss 6.3050 +23500 val perplexity 547.3044 +23500 train 6.209606 (lr=5.9534e-05) (hash(x)=48935694) +22400 val loss 5.9881 +22400 val perplexity 398.6481 +22400 train 5.545953 (lr=3.1314e-05) (hash(x)=44585679) +24000 val loss 6.1128 +24000 val perplexity 451.5821 +24000 train 5.937686 (lr=4.0684e-05) (hash(x)=51062942) +22500 val loss 5.9894 +22500 val perplexity 399.1792 +22500 train 6.055580 (lr=3.1174e-05) (hash(x)=48753202) +23600 val loss 6.2960 +23600 val perplexity 542.4075 +23600 train 6.142959 (lr=5.9252e-05) (hash(x)=53977113) +24100 val loss 6.1118 +24100 val perplexity 451.1725 +24100 train 5.957380 (lr=4.0486e-05) (hash(x)=51090246) +23700 val loss 6.3045 +23700 val perplexity 547.0236 +23700 train 5.952677 (lr=5.8969e-05) (hash(x)=45466992) +22600 val loss 5.9757 +22600 val perplexity 393.7350 +22600 train 5.555033 (lr=3.1034e-05) (hash(x)=45351902) +24200 val loss 6.1143 +24200 val perplexity 452.2881 +24200 train 5.828847 (lr=4.0287e-05) (hash(x)=48686225) +22700 val loss 5.9891 +22700 val perplexity 399.0443 +22700 train 6.472228 (lr=3.0894e-05) (hash(x)=59263323) +23800 val loss 6.3023 +23800 val perplexity 545.8222 +23800 train 6.130695 (lr=5.8686e-05) (hash(x)=45830878) +24300 val loss 6.1108 +24300 val perplexity 450.7126 +24300 train 5.765564 (lr=4.0089e-05) (hash(x)=43604814) +22800 val loss 5.9729 +22800 val perplexity 392.6522 +22800 train 5.568539 (lr=3.0753e-05) (hash(x)=44873637) +23900 val loss 6.2997 +23900 val perplexity 544.4222 +23900 train 6.194791 (lr=5.8403e-05) (hash(x)=48268066) +24400 val loss 6.1000 +24400 val perplexity 445.8465 +24400 train 5.904193 (lr=3.9891e-05) (hash(x)=48143655) +22900 val loss 5.9682 +22900 val perplexity 390.7981 +22900 train 6.094232 (lr=3.0613e-05) (hash(x)=58208930) +24000 val loss 6.3047 +24000 val perplexity 547.1176 +24000 train 6.130448 (lr=5.8120e-05) (hash(x)=51062942) +24500 val loss 6.1090 +24500 val perplexity 449.8874 +24500 train 6.140269 (lr=3.9692e-05) (hash(x)=58317397) +23000 val loss 5.9644 +23000 val perplexity 389.3314 +23000 train 5.966906 (lr=3.0472e-05) (hash(x)=50248612) +24100 val loss 6.3059 +24100 val perplexity 547.7991 +24100 train 6.162120 (lr=5.7837e-05) (hash(x)=51090246) +24600 val loss 6.1055 +24600 val perplexity 448.3269 +24600 train 6.242974 (lr=3.9493e-05) (hash(x)=53855425) +23100 val loss 5.9697 +23100 val perplexity 391.3837 +23100 train 5.868172 (lr=3.0331e-05) (hash(x)=47902220) +24200 val loss 6.3250 +24200 val perplexity 558.3798 +24200 train 6.033810 (lr=5.7554e-05) (hash(x)=48686225) +24700 val loss 6.1025 +24700 val perplexity 446.9740 +24700 train 5.891122 (lr=3.9295e-05) (hash(x)=49829785) +23200 val loss 5.9563 +23200 val perplexity 386.1695 +23200 train 5.812859 (lr=3.0190e-05) (hash(x)=45606218) +24800 val loss 6.0966 +24800 val perplexity 444.3436 +24800 train 6.130437 (lr=3.9096e-05) (hash(x)=51337298) +24300 val loss 6.3078 +24300 val perplexity 548.8285 +24300 train 5.955484 (lr=5.7270e-05) (hash(x)=43604814) +23300 val loss 5.9550 +23300 val perplexity 385.6635 +23300 train 6.090078 (lr=3.0049e-05) (hash(x)=55596334) +24900 val loss 6.0916 +24900 val perplexity 442.1438 +24900 train 6.120977 (lr=3.8897e-05) (hash(x)=47385279) +24400 val loss 6.2998 +24400 val perplexity 544.4456 +24400 train 6.106803 (lr=5.6987e-05) (hash(x)=48143655) +23400 val loss 5.9525 +23400 val perplexity 384.7060 +23400 train 5.954114 (lr=2.9908e-05) (hash(x)=59736985) +25000 val loss 6.0797 +25000 val perplexity 436.8879 +25000 train 6.504747 (lr=3.8699e-05) (hash(x)=53631975) +24500 val loss 6.2944 +24500 val perplexity 541.5116 +24500 train 6.330753 (lr=5.6703e-05) (hash(x)=58317397) +23500 val loss 5.9520 +23500 val perplexity 384.5178 +23500 train 5.869464 (lr=2.9767e-05) (hash(x)=48935694) +25100 val loss 6.0752 +25100 val perplexity 434.9232 +25100 train 6.107512 (lr=3.8500e-05) (hash(x)=52991852) +24600 val loss 6.2925 +24600 val perplexity 540.5165 +24600 train 6.439625 (lr=5.6419e-05) (hash(x)=53855425) +25200 val loss 6.0788 +25200 val perplexity 436.5062 +25200 train 6.457631 (lr=3.8301e-05) (hash(x)=51863290) +23600 val loss 5.9498 +23600 val perplexity 383.6585 +23600 train 5.757344 (lr=2.9626e-05) (hash(x)=53977113) +24700 val loss 6.3028 +24700 val perplexity 546.1091 +24700 train 6.100008 (lr=5.6135e-05) (hash(x)=49829785) +25300 val loss 6.0786 +25300 val perplexity 436.4076 +25300 train 5.847868 (lr=3.8103e-05) (hash(x)=46898810) +23700 val loss 5.9542 +23700 val perplexity 385.3510 +23700 train 5.572944 (lr=2.9485e-05) (hash(x)=45466992) +24800 val loss 6.2816 +24800 val perplexity 534.6320 +24800 train 6.316615 (lr=5.5852e-05) (hash(x)=51337298) +25400 val loss 6.0781 +25400 val perplexity 436.2205 +25400 train 6.139091 (lr=3.7904e-05) (hash(x)=50297951) +24900 val loss 6.2896 +24900 val perplexity 538.9402 +24900 train 6.324134 (lr=5.5568e-05) (hash(x)=47385279) +23800 val loss 5.9453 +23800 val perplexity 381.9656 +23800 train 5.733608 (lr=2.9343e-05) (hash(x)=45830878) +25500 val loss 6.0659 +25500 val perplexity 430.9047 +25500 train 5.887280 (lr=3.7705e-05) (hash(x)=49942718) +25000 val loss 6.2820 +25000 val perplexity 534.8416 +23900 val loss 5.9411 +23900 val perplexity 380.3693 +23900 train 5.839910 (lr=2.9202e-05) (hash(x)=48268066) +25000 train 6.650968 (lr=5.5284e-05) (hash(x)=53631975) +25600 val loss 6.0695 +25600 val perplexity 432.4829 +25600 train 5.939018 (lr=3.7507e-05) (hash(x)=46944979) +24000 val loss 5.9383 +24000 val perplexity 379.2958 +24000 train 5.761079 (lr=2.9060e-05) (hash(x)=51062942) +25100 val loss 6.2794 +25100 val perplexity 533.4435 +25100 train 6.317781 (lr=5.5000e-05) (hash(x)=52991852) +25700 val loss 6.0792 +25700 val perplexity 436.6661 +25700 train 6.337087 (lr=3.7308e-05) (hash(x)=48116500) +24100 val loss 5.9419 +24100 val perplexity 380.6670 +25200 val loss 6.2777 +25200 val perplexity 532.5540 +24100 train 5.817915 (lr=2.8918e-05) (hash(x)=51090246) +25200 train 6.661186 (lr=5.4716e-05) (hash(x)=51863290) +25800 val loss 6.0654 +25800 val perplexity 430.6810 +25800 train 6.344007 (lr=3.7109e-05) (hash(x)=59750893) +25300 val loss 6.2835 +25300 val perplexity 535.6470 +25300 train 6.038043 (lr=5.4432e-05) (hash(x)=46898810) +24200 val loss 5.9475 +24200 val perplexity 382.7947 +24200 train 5.666345 (lr=2.8777e-05) (hash(x)=48686225) +25900 val loss 6.0707 +25900 val perplexity 432.9895 +25900 train 5.797611 (lr=3.6911e-05) (hash(x)=49479938) +25400 val loss 6.2828 +25400 val perplexity 535.2632 +25400 train 6.348429 (lr=5.4148e-05) (hash(x)=50297951) +24300 val loss 5.9399 +24300 val perplexity 379.9096 +24300 train 5.606017 (lr=2.8635e-05) (hash(x)=43604814) +26000 val loss 6.0766 +26000 val perplexity 435.5415 +26000 train 6.209918 (lr=3.6713e-05) (hash(x)=51134577) +25500 val loss 6.2826 +25500 val perplexity 535.1839 +25500 train 6.111054 (lr=5.3865e-05) (hash(x)=49942718) +24400 val loss 5.9440 +24400 val perplexity 381.4422 +24400 train 5.755672 (lr=2.8493e-05) (hash(x)=48143655) +26100 val loss 6.0689 +26100 val perplexity 432.1848 +26100 train 5.996790 (lr=3.6514e-05) (hash(x)=53677080) +25600 val loss 6.2769 +25600 val perplexity 532.1389 +25600 train 6.146551 (lr=5.3581e-05) (hash(x)=46944979) +24500 val loss 5.9383 +24500 val perplexity 379.2902 +24500 train 5.992229 (lr=2.8351e-05) (hash(x)=58317397) +26200 val loss 6.0752 +26200 val perplexity 434.9546 +26200 train 5.937295 (lr=3.6316e-05) (hash(x)=48885091) +25700 val loss 6.2790 +25700 val perplexity 533.2624 +24600 val loss 5.9361 +24600 val perplexity 378.4429 +25700 train 6.553234 (lr=5.3297e-05) (hash(x)=48116500) +24600 train 6.060189 (lr=2.8210e-05) (hash(x)=53855425) +26300 val loss 6.0637 +26300 val perplexity 429.9825 +26300 train 6.134348 (lr=3.6118e-05) (hash(x)=57217953) +24700 val loss 5.9514 +24700 val perplexity 384.2784 +24700 train 5.756442 (lr=2.8068e-05) (hash(x)=49829785) +25800 val loss 6.2791 +25800 val perplexity 533.2939 +25800 train 6.653417 (lr=5.3013e-05) (hash(x)=59750893) +26400 val loss 6.0595 +26400 val perplexity 428.1722 +26400 train 5.841461 (lr=3.5920e-05) (hash(x)=48504109) +24800 val loss 5.9219 +24800 val perplexity 373.1087 +25900 val loss 6.2764 +25900 val perplexity 531.8708 +24800 train 5.968436 (lr=2.7926e-05) (hash(x)=51337298) +25900 train 6.000820 (lr=5.2730e-05) (hash(x)=49479938) +26500 val loss 6.0597 +26500 val perplexity 428.2265 +26500 train 6.635356 (lr=3.5722e-05) (hash(x)=55775918) +24900 val loss 5.9406 +24900 val perplexity 380.1450 +24900 train 5.984793 (lr=2.7784e-05) (hash(x)=47385279) +26000 val loss 6.2688 +26000 val perplexity 527.8687 +26000 train 6.396154 (lr=5.2446e-05) (hash(x)=51134577) +26600 val loss 6.0557 +26600 val perplexity 426.5464 +26600 train 6.022480 (lr=3.5524e-05) (hash(x)=51978817) +25000 val loss 5.9243 +25000 val perplexity 374.0070 +25000 train 6.377190 (lr=2.7642e-05) (hash(x)=53631975) +26100 val loss 6.2779 +26100 val perplexity 532.6611 +26100 train 6.178829 (lr=5.2163e-05) (hash(x)=53677080) +26700 val loss 6.0598 +26700 val perplexity 428.2859 +26700 train 5.925630 (lr=3.5326e-05) (hash(x)=50995814) +25100 val loss 5.9203 +25100 val perplexity 372.5377 +25100 train 5.939419 (lr=2.7500e-05) (hash(x)=52991852) +26200 val loss 6.2773 +26200 val perplexity 532.3508 +26800 val loss 6.0507 +26800 val perplexity 424.4202 +26200 train 6.118965 (lr=5.1880e-05) (hash(x)=48885091) +26800 train 5.966284 (lr=3.5128e-05) (hash(x)=49986643) +26900 val loss 6.0512 +26900 val perplexity 424.6243 +26900 train 6.157857 (lr=3.4931e-05) (hash(x)=51408788) +25200 val loss 5.9152 +25200 val perplexity 370.6218 +25200 train 6.300425 (lr=2.7358e-05) (hash(x)=51863290) +26300 val loss 6.2773 +26300 val perplexity 532.3270 +26300 train 6.354596 (lr=5.1597e-05) (hash(x)=57217953) +27000 val loss 6.0416 +27000 val perplexity 420.5643 +27000 train 5.814680 (lr=3.4733e-05) (hash(x)=49024980) +25300 val loss 5.9153 +25300 val perplexity 370.6690 +26400 val loss 6.2635 +26400 val perplexity 525.0340 +25300 train 5.684647 (lr=2.7216e-05) (hash(x)=46898810) +26400 train 6.056812 (lr=5.1314e-05) (hash(x)=48504109) +27100 val loss 6.0444 +27100 val perplexity 421.7439 +27100 train 6.281505 (lr=3.4536e-05) (hash(x)=45027677) +26500 val loss 6.2579 +26500 val perplexity 522.1194 +26500 train 6.795272 (lr=5.1031e-05) (hash(x)=55775918) +25400 val loss 5.9204 +25400 val perplexity 372.5515 +25400 train 6.009842 (lr=2.7074e-05) (hash(x)=50297951) +27200 val loss 6.0484 +27200 val perplexity 423.4525 +27200 train 6.103629 (lr=3.4339e-05) (hash(x)=49930937) +26600 val loss 6.2578 +26600 val perplexity 522.0460 +26600 train 6.216881 (lr=5.0748e-05) (hash(x)=51978817) +25500 val loss 5.9114 +25500 val perplexity 369.2381 +25500 train 5.744133 (lr=2.6932e-05) (hash(x)=49942718) +27300 val loss 6.0353 +27300 val perplexity 417.9277 +27300 train 5.947425 (lr=3.4142e-05) (hash(x)=51016030) +26700 val loss 6.2640 +26700 val perplexity 525.3312 +26700 train 6.130256 (lr=5.0466e-05) (hash(x)=50995814) +25600 val loss 5.9158 +25600 val perplexity 370.8578 +25600 train 5.784765 (lr=2.6790e-05) (hash(x)=46944979) +27400 val loss 6.0351 +27400 val perplexity 417.8452 +27400 train 6.264123 (lr=3.3946e-05) (hash(x)=48154914) +26800 val loss 6.2422 +26800 val perplexity 513.9755 +26800 train 6.156603 (lr=5.0183e-05) (hash(x)=49986643) +25700 val loss 5.9091 +25700 val perplexity 368.3661 +25700 train 6.181494 (lr=2.6649e-05) (hash(x)=48116500) +27500 val loss 6.0305 +27500 val perplexity 415.9289 +27500 train 6.128118 (lr=3.3749e-05) (hash(x)=53265478) +26900 val loss 6.2369 +26900 val perplexity 511.2696 +26900 train 6.337112 (lr=4.9901e-05) (hash(x)=51408788) +25800 val loss 5.9088 +25800 val perplexity 368.2530 +25800 train 6.189940 (lr=2.6507e-05) (hash(x)=59750893) +27600 val loss 6.0527 +27600 val perplexity 425.2629 +27600 train 5.856176 (lr=3.3553e-05) (hash(x)=39456920) +27000 val loss 6.2410 +27000 val perplexity 513.3475 +27000 train 6.025411 (lr=4.9619e-05) (hash(x)=49024980) +25900 val loss 5.9107 +25900 val perplexity 368.9599 +25900 train 5.638449 (lr=2.6365e-05) (hash(x)=49479938) +27700 val loss 6.0435 +27700 val perplexity 421.3598 +27700 train 6.216787 (lr=3.3357e-05) (hash(x)=55794844) +27100 val loss 6.2409 +27100 val perplexity 513.3365 +27100 train 6.460735 (lr=4.9337e-05) (hash(x)=45027677) +26000 val loss 5.9132 +26000 val perplexity 369.8796 +26000 train 6.063453 (lr=2.6223e-05) (hash(x)=51134577) +27800 val loss 6.0393 +27800 val perplexity 419.5840 +27800 train 5.958351 (lr=3.3161e-05) (hash(x)=49091362) +27200 val loss 6.2371 +27200 val perplexity 511.3715 +26100 val loss 5.9118 +26100 val perplexity 369.3658 +27200 train 6.283369 (lr=4.9056e-05) (hash(x)=49930937) +26100 train 5.850782 (lr=2.6082e-05) (hash(x)=53677080) +27900 val loss 6.0371 +27900 val perplexity 418.6732 +27900 train 5.852857 (lr=3.2965e-05) (hash(x)=49494590) +26200 val loss 5.9137 +26200 val perplexity 370.0862 +26200 train 5.798280 (lr=2.5940e-05) (hash(x)=48885091) +27300 val loss 6.2352 +27300 val perplexity 510.3822 +27300 train 6.146107 (lr=4.8775e-05) (hash(x)=51016030) +28000 val loss 6.0347 +28000 val perplexity 417.6547 +28000 train 5.417509 (lr=3.2769e-05) (hash(x)=42333356) +26300 val loss 5.9087 +26300 val perplexity 368.2252 +27400 val loss 6.2294 +27400 val perplexity 507.4462 +26300 train 5.979610 (lr=2.5798e-05) (hash(x)=57217953) +27400 train 6.446771 (lr=4.8494e-05) (hash(x)=48154914) +28100 val loss 6.0293 +28100 val perplexity 415.4191 +28100 train 6.210233 (lr=3.2574e-05) (hash(x)=49378958) +26400 val loss 5.9082 +26400 val perplexity 368.0288 +26400 train 5.696656 (lr=2.5657e-05) (hash(x)=48504109) +27500 val loss 6.2373 +27500 val perplexity 511.4503 +27500 train 6.334397 (lr=4.8213e-05) (hash(x)=53265478) +28200 val loss 6.0243 +28200 val perplexity 413.3595 +28200 train 6.198831 (lr=3.2379e-05) (hash(x)=56595308) +26500 val loss 5.8980 +26500 val perplexity 364.3212 +26500 train 6.510257 (lr=2.5515e-05) (hash(x)=55775918) +27600 val loss 6.2322 +27600 val perplexity 508.8639 +27600 train 6.047086 (lr=4.7932e-05) (hash(x)=39456920) +28300 val loss 6.0461 +28300 val perplexity 422.4748 +28300 train 6.702070 (lr=3.2184e-05) (hash(x)=52311024) +26600 val loss 5.8978 +26600 val perplexity 364.2238 +26600 train 5.865256 (lr=2.5374e-05) (hash(x)=51978817) +27700 val loss 6.2495 +27700 val perplexity 517.7338 +27700 train 6.436790 (lr=4.7652e-05) (hash(x)=55794844) +28400 val loss 6.0277 +28400 val perplexity 414.7752 +28400 train 6.030992 (lr=3.1990e-05) (hash(x)=47997766) +26700 val loss 5.9000 +26700 val perplexity 365.0312 +26700 train 5.699328 (lr=2.5233e-05) (hash(x)=50995814) +27800 val loss 6.2441 +27800 val perplexity 514.9801 +27800 train 6.175504 (lr=4.7372e-05) (hash(x)=49091362) +28500 val loss 6.0278 +28500 val perplexity 414.7870 +28500 train 5.981230 (lr=3.1795e-05) (hash(x)=46906589) +26800 val loss 5.8952 +26800 val perplexity 363.3040 +26800 train 5.803399 (lr=2.5092e-05) (hash(x)=49986643) +27900 val loss 6.2364 +27900 val perplexity 511.0261 +28600 val loss 6.0144 +28600 val perplexity 409.3001 +27900 train 6.035953 (lr=4.7093e-05) (hash(x)=49494590) +28600 train 5.802358 (lr=3.1601e-05) (hash(x)=44172312) +26900 val loss 5.8867 +26900 val perplexity 360.2006 +26900 train 6.010286 (lr=2.4951e-05) (hash(x)=51408788) +28700 val loss 6.0143 +28700 val perplexity 409.2209 +28700 train 6.113423 (lr=3.1408e-05) (hash(x)=50435687) +28000 val loss 6.2374 +28000 val perplexity 511.5454 +28000 train 5.639122 (lr=4.6813e-05) (hash(x)=42333356) +27000 val loss 5.8821 +27000 val perplexity 358.5791 +27000 train 5.639222 (lr=2.4810e-05) (hash(x)=49024980) +28800 val loss 6.0265 +28800 val perplexity 414.2587 +28800 train 5.955416 (lr=3.1214e-05) (hash(x)=53678477) +28100 val loss 6.2366 +28100 val perplexity 511.1124 +28100 train 6.408118 (lr=4.6534e-05) (hash(x)=49378958) +27100 val loss 5.8795 +27100 val perplexity 357.6401 +27100 train 6.145059 (lr=2.4669e-05) (hash(x)=45027677) +28900 val loss 6.0131 +28900 val perplexity 408.7520 +28900 train 5.830149 (lr=3.1021e-05) (hash(x)=47353092) +28200 val loss 6.2355 +28200 val perplexity 510.5514 +28200 train 6.413532 (lr=4.6256e-05) (hash(x)=56595308) +27200 val loss 5.8851 +27200 val perplexity 359.6531 +27200 train 5.970792 (lr=2.4528e-05) (hash(x)=49930937) +29000 val loss 6.0132 +29000 val perplexity 408.7924 +29000 train 6.236730 (lr=3.0828e-05) (hash(x)=56723899) +28300 val loss 6.2460 +28300 val perplexity 515.9637 +28300 train 6.775725 (lr=4.5977e-05) (hash(x)=52311024) +29100 val loss 6.0087 +29100 val perplexity 406.9632 +27300 val loss 5.8865 +27300 val perplexity 360.1448 +29100 train 6.015871 (lr=3.0635e-05) (hash(x)=56056600) +27300 train 5.807554 (lr=2.4387e-05) (hash(x)=51016030) +28400 val loss 6.2328 +28400 val perplexity 509.1544 +28400 train 6.245226 (lr=4.5700e-05) (hash(x)=47997766) +29200 val loss 6.0170 +29200 val perplexity 410.3560 +29200 train 5.952146 (lr=3.0443e-05) (hash(x)=50538322) +27400 val loss 5.8778 +27400 val perplexity 357.0374 +27400 train 6.118532 (lr=2.4247e-05) (hash(x)=48154914) +28500 val loss 6.2164 +28500 val perplexity 500.9097 +28500 train 6.170032 (lr=4.5422e-05) (hash(x)=46906589) +29300 val loss 6.0125 +29300 val perplexity 408.5009 +29300 train 6.051044 (lr=3.0251e-05) (hash(x)=57261305) +27500 val loss 5.8801 +27500 val perplexity 357.8611 +27500 train 5.987864 (lr=2.4106e-05) (hash(x)=53265478) +28600 val loss 6.2102 +28600 val perplexity 497.8000 +28600 train 6.002185 (lr=4.5145e-05) (hash(x)=44172312) +29400 val loss 6.0143 +29400 val perplexity 409.2464 +29400 train 5.828960 (lr=3.0060e-05) (hash(x)=49014077) +27600 val loss 5.9269 +27600 val perplexity 374.9927 +27600 train 5.704084 (lr=2.3966e-05) (hash(x)=39456920) +28700 val loss 6.2190 +28700 val perplexity 502.2180 +28700 train 6.295090 (lr=4.4868e-05) (hash(x)=50435687) +29500 val loss 6.0043 +29500 val perplexity 405.1839 +29500 train 5.747086 (lr=2.9868e-05) (hash(x)=48698820) +27700 val loss 5.8864 +27700 val perplexity 360.1145 +27700 train 6.045403 (lr=2.3826e-05) (hash(x)=55794844) +28800 val loss 6.2307 +28800 val perplexity 508.0989 +28800 train 6.177702 (lr=4.4592e-05) (hash(x)=53678477) +29600 val loss 6.0036 +29600 val perplexity 404.9002 +29600 train 5.857091 (lr=2.9677e-05) (hash(x)=48327445) +28900 val loss 6.2273 +28900 val perplexity 506.4051 +28900 train 6.015232 (lr=4.4316e-05) (hash(x)=47353092) +27800 val loss 5.8906 +27800 val perplexity 361.6364 +27800 train 5.769773 (lr=2.3686e-05) (hash(x)=49091362) +29700 val loss 6.0087 +29700 val perplexity 406.9378 +29700 train 6.164454 (lr=2.9487e-05) (hash(x)=52084503) +29000 val loss 6.2123 +29000 val perplexity 498.8557 +29000 train 6.440894 (lr=4.4040e-05) (hash(x)=56723899) +27900 val loss 5.8769 +27900 val perplexity 356.7052 +27900 train 5.697681 (lr=2.3546e-05) (hash(x)=49494590) +29800 val loss 6.0307 +29800 val perplexity 416.0104 +29800 train 5.802908 (lr=2.9297e-05) (hash(x)=44402411) +29100 val loss 6.2136 +29100 val perplexity 499.5199 +29100 train 6.206547 (lr=4.3765e-05) (hash(x)=56056600) +28000 val loss 5.8756 +28000 val perplexity 356.2436 +28000 train 5.260273 (lr=2.3407e-05) (hash(x)=42333356) +29900 val loss 6.0100 +29900 val perplexity 407.5032 +29900 train 5.680004 (lr=2.9107e-05) (hash(x)=43932550) +29200 val loss 6.2180 +29200 val perplexity 501.7069 +29200 train 6.145995 (lr=4.3490e-05) (hash(x)=50538322) +28100 val loss 5.8717 +28100 val perplexity 354.8508 +28100 train 6.080104 (lr=2.3267e-05) (hash(x)=49378958) +30000 val loss 5.9983 +30000 val perplexity 402.7516 +30000 train 5.870724 (lr=2.8917e-05) (hash(x)=48250169) +29300 val loss 6.2152 +29300 val perplexity 500.2998 +29300 train 6.333208 (lr=4.3216e-05) (hash(x)=57261305) +28200 val loss 5.8669 +28200 val perplexity 353.1412 +28200 train 6.020795 (lr=2.3128e-05) (hash(x)=56595308) +30100 val loss 5.9937 +30100 val perplexity 400.8860 +30100 train 6.050492 (lr=2.8728e-05) (hash(x)=46727253) +29400 val loss 6.2179 +29400 val perplexity 501.6275 +29400 train 6.051409 (lr=4.2942e-05) (hash(x)=49014077) +28300 val loss 5.8746 +28300 val perplexity 355.8778 +28300 train 6.543456 (lr=2.2989e-05) (hash(x)=52311024) +30200 val loss 5.9869 +30200 val perplexity 398.1633 +30200 train 6.059178 (lr=2.8539e-05) (hash(x)=52740773) +29500 val loss 6.2182 +29500 val perplexity 501.8105 +29500 train 5.973386 (lr=4.2669e-05) (hash(x)=48698820) +28400 val loss 5.8685 +28400 val perplexity 353.7059 +28400 train 5.846119 (lr=2.2850e-05) (hash(x)=47997766) +30300 val loss 5.9918 +30300 val perplexity 400.1261 +30300 train 6.092599 (lr=2.8351e-05) (hash(x)=52481310) +29600 val loss 6.2193 +29600 val perplexity 502.3401 +29600 train 6.098068 (lr=4.2396e-05) (hash(x)=48327445) +28500 val loss 5.8600 +28500 val perplexity 350.7257 +28500 train 5.825363 (lr=2.2711e-05) (hash(x)=46906589) +30400 val loss 6.0039 +30400 val perplexity 405.0247 +30400 train 5.917053 (lr=2.8163e-05) (hash(x)=44942827) +29700 val loss 6.2219 +29700 val perplexity 503.6554 +29700 train 6.411894 (lr=4.2124e-05) (hash(x)=52084503) +28600 val loss 5.8538 +28600 val perplexity 348.5419 +28600 train 5.667266 (lr=2.2572e-05) (hash(x)=44172312) +30500 val loss 5.9957 +30500 val perplexity 401.7155 +30500 train 5.870349 (lr=2.7976e-05) (hash(x)=53118706) +29800 val loss 6.2254 +29800 val perplexity 505.4112 +29800 train 5.972321 (lr=4.1852e-05) (hash(x)=44402411) +28700 val loss 5.8625 +28700 val perplexity 351.6016 +28700 train 5.945757 (lr=2.2434e-05) (hash(x)=50435687) +30600 val loss 5.9892 +30600 val perplexity 399.0977 +30600 train 5.704517 (lr=2.7789e-05) (hash(x)=46278699) +29900 val loss 6.2241 +29900 val perplexity 504.7782 +29900 train 5.888961 (lr=4.1581e-05) (hash(x)=43932550) +28800 val loss 5.8593 +28800 val perplexity 350.4829 +28800 train 5.783760 (lr=2.2296e-05) (hash(x)=53678477) +30700 val loss 5.9854 +30700 val perplexity 397.5801 +30700 train 5.937806 (lr=2.7602e-05) (hash(x)=51949086) +30000 val loss 6.2064 +30000 val perplexity 495.9354 +30000 train 6.103238 (lr=4.1310e-05) (hash(x)=48250169) +28900 val loss 5.8612 +28900 val perplexity 351.1393 +28900 train 5.676228 (lr=2.2158e-05) (hash(x)=47353092) +30800 val loss 5.9892 +30800 val perplexity 399.1051 +30800 train 5.973952 (lr=2.7416e-05) (hash(x)=50575709) +30100 val loss 6.2129 +30100 val perplexity 499.1432 +30100 train 6.235704 (lr=4.1040e-05) (hash(x)=46727253) +29000 val loss 5.8532 +29000 val perplexity 348.3626 +29000 train 6.064645 (lr=2.2020e-05) (hash(x)=56723899) +30900 val loss 5.9934 +30900 val perplexity 400.7560 +30900 train 5.830714 (lr=2.7230e-05) (hash(x)=53992116) +30200 val loss 6.2046 +30200 val perplexity 495.0435 +30200 train 6.227543 (lr=4.0771e-05) (hash(x)=52740773) +29100 val loss 5.8571 +29100 val perplexity 349.7240 +29100 train 5.874152 (lr=2.1882e-05) (hash(x)=56056600) +31000 val loss 5.9884 +31000 val perplexity 398.7793 +31000 train 5.784818 (lr=2.7045e-05) (hash(x)=44967737) +30300 val loss 6.1977 +30300 val perplexity 491.6121 +30300 train 6.350532 (lr=4.0502e-05) (hash(x)=52481310) +29200 val loss 5.8520 +29200 val perplexity 347.9445 +29200 train 5.807312 (lr=2.1745e-05) (hash(x)=50538322) +31100 val loss 5.9902 +31100 val perplexity 399.4781 +31100 train 5.687879 (lr=2.6860e-05) (hash(x)=41850245) +30400 val loss 6.2095 +30400 val perplexity 497.4491 +30400 train 6.109884 (lr=4.0233e-05) (hash(x)=44942827) +29300 val loss 5.8500 +29300 val perplexity 347.2274 +29300 train 5.855346 (lr=2.1608e-05) (hash(x)=57261305) +31200 val loss 5.9812 +31200 val perplexity 395.9185 +31200 train 5.836033 (lr=2.6675e-05) (hash(x)=49921656) +30500 val loss 6.1996 +30500 val perplexity 492.5657 +30500 train 6.063776 (lr=3.9965e-05) (hash(x)=53118706) +29400 val loss 5.8523 +29400 val perplexity 348.0262 +29400 train 5.677557 (lr=2.1471e-05) (hash(x)=49014077) +31300 val loss 5.9833 +31300 val perplexity 396.7625 +31300 train 5.862272 (lr=2.6491e-05) (hash(x)=47068006) +30600 val loss 6.1976 +30600 val perplexity 491.5683 +30600 train 5.907681 (lr=3.9698e-05) (hash(x)=46278699) +29500 val loss 5.8536 +29500 val perplexity 348.5013 +29500 train 5.582386 (lr=2.1335e-05) (hash(x)=48698820) +31400 val loss 5.9852 +31400 val perplexity 397.5016 +31400 train 6.159078 (lr=2.6308e-05) (hash(x)=52608679) +30700 val loss 6.1976 +30700 val perplexity 491.5591 +30700 train 6.130133 (lr=3.9431e-05) (hash(x)=51949086) +29600 val loss 5.8484 +29600 val perplexity 346.6938 +29600 train 5.701379 (lr=2.1198e-05) (hash(x)=48327445) +31500 val loss 5.9958 +31500 val perplexity 401.7412 +31500 train 5.899184 (lr=2.6125e-05) (hash(x)=51542050) +30800 val loss 6.1912 +30800 val perplexity 488.4259 +30800 train 6.164798 (lr=3.9165e-05) (hash(x)=50575709) +31600 val loss 5.9843 +31600 val perplexity 397.1341 +31600 train 5.896556 (lr=2.5942e-05) (hash(x)=49170260) +29700 val loss 5.8503 +29700 val perplexity 347.3433 +29700 train 6.005876 (lr=2.1062e-05) (hash(x)=52084503) +30900 val loss 6.1910 +30900 val perplexity 488.3528 +30900 train 6.037580 (lr=3.8900e-05) (hash(x)=53992116) +31700 val loss 5.9763 +31700 val perplexity 393.9861 +31700 train 6.023350 (lr=2.5760e-05) (hash(x)=57677051) +29800 val loss 5.8533 +29800 val perplexity 348.3767 +29800 train 5.676028 (lr=2.0926e-05) (hash(x)=44402411) +31000 val loss 6.1961 +31000 val perplexity 490.8546 +31000 train 6.038806 (lr=3.8635e-05) (hash(x)=44967737) +31800 val loss 5.9717 +31800 val perplexity 392.1629 +31800 train 6.152301 (lr=2.5579e-05) (hash(x)=52567943) +29900 val loss 5.8530 +29900 val perplexity 348.2892 +29900 train 5.540175 (lr=2.0791e-05) (hash(x)=43932550) +31100 val loss 6.1820 +31100 val perplexity 483.9361 +31100 train 5.881097 (lr=3.8371e-05) (hash(x)=41850245) +31900 val loss 5.9706 +31900 val perplexity 391.7267 +31900 train 6.161015 (lr=2.5398e-05) (hash(x)=51246750) +30000 val loss 5.8419 +30000 val perplexity 344.4402 +30000 train 5.702406 (lr=2.0655e-05) (hash(x)=48250169) +31200 val loss 6.1892 +31200 val perplexity 487.4655 +31200 train 6.056690 (lr=3.8108e-05) (hash(x)=49921656) +32000 val loss 5.9709 +32000 val perplexity 391.8543 +32000 train 6.447259 (lr=2.5218e-05) (hash(x)=63256210) +30100 val loss 5.8364 +30100 val perplexity 342.5384 +30100 train 5.934769 (lr=2.0520e-05) (hash(x)=46727253) +31300 val loss 6.1902 +31300 val perplexity 487.9301 +31300 train 6.051952 (lr=3.7845e-05) (hash(x)=47068006) +32100 val loss 5.9726 +32100 val perplexity 392.5269 +32100 train 6.162525 (lr=2.5038e-05) (hash(x)=52238178) +30200 val loss 5.8326 +30200 val perplexity 341.2585 +30200 train 5.895270 (lr=2.0385e-05) (hash(x)=52740773) +31400 val loss 6.1884 +31400 val perplexity 487.0724 +31400 train 6.287651 (lr=3.7583e-05) (hash(x)=52608679) +32200 val loss 5.9707 +32200 val perplexity 391.7810 +32200 train 6.205959 (lr=2.4858e-05) (hash(x)=56290278) +30300 val loss 5.8379 +30300 val perplexity 343.0556 +30300 train 5.890831 (lr=2.0251e-05) (hash(x)=52481310) +31500 val loss 6.2008 +31500 val perplexity 493.1667 +31500 train 6.076781 (lr=3.7321e-05) (hash(x)=51542050) +32300 val loss 5.9640 +32300 val perplexity 389.1691 +32300 train 6.014713 (lr=2.4679e-05) (hash(x)=48455511) +30400 val loss 5.8431 +30400 val perplexity 344.8558 +30400 train 5.780347 (lr=2.0117e-05) (hash(x)=44942827) +31600 val loss 6.1891 +31600 val perplexity 487.4055 +31600 train 6.056008 (lr=3.7061e-05) (hash(x)=49170260) +32400 val loss 5.9667 +32400 val perplexity 390.2041 +32400 train 5.776800 (lr=2.4501e-05) (hash(x)=42754343) +30500 val loss 5.8344 +30500 val perplexity 341.8611 +30500 train 5.713102 (lr=1.9983e-05) (hash(x)=53118706) +31700 val loss 6.1859 +31700 val perplexity 485.8332 +31700 train 6.241374 (lr=3.6801e-05) (hash(x)=57677051) +32500 val loss 5.9725 +32500 val perplexity 392.4980 +32500 train 5.962894 (lr=2.4323e-05) (hash(x)=51062421) +30600 val loss 5.8376 +30600 val perplexity 342.9691 +30600 train 5.548477 (lr=1.9849e-05) (hash(x)=46278699) +31800 val loss 6.1742 +31800 val perplexity 480.2129 +31800 train 6.333062 (lr=3.6541e-05) (hash(x)=52567943) +32600 val loss 5.9705 +32600 val perplexity 391.7160 +32600 train 5.763828 (lr=2.4146e-05) (hash(x)=43957725) +30700 val loss 5.8322 +30700 val perplexity 341.1252 +30700 train 5.787788 (lr=1.9716e-05) (hash(x)=51949086) +31900 val loss 6.1778 +31900 val perplexity 481.9407 +31900 train 6.360960 (lr=3.6283e-05) (hash(x)=51246750) +32700 val loss 5.9618 +32700 val perplexity 388.3044 +32700 train 5.987890 (lr=2.3970e-05) (hash(x)=55971557) +30800 val loss 5.8322 +30800 val perplexity 341.1127 +30800 train 5.843060 (lr=1.9583e-05) (hash(x)=50575709) +32000 val loss 6.1807 +32000 val perplexity 483.3308 +32000 train 6.709186 (lr=3.6025e-05) (hash(x)=63256210) +32800 val loss 5.9656 +32800 val perplexity 389.7902 +32800 train 5.970448 (lr=2.3794e-05) (hash(x)=49429775) +30900 val loss 5.8312 +30900 val perplexity 340.7760 +30900 train 5.671123 (lr=1.9450e-05) (hash(x)=53992116) +32100 val loss 6.1815 +32100 val perplexity 483.7410 +32100 train 6.340555 (lr=3.5768e-05) (hash(x)=52238178) +32900 val loss 5.9773 +32900 val perplexity 394.3556 +32900 train 5.824573 (lr=2.3618e-05) (hash(x)=49564803) +32200 val loss 6.1679 +32200 val perplexity 477.1887 +32200 train 6.374283 (lr=3.5512e-05) (hash(x)=56290278) +31000 val loss 5.8332 +31000 val perplexity 341.4471 +31000 train 5.659619 (lr=1.9318e-05) (hash(x)=44967737) +33000 val loss 5.9679 +33000 val perplexity 390.7012 +33000 train 5.847964 (lr=2.3443e-05) (hash(x)=46343736) +32300 val loss 6.1604 +32300 val perplexity 473.5974 +32300 train 6.191844 (lr=3.5256e-05) (hash(x)=48455511) +31100 val loss 5.8327 +31100 val perplexity 341.2876 +31100 train 5.544391 (lr=1.9186e-05) (hash(x)=41850245) +33100 val loss 5.9698 +33100 val perplexity 391.4238 +33100 train 5.662004 (lr=2.3269e-05) (hash(x)=45424347) +32400 val loss 6.1660 +32400 val perplexity 476.2751 +32400 train 5.952959 (lr=3.5002e-05) (hash(x)=42754343) +31200 val loss 5.8291 +31200 val perplexity 340.0558 +31200 train 5.683711 (lr=1.9054e-05) (hash(x)=49921656) +33200 val loss 5.9647 +33200 val perplexity 389.4277 +33200 train 6.005509 (lr=2.3095e-05) (hash(x)=50207861) +32500 val loss 6.1714 +32500 val perplexity 478.8346 +32500 train 6.157789 (lr=3.4748e-05) (hash(x)=51062421) +31300 val loss 5.8296 +31300 val perplexity 340.2083 +31300 train 5.687627 (lr=1.8922e-05) (hash(x)=47068006) +33300 val loss 5.9633 +33300 val perplexity 388.8740 +33300 train 5.886430 (lr=2.2922e-05) (hash(x)=45847922) +32600 val loss 6.1692 +32600 val perplexity 477.8271 +32600 train 5.951297 (lr=3.4495e-05) (hash(x)=43957725) +31400 val loss 5.8276 +31400 val perplexity 339.5481 +31400 train 5.998035 (lr=1.8791e-05) (hash(x)=52608679) +33400 val loss 5.9622 +33400 val perplexity 388.4505 +33400 train 5.791804 (lr=2.2750e-05) (hash(x)=44723617) +32700 val loss 6.1683 +32700 val perplexity 477.3633 +32700 train 6.196739 (lr=3.4242e-05) (hash(x)=55971557) +33500 val loss 5.9567 +33500 val perplexity 386.3194 +33500 train 6.093992 (lr=2.2578e-05) (hash(x)=54058855) +31500 val loss 5.8449 +31500 val perplexity 345.4549 +31500 train 5.749641 (lr=1.8661e-05) (hash(x)=51542050) +32800 val loss 6.1635 +32800 val perplexity 475.0682 +32800 train 6.165715 (lr=3.3991e-05) (hash(x)=49429775) +33600 val loss 5.9519 +33600 val perplexity 384.4791 +33600 train 5.929941 (lr=2.2407e-05) (hash(x)=53052866) +31600 val loss 5.8288 +31600 val perplexity 339.9622 +31600 train 5.715558 (lr=1.8530e-05) (hash(x)=49170260) +32900 val loss 6.1666 +32900 val perplexity 476.5743 +32900 train 6.011598 (lr=3.3740e-05) (hash(x)=49564803) +33700 val loss 5.9595 +33700 val perplexity 387.4097 +33700 train 5.876254 (lr=2.2237e-05) (hash(x)=50458295) +31700 val loss 5.8296 +31700 val perplexity 340.2090 +31700 train 5.814413 (lr=1.8400e-05) (hash(x)=57677051) +33000 val loss 6.1660 +33000 val perplexity 476.2631 +33000 train 6.041053 (lr=3.3490e-05) (hash(x)=46343736) +33800 val loss 5.9636 +33800 val perplexity 389.0148 +33800 train 6.121011 (lr=2.2067e-05) (hash(x)=58204533) +31800 val loss 5.8223 +31800 val perplexity 337.7436 +31800 train 5.994304 (lr=1.8271e-05) (hash(x)=52567943) +33900 val loss 5.9465 +33900 val perplexity 382.4198 +33900 train 6.083571 (lr=2.1898e-05) (hash(x)=52246445) +33100 val loss 6.1740 +33100 val perplexity 480.1154 +33100 train 5.879986 (lr=3.3242e-05) (hash(x)=45424347) +31900 val loss 5.8270 +31900 val perplexity 339.3340 +31900 train 6.048852 (lr=1.8141e-05) (hash(x)=51246750) +34000 val loss 5.9464 +34000 val perplexity 382.3709 +34000 train 5.890872 (lr=2.1729e-05) (hash(x)=49316883) +33200 val loss 6.1753 +33200 val perplexity 480.7034 +33200 train 6.173397 (lr=3.2993e-05) (hash(x)=50207861) +32000 val loss 5.8267 +32000 val perplexity 339.2486 +32000 train 6.286895 (lr=1.8013e-05) (hash(x)=63256210) +34100 val loss 5.9498 +34100 val perplexity 383.6810 +34100 train 5.636144 (lr=2.1561e-05) (hash(x)=42991129) +33300 val loss 6.1664 +33300 val perplexity 476.4754 +33300 train 6.078833 (lr=3.2746e-05) (hash(x)=45847922) +32100 val loss 5.8271 +32100 val perplexity 339.3733 +32100 train 6.022761 (lr=1.7884e-05) (hash(x)=52238178) +34200 val loss 5.9550 +34200 val perplexity 385.6611 +34200 train 5.614861 (lr=2.1394e-05) (hash(x)=45199853) +33400 val loss 6.1709 +33400 val perplexity 478.6107 +33400 train 5.986867 (lr=3.2500e-05) (hash(x)=44723617) +32200 val loss 5.8140 +32200 val perplexity 334.9713 +32200 train 6.064962 (lr=1.7756e-05) (hash(x)=56290278) +34300 val loss 5.9525 +34300 val perplexity 384.7282 +34300 train 5.832723 (lr=2.1227e-05) (hash(x)=48404386) +33500 val loss 6.1614 +33500 val perplexity 474.0743 +33500 train 6.271558 (lr=3.2255e-05) (hash(x)=54058855) +32300 val loss 5.8125 +32300 val perplexity 334.4654 +32300 train 5.886790 (lr=1.7628e-05) (hash(x)=48455511) +34400 val loss 5.9482 +34400 val perplexity 383.0553 +34400 train 5.740652 (lr=2.1062e-05) (hash(x)=45614997) +33600 val loss 6.1572 +33600 val perplexity 472.1153 +33600 train 6.153094 (lr=3.2010e-05) (hash(x)=53052866) +32400 val loss 5.8132 +32400 val perplexity 334.7010 +32400 train 5.625381 (lr=1.7501e-05) (hash(x)=42754343) +33700 val loss 6.1547 +33700 val perplexity 470.9484 +33700 train 6.073953 (lr=3.1767e-05) (hash(x)=50458295) +34500 val loss 5.9496 +34500 val perplexity 383.6016 +34500 train 5.980463 (lr=2.0896e-05) (hash(x)=53157684) +32500 val loss 5.8101 +32500 val perplexity 333.6473 +32500 train 5.810159 (lr=1.7374e-05) (hash(x)=51062421) +34600 val loss 5.9588 +34600 val perplexity 387.1570 +34600 train 6.175712 (lr=2.0732e-05) (hash(x)=57152287) +33800 val loss 6.1629 +33800 val perplexity 474.8112 +33800 train 6.277860 (lr=3.1524e-05) (hash(x)=58204533) +32600 val loss 5.8166 +32600 val perplexity 335.8415 +32600 train 5.610869 (lr=1.7247e-05) (hash(x)=43957725) +34700 val loss 5.9519 +34700 val perplexity 384.4753 +34700 train 5.406115 (lr=2.0568e-05) (hash(x)=46088213) +33900 val loss 6.1466 +33900 val perplexity 467.1330 +33900 train 6.256472 (lr=3.1282e-05) (hash(x)=52246445) +32700 val loss 5.8128 +32700 val perplexity 334.5435 +32700 train 5.838300 (lr=1.7121e-05) (hash(x)=55971557) +34800 val loss 5.9471 +34800 val perplexity 382.6555 +34800 train 5.693446 (lr=2.0405e-05) (hash(x)=47098930) +34000 val loss 6.1448 +34000 val perplexity 466.2920 +34000 train 6.076751 (lr=3.1041e-05) (hash(x)=49316883) +34900 val loss 5.9476 +34900 val perplexity 382.8204 +34900 train 5.945630 (lr=2.0243e-05) (hash(x)=51033119) +32800 val loss 5.8090 +32800 val perplexity 333.2708 +32800 train 5.821727 (lr=1.6995e-05) (hash(x)=49429775) +34100 val loss 6.1457 +34100 val perplexity 466.7285 +34100 train 5.839543 (lr=3.0802e-05) (hash(x)=42991129) +35000 val loss 5.9482 +35000 val perplexity 383.0675 +35000 train 5.622958 (lr=2.0081e-05) (hash(x)=47111881) +32900 val loss 5.8096 +32900 val perplexity 333.4849 +32900 train 5.661140 (lr=1.6870e-05) (hash(x)=49564803) +34200 val loss 6.1458 +34200 val perplexity 466.7634 +34200 train 5.827536 (lr=3.0563e-05) (hash(x)=45199853) +35100 val loss 5.9462 +35100 val perplexity 382.2891 +35100 train 5.967577 (lr=1.9921e-05) (hash(x)=52899306) +34300 val loss 6.1517 +34300 val perplexity 469.5308 +34300 train 6.048966 (lr=3.0325e-05) (hash(x)=48404386) +33000 val loss 5.8099 +33000 val perplexity 333.5746 +33000 train 5.681557 (lr=1.6745e-05) (hash(x)=46343736) +35200 val loss 5.9419 +35200 val perplexity 380.6734 +35200 train 5.869995 (lr=1.9760e-05) (hash(x)=49124834) +34400 val loss 6.1486 +34400 val perplexity 468.0514 +34400 train 5.962949 (lr=3.0088e-05) (hash(x)=45614997) +33100 val loss 5.8131 +33100 val perplexity 334.6570 +33100 train 5.508917 (lr=1.6621e-05) (hash(x)=45424347) +35300 val loss 5.9407 +35300 val perplexity 380.2195 +35300 train 6.108963 (lr=1.9601e-05) (hash(x)=54127282) +34500 val loss 6.1566 +34500 val perplexity 471.8180 +34500 train 6.178240 (lr=2.9852e-05) (hash(x)=53157684) +33200 val loss 5.8256 +33200 val perplexity 338.8622 +33200 train 5.892923 (lr=1.6497e-05) (hash(x)=50207861) +35400 val loss 5.9397 +35400 val perplexity 379.8055 +35400 train 5.801503 (lr=1.9442e-05) (hash(x)=51154858) +34600 val loss 6.1572 +34600 val perplexity 472.1272 +34600 train 6.362251 (lr=2.9617e-05) (hash(x)=57152287) +33300 val loss 5.8135 +33300 val perplexity 334.7842 +33300 train 5.733989 (lr=1.6373e-05) (hash(x)=45847922) +35500 val loss 5.9351 +35500 val perplexity 378.0600 +35500 train 5.985872 (lr=1.9285e-05) (hash(x)=51585534) +34700 val loss 6.1456 +34700 val perplexity 466.6777 +34700 train 5.622260 (lr=2.9383e-05) (hash(x)=46088213) +33400 val loss 5.8126 +33400 val perplexity 334.4757 +33400 train 5.656195 (lr=1.6250e-05) (hash(x)=44723617) +35600 val loss 5.9394 +35600 val perplexity 379.6975 +35600 train 5.750732 (lr=1.9128e-05) (hash(x)=46228695) +34800 val loss 6.1489 +34800 val perplexity 468.2012 +34800 train 5.881421 (lr=2.9150e-05) (hash(x)=47098930) +33500 val loss 5.8089 +33500 val perplexity 333.2426 +33500 train 5.919910 (lr=1.6127e-05) (hash(x)=54058855) +35700 val loss 5.9294 +35700 val perplexity 375.9344 +35700 train 5.940912 (lr=1.8971e-05) (hash(x)=49080481) +34900 val loss 6.1519 +34900 val perplexity 469.5917 +34900 train 6.178014 (lr=2.8918e-05) (hash(x)=51033119) +33600 val loss 5.8011 +33600 val perplexity 330.6728 +33600 train 5.768274 (lr=1.6005e-05) (hash(x)=53052866) +35800 val loss 5.9297 +35800 val perplexity 376.0268 +35800 train 6.078786 (lr=1.8816e-05) (hash(x)=56743084) +35000 val loss 6.1568 +35000 val perplexity 471.9262 +35000 train 5.839863 (lr=2.8688e-05) (hash(x)=47111881) +33700 val loss 5.8030 +33700 val perplexity 331.2847 +33700 train 5.727721 (lr=1.5883e-05) (hash(x)=50458295) +35900 val loss 5.9304 +35900 val perplexity 376.3233 +35900 train 6.033379 (lr=1.8661e-05) (hash(x)=53068229) +35100 val loss 6.1500 +35100 val perplexity 468.7335 +35100 train 6.190754 (lr=2.8458e-05) (hash(x)=52899306) +33800 val loss 5.8021 +33800 val perplexity 331.0087 +33800 train 5.948277 (lr=1.5762e-05) (hash(x)=58204533) +36000 val loss 5.9267 +36000 val perplexity 374.9042 +36000 train 6.096465 (lr=1.8507e-05) (hash(x)=54363049) +35200 val loss 6.1501 +35200 val perplexity 468.7583 +35200 train 6.098011 (lr=2.8229e-05) (hash(x)=49124834) +36100 val loss 5.9290 +36100 val perplexity 375.7826 +36100 train 5.854910 (lr=1.8354e-05) (hash(x)=50555767) +33900 val loss 5.7948 +33900 val perplexity 328.6017 +33900 train 5.935129 (lr=1.5641e-05) (hash(x)=52246445) +35300 val loss 6.1463 +35300 val perplexity 466.9794 +35300 train 6.313648 (lr=2.8002e-05) (hash(x)=54127282) +36200 val loss 5.9271 +36200 val perplexity 375.0605 +36200 train 6.072759 (lr=1.8201e-05) (hash(x)=52103710) +34000 val loss 5.7918 +34000 val perplexity 327.5931 +34000 train 5.721344 (lr=1.5521e-05) (hash(x)=49316883) +35400 val loss 6.1389 +35400 val perplexity 463.5280 +35400 train 5.998002 (lr=2.7775e-05) (hash(x)=51154858) +36300 val loss 5.9308 +36300 val perplexity 376.4374 +36300 train 5.712657 (lr=1.8050e-05) (hash(x)=43347374) +34100 val loss 5.7962 +34100 val perplexity 329.0480 +34100 train 5.451437 (lr=1.5401e-05) (hash(x)=42991129) +35500 val loss 6.1332 +35500 val perplexity 460.9086 +35500 train 6.169389 (lr=2.7549e-05) (hash(x)=51585534) +36400 val loss 5.9217 +36400 val perplexity 373.0446 +36400 train 5.978114 (lr=1.7899e-05) (hash(x)=52661268) +34200 val loss 5.7939 +34200 val perplexity 328.2848 +34200 train 5.440014 (lr=1.5281e-05) (hash(x)=45199853) +35600 val loss 6.1330 +35600 val perplexity 460.8136 +35600 train 5.942529 (lr=2.7325e-05) (hash(x)=46228695) +36500 val loss 5.9290 +36500 val perplexity 375.7711 +36500 train 5.829330 (lr=1.7749e-05) (hash(x)=47765383) +34300 val loss 5.7996 +34300 val perplexity 330.1776 +34300 train 5.690372 (lr=1.5162e-05) (hash(x)=48404386) +35700 val loss 6.1292 +35700 val perplexity 459.0685 +35700 train 6.098458 (lr=2.7102e-05) (hash(x)=49080481) +36600 val loss 5.9310 +36600 val perplexity 376.5367 +36600 train 5.769433 (lr=1.7600e-05) (hash(x)=49302411) +34400 val loss 5.7971 +34400 val perplexity 329.3420 +34400 train 5.594779 (lr=1.5044e-05) (hash(x)=45614997) +35800 val loss 6.1308 +35800 val perplexity 459.8204 +35800 train 6.276246 (lr=2.6880e-05) (hash(x)=56743084) +36700 val loss 5.9329 +36700 val perplexity 377.2295 +36700 train 5.456398 (lr=1.7452e-05) (hash(x)=44113066) +34500 val loss 5.8002 +34500 val perplexity 330.3516 +34500 train 5.815750 (lr=1.4926e-05) (hash(x)=53157684) +35900 val loss 6.1383 +35900 val perplexity 463.2589 +35900 train 6.221340 (lr=2.6659e-05) (hash(x)=53068229) +36800 val loss 5.9301 +36800 val perplexity 376.1776 +36800 train 5.837457 (lr=1.7305e-05) (hash(x)=55896927) +34600 val loss 5.8000 +34600 val perplexity 330.2973 +34600 train 6.033842 (lr=1.4809e-05) (hash(x)=57152287) +36000 val loss 6.1363 +36000 val perplexity 462.3354 +36000 train 6.289048 (lr=2.6439e-05) (hash(x)=54363049) +36900 val loss 5.9274 +36900 val perplexity 375.1843 +36900 train 5.742035 (lr=1.7158e-05) (hash(x)=49812554) +34700 val loss 5.7947 +34700 val perplexity 328.5693 +34700 train 5.238037 (lr=1.4692e-05) (hash(x)=46088213) +36100 val loss 6.1482 +36100 val perplexity 467.8798 +36100 train 6.052368 (lr=2.6220e-05) (hash(x)=50555767) +37000 val loss 5.9316 +37000 val perplexity 376.7454 +37000 train 5.750156 (lr=1.7012e-05) (hash(x)=49391771) +34800 val loss 5.7920 +34800 val perplexity 327.6733 +34800 train 5.528825 (lr=1.4575e-05) (hash(x)=47098930) +36200 val loss 6.1339 +36200 val perplexity 461.2412 +36200 train 6.267734 (lr=2.6002e-05) (hash(x)=52103710) +37100 val loss 5.9311 +37100 val perplexity 376.5597 +37100 train 5.879148 (lr=1.6867e-05) (hash(x)=50515328) +34900 val loss 5.7981 +34900 val perplexity 329.6606 +34900 train 5.806500 (lr=1.4459e-05) (hash(x)=51033119) +36300 val loss 6.1314 +36300 val perplexity 460.0860 +36300 train 5.908183 (lr=2.5786e-05) (hash(x)=43347374) +37200 val loss 5.9214 +37200 val perplexity 372.9274 +37200 train 6.117315 (lr=1.6723e-05) (hash(x)=57788407) +35000 val loss 5.7945 +35000 val perplexity 328.4980 +35000 train 5.473814 (lr=1.4344e-05) (hash(x)=47111881) +36400 val loss 6.1271 +36400 val perplexity 458.0939 +36400 train 6.201708 (lr=2.5570e-05) (hash(x)=52661268) +37300 val loss 5.9186 +37300 val perplexity 371.8936 +37300 train 5.694643 (lr=1.6580e-05) (hash(x)=46333526) +35100 val loss 5.7934 +35100 val perplexity 328.1255 +35100 train 5.811529 (lr=1.4229e-05) (hash(x)=52899306) +36500 val loss 6.1314 +36500 val perplexity 460.0856 +36500 train 6.002098 (lr=2.5356e-05) (hash(x)=47765383) +37400 val loss 5.9194 +37400 val perplexity 372.1934 +37400 train 5.719808 (lr=1.6438e-05) (hash(x)=45862038) +35200 val loss 5.7948 +35200 val perplexity 328.5847 +35200 train 5.748470 (lr=1.4115e-05) (hash(x)=49124834) +36600 val loss 6.1339 +36600 val perplexity 461.2357 +36600 train 5.971117 (lr=2.5143e-05) (hash(x)=49302411) +37500 val loss 5.9198 +37500 val perplexity 372.3443 +37500 train 5.635836 (lr=1.6297e-05) (hash(x)=46543051) +35300 val loss 5.7896 +35300 val perplexity 326.8839 +35300 train 5.945773 (lr=1.4001e-05) (hash(x)=54127282) +36700 val loss 6.1357 +36700 val perplexity 462.0757 +36700 train 5.676261 (lr=2.4931e-05) (hash(x)=44113066) +37600 val loss 5.9128 +37600 val perplexity 369.7480 +37600 train 5.879233 (lr=1.6156e-05) (hash(x)=51314450) +35400 val loss 5.8002 +35400 val perplexity 330.3776 +35400 train 5.669840 (lr=1.3887e-05) (hash(x)=51154858) +36800 val loss 6.1364 +36800 val perplexity 462.4066 +36800 train 6.062473 (lr=2.4721e-05) (hash(x)=55896927) +37700 val loss 5.9144 +37700 val perplexity 370.3425 +37700 train 5.778914 (lr=1.6016e-05) (hash(x)=46706763) +35500 val loss 5.7805 +35500 val perplexity 323.9211 +35500 train 5.829420 (lr=1.3775e-05) (hash(x)=51585534) +36900 val loss 6.1396 +36900 val perplexity 463.8754 +36900 train 5.966776 (lr=2.4511e-05) (hash(x)=49812554) +37800 val loss 5.9107 +37800 val perplexity 368.9532 +37800 train 5.765634 (lr=1.5878e-05) (hash(x)=45793085) +35600 val loss 5.7790 +35600 val perplexity 323.4417 +35600 train 5.583309 (lr=1.3663e-05) (hash(x)=46228695) +37000 val loss 6.1361 +37000 val perplexity 462.2492 +37000 train 5.911458 (lr=2.4303e-05) (hash(x)=49391771) +37900 val loss 5.9105 +37900 val perplexity 368.8726 +37900 train 5.771966 (lr=1.5740e-05) (hash(x)=49969493) +35700 val loss 5.7855 +35700 val perplexity 325.5457 +37100 val loss 6.1279 +37100 val perplexity 458.4543 +35700 train 5.764114 (lr=1.3551e-05) (hash(x)=49080481) +37100 train 6.064731 (lr=2.4096e-05) (hash(x)=50515328) +38000 val loss 5.9120 +38000 val perplexity 369.4412 +38000 train 5.700391 (lr=1.5603e-05) (hash(x)=47184489) +37200 val loss 6.1302 +37200 val perplexity 459.5096 +37200 train 6.320570 (lr=2.3890e-05) (hash(x)=57788407) +35800 val loss 5.7784 +35800 val perplexity 323.2407 +35800 train 5.946089 (lr=1.3440e-05) (hash(x)=56743084) +38100 val loss 5.9118 +38100 val perplexity 369.3829 +38100 train 6.399194 (lr=1.5467e-05) (hash(x)=54905742) +37300 val loss 6.1166 +37300 val perplexity 453.3305 +37300 train 5.894850 (lr=2.3686e-05) (hash(x)=46333526) +35900 val loss 5.7812 +35900 val perplexity 324.1445 +35900 train 5.871631 (lr=1.3329e-05) (hash(x)=53068229) +38200 val loss 5.9090 +38200 val perplexity 368.3536 +38200 train 6.178787 (lr=1.5332e-05) (hash(x)=48831719) +37400 val loss 6.1123 +37400 val perplexity 451.3916 +37400 train 5.903808 (lr=2.3483e-05) (hash(x)=45862038) +36000 val loss 5.7791 +36000 val perplexity 323.4611 +36000 train 5.956838 (lr=1.3219e-05) (hash(x)=54363049) +38300 val loss 5.9125 +38300 val perplexity 369.6474 +38300 train 5.759835 (lr=1.5198e-05) (hash(x)=48944700) +37500 val loss 6.1163 +37500 val perplexity 453.2015 +37500 train 5.844558 (lr=2.3281e-05) (hash(x)=46543051) +36100 val loss 5.7844 +36100 val perplexity 325.1889 +36100 train 5.697713 (lr=1.3110e-05) (hash(x)=50555767) +38400 val loss 5.9094 +38400 val perplexity 368.4789 +38400 train 5.544498 (lr=1.5064e-05) (hash(x)=46504848) +37600 val loss 6.1285 +37600 val perplexity 458.7383 +37600 train 6.088625 (lr=2.3080e-05) (hash(x)=51314450) +36200 val loss 5.7789 +36200 val perplexity 323.4190 +36200 train 5.935506 (lr=1.3001e-05) (hash(x)=52103710) +38500 val loss 5.9102 +38500 val perplexity 368.7902 +38500 train 5.647353 (lr=1.4932e-05) (hash(x)=50148948) +37700 val loss 6.1204 +37700 val perplexity 455.0315 +37700 train 5.980180 (lr=2.2881e-05) (hash(x)=46706763) +36300 val loss 5.7805 +36300 val perplexity 323.9157 +36300 train 5.575034 (lr=1.2893e-05) (hash(x)=43347374) +38600 val loss 5.9123 +38600 val perplexity 369.5478 +38600 train 6.689418 (lr=1.4801e-05) (hash(x)=60660913) +37800 val loss 6.1189 +37800 val perplexity 454.3582 +37800 train 5.963765 (lr=2.2682e-05) (hash(x)=45793085) +36400 val loss 5.7763 +36400 val perplexity 322.5495 +38700 val loss 5.9106 +38700 val perplexity 368.9344 +36400 train 5.812431 (lr=1.2785e-05) (hash(x)=52661268) +38700 train 5.705847 (lr=1.4670e-05) (hash(x)=50256252) +37900 val loss 6.1147 +37900 val perplexity 452.4417 +37900 train 5.977998 (lr=2.2485e-05) (hash(x)=49969493) +38800 val loss 5.9057 +38800 val perplexity 367.1224 +38800 train 5.963464 (lr=1.4541e-05) (hash(x)=52592415) +36500 val loss 5.7815 +36500 val perplexity 324.2580 +36500 train 5.695755 (lr=1.2678e-05) (hash(x)=47765383) +38000 val loss 6.1155 +38000 val perplexity 452.8185 +38000 train 5.888125 (lr=2.2290e-05) (hash(x)=47184489) +38900 val loss 5.9059 +38900 val perplexity 367.2014 +38900 train 5.842779 (lr=1.4412e-05) (hash(x)=51945430) +36600 val loss 5.7770 +36600 val perplexity 322.7749 +36600 train 5.621260 (lr=1.2572e-05) (hash(x)=49302411) +38100 val loss 6.1191 +38100 val perplexity 454.4779 +38100 train 6.524765 (lr=2.2096e-05) (hash(x)=54905742) +39000 val loss 5.9003 +39000 val perplexity 365.1581 +39000 train 6.111910 (lr=1.4285e-05) (hash(x)=51838869) +36700 val loss 5.7835 +36700 val perplexity 324.8792 +36700 train 5.340133 (lr=1.2466e-05) (hash(x)=44113066) +38200 val loss 6.1137 +38200 val perplexity 451.9871 +38200 train 6.343554 (lr=2.1903e-05) (hash(x)=48831719) +39100 val loss 5.8984 +39100 val perplexity 364.4516 +39100 train 6.018938 (lr=1.4158e-05) (hash(x)=54556054) +36800 val loss 5.7789 +36800 val perplexity 323.3910 +36800 train 5.646072 (lr=1.2360e-05) (hash(x)=55896927) +38300 val loss 6.1177 +38300 val perplexity 453.8232 +38300 train 5.962441 (lr=2.1711e-05) (hash(x)=48944700) +39200 val loss 5.8986 +39200 val perplexity 364.5089 +39200 train 6.256696 (lr=1.4032e-05) (hash(x)=48915509) +36900 val loss 5.7768 +36900 val perplexity 322.7098 +36900 train 5.590997 (lr=1.2256e-05) (hash(x)=49812554) +38400 val loss 6.1172 +38400 val perplexity 453.5770 +38400 train 5.777618 (lr=2.1521e-05) (hash(x)=46504848) +39300 val loss 5.9054 +39300 val perplexity 367.0246 +39300 train 5.709824 (lr=1.3908e-05) (hash(x)=46121589) +37000 val loss 5.7821 +37000 val perplexity 324.4427 +37000 train 5.613402 (lr=1.2152e-05) (hash(x)=49391771) +38500 val loss 6.1143 +38500 val perplexity 452.2820 +38500 train 5.850760 (lr=2.1332e-05) (hash(x)=50148948) +39400 val loss 5.9016 +39400 val perplexity 365.6397 +39400 train 5.460722 (lr=1.3784e-05) (hash(x)=45556201) +37100 val loss 5.7746 +37100 val perplexity 322.0222 +37100 train 5.731025 (lr=1.2048e-05) (hash(x)=50515328) +38600 val loss 6.1168 +38600 val perplexity 453.4129 +38600 train 6.839215 (lr=2.1144e-05) (hash(x)=60660913) +39500 val loss 5.8980 +39500 val perplexity 364.2948 +39500 train 5.912900 (lr=1.3661e-05) (hash(x)=47014749) +37200 val loss 5.7738 +37200 val perplexity 321.7449 +37200 train 5.990785 (lr=1.1945e-05) (hash(x)=57788407) +38700 val loss 6.1175 +38700 val perplexity 453.7334 +38700 train 5.911247 (lr=2.0957e-05) (hash(x)=50256252) +39600 val loss 5.9021 +39600 val perplexity 365.8049 +39600 train 5.702456 (lr=1.3540e-05) (hash(x)=47395114) +37300 val loss 5.7715 +37300 val perplexity 321.0235 +37300 train 5.544409 (lr=1.1843e-05) (hash(x)=46333526) +38800 val loss 6.1133 +38800 val perplexity 451.8223 +38800 train 6.149075 (lr=2.0773e-05) (hash(x)=52592415) +39700 val loss 5.9055 +39700 val perplexity 367.0372 +39700 train 6.017768 (lr=1.3419e-05) (hash(x)=50858008) +37400 val loss 5.7680 +37400 val perplexity 319.8995 +37400 train 5.567012 (lr=1.1741e-05) (hash(x)=45862038) +38900 val loss 6.1136 +38900 val perplexity 451.9839 +38900 train 6.061025 (lr=2.0589e-05) (hash(x)=51945430) +39800 val loss 5.8998 +39800 val perplexity 364.9705 +39800 train 5.619897 (lr=1.3299e-05) (hash(x)=44096434) +37500 val loss 5.7672 +37500 val perplexity 319.6557 +37500 train 5.480631 (lr=1.1640e-05) (hash(x)=46543051) +39000 val loss 6.1031 +39000 val perplexity 447.2501 +39000 train 6.308416 (lr=2.0407e-05) (hash(x)=51838869) +39900 val loss 5.9007 +39900 val perplexity 365.2943 +39900 train 5.654382 (lr=1.3180e-05) (hash(x)=46902151) +37600 val loss 5.7685 +37600 val perplexity 320.0498 +37600 train 5.724071 (lr=1.1540e-05) (hash(x)=51314450) +39100 val loss 6.1018 +39100 val perplexity 446.6725 +39100 train 6.199195 (lr=2.0226e-05) (hash(x)=54556054) +40000 val loss 5.8979 +40000 val perplexity 364.2790 +40000 train 5.670296 (lr=1.3063e-05) (hash(x)=46735297) +37700 val loss 5.7655 +37700 val perplexity 319.1130 +37700 train 5.618969 (lr=1.1440e-05) (hash(x)=46706763) +39200 val loss 6.1041 +39200 val perplexity 447.7082 +39200 train 6.436305 (lr=2.0046e-05) (hash(x)=48915509) +40100 val loss 5.8982 +40100 val perplexity 364.3735 +40100 train 5.852563 (lr=1.2946e-05) (hash(x)=48035170) +37800 val loss 5.7649 +37800 val perplexity 318.8958 +37800 train 5.625665 (lr=1.1341e-05) (hash(x)=45793085) +39300 val loss 6.1047 +39300 val perplexity 447.9523 +39300 train 5.907686 (lr=1.9868e-05) (hash(x)=46121589) +40200 val loss 5.8964 +40200 val perplexity 363.7285 +40200 train 5.715671 (lr=1.2830e-05) (hash(x)=46651322) +37900 val loss 5.7650 +37900 val perplexity 318.9442 +37900 train 5.611680 (lr=1.1243e-05) (hash(x)=49969493) +39400 val loss 6.0985 +39400 val perplexity 445.2034 +39400 train 5.652236 (lr=1.9692e-05) (hash(x)=45556201) +40300 val loss 5.8976 +40300 val perplexity 364.1667 +40300 train 5.741758 (lr=1.2716e-05) (hash(x)=46378099) +38000 val loss 5.7623 +38000 val perplexity 318.0941 +38000 train 5.559613 (lr=1.1145e-05) (hash(x)=47184489) +39500 val loss 6.1009 +39500 val perplexity 446.2423 +39500 train 6.076194 (lr=1.9516e-05) (hash(x)=47014749) +40400 val loss 5.8924 +40400 val perplexity 362.2703 +40400 train 5.743531 (lr=1.2602e-05) (hash(x)=46495438) +38100 val loss 5.7646 +38100 val perplexity 318.8094 +38100 train 6.227178 (lr=1.1048e-05) (hash(x)=54905742) +39600 val loss 6.1081 +39600 val perplexity 449.4911 +39600 train 5.875370 (lr=1.9342e-05) (hash(x)=47395114) +40500 val loss 5.8888 +40500 val perplexity 360.9549 +40500 train 6.038678 (lr=1.2489e-05) (hash(x)=52059147) +38200 val loss 5.7636 +38200 val perplexity 318.5088 +38200 train 6.033066 (lr=1.0951e-05) (hash(x)=48831719) +39700 val loss 6.1090 +39700 val perplexity 449.8992 +39700 train 6.190334 (lr=1.9170e-05) (hash(x)=50858008) +40600 val loss 5.8898 +40600 val perplexity 361.3468 +40600 train 6.023067 (lr=1.2378e-05) (hash(x)=54885045) +38300 val loss 5.7640 +38300 val perplexity 318.6113 +38300 train 5.599991 (lr=1.0855e-05) (hash(x)=48944700) +39800 val loss 6.0991 +39800 val perplexity 445.4457 +39800 train 5.810238 (lr=1.8999e-05) (hash(x)=44096434) +40700 val loss 5.8854 +40700 val perplexity 359.7634 +40700 train 6.309203 (lr=1.2267e-05) (hash(x)=53213971) +38400 val loss 5.7637 +38400 val perplexity 318.5376 +38400 train 5.379463 (lr=1.0760e-05) (hash(x)=46504848) +39900 val loss 6.0979 +39900 val perplexity 444.9122 +39900 train 5.845292 (lr=1.8829e-05) (hash(x)=46902151) +40800 val loss 5.8834 +40800 val perplexity 359.0162 +40800 train 5.555313 (lr=1.2158e-05) (hash(x)=45133794) +38500 val loss 5.7629 +38500 val perplexity 318.2579 +38500 train 5.494953 (lr=1.0666e-05) (hash(x)=50148948) +40000 val loss 6.1015 +40000 val perplexity 446.5135 +40000 train 5.857305 (lr=1.8661e-05) (hash(x)=46735297) +40900 val loss 5.8835 +40900 val perplexity 359.0653 +40900 train 6.047424 (lr=1.2049e-05) (hash(x)=56546547) +38600 val loss 5.7637 +38600 val perplexity 318.5261 +38600 train 6.555881 (lr=1.0572e-05) (hash(x)=60660913) +40100 val loss 6.1004 +40100 val perplexity 446.0259 +40100 train 6.029654 (lr=1.8494e-05) (hash(x)=48035170) +41000 val loss 5.8819 +41000 val perplexity 358.4984 +41000 train 6.263039 (lr=1.1942e-05) (hash(x)=49552270) +38700 val loss 5.7654 +38700 val perplexity 319.0681 +38700 train 5.562728 (lr=1.0479e-05) (hash(x)=50256252) +40200 val loss 6.0984 +40200 val perplexity 445.1240 +40200 train 5.900217 (lr=1.8329e-05) (hash(x)=46651322) +41100 val loss 5.8834 +41100 val perplexity 359.0388 +41100 train 6.105730 (lr=1.1836e-05) (hash(x)=51222370) +38800 val loss 5.7634 +38800 val perplexity 318.4179 +38800 train 5.842839 (lr=1.0386e-05) (hash(x)=52592415) +40300 val loss 6.1051 +40300 val perplexity 448.1296 +40300 train 5.948763 (lr=1.8165e-05) (hash(x)=46378099) +41200 val loss 5.8920 +41200 val perplexity 362.1415 +41200 train 5.928735 (lr=1.1730e-05) (hash(x)=50883608) +38900 val loss 5.7602 +38900 val perplexity 317.4065 +38900 train 5.703421 (lr=1.0294e-05) (hash(x)=51945430) +40400 val loss 6.0977 +40400 val perplexity 444.8231 +40400 train 5.962878 (lr=1.8003e-05) (hash(x)=46495438) +41300 val loss 5.8873 +41300 val perplexity 360.4363 +41300 train 6.133073 (lr=1.1626e-05) (hash(x)=52996828) +39000 val loss 5.7592 +39000 val perplexity 317.1058 +39000 train 5.992633 (lr=1.0203e-05) (hash(x)=51838869) +40500 val loss 6.0893 +40500 val perplexity 441.0985 +40500 train 6.257522 (lr=1.7842e-05) (hash(x)=52059147) +41400 val loss 5.8895 +41400 val perplexity 361.2230 +41400 train 5.853330 (lr=1.1523e-05) (hash(x)=48822716) +39100 val loss 5.7557 +39100 val perplexity 315.9767 +39100 train 5.885694 (lr=1.0113e-05) (hash(x)=54556054) +40600 val loss 6.0907 +40600 val perplexity 441.7514 +40600 train 6.233351 (lr=1.7683e-05) (hash(x)=54885045) +41500 val loss 5.8887 +41500 val perplexity 360.9361 +41500 train 5.860559 (lr=1.1421e-05) (hash(x)=50936577) +39200 val loss 5.7568 +39200 val perplexity 316.3292 +39200 train 6.130731 (lr=1.0023e-05) (hash(x)=48915509) +40700 val loss 6.0848 +40700 val perplexity 439.1400 +40700 train 6.507118 (lr=1.7525e-05) (hash(x)=53213971) +41600 val loss 5.8957 +41600 val perplexity 363.4856 +41600 train 5.931068 (lr=1.1320e-05) (hash(x)=44375046) +39300 val loss 5.7592 +39300 val perplexity 317.0928 +39300 train 5.582416 (lr=9.9341e-06) (hash(x)=46121589) +40800 val loss 6.0815 +40800 val perplexity 437.7063 +40800 train 5.749034 (lr=1.7368e-05) (hash(x)=45133794) +41700 val loss 5.8856 +41700 val perplexity 359.8251 +41700 train 5.654349 (lr=1.1220e-05) (hash(x)=44060021) +40900 val loss 6.0838 +40900 val perplexity 438.6890 +40900 train 6.203508 (lr=1.7214e-05) (hash(x)=56546547) +39400 val loss 5.7545 +39400 val perplexity 315.6202 +39400 train 5.293779 (lr=9.8458e-06) (hash(x)=45556201) +41800 val loss 5.8851 +41800 val perplexity 359.6262 +41800 train 6.167309 (lr=1.1121e-05) (hash(x)=57765221) +41000 val loss 6.0783 +41000 val perplexity 436.3056 +41000 train 6.447348 (lr=1.7060e-05) (hash(x)=49552270) +39500 val loss 5.7557 +39500 val perplexity 315.9789 +39500 train 5.796134 (lr=9.7581e-06) (hash(x)=47014749) +41900 val loss 5.8858 +41900 val perplexity 359.8881 +41900 train 5.638933 (lr=1.1024e-05) (hash(x)=46051470) +41100 val loss 6.0833 +41100 val perplexity 438.4789 +41100 train 6.289819 (lr=1.6908e-05) (hash(x)=51222370) +39600 val loss 5.7580 +39600 val perplexity 316.7281 +39600 train 5.561605 (lr=9.6712e-06) (hash(x)=47395114) +42000 val loss 5.8856 +42000 val perplexity 359.8166 +42000 train 6.123823 (lr=1.0927e-05) (hash(x)=52077616) +39700 val loss 5.7605 +39700 val perplexity 317.5004 +39700 train 5.875614 (lr=9.5849e-06) (hash(x)=50858008) +41200 val loss 6.0801 +41200 val perplexity 437.0869 +41200 train 6.125017 (lr=1.6758e-05) (hash(x)=50883608) +42100 val loss 5.8841 +42100 val perplexity 359.2952 +42100 train 5.741609 (lr=1.0831e-05) (hash(x)=47845199) +39800 val loss 5.7552 +39800 val perplexity 315.8245 +39800 train 5.476305 (lr=9.4994e-06) (hash(x)=44096434) +41300 val loss 6.0845 +41300 val perplexity 438.9784 +41300 train 6.330495 (lr=1.6609e-05) (hash(x)=52996828) +42200 val loss 5.8834 +42200 val perplexity 359.0181 +42200 train 6.128910 (lr=1.0737e-05) (hash(x)=51549823) +39900 val loss 5.7576 +39900 val perplexity 316.5782 +39900 train 5.512799 (lr=9.4146e-06) (hash(x)=46902151) +41400 val loss 6.0846 +41400 val perplexity 439.0529 +41400 train 6.062356 (lr=1.6462e-05) (hash(x)=48822716) +42300 val loss 5.8727 +42300 val perplexity 355.2144 +42300 train 6.019602 (lr=1.0644e-05) (hash(x)=56922131) +41500 val loss 6.0899 +41500 val perplexity 441.3937 +40000 val loss 5.7535 +40000 val perplexity 315.2927 +41500 train 6.054962 (lr=1.6316e-05) (hash(x)=50936577) +40000 train 5.562321 (lr=9.3305e-06) (hash(x)=46735297) +42400 val loss 5.8732 +42400 val perplexity 355.3774 +42400 train 5.817928 (lr=1.0552e-05) (hash(x)=49004372) +41600 val loss 6.0941 +41600 val perplexity 443.2212 +41600 train 6.104293 (lr=1.6172e-05) (hash(x)=44375046) +40100 val loss 5.7523 +40100 val perplexity 314.9220 +40100 train 5.715792 (lr=9.2472e-06) (hash(x)=48035170) +42500 val loss 5.8702 +42500 val perplexity 354.3189 +42500 train 5.842021 (lr=1.0460e-05) (hash(x)=50651839) +41700 val loss 6.0870 +41700 val perplexity 440.1031 +41700 train 5.825902 (lr=1.6029e-05) (hash(x)=44060021) +40200 val loss 5.7523 +40200 val perplexity 314.9005 +40200 train 5.575696 (lr=9.1646e-06) (hash(x)=46651322) +42600 val loss 5.8698 +42600 val perplexity 354.1635 +42600 train 6.166778 (lr=1.0370e-05) (hash(x)=50767721) +41800 val loss 6.0914 +41800 val perplexity 442.0266 +41800 train 6.348953 (lr=1.5888e-05) (hash(x)=57765221) +40300 val loss 5.7509 +40300 val perplexity 314.4647 +40300 train 5.593280 (lr=9.0827e-06) (hash(x)=46378099) +42700 val loss 5.8683 +42700 val perplexity 353.6617 +42700 train 5.581938 (lr=1.0282e-05) (hash(x)=49099183) +41900 val loss 6.0879 +41900 val perplexity 440.4934 +41900 train 5.849139 (lr=1.5748e-05) (hash(x)=46051470) +40400 val loss 5.7455 +40400 val perplexity 312.7840 +40400 train 5.585403 (lr=9.0015e-06) (hash(x)=46495438) +42800 val loss 5.8679 +42800 val perplexity 353.4950 +42800 train 6.413287 (lr=1.0194e-05) (hash(x)=42272413) +42000 val loss 6.0918 +42000 val perplexity 442.2172 +42000 train 6.311883 (lr=1.5610e-05) (hash(x)=52077616) +40500 val loss 5.7442 +40500 val perplexity 312.3704 +40500 train 5.897409 (lr=8.9211e-06) (hash(x)=52059147) +42900 val loss 5.8702 +42900 val perplexity 354.3152 +42900 train 5.672939 (lr=1.0107e-05) (hash(x)=48582863) +42100 val loss 6.0902 +42100 val perplexity 441.4968 +42100 train 5.944408 (lr=1.5474e-05) (hash(x)=47845199) +40600 val loss 5.7484 +40600 val perplexity 313.7004 +40600 train 5.902262 (lr=8.8414e-06) (hash(x)=54885045) +43000 val loss 5.8681 +43000 val perplexity 353.5818 +43000 train 5.837842 (lr=1.0022e-05) (hash(x)=48703446) +42200 val loss 6.0861 +42200 val perplexity 439.7129 +42200 train 6.341642 (lr=1.5339e-05) (hash(x)=51549823) +40700 val loss 5.7442 +40700 val perplexity 312.3774 +40700 train 6.190853 (lr=8.7624e-06) (hash(x)=53213971) +43100 val loss 5.8689 +43100 val perplexity 353.8753 +43100 train 5.806827 (lr=9.9373e-06) (hash(x)=48730321) +42300 val loss 6.0799 +42300 val perplexity 436.9808 +42300 train 6.246227 (lr=1.5205e-05) (hash(x)=56922131) +40800 val loss 5.7420 +40800 val perplexity 311.6921 +40800 train 5.419962 (lr=8.6842e-06) (hash(x)=45133794) +43200 val loss 5.8685 +43200 val perplexity 353.7095 +43200 train 6.086263 (lr=9.8541e-06) (hash(x)=56536090) +42400 val loss 6.0783 +42400 val perplexity 436.2948 +42400 train 6.009583 (lr=1.5074e-05) (hash(x)=49004372) +43300 val loss 5.8681 +43300 val perplexity 353.5628 +43300 train 5.982976 (lr=9.7720e-06) (hash(x)=54154116) +40900 val loss 5.7432 +40900 val perplexity 312.0596 +40900 train 5.886569 (lr=8.6068e-06) (hash(x)=56546547) +42500 val loss 6.0735 +42500 val perplexity 434.1761 +42500 train 6.045653 (lr=1.4943e-05) (hash(x)=50651839) +43400 val loss 5.8719 +43400 val perplexity 354.9122 +43400 train 5.750636 (lr=9.6911e-06) (hash(x)=50058055) +41000 val loss 5.7402 +41000 val perplexity 311.1349 +41000 train 6.131865 (lr=8.5301e-06) (hash(x)=49552270) +42600 val loss 6.0687 +42600 val perplexity 432.1059 +42600 train 6.374171 (lr=1.4815e-05) (hash(x)=50767721) +43500 val loss 5.8686 +43500 val perplexity 353.7496 +43500 train 5.801444 (lr=9.6113e-06) (hash(x)=48743802) +41100 val loss 5.7393 +41100 val perplexity 310.8314 +41100 train 5.977370 (lr=8.4541e-06) (hash(x)=51222370) +42700 val loss 6.0701 +42700 val perplexity 432.7057 +42700 train 5.794621 (lr=1.4688e-05) (hash(x)=49099183) +43600 val loss 5.8708 +43600 val perplexity 354.5286 +43600 train 5.519044 (lr=9.5326e-06) (hash(x)=42792886) +41200 val loss 5.7427 +41200 val perplexity 311.9193 +41200 train 5.779368 (lr=8.3789e-06) (hash(x)=50883608) +42800 val loss 6.0708 +42800 val perplexity 433.0135 +42800 train 6.560636 (lr=1.4563e-05) (hash(x)=42272413) +43700 val loss 5.8688 +43700 val perplexity 353.8321 +43700 train 6.247068 (lr=9.4552e-06) (hash(x)=56446070) +41300 val loss 5.7443 +41300 val perplexity 312.3972 +41300 train 5.995403 (lr=8.3045e-06) (hash(x)=52996828) +42900 val loss 6.0737 +42900 val perplexity 434.2632 +42900 train 5.865055 (lr=1.4439e-05) (hash(x)=48582863) +43800 val loss 5.8704 +43800 val perplexity 354.3770 +43800 train 5.713625 (lr=9.3788e-06) (hash(x)=45584354) +41400 val loss 5.7400 +41400 val perplexity 311.0588 +41400 train 5.687901 (lr=8.2308e-06) (hash(x)=48822716) +43000 val loss 6.0699 +43000 val perplexity 432.6428 +43000 train 6.018393 (lr=1.4317e-05) (hash(x)=48703446) +43900 val loss 5.8692 +43900 val perplexity 353.9805 +43900 train 5.711111 (lr=9.3036e-06) (hash(x)=49339253) +41500 val loss 5.7436 +41500 val perplexity 312.1848 +41500 train 5.718872 (lr=8.1579e-06) (hash(x)=50936577) +43100 val loss 6.0662 +43100 val perplexity 431.0327 +43100 train 5.987951 (lr=1.4196e-05) (hash(x)=48730321) +44000 val loss 5.8701 +44000 val perplexity 354.2998 +44000 train 5.685949 (lr=9.2296e-06) (hash(x)=46183203) +41600 val loss 5.7488 +41600 val perplexity 313.8020 +41600 train 5.803505 (lr=8.0858e-06) (hash(x)=44375046) +43200 val loss 6.0691 +43200 val perplexity 432.3110 +43200 train 6.268177 (lr=1.4077e-05) (hash(x)=56536090) +44100 val loss 5.8633 +44100 val perplexity 351.8832 +44100 train 5.847696 (lr=9.1568e-06) (hash(x)=47849630) +41700 val loss 5.7438 +41700 val perplexity 312.2614 +41700 train 5.521793 (lr=8.0144e-06) (hash(x)=44060021) +43300 val loss 6.0682 +43300 val perplexity 431.8818 +43300 train 6.172038 (lr=1.3960e-05) (hash(x)=54154116) +44200 val loss 5.8638 +44200 val perplexity 352.0425 +44200 train 6.163978 (lr=9.0851e-06) (hash(x)=49834275) +41800 val loss 5.7417 +41800 val perplexity 311.6032 +41800 train 6.056554 (lr=7.9438e-06) (hash(x)=57765221) +43400 val loss 6.0726 +43400 val perplexity 433.8108 +43400 train 5.960909 (lr=1.3844e-05) (hash(x)=50058055) +44300 val loss 5.8622 +44300 val perplexity 351.4916 +44300 train 6.319765 (lr=9.0146e-06) (hash(x)=62535257) +41900 val loss 5.7419 +41900 val perplexity 311.6466 +41900 train 5.494161 (lr=7.8740e-06) (hash(x)=46051470) +43500 val loss 6.0713 +43500 val perplexity 433.2254 +43500 train 5.989706 (lr=1.3730e-05) (hash(x)=48743802) +44400 val loss 5.8612 +44400 val perplexity 351.1554 +44400 train 5.913574 (lr=8.9453e-06) (hash(x)=49253957) +42000 val loss 5.7454 +42000 val perplexity 312.7515 +42000 train 5.963854 (lr=7.8050e-06) (hash(x)=52077616) +43600 val loss 6.0686 +43600 val perplexity 432.0546 +43600 train 5.709352 (lr=1.3618e-05) (hash(x)=42792886) +44500 val loss 5.8609 +44500 val perplexity 351.0523 +44500 train 5.816366 (lr=8.8771e-06) (hash(x)=55368339) +42100 val loss 5.7410 +42100 val perplexity 311.3617 +42100 train 5.579000 (lr=7.7368e-06) (hash(x)=47845199) +43700 val loss 6.0663 +43700 val perplexity 431.0718 +43700 train 6.482207 (lr=1.3507e-05) (hash(x)=56446070) +44600 val loss 5.8624 +44600 val perplexity 351.5555 +44600 train 5.944849 (lr=8.8101e-06) (hash(x)=47098476) +42200 val loss 5.7395 +42200 val perplexity 310.9067 +42200 train 5.996667 (lr=7.6693e-06) (hash(x)=51549823) +43800 val loss 6.0681 +43800 val perplexity 431.8659 +43800 train 5.916797 (lr=1.3398e-05) (hash(x)=45584354) +44700 val loss 5.8614 +44700 val perplexity 351.2013 +44700 train 5.784379 (lr=8.7443e-06) (hash(x)=48280562) +43900 val loss 6.0683 +43900 val perplexity 431.9393 +42300 val loss 5.7336 +42300 val perplexity 309.0953 +43900 train 5.914937 (lr=1.3291e-05) (hash(x)=49339253) +42300 train 5.855470 (lr=7.6027e-06) (hash(x)=56922131) +44800 val loss 5.8671 +44800 val perplexity 353.2086 +44800 train 6.136794 (lr=8.6797e-06) (hash(x)=55591638) +44000 val loss 6.0671 +44000 val perplexity 431.4092 +44000 train 5.857544 (lr=1.3185e-05) (hash(x)=46183203) +42400 val loss 5.7331 +42400 val perplexity 308.9134 +42400 train 5.686144 (lr=7.5368e-06) (hash(x)=49004372) +44900 val loss 5.8650 +44900 val perplexity 352.4780 +44900 train 6.042705 (lr=8.6163e-06) (hash(x)=53757748) +44100 val loss 6.0625 +44100 val perplexity 429.4332 +44100 train 6.027684 (lr=1.3081e-05) (hash(x)=47849630) +42500 val loss 5.7304 +42500 val perplexity 308.0865 +42500 train 5.707544 (lr=7.4717e-06) (hash(x)=50651839) +45000 val loss 5.8605 +45000 val perplexity 350.8926 +45000 train 6.009489 (lr=8.5540e-06) (hash(x)=51685087) +44200 val loss 6.0632 +44200 val perplexity 429.7634 +44200 train 6.403512 (lr=1.2979e-05) (hash(x)=49834275) +42600 val loss 5.7309 +42600 val perplexity 308.2605 +42600 train 6.003824 (lr=7.4074e-06) (hash(x)=50767721) +45100 val loss 5.8595 +45100 val perplexity 350.5572 +45100 train 5.871101 (lr=8.4930e-06) (hash(x)=50093774) +44300 val loss 6.0562 +44300 val perplexity 426.7346 +44300 train 6.502149 (lr=1.2878e-05) (hash(x)=62535257) +42700 val loss 5.7305 +42700 val perplexity 308.1259 +42700 train 5.442407 (lr=7.3440e-06) (hash(x)=49099183) +45200 val loss 5.8601 +45200 val perplexity 350.7615 +45200 train 5.580861 (lr=8.4331e-06) (hash(x)=43460450) +44400 val loss 6.0591 +44400 val perplexity 427.9775 +44400 train 6.109484 (lr=1.2779e-05) (hash(x)=49253957) +42800 val loss 5.7267 +42800 val perplexity 306.9621 +42800 train 6.192804 (lr=7.2813e-06) (hash(x)=42272413) +45300 val loss 5.8632 +45300 val perplexity 351.8497 +45300 train 6.055310 (lr=8.3745e-06) (hash(x)=49935488) +44500 val loss 6.0592 +44500 val perplexity 428.0197 +44500 train 6.028224 (lr=1.2682e-05) (hash(x)=55368339) +45400 val loss 5.8620 +45400 val perplexity 351.4144 +45400 train 5.832631 (lr=8.3170e-06) (hash(x)=49447929) +42900 val loss 5.7333 +42900 val perplexity 308.9987 +42900 train 5.529157 (lr=7.2194e-06) (hash(x)=48582863) +44600 val loss 6.0570 +44600 val perplexity 427.1129 +44600 train 6.093005 (lr=1.2586e-05) (hash(x)=47098476) +45500 val loss 5.8609 +45500 val perplexity 351.0280 +45500 train 5.806209 (lr=8.2607e-06) (hash(x)=50713904) +43000 val loss 5.7272 +43000 val perplexity 307.1086 +43000 train 5.717058 (lr=7.1583e-06) (hash(x)=48703446) +45600 val loss 5.8931 +45600 val perplexity 362.5154 +45600 train 5.801927 (lr=8.2057e-06) (hash(x)=47674606) +44700 val loss 6.0582 +44700 val perplexity 427.6148 +44700 train 5.950908 (lr=1.2492e-05) (hash(x)=48280562) +43100 val loss 5.7278 +43100 val perplexity 307.2908 +43100 train 5.654952 (lr=7.0981e-06) (hash(x)=48730321) +45700 val loss 5.8621 +45700 val perplexity 351.4761 +45700 train 5.754256 (lr=8.1518e-06) (hash(x)=51539617) +44800 val loss 6.0574 +44800 val perplexity 427.2695 +44800 train 6.307104 (lr=1.2400e-05) (hash(x)=55591638) +43200 val loss 5.7288 +43200 val perplexity 307.6085 +43200 train 5.956896 (lr=7.0386e-06) (hash(x)=56536090) +44900 val loss 6.0572 +44900 val perplexity 427.1797 +44900 train 6.209588 (lr=1.2309e-05) (hash(x)=53757748) +45800 val loss 5.8620 +45800 val perplexity 351.4243 +45800 train 5.833173 (lr=8.0992e-06) (hash(x)=44448785) +43300 val loss 5.7274 +43300 val perplexity 307.1761 +43300 train 5.866704 (lr=6.9800e-06) (hash(x)=54154116) +45000 val loss 6.0571 +45000 val perplexity 427.1190 +45000 train 6.189182 (lr=1.2220e-05) (hash(x)=51685087) +45900 val loss 5.8606 +45900 val perplexity 350.9274 +45900 train 5.672136 (lr=8.0478e-06) (hash(x)=51499105) +43400 val loss 5.7301 +43400 val perplexity 307.9856 +43400 train 5.614723 (lr=6.9222e-06) (hash(x)=50058055) +45100 val loss 6.0571 +45100 val perplexity 427.1398 +45100 train 6.055492 (lr=1.2133e-05) (hash(x)=50093774) +46000 val loss 5.8559 +46000 val perplexity 349.2964 +46000 train 5.756037 (lr=7.9976e-06) (hash(x)=48359464) +43500 val loss 5.7306 +43500 val perplexity 308.1495 +43500 train 5.641300 (lr=6.8652e-06) (hash(x)=48743802) +45200 val loss 6.0550 +45200 val perplexity 426.2207 +45200 train 5.782551 (lr=1.2047e-05) (hash(x)=43460450) +46100 val loss 5.8582 +46100 val perplexity 350.1075 +46100 train 5.897924 (lr=7.9485e-06) (hash(x)=51885986) +43600 val loss 5.7286 +43600 val perplexity 307.5317 +43600 train 5.374582 (lr=6.8090e-06) (hash(x)=42792886) +45300 val loss 6.0555 +45300 val perplexity 426.4360 +45300 train 6.190984 (lr=1.1964e-05) (hash(x)=49935488) +46200 val loss 5.8500 +46200 val perplexity 347.2492 +46200 train 6.391445 (lr=7.9008e-06) (hash(x)=65186615) +43700 val loss 5.7287 +43700 val perplexity 307.5639 +43700 train 6.129159 (lr=6.7537e-06) (hash(x)=56446070) +45400 val loss 6.0603 +45400 val perplexity 428.5037 +45400 train 6.018839 (lr=1.1881e-05) (hash(x)=49447929) +46300 val loss 5.8518 +46300 val perplexity 347.8571 +46300 train 5.823817 (lr=7.8542e-06) (hash(x)=49626999) +43800 val loss 5.7300 +43800 val perplexity 307.9766 +43800 train 5.549335 (lr=6.6992e-06) (hash(x)=45584354) +45500 val loss 6.0581 +45500 val perplexity 427.5738 +45500 train 5.983250 (lr=1.1801e-05) (hash(x)=50713904) +46400 val loss 5.8525 +46400 val perplexity 348.0972 +46400 train 5.741714 (lr=7.8088e-06) (hash(x)=43325701) +43900 val loss 5.7271 +43900 val perplexity 307.0764 +43900 train 5.571040 (lr=6.6455e-06) (hash(x)=49339253) +45600 val loss 6.0809 +45600 val perplexity 437.4357 +45600 train 5.990339 (lr=1.1722e-05) (hash(x)=47674606) +46500 val loss 5.8523 +46500 val perplexity 348.0345 +46500 train 6.141178 (lr=7.7647e-06) (hash(x)=54028595) +44000 val loss 5.7273 +44000 val perplexity 307.1531 +44000 train 5.546133 (lr=6.5926e-06) (hash(x)=46183203) +45700 val loss 6.0576 +45700 val perplexity 427.3579 +45700 train 5.974851 (lr=1.1645e-05) (hash(x)=51539617) +46600 val loss 5.8544 +46600 val perplexity 348.7543 +46600 train 5.707172 (lr=7.7218e-06) (hash(x)=44519175) +44100 val loss 5.7220 +44100 val perplexity 305.5253 +44100 train 5.738289 (lr=6.5406e-06) (hash(x)=47849630) +45800 val loss 6.0620 +45800 val perplexity 429.2244 +46700 val loss 5.8538 +46700 val perplexity 348.5613 +45800 train 6.021019 (lr=1.1570e-05) (hash(x)=44448785) +46700 train 5.908800 (lr=7.6801e-06) (hash(x)=48357998) +44200 val loss 5.7261 +44200 val perplexity 306.7854 +44200 train 6.044593 (lr=6.4894e-06) (hash(x)=49834275) +45900 val loss 6.0583 +45900 val perplexity 427.6523 +46800 val loss 5.8497 +46800 val perplexity 347.1213 +45900 train 5.886087 (lr=1.1497e-05) (hash(x)=51499105) +46800 train 6.041516 (lr=7.6397e-06) (hash(x)=55911353) +44300 val loss 5.7182 +44300 val perplexity 304.3456 +44300 train 6.210258 (lr=6.4390e-06) (hash(x)=62535257) +46900 val loss 5.8517 +46900 val perplexity 347.8130 +46000 val loss 6.0527 +46000 val perplexity 425.2731 +46900 train 5.773681 (lr=7.6004e-06) (hash(x)=47897187) +46000 train 5.946332 (lr=1.1425e-05) (hash(x)=48359464) +47000 val loss 5.8494 +47000 val perplexity 347.0211 +47000 train 5.433744 (lr=7.5624e-06) (hash(x)=43196571) +44400 val loss 5.7177 +44400 val perplexity 304.2032 +44400 train 5.762341 (lr=6.3895e-06) (hash(x)=49253957) +46100 val loss 6.0516 +46100 val perplexity 424.7917 +46100 train 6.077628 (lr=1.1355e-05) (hash(x)=51885986) +47100 val loss 5.8501 +47100 val perplexity 347.2580 +47100 train 5.718368 (lr=7.5257e-06) (hash(x)=51224987) +44500 val loss 5.7176 +44500 val perplexity 304.1715 +46200 val loss 6.0464 +46200 val perplexity 422.5769 +44500 train 5.675300 (lr=6.3408e-06) (hash(x)=55368339) +46200 train 6.599186 (lr=1.1287e-05) (hash(x)=65186615) +47200 val loss 5.8540 +47200 val perplexity 348.6273 +47200 train 5.550478 (lr=7.4901e-06) (hash(x)=47943697) +46300 val loss 6.0460 +46300 val perplexity 422.4326 +46300 train 6.010680 (lr=1.1220e-05) (hash(x)=49626999) +44600 val loss 5.7185 +44600 val perplexity 304.4501 +44600 train 5.850135 (lr=6.2929e-06) (hash(x)=47098476) +47300 val loss 5.8548 +47300 val perplexity 348.9183 +47300 train 5.792697 (lr=7.4558e-06) (hash(x)=47351003) +46400 val loss 6.0468 +46400 val perplexity 422.7397 +46400 train 5.928525 (lr=1.1155e-05) (hash(x)=43325701) +44700 val loss 5.7176 +44700 val perplexity 304.1606 +44700 train 5.639624 (lr=6.2459e-06) (hash(x)=48280562) +47400 val loss 5.8528 +47400 val perplexity 348.2169 +47400 train 5.919446 (lr=7.4228e-06) (hash(x)=55562243) +46500 val loss 6.0487 +46500 val perplexity 423.5460 +46500 train 6.316158 (lr=1.1092e-05) (hash(x)=54028595) +44800 val loss 5.7194 +44800 val perplexity 304.7143 +44800 train 6.008594 (lr=6.1998e-06) (hash(x)=55591638) +47500 val loss 5.8510 +47500 val perplexity 347.5935 +47500 train 5.729331 (lr=7.3909e-06) (hash(x)=53544850) +46600 val loss 6.0485 +46600 val perplexity 423.4939 +46600 train 5.879036 (lr=1.1031e-05) (hash(x)=44519175) +44900 val loss 5.7217 +44900 val perplexity 305.4323 +44900 train 5.873918 (lr=6.1545e-06) (hash(x)=53757748) +47600 val loss 5.8521 +47600 val perplexity 347.9716 +47600 train 6.031857 (lr=7.3603e-06) (hash(x)=43634907) +46700 val loss 6.0500 +46700 val perplexity 424.1289 +46700 train 6.119004 (lr=1.0972e-05) (hash(x)=48357998) +45000 val loss 5.7176 +45000 val perplexity 304.1754 +45000 train 5.889555 (lr=6.1100e-06) (hash(x)=51685087) +47700 val loss 5.8513 +47700 val perplexity 347.6973 +47700 train 5.841846 (lr=7.3310e-06) (hash(x)=47909383) +46800 val loss 6.0481 +46800 val perplexity 423.3104 +46800 train 6.212865 (lr=1.0914e-05) (hash(x)=55911353) +45100 val loss 5.7174 +45100 val perplexity 304.1154 +45100 train 5.737835 (lr=6.0664e-06) (hash(x)=50093774) +47800 val loss 5.8470 +47800 val perplexity 346.2068 +47800 train 5.702913 (lr=7.3029e-06) (hash(x)=45871079) +46900 val loss 6.0458 +46900 val perplexity 422.3537 +46900 train 5.969927 (lr=1.0858e-05) (hash(x)=47897187) +45200 val loss 5.7165 +45200 val perplexity 303.8303 +45200 train 5.434013 (lr=6.0237e-06) (hash(x)=43460450) +47900 val loss 5.8458 +47900 val perplexity 345.7645 +47900 train 5.847965 (lr=7.2760e-06) (hash(x)=47333324) +47000 val loss 6.0454 +47000 val perplexity 422.1632 +47000 train 5.631547 (lr=1.0803e-05) (hash(x)=43196571) +45300 val loss 5.7197 +45300 val perplexity 304.8224 +45300 train 5.949478 (lr=5.9818e-06) (hash(x)=49935488) +48000 val loss 5.8457 +48000 val perplexity 345.7522 +48000 train 5.926929 (lr=7.2504e-06) (hash(x)=52758020) +47100 val loss 6.0468 +47100 val perplexity 422.7625 +47100 train 5.896503 (lr=1.0751e-05) (hash(x)=51224987) +45400 val loss 5.7188 +45400 val perplexity 304.5240 +45400 train 5.714464 (lr=5.9407e-06) (hash(x)=49447929) +48100 val loss 5.8417 +48100 val perplexity 344.3708 +48100 train 5.853239 (lr=7.2260e-06) (hash(x)=49806349) +47200 val loss 6.0475 +47200 val perplexity 423.0751 +47200 train 5.746195 (lr=1.0700e-05) (hash(x)=47943697) +45500 val loss 5.7183 +45500 val perplexity 304.3906 +45500 train 5.659379 (lr=5.9005e-06) (hash(x)=50713904) +48200 val loss 5.8440 +48200 val perplexity 345.1598 +48200 train 5.899774 (lr=7.2029e-06) (hash(x)=53220839) +47300 val loss 6.0511 +47300 val perplexity 424.5892 +47300 train 5.972193 (lr=1.0651e-05) (hash(x)=47351003) +48300 val loss 5.8452 +48300 val perplexity 345.5555 +48300 train 5.856475 (lr=7.1810e-06) (hash(x)=56052541) +45600 val loss 5.7401 +45600 val perplexity 311.0983 +45600 train 5.674036 (lr=5.8612e-06) (hash(x)=47674606) +47400 val loss 6.0474 +47400 val perplexity 423.0206 +47400 train 6.107438 (lr=1.0604e-05) (hash(x)=55562243) +48400 val loss 5.8411 +48400 val perplexity 344.1588 +48400 train 5.725373 (lr=7.1603e-06) (hash(x)=44482356) +45700 val loss 5.7195 +45700 val perplexity 304.7594 +45700 train 5.608114 (lr=5.8227e-06) (hash(x)=51539617) +47500 val loss 6.0458 +47500 val perplexity 422.3304 +47500 train 5.906383 (lr=1.0558e-05) (hash(x)=53544850) +48500 val loss 5.8416 +48500 val perplexity 344.3341 +48500 train 5.554918 (lr=7.1409e-06) (hash(x)=45714818) +45800 val loss 5.7209 +45800 val perplexity 305.1709 +45800 train 5.699752 (lr=5.7851e-06) (hash(x)=44448785) +47600 val loss 6.0501 +47600 val perplexity 424.1624 +47600 train 6.212331 (lr=1.0515e-05) (hash(x)=43634907) +48600 val loss 5.8423 +48600 val perplexity 344.5629 +48600 train 5.602136 (lr=7.1228e-06) (hash(x)=49476556) +45900 val loss 5.7186 +45900 val perplexity 304.4800 +45900 train 5.515782 (lr=5.7484e-06) (hash(x)=51499105) +47700 val loss 6.0454 +47700 val perplexity 422.1596 +47700 train 6.017856 (lr=1.0473e-05) (hash(x)=47909383) +48700 val loss 5.8426 +48700 val perplexity 344.6624 +48700 train 5.536637 (lr=7.1059e-06) (hash(x)=42508579) +46000 val loss 5.7148 +46000 val perplexity 303.3325 +46000 train 5.614646 (lr=5.7125e-06) (hash(x)=48359464) +47800 val loss 6.0433 +47800 val perplexity 421.2838 +47800 train 5.923380 (lr=1.0433e-05) (hash(x)=45871079) +48800 val loss 5.8451 +48800 val perplexity 345.5361 +48800 train 6.052786 (lr=7.0902e-06) (hash(x)=52737449) +46100 val loss 5.7171 +46100 val perplexity 304.0354 +46100 train 5.770822 (lr=5.6775e-06) (hash(x)=51885986) +47900 val loss 6.0408 +47900 val perplexity 420.2432 +47900 train 6.021464 (lr=1.0394e-05) (hash(x)=47333324) +48900 val loss 5.8437 +48900 val perplexity 345.0556 +48900 train 5.683334 (lr=7.0758e-06) (hash(x)=47057569) +46200 val loss 5.7129 +46200 val perplexity 302.7400 +46200 train 6.260228 (lr=5.6434e-06) (hash(x)=65186615) +48000 val loss 6.0431 +48000 val perplexity 421.1812 +48000 train 6.103298 (lr=1.0358e-05) (hash(x)=52758020) +49000 val loss 5.8492 +49000 val perplexity 346.9598 +49000 train 5.825048 (lr=7.0627e-06) (hash(x)=49908975) +46300 val loss 5.7136 +46300 val perplexity 302.9592 +46300 train 5.667426 (lr=5.6101e-06) (hash(x)=49626999) +48100 val loss 6.0383 +48100 val perplexity 419.1610 +48100 train 6.059237 (lr=1.0323e-05) (hash(x)=49806349) +49100 val loss 5.8464 +49100 val perplexity 345.9872 +49100 train 5.681278 (lr=7.0508e-06) (hash(x)=48427414) +46400 val loss 5.7126 +46400 val perplexity 302.6713 +48200 val loss 6.0352 +48200 val perplexity 417.8884 +46400 train 5.609452 (lr=5.5777e-06) (hash(x)=43325701) +48200 train 6.101655 (lr=1.0290e-05) (hash(x)=53220839) +49200 val loss 5.8453 +49200 val perplexity 345.6060 +49200 train 5.658861 (lr=7.0401e-06) (hash(x)=50246074) +48300 val loss 6.0348 +48300 val perplexity 417.7147 +48300 train 6.066577 (lr=1.0259e-05) (hash(x)=56052541) +46500 val loss 5.7142 +46500 val perplexity 303.1279 +46500 train 6.013310 (lr=5.5462e-06) (hash(x)=54028595) +49300 val loss 5.8465 +49300 val perplexity 346.0131 +49300 train 5.907238 (lr=7.0307e-06) (hash(x)=47715359) +48400 val loss 6.0330 +48400 val perplexity 416.9503 +46600 val loss 5.7100 +46600 val perplexity 301.8763 +46600 train 5.570869 (lr=5.5156e-06) (hash(x)=44519175) +48400 train 5.892282 (lr=1.0229e-05) (hash(x)=44482356) +49400 val loss 5.8469 +49400 val perplexity 346.1647 +49400 train 5.822443 (lr=7.0226e-06) (hash(x)=50175867) +46700 val loss 5.7136 +46700 val perplexity 302.9503 +46700 train 5.749952 (lr=5.4858e-06) (hash(x)=48357998) +48500 val loss 6.0343 +48500 val perplexity 417.5217 +48500 train 5.764053 (lr=1.0201e-05) (hash(x)=45714818) +46800 val loss 5.7132 +46800 val perplexity 302.8454 +46800 train 5.879297 (lr=5.4569e-06) (hash(x)=55911353) +49500 val loss 5.8512 +49500 val perplexity 347.6361 +49500 train 5.676702 (lr=7.0157e-06) (hash(x)=49336040) +48600 val loss 6.0346 +48600 val perplexity 417.6282 +48600 train 5.787707 (lr=1.0175e-05) (hash(x)=49476556) +46900 val loss 5.7112 +46900 val perplexity 302.2209 +46900 train 5.637585 (lr=5.4289e-06) (hash(x)=47897187) +49600 val loss 5.8519 +49600 val perplexity 347.8839 +49600 train 5.900337 (lr=7.0100e-06) (hash(x)=52039357) +48700 val loss 6.0347 +48700 val perplexity 417.6910 +48700 train 5.723627 (lr=1.0151e-05) (hash(x)=42508579) +47000 val loss 5.7112 +47000 val perplexity 302.2247 +47000 train 5.284904 (lr=5.4017e-06) (hash(x)=43196571) +49700 val loss 5.8527 +49700 val perplexity 348.1769 +49700 train 5.638364 (lr=7.0056e-06) (hash(x)=47568707) +48800 val loss 6.0380 +48800 val perplexity 419.0475 +48800 train 6.229567 (lr=1.0129e-05) (hash(x)=52737449) +47100 val loss 5.7113 +47100 val perplexity 302.2637 +47100 train 5.542101 (lr=5.3755e-06) (hash(x)=51224987) +49800 val loss 5.8571 +49800 val perplexity 349.7072 +49800 train 5.771938 (lr=7.0025e-06) (hash(x)=48451274) +48900 val loss 6.0367 +48900 val perplexity 418.5146 +47200 val loss 5.7128 +47200 val perplexity 302.7038 +48900 train 5.891450 (lr=1.0108e-05) (hash(x)=47057569) +47200 train 5.417474 (lr=5.3501e-06) (hash(x)=47943697) +49900 val loss 5.8548 +49900 val perplexity 348.9065 +49900 train 5.546834 (lr=7.0006e-06) (hash(x)=44523603) +47300 val loss 5.7134 +47300 val perplexity 302.8896 +47300 train 5.650457 (lr=5.3256e-06) (hash(x)=47351003) +49000 val loss 6.0453 +49000 val perplexity 422.1228 +49000 train 6.011272 (lr=1.0090e-05) (hash(x)=49908975) +49999 val loss 5.8445 +49999 val perplexity 345.3444 +47400 val loss 5.7120 +47400 val perplexity 302.4785 +47400 train 5.765558 (lr=5.3020e-06) (hash(x)=55562243) +49100 val loss 6.0398 +49100 val perplexity 419.8189 +49100 train 5.864618 (lr=1.0073e-05) (hash(x)=48427414) +47500 val loss 5.7122 +47500 val perplexity 302.5348 +47500 train 5.578689 (lr=5.2792e-06) (hash(x)=53544850) +49200 val loss 6.0385 +49200 val perplexity 419.2824 +49200 train 5.857993 (lr=1.0057e-05) (hash(x)=50246074) +47600 val loss 5.7123 +47600 val perplexity 302.5630 +47600 train 5.958771 (lr=5.2574e-06) (hash(x)=43634907) +49300 val loss 6.0412 +49300 val perplexity 420.4065 +49300 train 6.071166 (lr=1.0044e-05) (hash(x)=47715359) +47700 val loss 5.7123 +47700 val perplexity 302.5704 +47700 train 5.694991 (lr=5.2364e-06) (hash(x)=47909383) +49400 val loss 6.0446 +49400 val perplexity 421.8107 +49400 train 6.025251 (lr=1.0032e-05) (hash(x)=50175867) +47800 val loss 5.7095 +47800 val perplexity 301.7066 +47800 train 5.585462 (lr=5.2163e-06) (hash(x)=45871079) +49500 val loss 6.0472 +49500 val perplexity 422.9289 +49500 train 5.871203 (lr=1.0022e-05) (hash(x)=49336040) +47900 val loss 5.7073 +47900 val perplexity 301.0569 +47900 train 5.720872 (lr=5.1972e-06) (hash(x)=47333324) +49600 val loss 6.0544 +49600 val perplexity 425.9853 +49600 train 6.064258 (lr=1.0014e-05) (hash(x)=52039357) +48000 val loss 5.7077 +48000 val perplexity 301.1822 +48000 train 5.798008 (lr=5.1788e-06) (hash(x)=52758020) +49700 val loss 6.0505 +49700 val perplexity 424.3354 +49700 train 5.830864 (lr=1.0008e-05) (hash(x)=47568707) +48100 val loss 5.7056 +48100 val perplexity 300.5576 +48100 train 5.700547 (lr=5.1614e-06) (hash(x)=49806349) +49800 val loss 6.0525 +49800 val perplexity 425.1691 +49800 train 5.946240 (lr=1.0004e-05) (hash(x)=48451274) +48200 val loss 5.7051 +48200 val perplexity 300.4040 +48200 train 5.781303 (lr=5.1449e-06) (hash(x)=53220839) +49900 val loss 6.0540 +49900 val perplexity 425.8073 +49900 train 5.733783 (lr=1.0001e-05) (hash(x)=44523603) +48300 val loss 5.7046 +48300 val perplexity 300.2323 +48300 train 5.714827 (lr=5.1293e-06) (hash(x)=56052541) +49999 val loss 6.0411 +49999 val perplexity 420.3536 +48400 val loss 5.7032 +48400 val perplexity 299.8373 +48400 train 5.594895 (lr=5.1145e-06) (hash(x)=44482356) +48500 val loss 5.7032 +48500 val perplexity 299.8374 +48500 train 5.403605 (lr=5.1007e-06) (hash(x)=45714818) +48600 val loss 5.7045 +48600 val perplexity 300.2178 +48600 train 5.458147 (lr=5.0877e-06) (hash(x)=49476556) +48700 val loss 5.7060 +48700 val perplexity 300.6585 +48700 train 5.399137 (lr=5.0756e-06) (hash(x)=42508579) diff --git a/attention_kindselective_n_heads2_seed1340/model_02500.pt b/attention_kindselective_n_heads2_seed1340/model_02500.pt index 473b3cc45f008ad013670fa4bacbf70244a8f26b..8617b408eaa5c03c920f0207feaa12718d83e800 100644 --- a/attention_kindselective_n_heads2_seed1340/model_02500.pt +++ b/attention_kindselective_n_heads2_seed1340/model_02500.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db5e71e2651a7e4c04adf1b1bdd2fd67898b9838ff7ce2857cd735f4766b5532 +oid sha256:a43c3fea68b44f6e94a64fabd4bec96f4af33541fb79fc8ab291548d61ad3ba8 size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_05000.pt b/attention_kindselective_n_heads2_seed1340/model_05000.pt index 6a5728e2e1b189f44a9462ec4aed9dd4b14ad527..534f5ce06d11d6963eb798dc85425cb1b5f1d97d 100644 --- a/attention_kindselective_n_heads2_seed1340/model_05000.pt +++ b/attention_kindselective_n_heads2_seed1340/model_05000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87a3a4e1a720bf19b1c7f850b83a50505c837892b3964a6f17b36ac335b8a606 +oid sha256:cedd4c5d0d207b268fe96d32662f008ce85caf9422332a5f205480009a0bae8a size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_07500.pt b/attention_kindselective_n_heads2_seed1340/model_07500.pt index b716248628433516e6300010672ae386bd9d8dfa..66ebfd3368d945ea47dfb35e4e2cb4f8fa1fa235 100644 --- a/attention_kindselective_n_heads2_seed1340/model_07500.pt +++ b/attention_kindselective_n_heads2_seed1340/model_07500.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:933638ce43638ef9bd5900408f7bbcb30a32be65bd1c3e240216a1265da81d33 +oid sha256:cf3ae491c351dd98c03ca4270a47dadf6b7299816fd4d4d28e612be080c8eaf6 size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_10000.pt b/attention_kindselective_n_heads2_seed1340/model_10000.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5d0be82aafd14dada0797f701e6666b91eaa03c --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_10000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3917438735ab535c976c8cf747154e5ff137b3226377c17494de2db52a5dbc9 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_12500.pt b/attention_kindselective_n_heads2_seed1340/model_12500.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1a7eb2266e99c7b8c4ed79fbfc21587b956a9f4 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_12500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82f39c782dc0ecfa52b440f257e68f9fbd5d1c95cb1327b31ba1a4a90fddbd14 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_15000.pt b/attention_kindselective_n_heads2_seed1340/model_15000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0b673ba6972275623da4056004c290a90c37588 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_15000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9a999d0f39f00dac906e79883b2c60cacea327110d17ce2aeb2ca063f6dd405 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_17500.pt b/attention_kindselective_n_heads2_seed1340/model_17500.pt new file mode 100644 index 0000000000000000000000000000000000000000..f21606eafcd7ee65de929519901d9ecbf05c4585 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_17500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77730d4395b567ca5bcbb5ee491c47a6347ea335cdafbcaa57606e056f99d5be +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_20000.pt b/attention_kindselective_n_heads2_seed1340/model_20000.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4dd22160671a19d595f7a51401bb98352f36acc --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_20000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1b0f211ab6773a5b61676a5550d5e97c97a33cb69ba778acf90934dbc1edd6b +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_22500.pt b/attention_kindselective_n_heads2_seed1340/model_22500.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1931a1089e622d35f7f338038a60eb4105ce4a6 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_22500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:704498bc3ab665573a16170d899809e4f5b2e0433ee9568c75b1a299e86e63d0 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_25000.pt b/attention_kindselective_n_heads2_seed1340/model_25000.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e44cddaf203cafb2b7b34973b270ac7f068201e --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_25000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abad2d98112c3e440e66c5dbf182d0a97537a774c60116df1cc4e601a19b6d31 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_27500.pt b/attention_kindselective_n_heads2_seed1340/model_27500.pt new file mode 100644 index 0000000000000000000000000000000000000000..25562fc194bd00fed67c27c4b8f75181027ce733 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_27500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:231c011ec9ed5d97d25c7481c92c73d78b16474dea2f6e86258c8ef2055ce77f +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_30000.pt b/attention_kindselective_n_heads2_seed1340/model_30000.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e6da3aa7a7d28b84c404a91b637a58465f39419 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_30000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c5fa0b4c0e1c96c7546e72b0608230b2b1be0609b274c5f83c59202f5463a12 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_32500.pt b/attention_kindselective_n_heads2_seed1340/model_32500.pt new file mode 100644 index 0000000000000000000000000000000000000000..307835a5183aa325552d286901b2d9c9a266d92c --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_32500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7db2e6312252b187d62e6ebdbc4915b0d541396bd5728579d3b5167316a4611 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_35000.pt b/attention_kindselective_n_heads2_seed1340/model_35000.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e324026bfe83798b49d0c0d6f070465bc5ff463 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_35000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49399b8ac5d522d2716a77e73e6a22da180c585b1c865fdc6647f0186d199c0a +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_37500.pt b/attention_kindselective_n_heads2_seed1340/model_37500.pt new file mode 100644 index 0000000000000000000000000000000000000000..2abba94f607d621041bde491cffb6eb62378ad2e --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_37500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cee4edf4b83913b1bfc2b1150a9871f06b8205e075d2edd1d1f43cc015cd3ef +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_40000.pt b/attention_kindselective_n_heads2_seed1340/model_40000.pt new file mode 100644 index 0000000000000000000000000000000000000000..6de2d7103a58c63c65ab1530d4e29943c0b2ef95 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_40000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e12690a1fa5e37625d4caa7ed93f085e14857ea2fd92fde5d9c686eaadf988cd +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_42500.pt b/attention_kindselective_n_heads2_seed1340/model_42500.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b55aae57ae4e861546d7699642429a6b9cc64ad --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_42500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fbfa201aad46b6a3b9db8b8a6e3ffe19b2b0bf7135c941a10352a0f659dbd2f +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_45000.pt b/attention_kindselective_n_heads2_seed1340/model_45000.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b9bfdb758f96e75cf5506b553cee2844ccaf284 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_45000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dd2d44c2ef7a35bb4a5a6a4caae579ea0b5bbefab52fcc539b46efb5e230c44 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_47500.pt b/attention_kindselective_n_heads2_seed1340/model_47500.pt new file mode 100644 index 0000000000000000000000000000000000000000..06a66d3a3ac8ee1c4dd4f64ec85d1e1efb886d46 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_47500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:671e09d821aad0b05acfd58c4cb5943479619d40c185ec2b6e6081321e58589a +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/model_49999.pt b/attention_kindselective_n_heads2_seed1340/model_49999.pt new file mode 100644 index 0000000000000000000000000000000000000000..810f732bb1702e0df2534b034e0a9dd8b3ba8cfa --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/model_49999.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ade59c1a7b1f27604ba9d522da69929f07674b7e48af67cb6f07699745ecf35 +size 38587970 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_02500.pt b/attention_kindselective_n_heads2_seed1340/optimizer_02500.pt index 035c55e1b50e39811bdd6a3852edc14545d28dec..3aa37668665ab051744b172656f49288d0e804e8 100644 --- a/attention_kindselective_n_heads2_seed1340/optimizer_02500.pt +++ b/attention_kindselective_n_heads2_seed1340/optimizer_02500.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad182381a14e875a3c04a9d0fce780b0641e040b738f8454b13df6255a740e34 +oid sha256:f8b2588354f55480e0234fb6a453af90f8dce086ada918b3cfb81acfd1ade067 size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_05000.pt b/attention_kindselective_n_heads2_seed1340/optimizer_05000.pt index 667bea6c6237453d6d2f1d66f75747ef029a1f26..93ba36664ddacf6bbc231d55047909ff0a511313 100644 --- a/attention_kindselective_n_heads2_seed1340/optimizer_05000.pt +++ b/attention_kindselective_n_heads2_seed1340/optimizer_05000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89e51f2fe0afb3163bdb42b8d6c2d3968fb0c1d3a51b33ea7ae5061524b2b719 +oid sha256:a0fd46e04dbb546ef5f7f70b8792710a6b2c3d746613f6b100e414a89e996728 size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_07500.pt b/attention_kindselective_n_heads2_seed1340/optimizer_07500.pt index 84af818a6b45e3df41354c297304dfff3c01917a..3d7b82171cc3c9567754aa300a42d0d1dded51cf 100644 --- a/attention_kindselective_n_heads2_seed1340/optimizer_07500.pt +++ b/attention_kindselective_n_heads2_seed1340/optimizer_07500.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c475776587ad7676fd824bc7c9c9570e94a3ffb32776966605d3ccf70784164 +oid sha256:06a62406b060c40f3fc2ef5b420b68e53741a74ba8388e186bc8ea3757ec7472 size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_10000.pt b/attention_kindselective_n_heads2_seed1340/optimizer_10000.pt new file mode 100644 index 0000000000000000000000000000000000000000..592cb6db43b371b2a1d56454da5a3775fd3ed0ba --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_10000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fca96eb06f8fcae983adc5b5fd192a2b465e885c3ce28a93a16dff13711aed2a +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_12500.pt b/attention_kindselective_n_heads2_seed1340/optimizer_12500.pt new file mode 100644 index 0000000000000000000000000000000000000000..e88626bc78c9ac9dd32fa225f24f68bc35733a8a --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_12500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:976e03738be53dea6d55c73ba3ec3c146d8423d50974c2d70bf10f0f5942be6a +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_15000.pt b/attention_kindselective_n_heads2_seed1340/optimizer_15000.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3018d53784aba1d372484db42f40270fd0cda22 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_15000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccb9e25a713df0525fc8663aa13e51a23de1821fe91a03628167abe36ea38647 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_17500.pt b/attention_kindselective_n_heads2_seed1340/optimizer_17500.pt new file mode 100644 index 0000000000000000000000000000000000000000..7634a54943f4f011c3c82902431864b85a260dc4 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_17500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f09d15b8a367be854baf5c8608688081f0710680a6364d6cfac81c9d751acf67 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_20000.pt b/attention_kindselective_n_heads2_seed1340/optimizer_20000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0ee652acbf3a347d34dfbe4427bb3e7cf9a8ecb --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_20000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e238640eaa708aa5d306f56d91790321b5bfa8e6c0dc770b07dea34c1eb7de1f +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_22500.pt b/attention_kindselective_n_heads2_seed1340/optimizer_22500.pt new file mode 100644 index 0000000000000000000000000000000000000000..84e6ab01d60ca18cc119a285a013a76d3bc63555 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_22500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b84fcade25645820cc712b0b50986685542b78af0f11c9d5779617e41883b28 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_25000.pt b/attention_kindselective_n_heads2_seed1340/optimizer_25000.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f6a6c47fa6c94911221e15600ef85761c2dd254 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_25000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0345fb665f4a9124835ba1fa36906d37e6402ad0cc34129b35cf08f1f03a412 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_27500.pt b/attention_kindselective_n_heads2_seed1340/optimizer_27500.pt new file mode 100644 index 0000000000000000000000000000000000000000..02f46a54167993958dcec795e3d5294c1b23bc62 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_27500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a7b48a02e56eaa60ecd353f6f6ca16dabbe3aca0844593838434ead44e59fa4 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_30000.pt b/attention_kindselective_n_heads2_seed1340/optimizer_30000.pt new file mode 100644 index 0000000000000000000000000000000000000000..251420b3569496e38ffebfdea90d17f9009b363f --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_30000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b2deb47856a5a5c04ac084381911f4eb2ed9d4f2e122cf75eaa373d6a0d06d9 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_32500.pt b/attention_kindselective_n_heads2_seed1340/optimizer_32500.pt new file mode 100644 index 0000000000000000000000000000000000000000..9db9de0a936a8cc94cbcdc4b2f807195876caa43 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_32500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2758cd788b5b1aa7913b9d292af233fdef11fced987cca41dc5f9a8ebfdd73f6 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_35000.pt b/attention_kindselective_n_heads2_seed1340/optimizer_35000.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5358ea7ff52cddf598f473011d4eb2c9c1e3dc7 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_35000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:287a9f12979f3a2fe6db8b8ed14e540067d6f702530f08fae301c9ed10d640b0 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_37500.pt b/attention_kindselective_n_heads2_seed1340/optimizer_37500.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9aed3ba8951bd7e6de0a2fceffad98f76d2661d --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_37500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39099accc12524192b5e056c310a41a15cb2732a5494880dc53425c10f951945 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_40000.pt b/attention_kindselective_n_heads2_seed1340/optimizer_40000.pt new file mode 100644 index 0000000000000000000000000000000000000000..21d171adaebf334ce24615a075c0b8f82b149188 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_40000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfa8c38e106b13173b0d03014522cc51af92bb1d830433a5e97e26aa89b43989 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_42500.pt b/attention_kindselective_n_heads2_seed1340/optimizer_42500.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8bbb71550a87f48ef7b4157cdcfc558e759541f --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_42500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1624e56abfb76aa3296c9513beeac34e72980c3261c49b8b9fff585341b61b64 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_45000.pt b/attention_kindselective_n_heads2_seed1340/optimizer_45000.pt new file mode 100644 index 0000000000000000000000000000000000000000..9274490b3ec45c144e024765474854960787cab0 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_45000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db558aca5edc7dc4484cfa7ed272ef293cac9a8a59c2e7ecb072dcf22b3c4fc1 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_47500.pt b/attention_kindselective_n_heads2_seed1340/optimizer_47500.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb737ad70de4bd04e828bb7e077d639d7996f34d --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_47500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a33cd629f7b0ddcb5fe3a20bc9eeb53fd933901026e245136c3cc30eeeb0cf06 +size 70895430 diff --git a/attention_kindselective_n_heads2_seed1340/optimizer_49999.pt b/attention_kindselective_n_heads2_seed1340/optimizer_49999.pt new file mode 100644 index 0000000000000000000000000000000000000000..99f19698ac2e6817ae276b499b50670157d25f66 --- /dev/null +++ b/attention_kindselective_n_heads2_seed1340/optimizer_49999.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0b2e4ee681ee034716693975b2f3696f223c079ff54afce878108a7d45599c2 +size 70895430