andrew-healey commited on
Commit
e38c4b5
·
verified ·
1 Parent(s): c845ecb

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads8_seed1340/args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads8_seed1340", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 8, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1340, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 7e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "7e-5_10240_8_1340", "n_embd": 512}
attention_kindselective_n_heads8_seed1340/dataloader_02500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b2ea67f78ff5a7970d0db044ff7ee527b3dc065f295fd30f588df4b44b568d0
3
+ size 964
attention_kindselective_n_heads8_seed1340/dataloader_05000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f03ed2ebf741f15e13c79e6cc1e9a19b308450d81cc3b4d8d0338c63d77ca59
3
+ size 964
attention_kindselective_n_heads8_seed1340/dataloader_07500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82590037fb2eecbec961f7967a8dd1b8d85515d31a252f66b92b8139858a8b7c
3
+ size 964
attention_kindselective_n_heads8_seed1340/dataloader_09999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c544303717d12355a69b8ffb1eb109434e4fdccfd5a61279b6e8ba2e870d6700
3
+ size 964
attention_kindselective_n_heads8_seed1340/log2.txt ADDED
@@ -0,0 +1,1209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ max_steps: 10000
2
+ 0 val loss 11.0744
3
+ 0 val perplexity 64500.1523
4
+ 0 val loss 11.0744
5
+ 0 val perplexity 64500.2734
6
+ 0 val loss 11.0744
7
+ 0 val perplexity 64501.0742
8
+ 0 val loss 11.0744
9
+ 0 val perplexity 64500.1523
10
+ 0 train 11.079541 (lr=3.5000e-07) (hash(x)=57791809)
11
+ 0 train 11.079541 (lr=1.5000e-07) (hash(x)=57791809)
12
+ 0 train 11.079541 (lr=2.5000e-07) (hash(x)=57791809)
13
+ 0 train 11.079541 (lr=1.0000e-07) (hash(x)=57791809)
14
+ 100 val loss 9.5254
15
+ 100 val perplexity 13704.0791
16
+ 100 train 9.545608 (lr=3.5350e-05) (hash(x)=48211824)
17
+ 100 val loss 9.9436
18
+ 100 val perplexity 20819.3242
19
+ 100 val loss 9.6354
20
+ 100 val perplexity 15296.7988
21
+ 100 train 9.972991 (lr=1.0100e-05) (hash(x)=48211824)
22
+ 100 train 9.657048 (lr=2.5250e-05) (hash(x)=48211824)
23
+ 100 val loss 9.7540
24
+ 100 val perplexity 17222.1250
25
+ 100 train 9.779591 (lr=1.5150e-05) (hash(x)=48211824)
26
+ 200 val loss 8.2580
27
+ 200 val perplexity 3858.4006
28
+ 200 train 8.204547 (lr=7.0000e-05) (hash(x)=50375849)
29
+ 200 val loss 9.4624
30
+ 200 val perplexity 12866.3037
31
+ 200 train 9.433879 (lr=2.0000e-05) (hash(x)=50375849)
32
+ 200 val loss 9.1285
33
+ 200 val perplexity 9214.0938
34
+ 200 val loss 8.8667
35
+ 200 val perplexity 7091.8809
36
+ 200 train 9.091208 (lr=3.0000e-05) (hash(x)=50375849)
37
+ 200 train 8.824692 (lr=5.0000e-05) (hash(x)=50375849)
38
+ 300 val loss 7.7658
39
+ 300 val perplexity 2358.4961
40
+ 300 train 8.038077 (lr=6.9984e-05) (hash(x)=57250808)
41
+ 300 val loss 8.8112
42
+ 300 val perplexity 6709.1323
43
+ 300 train 8.984029 (lr=1.9995e-05) (hash(x)=57250808)
44
+ 300 val loss 8.1672
45
+ 300 val perplexity 3523.3943
46
+ 300 train 8.388642 (lr=2.9993e-05) (hash(x)=57250808)
47
+ 300 val loss 7.9807
48
+ 300 val perplexity 2924.0942
49
+ 300 train 8.223899 (lr=4.9988e-05) (hash(x)=57250808)
50
+ 400 val loss 7.5800
51
+ 400 val perplexity 1958.6438
52
+ 400 train 8.198520 (lr=6.9935e-05) (hash(x)=62519858)
53
+ 400 val loss 8.2008
54
+ 400 val perplexity 3643.9265
55
+ 400 train 8.714503 (lr=1.9982e-05) (hash(x)=62519858)
56
+ 400 val loss 7.7380
57
+ 400 val perplexity 2293.7683
58
+ 400 train 8.321421 (lr=2.9972e-05) (hash(x)=62519858)
59
+ 400 val loss 7.6893
60
+ 400 val perplexity 2184.7803
61
+ 400 train 8.280112 (lr=4.9954e-05) (hash(x)=62519858)
62
+ 500 val loss 7.4864
63
+ 500 val perplexity 1783.6954
64
+ 500 train 7.365067 (lr=6.9854e-05) (hash(x)=47226806)
65
+ 500 val loss 7.9001
66
+ 500 val perplexity 2697.5540
67
+ 500 train 7.817379 (lr=1.9958e-05) (hash(x)=47226806)
68
+ 500 val loss 7.5787
69
+ 500 val perplexity 1956.1013
70
+ 500 train 7.463418 (lr=2.9938e-05) (hash(x)=47226806)
71
+ 500 val loss 7.5497
72
+ 500 val perplexity 1900.0834
73
+ 500 train 7.439267 (lr=4.9896e-05) (hash(x)=47226806)
74
+ 600 val loss 7.4098
75
+ 600 val perplexity 1652.0936
76
+ 600 train 7.449531 (lr=6.9741e-05) (hash(x)=51149322)
77
+ 600 val loss 7.7479
78
+ 600 val perplexity 2316.7852
79
+ 600 train 7.808668 (lr=1.9926e-05) (hash(x)=51149322)
80
+ 600 val loss 7.4978
81
+ 600 val perplexity 1804.1498
82
+ 600 train 7.543031 (lr=2.9889e-05) (hash(x)=51149322)
83
+ 600 val loss 7.4542
84
+ 600 val perplexity 1727.0232
85
+ 600 train 7.500797 (lr=4.9815e-05) (hash(x)=51149322)
86
+ 700 val loss 7.3662
87
+ 700 val perplexity 1581.5858
88
+ 700 train 7.328444 (lr=6.9596e-05) (hash(x)=51564551)
89
+ 700 val loss 7.6381
90
+ 700 val perplexity 2075.7202
91
+ 700 train 7.601674 (lr=1.9885e-05) (hash(x)=51564551)
92
+ 700 val loss 7.4472
93
+ 700 val perplexity 1715.0033
94
+ 700 train 7.412354 (lr=2.9827e-05) (hash(x)=51564551)
95
+ 700 val loss 7.4139
96
+ 700 val perplexity 1658.8911
97
+ 700 train 7.377543 (lr=4.9712e-05) (hash(x)=51564551)
98
+ 800 val loss 7.3339
99
+ 800 val perplexity 1531.3365
100
+ 800 train 7.086210 (lr=6.9419e-05) (hash(x)=45093459)
101
+ 800 val loss 7.5613
102
+ 800 val perplexity 1922.2756
103
+ 800 train 7.341402 (lr=1.9834e-05) (hash(x)=45093459)
104
+ 800 val loss 7.3970
105
+ 800 val perplexity 1631.0750
106
+ 800 train 7.159473 (lr=2.9751e-05) (hash(x)=45093459)
107
+ 800 val loss 7.3797
108
+ 800 val perplexity 1603.0342
109
+ 800 train 7.141194 (lr=4.9585e-05) (hash(x)=45093459)
110
+ 900 val loss 7.3133
111
+ 900 val perplexity 1500.0953
112
+ 900 train 7.530349 (lr=6.9210e-05) (hash(x)=54988361)
113
+ 900 val loss 7.5042
114
+ 900 val perplexity 1815.7388
115
+ 900 train 7.794142 (lr=1.9774e-05) (hash(x)=54988361)
116
+ 900 val loss 7.3566
117
+ 900 val perplexity 1566.5011
118
+ 900 train 7.629829 (lr=2.9662e-05) (hash(x)=54988361)
119
+ 900 val loss 7.3483
120
+ 900 val perplexity 1553.5599
121
+ 900 train 7.611835 (lr=4.9436e-05) (hash(x)=54988361)
122
+ 1000 val loss 7.3015
123
+ 1000 val perplexity 1482.5197
124
+ 1000 train 7.151457 (lr=6.8970e-05) (hash(x)=47588648)
125
+ 1000 val loss 7.4704
126
+ 1000 val perplexity 1755.3352
127
+ 1000 train 7.308885 (lr=1.9706e-05) (hash(x)=47588648)
128
+ 1000 val loss 7.3298
129
+ 1000 val perplexity 1525.1024
130
+ 1000 train 7.173532 (lr=2.9558e-05) (hash(x)=47588648)
131
+ 1000 val loss 7.3243
132
+ 1000 val perplexity 1516.7701
133
+ 1000 train 7.170599 (lr=4.9264e-05) (hash(x)=47588648)
134
+ 1100 val loss 7.2816
135
+ 1100 val perplexity 1453.3807
136
+ 1100 val loss 7.4328
137
+ 1100 val perplexity 1690.5720
138
+ 1100 train 6.891258 (lr=6.8698e-05) (hash(x)=37984588)
139
+ 1100 train 7.063292 (lr=1.9628e-05) (hash(x)=37984588)
140
+ 1100 val loss 7.2966
141
+ 1100 val perplexity 1475.2267
142
+ 1100 train 6.895032 (lr=2.9442e-05) (hash(x)=37984588)
143
+ 1100 val loss 7.2944
144
+ 1100 val perplexity 1472.0934
145
+ 1100 train 6.891774 (lr=4.9070e-05) (hash(x)=37984588)
146
+ 1200 val loss 7.2477
147
+ 1200 val perplexity 1404.8997
148
+ 1200 train 7.403861 (lr=6.8395e-05) (hash(x)=56333817)
149
+ 1200 val loss 7.4056
150
+ 1200 val perplexity 1645.1812
151
+ 1200 train 7.572041 (lr=1.9542e-05) (hash(x)=56333817)
152
+ 1200 val loss 7.2510
153
+ 1200 val perplexity 1409.5781
154
+ 1200 train 7.413085 (lr=2.9312e-05) (hash(x)=56333817)
155
+ 1200 val loss 7.2683
156
+ 1200 val perplexity 1434.0852
157
+ 1200 train 7.427673 (lr=4.8854e-05) (hash(x)=56333817)
158
+ 1300 val loss 7.2387
159
+ 1300 val perplexity 1392.2408
160
+ 1300 val loss 7.3904
161
+ 1300 val perplexity 1620.3455
162
+ 1300 train 7.393292 (lr=6.8062e-05) (hash(x)=53454056)
163
+ 1300 train 7.515656 (lr=1.9446e-05) (hash(x)=53454056)
164
+ 1300 val loss 7.2148
165
+ 1300 val perplexity 1359.3795
166
+ 1300 train 7.371761 (lr=2.9169e-05) (hash(x)=53454056)
167
+ 1300 val loss 7.2425
168
+ 1300 val perplexity 1397.6387
169
+ 1300 train 7.405399 (lr=4.8616e-05) (hash(x)=53454056)
170
+ 1400 val loss 7.2102
171
+ 1400 val perplexity 1353.1600
172
+ 1400 train 7.318605 (lr=6.7698e-05) (hash(x)=55284163)
173
+ 1400 val loss 7.3416
174
+ 1400 val perplexity 1543.2137
175
+ 1400 train 7.421918 (lr=1.9342e-05) (hash(x)=55284163)
176
+ 1400 val loss 7.1703
177
+ 1400 val perplexity 1300.1732
178
+ 1400 train 7.267991 (lr=2.9013e-05) (hash(x)=55284163)
179
+ 1400 val loss 7.2076
180
+ 1400 val perplexity 1349.6500
181
+ 1400 train 7.308177 (lr=4.8356e-05) (hash(x)=55284163)
182
+ 1500 val loss 7.1755
183
+ 1500 val perplexity 1307.0302
184
+ 1500 train 6.990077 (lr=6.7304e-05) (hash(x)=48162598)
185
+ 1500 val loss 7.3069
186
+ 1500 val perplexity 1490.5138
187
+ 1500 train 7.126318 (lr=1.9230e-05) (hash(x)=48162598)
188
+ 1500 val loss 7.1275
189
+ 1500 val perplexity 1245.7611
190
+ 1500 train 6.953938 (lr=2.8845e-05) (hash(x)=48162598)
191
+ 1500 val loss 7.1882
192
+ 1500 val perplexity 1323.7600
193
+ 1500 train 7.001610 (lr=4.8074e-05) (hash(x)=48162598)
194
+ 1600 val loss 7.1604
195
+ 1600 val perplexity 1287.4865
196
+ 1600 train 7.235291 (lr=6.6881e-05) (hash(x)=54214535)
197
+ 1600 val loss 7.2764
198
+ 1600 val perplexity 1445.7960
199
+ 1600 train 7.336366 (lr=1.9109e-05) (hash(x)=54214535)
200
+ 1600 val loss 7.0945
201
+ 1600 val perplexity 1205.3110
202
+ 1600 train 7.182061 (lr=2.8663e-05) (hash(x)=54214535)
203
+ 1600 val loss 7.1639
204
+ 1600 val perplexity 1291.9230
205
+ 1600 train 7.246048 (lr=4.7772e-05) (hash(x)=54214535)
206
+ 1700 val loss 7.1594
207
+ 1700 val perplexity 1286.1304
208
+ 1700 train 7.379553 (lr=6.6428e-05) (hash(x)=53525003)
209
+ 1700 val loss 7.2678
210
+ 1700 val perplexity 1433.3823
211
+ 1700 train 7.495866 (lr=1.8979e-05) (hash(x)=53525003)
212
+ 1700 val loss 7.0801
213
+ 1700 val perplexity 1188.0975
214
+ 1700 train 7.320861 (lr=2.8469e-05) (hash(x)=53525003)
215
+ 1700 val loss 7.1502
216
+ 1700 val perplexity 1274.3279
217
+ 1700 train 7.380573 (lr=4.7448e-05) (hash(x)=53525003)
218
+ 1800 val loss 7.1304
219
+ 1800 val perplexity 1249.4257
220
+ 1800 train 7.203484 (lr=6.5947e-05) (hash(x)=51848994)
221
+ 1800 val loss 7.2311
222
+ 1800 val perplexity 1381.7793
223
+ 1800 train 7.303027 (lr=1.8842e-05) (hash(x)=51848994)
224
+ 1800 val loss 7.0370
225
+ 1800 val perplexity 1137.9156
226
+ 1800 train 7.110418 (lr=2.8263e-05) (hash(x)=51848994)
227
+ 1800 val loss 7.1128
228
+ 1800 val perplexity 1227.6375
229
+ 1800 train 7.184612 (lr=4.7105e-05) (hash(x)=51848994)
230
+ 1900 val loss 7.1304
231
+ 1900 val perplexity 1249.3917
232
+ 1900 train 6.945359 (lr=6.5437e-05) (hash(x)=48405987)
233
+ 1900 val loss 7.2135
234
+ 1900 val perplexity 1357.6111
235
+ 1900 train 7.020858 (lr=1.8696e-05) (hash(x)=48405987)
236
+ 1900 val loss 7.0105
237
+ 1900 val perplexity 1108.2396
238
+ 1900 train 6.820072 (lr=2.8044e-05) (hash(x)=48405987)
239
+ 1900 val loss 7.0923
240
+ 1900 val perplexity 1202.6534
241
+ 1900 train 6.898511 (lr=4.6741e-05) (hash(x)=48405987)
242
+ 2000 val loss 7.0969
243
+ 2000 val perplexity 1208.1680
244
+ 2000 train 7.451231 (lr=6.4900e-05) (hash(x)=58592291)
245
+ 2000 val loss 7.1832
246
+ 2000 val perplexity 1317.0741
247
+ 2000 train 7.533998 (lr=1.8543e-05) (hash(x)=58592291)
248
+ 2000 val loss 6.9804
249
+ 2000 val perplexity 1075.3429
250
+ 2000 train 7.339841 (lr=2.7814e-05) (hash(x)=58592291)
251
+ 2000 val loss 7.0678
252
+ 2000 val perplexity 1173.6071
253
+ 2000 train 7.429214 (lr=4.6357e-05) (hash(x)=58592291)
254
+ 2100 val loss 7.0806
255
+ 2100 val perplexity 1188.7402
256
+ 2100 train 7.179558 (lr=6.4335e-05) (hash(x)=51167081)
257
+ 2100 val loss 7.1699
258
+ 2100 val perplexity 1299.7511
259
+ 2100 train 7.262040 (lr=1.8382e-05) (hash(x)=51167081)
260
+ 2100 val loss 6.9650
261
+ 2100 val perplexity 1058.9535
262
+ 2100 train 7.059167 (lr=2.7572e-05) (hash(x)=51167081)
263
+ 2100 val loss 7.0631
264
+ 2100 val perplexity 1168.0159
265
+ 2100 train 7.157566 (lr=4.5954e-05) (hash(x)=51167081)
266
+ 2200 val loss 7.0772
267
+ 2200 val perplexity 1184.5981
268
+ 2200 train 7.055668 (lr=6.3745e-05) (hash(x)=47994988)
269
+ 2200 val loss 7.1410
270
+ 2200 val perplexity 1262.7300
271
+ 2200 train 7.130829 (lr=1.8213e-05) (hash(x)=47994988)
272
+ 2200 val loss 6.9354
273
+ 2200 val perplexity 1028.0294
274
+ 2200 train 6.934694 (lr=2.7319e-05) (hash(x)=47994988)
275
+ 2200 val loss 7.0341
276
+ 2200 val perplexity 1134.6560
277
+ 2200 train 7.013680 (lr=4.5532e-05) (hash(x)=47994988)
278
+ 2300 val loss 7.0683
279
+ 2300 val perplexity 1174.1478
280
+ 2300 train 7.064332 (lr=6.3128e-05) (hash(x)=47377604)
281
+ 2300 val loss 7.1320
282
+ 2300 val perplexity 1251.4082
283
+ 2300 train 7.126850 (lr=1.8036e-05) (hash(x)=47377604)
284
+ 2300 val loss 6.9205
285
+ 2300 val perplexity 1012.7773
286
+ 2300 train 6.902592 (lr=2.7055e-05) (hash(x)=47377604)
287
+ 2400 val loss 7.0480
288
+ 2400 val perplexity 1150.5189
289
+ 2300 val loss 7.0176
290
+ 2300 val perplexity 1116.0878
291
+ 2400 train 7.102792 (lr=6.2486e-05) (hash(x)=53554323)
292
+ 2300 train 7.011597 (lr=4.5091e-05) (hash(x)=47377604)
293
+ 2400 val loss 7.1077
294
+ 2400 val perplexity 1221.3223
295
+ 2400 train 7.168054 (lr=1.7853e-05) (hash(x)=53554323)
296
+ 2400 val loss 6.8895
297
+ 2400 val perplexity 981.9048
298
+ 2400 train 6.956583 (lr=2.6780e-05) (hash(x)=53554323)
299
+ 2500 val loss 7.0545
300
+ 2500 val perplexity 1158.1093
301
+ 2400 val loss 6.9913
302
+ 2400 val perplexity 1087.1801
303
+ 2500 val loss 7.0940
304
+ 2500 val perplexity 1204.7324
305
+ 2500 train 7.091203 (lr=6.1819e-05) (hash(x)=50780417)
306
+ 2400 train 7.054268 (lr=4.4633e-05) (hash(x)=53554323)
307
+ 2500 train 7.134183 (lr=1.7663e-05) (hash(x)=50780417)
308
+ 2500 val loss 6.8720
309
+ 2500 val perplexity 964.8711
310
+ 2500 train 6.929877 (lr=2.6494e-05) (hash(x)=50780417)
311
+ 2600 val loss 7.0427
312
+ 2600 val perplexity 1144.4280
313
+ 2600 train 6.935129 (lr=6.1128e-05) (hash(x)=46453562)
314
+ 2500 val loss 6.9796
315
+ 2500 val perplexity 1074.4991
316
+ 2600 val loss 7.0705
317
+ 2600 val perplexity 1176.6801
318
+ 2600 train 6.983191 (lr=1.7465e-05) (hash(x)=46453562)
319
+ 2500 train 7.024747 (lr=4.4156e-05) (hash(x)=50780417)
320
+ 2600 val loss 6.8481
321
+ 2600 val perplexity 942.0485
322
+ 2600 train 6.759700 (lr=2.6198e-05) (hash(x)=46453562)
323
+ 2700 val loss 7.0361
324
+ 2700 val perplexity 1136.9296
325
+ 2700 train 7.019847 (lr=6.0414e-05) (hash(x)=54404221)
326
+ 2700 val loss 7.0556
327
+ 2700 val perplexity 1159.3474
328
+ 2700 train 7.057378 (lr=1.7261e-05) (hash(x)=54404221)
329
+ 2600 val loss 6.9612
330
+ 2600 val perplexity 1054.9165
331
+ 2600 train 6.864810 (lr=4.3663e-05) (hash(x)=46453562)
332
+ 2700 val loss 6.8362
333
+ 2700 val perplexity 930.9543
334
+ 2700 train 6.829805 (lr=2.5892e-05) (hash(x)=54404221)
335
+ 2800 val loss 7.0333
336
+ 2800 val perplexity 1133.7198
337
+ 2800 train 7.761267 (lr=5.9677e-05) (hash(x)=59318895)
338
+ 2800 val loss 7.0442
339
+ 2800 val perplexity 1146.1642
340
+ 2800 train 7.818988 (lr=1.7051e-05) (hash(x)=59318895)
341
+ 2700 val loss 6.9554
342
+ 2700 val perplexity 1048.7799
343
+ 2700 train 6.948943 (lr=4.3153e-05) (hash(x)=54404221)
344
+ 2800 val loss 6.8236
345
+ 2800 val perplexity 919.3036
346
+ 2800 train 7.550114 (lr=2.5576e-05) (hash(x)=59318895)
347
+ 2900 val loss 7.0409
348
+ 2900 val perplexity 1142.4042
349
+ 2900 train 7.038341 (lr=5.8919e-05) (hash(x)=47845760)
350
+ 2900 val loss 7.0287
351
+ 2900 val perplexity 1128.5403
352
+ 2900 train 7.030903 (lr=1.6834e-05) (hash(x)=47845760)
353
+ 2800 val loss 6.9471
354
+ 2800 val perplexity 1040.0840
355
+ 2800 train 7.687397 (lr=4.2627e-05) (hash(x)=59318895)
356
+ 2900 val loss 6.8010
357
+ 2900 val perplexity 898.7311
358
+ 2900 train 6.806674 (lr=2.5251e-05) (hash(x)=47845760)
359
+ 3000 val loss 7.0295
360
+ 3000 val perplexity 1129.4146
361
+ 3000 train 6.731463 (lr=5.8140e-05) (hash(x)=44336167)
362
+ 3000 val loss 7.0128
363
+ 3000 val perplexity 1110.7865
364
+ 3000 train 6.712865 (lr=1.6611e-05) (hash(x)=44336167)
365
+ 2900 val loss 6.9376
366
+ 2900 val perplexity 1030.3340
367
+ 2900 train 6.940581 (lr=4.2085e-05) (hash(x)=47845760)
368
+ 3000 val loss 6.7838
369
+ 3000 val perplexity 883.4333
370
+ 3000 train 6.473313 (lr=2.4917e-05) (hash(x)=44336167)
371
+ 3100 val loss 7.0272
372
+ 3100 val perplexity 1126.9003
373
+ 3100 train 7.329674 (lr=5.7340e-05) (hash(x)=44479330)
374
+ 3100 val loss 6.9967
375
+ 3100 val perplexity 1093.0607
376
+ 3100 train 7.308694 (lr=1.6383e-05) (hash(x)=44479330)
377
+ 3000 val loss 6.9218
378
+ 3000 val perplexity 1014.1106
379
+ 3000 train 6.614216 (lr=4.1529e-05) (hash(x)=44336167)
380
+ 3100 val loss 6.7648
381
+ 3100 val perplexity 866.7640
382
+ 3100 train 7.090666 (lr=2.4574e-05) (hash(x)=44479330)
383
+ 3200 val loss 7.0308
384
+ 3200 val perplexity 1130.9462
385
+ 3200 train 7.100553 (lr=5.6522e-05) (hash(x)=54593096)
386
+ 3200 val loss 6.9944
387
+ 3200 val perplexity 1090.5176
388
+ 3200 train 7.079770 (lr=1.6149e-05) (hash(x)=54593096)
389
+ 3100 val loss 6.9163
390
+ 3100 val perplexity 1008.5403
391
+ 3100 train 7.235619 (lr=4.0957e-05) (hash(x)=44479330)
392
+ 3200 val loss 6.7609
393
+ 3200 val perplexity 863.3954
394
+ 3200 train 6.855031 (lr=2.4224e-05) (hash(x)=54593096)
395
+ 3300 val loss 7.0069
396
+ 3300 val perplexity 1104.2101
397
+ 3300 train 6.964014 (lr=5.5684e-05) (hash(x)=45347643)
398
+ 3300 val loss 6.9772
399
+ 3300 val perplexity 1071.8771
400
+ 3300 train 6.932001 (lr=1.5910e-05) (hash(x)=45347643)
401
+ 3200 val loss 6.9159
402
+ 3200 val perplexity 1008.1403
403
+ 3200 train 7.010983 (lr=4.0373e-05) (hash(x)=54593096)
404
+ 3300 val loss 6.7374
405
+ 3300 val perplexity 843.3978
406
+ 3300 train 6.732250 (lr=2.3865e-05) (hash(x)=45347643)
407
+ 3400 val loss 6.9972
408
+ 3400 val perplexity 1093.6138
409
+ 3400 train 7.143245 (lr=5.4829e-05) (hash(x)=47797247)
410
+ 3400 val loss 6.9505
411
+ 3400 val perplexity 1043.6790
412
+ 3400 train 7.096212 (lr=1.5666e-05) (hash(x)=47797247)
413
+ 3300 val loss 6.9194
414
+ 3300 val perplexity 1011.7419
415
+ 3300 train 6.877916 (lr=3.9775e-05) (hash(x)=45347643)
416
+ 3400 val loss 6.7233
417
+ 3400 val perplexity 831.5302
418
+ 3400 train 6.864102 (lr=2.3498e-05) (hash(x)=47797247)
419
+ 3500 val loss 6.9898
420
+ 3500 val perplexity 1085.4670
421
+ 3500 train 6.855062 (lr=5.3958e-05) (hash(x)=46115683)
422
+ 3500 val loss 6.9364
423
+ 3500 val perplexity 1029.0530
424
+ 3500 train 6.807493 (lr=1.5416e-05) (hash(x)=46115683)
425
+ 3400 val loss 6.8847
426
+ 3400 val perplexity 977.1854
427
+ 3400 train 7.030772 (lr=3.9164e-05) (hash(x)=47797247)
428
+ 3500 val loss 6.6976
429
+ 3500 val perplexity 810.4366
430
+ 3500 train 6.536535 (lr=2.3125e-05) (hash(x)=46115683)
431
+ 3600 val loss 6.9711
432
+ 3600 val perplexity 1065.3461
433
+ 3600 train 6.831712 (lr=5.3070e-05) (hash(x)=44502074)
434
+ 3600 val loss 6.9254
435
+ 3600 val perplexity 1017.8438
436
+ 3600 train 6.797766 (lr=1.5163e-05) (hash(x)=44502074)
437
+ 3500 val loss 6.8773
438
+ 3500 val perplexity 969.9569
439
+ 3500 train 6.728076 (lr=3.8541e-05) (hash(x)=46115683)
440
+ 3600 val loss 6.6856
441
+ 3600 val perplexity 800.8084
442
+ 3600 train 6.566401 (lr=2.2744e-05) (hash(x)=44502074)
443
+ 3700 val loss 6.9689
444
+ 3700 val perplexity 1063.0332
445
+ 3700 train 7.124398 (lr=5.2167e-05) (hash(x)=55388443)
446
+ 3700 val loss 6.9106
447
+ 3700 val perplexity 1002.8619
448
+ 3700 train 7.083196 (lr=1.4905e-05) (hash(x)=55388443)
449
+ 3700 val loss 6.6702
450
+ 3700 val perplexity 788.5517
451
+ 3600 val loss 6.8693
452
+ 3600 val perplexity 962.2554
453
+ 3700 train 6.845844 (lr=2.2357e-05) (hash(x)=55388443)
454
+ 3600 train 6.741370 (lr=3.7907e-05) (hash(x)=44502074)
455
+ 3800 val loss 6.9651
456
+ 3800 val perplexity 1058.9833
457
+ 3800 train 6.742047 (lr=5.1251e-05) (hash(x)=43790341)
458
+ 3800 val loss 6.8986
459
+ 3800 val perplexity 990.8743
460
+ 3800 train 6.670383 (lr=1.4643e-05) (hash(x)=43790341)
461
+ 3800 val loss 6.6571
462
+ 3800 val perplexity 778.3168
463
+ 3800 train 6.436588 (lr=2.1965e-05) (hash(x)=43790341)
464
+ 3700 val loss 6.8565
465
+ 3700 val perplexity 950.0204
466
+ 3700 train 7.029319 (lr=3.7262e-05) (hash(x)=55388443)
467
+ 3900 val loss 6.9636
468
+ 3900 val perplexity 1057.3928
469
+ 3900 train 6.931797 (lr=5.0321e-05) (hash(x)=50013318)
470
+ 3900 val loss 6.8866
471
+ 3900 val perplexity 979.1108
472
+ 3900 train 6.861619 (lr=1.4377e-05) (hash(x)=50013318)
473
+ 3900 val loss 6.6434
474
+ 3900 val perplexity 767.6965
475
+ 3900 train 6.618001 (lr=2.1566e-05) (hash(x)=50013318)
476
+ 3800 val loss 6.8465
477
+ 3800 val perplexity 940.5781
478
+ 3800 train 6.626799 (lr=3.6608e-05) (hash(x)=43790341)
479
+ 4000 val loss 6.9586
480
+ 4000 val perplexity 1052.1274
481
+ 4000 train 7.005599 (lr=4.9379e-05) (hash(x)=51704787)
482
+ 4000 val loss 6.8761
483
+ 4000 val perplexity 968.8410
484
+ 4000 train 6.948847 (lr=1.4108e-05) (hash(x)=51704787)
485
+ 4000 val loss 6.6273
486
+ 4000 val perplexity 755.4662
487
+ 4000 train 6.686940 (lr=2.1162e-05) (hash(x)=51704787)
488
+ 3900 val loss 6.8405
489
+ 3900 val perplexity 934.9680
490
+ 3900 train 6.808554 (lr=3.5944e-05) (hash(x)=50013318)
491
+ 4100 val loss 6.9637
492
+ 4100 val perplexity 1057.5693
493
+ 4100 train 7.027716 (lr=4.8426e-05) (hash(x)=50821964)
494
+ 4100 val loss 6.8649
495
+ 4100 val perplexity 958.0593
496
+ 4100 train 6.937582 (lr=1.3836e-05) (hash(x)=50821964)
497
+ 4100 val loss 6.6167
498
+ 4100 val perplexity 747.4631
499
+ 4100 train 6.681018 (lr=2.0754e-05) (hash(x)=50821964)
500
+ 4000 val loss 6.8263
501
+ 4000 val perplexity 921.7486
502
+ 4000 train 6.867013 (lr=3.5271e-05) (hash(x)=51704787)
503
+ 4200 val loss 6.9647
504
+ 4200 val perplexity 1058.6071
505
+ 4200 train 6.991130 (lr=4.7463e-05) (hash(x)=49675080)
506
+ 4200 val loss 6.8555
507
+ 4200 val perplexity 949.0610
508
+ 4200 train 6.890470 (lr=1.3561e-05) (hash(x)=49675080)
509
+ 4200 val loss 6.6082
510
+ 4200 val perplexity 741.1528
511
+ 4200 train 6.649536 (lr=2.0341e-05) (hash(x)=49675080)
512
+ 4100 val loss 6.8227
513
+ 4100 val perplexity 918.4326
514
+ 4100 train 6.889338 (lr=3.4590e-05) (hash(x)=50821964)
515
+ 4300 val loss 6.9660
516
+ 4300 val perplexity 1059.9851
517
+ 4300 train 6.611563 (lr=4.6490e-05) (hash(x)=43239281)
518
+ 4300 val loss 6.8436
519
+ 4300 val perplexity 937.8506
520
+ 4300 train 6.500687 (lr=1.3283e-05) (hash(x)=43239281)
521
+ 4300 val loss 6.5957
522
+ 4300 val perplexity 731.9608
523
+ 4300 train 6.246088 (lr=1.9924e-05) (hash(x)=43239281)
524
+ 4200 val loss 6.8298
525
+ 4200 val perplexity 925.0024
526
+ 4200 train 6.867212 (lr=3.3902e-05) (hash(x)=49675080)
527
+ 4400 val loss 6.9531
528
+ 4400 val perplexity 1046.4331
529
+ 4400 train 6.618497 (lr=4.5509e-05) (hash(x)=45076737)
530
+ 4400 val loss 6.8358
531
+ 4400 val perplexity 930.5940
532
+ 4400 train 6.493437 (lr=1.3003e-05) (hash(x)=45076737)
533
+ 4400 val loss 6.5910
534
+ 4400 val perplexity 728.4740
535
+ 4400 train 6.258480 (lr=1.9504e-05) (hash(x)=45076737)
536
+ 4500 val loss 6.9524
537
+ 4500 val perplexity 1045.6770
538
+ 4500 train 7.078568 (lr=4.4521e-05) (hash(x)=57930262)
539
+ 4300 val loss 6.8005
540
+ 4300 val perplexity 898.3245
541
+ 4300 train 6.453156 (lr=3.3207e-05) (hash(x)=43239281)
542
+ 4500 val loss 6.8233
543
+ 4500 val perplexity 918.9745
544
+ 4500 train 6.945100 (lr=1.2720e-05) (hash(x)=57930262)
545
+ 4500 val loss 6.5777
546
+ 4500 val perplexity 718.9064
547
+ 4500 train 6.710505 (lr=1.9081e-05) (hash(x)=57930262)
548
+ 4600 val loss 6.9370
549
+ 4600 val perplexity 1029.6769
550
+ 4600 train 6.716589 (lr=4.3527e-05) (hash(x)=46721614)
551
+ 4400 val loss 6.8052
552
+ 4400 val perplexity 902.5490
553
+ 4600 val loss 6.8141
554
+ 4600 val perplexity 910.5617
555
+ 4400 train 6.455774 (lr=3.2507e-05) (hash(x)=45076737)
556
+ 4600 train 6.585222 (lr=1.2436e-05) (hash(x)=46721614)
557
+ 4600 val loss 6.5683
558
+ 4600 val perplexity 712.1240
559
+ 4600 train 6.316262 (lr=1.8655e-05) (hash(x)=46721614)
560
+ 4700 val loss 6.9392
561
+ 4700 val perplexity 1031.9817
562
+ 4700 train 6.750459 (lr=4.2528e-05) (hash(x)=49837920)
563
+ 4700 val loss 6.8054
564
+ 4700 val perplexity 902.6893
565
+ 4700 train 6.615937 (lr=1.2151e-05) (hash(x)=49837920)
566
+ 4500 val loss 6.7860
567
+ 4500 val perplexity 885.3331
568
+ 4500 train 6.911989 (lr=3.1801e-05) (hash(x)=57930262)
569
+ 4700 val loss 6.5557
570
+ 4700 val perplexity 703.2458
571
+ 4700 train 6.345678 (lr=1.8226e-05) (hash(x)=49837920)
572
+ 4800 val loss 6.9232
573
+ 4800 val perplexity 1015.5730
574
+ 4800 train 7.062541 (lr=4.1525e-05) (hash(x)=48380045)
575
+ 4800 val loss 6.7953
576
+ 4800 val perplexity 893.6684
577
+ 4800 train 6.939090 (lr=1.1864e-05) (hash(x)=48380045)
578
+ 4600 val loss 6.7746
579
+ 4600 val perplexity 875.3221
580
+ 4600 train 6.539246 (lr=3.1091e-05) (hash(x)=46721614)
581
+ 4800 val loss 6.5463
582
+ 4800 val perplexity 696.6906
583
+ 4800 train 6.705057 (lr=1.7796e-05) (hash(x)=48380045)
584
+ 4900 val loss 6.9166
585
+ 4900 val perplexity 1008.8375
586
+ 4900 train 6.791525 (lr=4.0518e-05) (hash(x)=44202577)
587
+ 4900 val loss 6.7833
588
+ 4900 val perplexity 882.9680
589
+ 4900 train 6.638532 (lr=1.1577e-05) (hash(x)=44202577)
590
+ 4700 val loss 6.7656
591
+ 4700 val perplexity 867.4582
592
+ 4700 train 6.562875 (lr=3.0377e-05) (hash(x)=49837920)
593
+ 4900 val loss 6.5368
594
+ 4900 val perplexity 690.0550
595
+ 4900 train 6.406009 (lr=1.7365e-05) (hash(x)=44202577)
596
+ 5000 val loss 6.9087
597
+ 5000 val perplexity 1000.9222
598
+ 5000 train 6.899564 (lr=3.9510e-05) (hash(x)=52038024)
599
+ 5000 val loss 6.7696
600
+ 5000 val perplexity 870.9617
601
+ 5000 train 6.774422 (lr=1.1288e-05) (hash(x)=52038024)
602
+ 4800 val loss 6.7573
603
+ 4800 val perplexity 860.2833
604
+ 4800 train 6.900402 (lr=2.9661e-05) (hash(x)=48380045)
605
+ 5000 val loss 6.5254
606
+ 5000 val perplexity 682.2469
607
+ 5000 train 6.508739 (lr=1.6933e-05) (hash(x)=52038024)
608
+ 5100 val loss 6.8995
609
+ 5100 val perplexity 991.7739
610
+ 5100 train 7.089410 (lr=3.8500e-05) (hash(x)=53700038)
611
+ 5100 val loss 6.7566
612
+ 5100 val perplexity 859.7440
613
+ 5100 train 6.936323 (lr=1.1000e-05) (hash(x)=53700038)
614
+ 4900 val loss 6.7479
615
+ 4900 val perplexity 852.2854
616
+ 4900 train 6.608902 (lr=2.8942e-05) (hash(x)=44202577)
617
+ 5200 val loss 6.8979
618
+ 5200 val perplexity 990.1672
619
+ 5200 train 6.912906 (lr=3.7490e-05) (hash(x)=48137625)
620
+ 5100 val loss 6.5085
621
+ 5100 val perplexity 670.8521
622
+ 5100 train 6.672117 (lr=1.6500e-05) (hash(x)=53700038)
623
+ 5200 val loss 6.7483
624
+ 5200 val perplexity 852.6248
625
+ 5200 train 6.789814 (lr=1.0712e-05) (hash(x)=48137625)
626
+ 5000 val loss 6.7348
627
+ 5000 val perplexity 841.1663
628
+ 5000 train 6.733591 (lr=2.8221e-05) (hash(x)=52038024)
629
+ 5300 val loss 6.8939
630
+ 5300 val perplexity 986.2891
631
+ 5300 train 6.749971 (lr=3.6482e-05) (hash(x)=43161573)
632
+ 5200 val loss 6.5073
633
+ 5200 val perplexity 670.0002
634
+ 5200 train 6.552045 (lr=1.6067e-05) (hash(x)=48137625)
635
+ 5300 val loss 6.7398
636
+ 5300 val perplexity 845.4045
637
+ 5300 train 6.591602 (lr=1.0423e-05) (hash(x)=43161573)
638
+ 5100 val loss 6.7271
639
+ 5100 val perplexity 834.7480
640
+ 5100 train 6.914661 (lr=2.7500e-05) (hash(x)=53700038)
641
+ 5400 val loss 6.8924
642
+ 5400 val perplexity 984.7928
643
+ 5400 train 7.057383 (lr=3.5475e-05) (hash(x)=56673322)
644
+ 5300 val loss 6.4953
645
+ 5300 val perplexity 662.0350
646
+ 5300 train 6.333315 (lr=1.5635e-05) (hash(x)=43161573)
647
+ 5400 val loss 6.7355
648
+ 5400 val perplexity 841.7444
649
+ 5400 train 6.927864 (lr=1.0136e-05) (hash(x)=56673322)
650
+ 5200 val loss 6.7212
651
+ 5200 val perplexity 829.8277
652
+ 5200 train 6.751077 (lr=2.6779e-05) (hash(x)=48137625)
653
+ 5500 val loss 6.8680
654
+ 5500 val perplexity 961.0191
655
+ 5500 train 7.047554 (lr=3.4472e-05) (hash(x)=53468295)
656
+ 5400 val loss 6.4899
657
+ 5400 val perplexity 658.4774
658
+ 5500 val loss 6.7250
659
+ 5500 val perplexity 832.9433
660
+ 5400 train 6.677568 (lr=1.5204e-05) (hash(x)=56673322)
661
+ 5500 train 6.919577 (lr=9.8491e-06) (hash(x)=53468295)
662
+ 5300 val loss 6.7096
663
+ 5300 val perplexity 820.2217
664
+ 5300 train 6.554457 (lr=2.6058e-05) (hash(x)=43161573)
665
+ 5600 val loss 6.8605
666
+ 5600 val perplexity 953.8215
667
+ 5600 train 7.188080 (lr=3.3473e-05) (hash(x)=59287280)
668
+ 5600 val loss 6.7138
669
+ 5600 val perplexity 823.6735
670
+ 5500 val loss 6.4886
671
+ 5500 val perplexity 657.5712
672
+ 5600 train 7.046098 (lr=9.5636e-06) (hash(x)=59287280)
673
+ 5500 train 6.689147 (lr=1.4774e-05) (hash(x)=53468295)
674
+ 5400 val loss 6.7112
675
+ 5400 val perplexity 821.5338
676
+ 5400 train 6.880655 (lr=2.5339e-05) (hash(x)=56673322)
677
+ 5700 val loss 6.8700
678
+ 5700 val perplexity 962.9370
679
+ 5700 train 7.115635 (lr=3.2479e-05) (hash(x)=57575806)
680
+ 5700 val loss 6.7076
681
+ 5700 val perplexity 818.6053
682
+ 5700 train 6.936852 (lr=9.2796e-06) (hash(x)=57575806)
683
+ 5600 val loss 6.4805
684
+ 5600 val perplexity 652.3239
685
+ 5600 train 6.805542 (lr=1.4345e-05) (hash(x)=59287280)
686
+ 5500 val loss 6.7051
687
+ 5500 val perplexity 816.5854
688
+ 5500 train 6.890529 (lr=2.4623e-05) (hash(x)=53468295)
689
+ 5800 val loss 6.8754
690
+ 5800 val perplexity 968.1921
691
+ 5800 train 6.828178 (lr=3.1491e-05) (hash(x)=46897279)
692
+ 5800 val loss 6.7025
693
+ 5800 val perplexity 814.4427
694
+ 5800 train 6.697758 (lr=8.9973e-06) (hash(x)=46897279)
695
+ 5700 val loss 6.4664
696
+ 5700 val perplexity 643.1498
697
+ 5700 train 6.672063 (lr=1.3919e-05) (hash(x)=57575806)
698
+ 5600 val loss 6.6962
699
+ 5600 val perplexity 809.3145
700
+ 5600 train 7.011116 (lr=2.3909e-05) (hash(x)=59287280)
701
+ 5900 val loss 6.8609
702
+ 5900 val perplexity 954.2668
703
+ 5900 train 6.717267 (lr=3.0510e-05) (hash(x)=47565679)
704
+ 5900 val loss 6.6951
705
+ 5900 val perplexity 808.4362
706
+ 5900 train 6.575296 (lr=8.7171e-06) (hash(x)=47565679)
707
+ 5800 val loss 6.4659
708
+ 5800 val perplexity 642.8671
709
+ 5800 train 6.516923 (lr=1.3496e-05) (hash(x)=46897279)
710
+ 5700 val loss 6.6804
711
+ 5700 val perplexity 796.6323
712
+ 5700 train 6.919859 (lr=2.3199e-05) (hash(x)=57575806)
713
+ 6000 val loss 6.8543
714
+ 6000 val perplexity 947.9298
715
+ 6000 train 6.732675 (lr=2.9537e-05) (hash(x)=51590090)
716
+ 6000 val loss 6.6898
717
+ 6000 val perplexity 804.1218
718
+ 6000 train 6.565755 (lr=8.4393e-06) (hash(x)=51590090)
719
+ 5900 val loss 6.4598
720
+ 5900 val perplexity 638.9581
721
+ 5900 train 6.347005 (lr=1.3076e-05) (hash(x)=47565679)
722
+ 5800 val loss 6.6835
723
+ 5800 val perplexity 799.0714
724
+ 5800 train 6.678881 (lr=2.2493e-05) (hash(x)=46897279)
725
+ 6100 val loss 6.8477
726
+ 6100 val perplexity 941.7269
727
+ 6100 train 7.248047 (lr=2.8574e-05) (hash(x)=59732271)
728
+ 6100 val loss 6.6780
729
+ 6100 val perplexity 794.7079
730
+ 6100 train 7.095581 (lr=8.1640e-06) (hash(x)=59732271)
731
+ 6000 val loss 6.4558
732
+ 6000 val perplexity 636.3523
733
+ 6000 train 6.310637 (lr=1.2659e-05) (hash(x)=51590090)
734
+ 5900 val loss 6.6804
735
+ 5900 val perplexity 796.6262
736
+ 5900 train 6.560131 (lr=2.1793e-05) (hash(x)=47565679)
737
+ 6200 val loss 6.8392
738
+ 6200 val perplexity 933.7579
739
+ 6200 train 6.832141 (lr=2.7621e-05) (hash(x)=46394422)
740
+ 6200 val loss 6.6687
741
+ 6200 val perplexity 787.3771
742
+ 6200 train 6.698349 (lr=7.8917e-06) (hash(x)=46394422)
743
+ 6100 val loss 6.4453
744
+ 6100 val perplexity 629.7390
745
+ 6100 train 6.882823 (lr=1.2246e-05) (hash(x)=59732271)
746
+ 6000 val loss 6.6699
747
+ 6000 val perplexity 788.3404
748
+ 6000 train 6.557931 (lr=2.1098e-05) (hash(x)=51590090)
749
+ 6300 val loss 6.8440
750
+ 6300 val perplexity 938.2667
751
+ 6300 train 6.859726 (lr=2.6679e-05) (hash(x)=53748145)
752
+ 6300 val loss 6.6657
753
+ 6300 val perplexity 785.0262
754
+ 6300 train 6.699683 (lr=7.6226e-06) (hash(x)=53748145)
755
+ 6200 val loss 6.4406
756
+ 6200 val perplexity 626.7773
757
+ 6200 train 6.547752 (lr=1.1838e-05) (hash(x)=46394422)
758
+ 6100 val loss 6.6590
759
+ 6100 val perplexity 779.7674
760
+ 6100 train 7.079908 (lr=2.0410e-05) (hash(x)=59732271)
761
+ 6400 val loss 6.8349
762
+ 6400 val perplexity 929.7654
763
+ 6400 train 6.713243 (lr=2.5749e-05) (hash(x)=46054751)
764
+ 6400 val loss 6.6582
765
+ 6400 val perplexity 779.1374
766
+ 6400 train 6.540910 (lr=7.3569e-06) (hash(x)=46054751)
767
+ 6300 val loss 6.4352
768
+ 6300 val perplexity 623.3842
769
+ 6300 train 6.474613 (lr=1.1434e-05) (hash(x)=53748145)
770
+ 6500 val loss 6.8288
771
+ 6500 val perplexity 924.0612
772
+ 6500 train 7.062670 (lr=2.4833e-05) (hash(x)=51816809)
773
+ 6200 val loss 6.6508
774
+ 6200 val perplexity 773.4122
775
+ 6200 train 6.688356 (lr=1.9729e-05) (hash(x)=46394422)
776
+ 6500 val loss 6.6497
777
+ 6500 val perplexity 772.5416
778
+ 6500 train 6.918588 (lr=7.0950e-06) (hash(x)=51816809)
779
+ 6400 val loss 6.4275
780
+ 6400 val perplexity 618.6315
781
+ 6400 train 6.307833 (lr=1.1035e-05) (hash(x)=46054751)
782
+ 6600 val loss 6.8306
783
+ 6600 val perplexity 925.7701
784
+ 6600 train 6.681698 (lr=2.3930e-05) (hash(x)=52453336)
785
+ 6300 val loss 6.6429
786
+ 6300 val perplexity 767.3305
787
+ 6300 train 6.688078 (lr=1.9056e-05) (hash(x)=53748145)
788
+ 6600 val loss 6.6472
789
+ 6600 val perplexity 770.6442
790
+ 6600 train 6.543104 (lr=6.8372e-06) (hash(x)=52453336)
791
+ 6500 val loss 6.4223
792
+ 6500 val perplexity 615.4333
793
+ 6500 train 6.724624 (lr=1.0643e-05) (hash(x)=51816809)
794
+ 6700 val loss 6.8288
795
+ 6700 val perplexity 924.0959
796
+ 6700 train 6.853996 (lr=2.3042e-05) (hash(x)=49108775)
797
+ 6400 val loss 6.6365
798
+ 6400 val perplexity 762.4250
799
+ 6400 train 6.516543 (lr=1.8392e-05) (hash(x)=46054751)
800
+ 6700 val loss 6.6395
801
+ 6700 val perplexity 764.6962
802
+ 6700 train 6.668485 (lr=6.5835e-06) (hash(x)=49108775)
803
+ 6600 val loss 6.4189
804
+ 6600 val perplexity 613.3281
805
+ 6600 train 6.276807 (lr=1.0256e-05) (hash(x)=52453336)
806
+ 6800 val loss 6.8176
807
+ 6800 val perplexity 913.8017
808
+ 6800 train 6.734222 (lr=2.2171e-05) (hash(x)=46745396)
809
+ 6500 val loss 6.6280
810
+ 6500 val perplexity 755.9351
811
+ 6500 train 6.887875 (lr=1.7738e-05) (hash(x)=51816809)
812
+ 6800 val loss 6.6352
813
+ 6800 val perplexity 761.4408
814
+ 6800 train 6.550888 (lr=6.3345e-06) (hash(x)=46745396)
815
+ 6700 val loss 6.4147
816
+ 6700 val perplexity 610.7313
817
+ 6700 train 6.440950 (lr=9.8753e-06) (hash(x)=49108775)
818
+ 6900 val loss 6.8119
819
+ 6900 val perplexity 908.6169
820
+ 6900 train 6.969987 (lr=2.1316e-05) (hash(x)=46534986)
821
+ 6900 val loss 6.6270
822
+ 6900 val perplexity 755.2350
823
+ 6900 train 6.828880 (lr=6.0902e-06) (hash(x)=46534986)
824
+ 6600 val loss 6.6247
825
+ 6600 val perplexity 753.4476
826
+ 6600 train 6.471210 (lr=1.7093e-05) (hash(x)=52453336)
827
+ 6800 val loss 6.4107
828
+ 6800 val perplexity 608.3489
829
+ 6800 train 6.298498 (lr=9.5017e-06) (hash(x)=46745396)
830
+ 7000 val loss 6.8048
831
+ 7000 val perplexity 902.1484
832
+ 7000 train 7.197929 (lr=2.0478e-05) (hash(x)=49317888)
833
+ 7000 val loss 6.6200
834
+ 7000 val perplexity 749.9446
835
+ 7000 train 6.991396 (lr=5.8510e-06) (hash(x)=49317888)
836
+ 6900 val loss 6.4029
837
+ 6900 val perplexity 603.6034
838
+ 6700 val loss 6.6157
839
+ 6700 val perplexity 746.7203
840
+ 6900 train 6.632865 (lr=9.1353e-06) (hash(x)=46534986)
841
+ 6700 train 6.645592 (lr=1.6459e-05) (hash(x)=49108775)
842
+ 7100 val loss 6.8015
843
+ 7100 val perplexity 899.2177
844
+ 7100 train 6.811685 (lr=1.9660e-05) (hash(x)=50360484)
845
+ 7100 val loss 6.6150
846
+ 7100 val perplexity 746.1974
847
+ 7100 train 6.616625 (lr=5.6170e-06) (hash(x)=50360484)
848
+ 7000 val loss 6.3969
849
+ 7000 val perplexity 599.9835
850
+ 7000 train 6.782572 (lr=8.7764e-06) (hash(x)=49317888)
851
+ 6800 val loss 6.6173
852
+ 6800 val perplexity 747.9319
853
+ 6800 train 6.525100 (lr=1.5836e-05) (hash(x)=46745396)
854
+ 7200 val loss 6.7989
855
+ 7200 val perplexity 896.8526
856
+ 7200 train 6.673613 (lr=1.8860e-05) (hash(x)=49515094)
857
+ 7200 val loss 6.6101
858
+ 7200 val perplexity 742.5444
859
+ 7200 train 6.475906 (lr=5.3886e-06) (hash(x)=49515094)
860
+ 7100 val loss 6.3945
861
+ 7100 val perplexity 598.5322
862
+ 7100 train 6.410477 (lr=8.4255e-06) (hash(x)=50360484)
863
+ 6900 val loss 6.6010
864
+ 6900 val perplexity 735.8467
865
+ 6900 train 6.792136 (lr=1.5225e-05) (hash(x)=46534986)
866
+ 7300 val loss 6.7962
867
+ 7300 val perplexity 894.4124
868
+ 7300 train 6.783378 (lr=1.8081e-05) (hash(x)=51546861)
869
+ 7300 val loss 6.6118
870
+ 7300 val perplexity 743.8251
871
+ 7300 train 6.599928 (lr=5.1659e-06) (hash(x)=51546861)
872
+ 7200 val loss 6.3870
873
+ 7200 val perplexity 594.0425
874
+ 7200 train 6.228770 (lr=8.0829e-06) (hash(x)=49515094)
875
+ 7000 val loss 6.5920
876
+ 7000 val perplexity 729.2160
877
+ 7000 train 6.981744 (lr=1.4627e-05) (hash(x)=49317888)
878
+ 7400 val loss 6.7896
879
+ 7400 val perplexity 888.5211
880
+ 7400 train 6.785089 (lr=1.7323e-05) (hash(x)=48320948)
881
+ 7400 val loss 6.6037
882
+ 7400 val perplexity 737.7843
883
+ 7400 train 6.612427 (lr=4.9493e-06) (hash(x)=48320948)
884
+ 7300 val loss 6.3924
885
+ 7300 val perplexity 597.2842
886
+ 7300 train 6.382062 (lr=7.7489e-06) (hash(x)=51546861)
887
+ 7100 val loss 6.5873
888
+ 7100 val perplexity 725.8042
889
+ 7100 train 6.607678 (lr=1.4043e-05) (hash(x)=50360484)
890
+ 7500 val loss 6.7864
891
+ 7500 val perplexity 885.7453
892
+ 7500 train 6.726026 (lr=1.6586e-05) (hash(x)=40167457)
893
+ 7500 val loss 6.6045
894
+ 7500 val perplexity 738.4333
895
+ 7500 train 6.547146 (lr=4.7389e-06) (hash(x)=40167457)
896
+ 7400 val loss 6.3855
897
+ 7400 val perplexity 593.1888
898
+ 7400 train 6.382369 (lr=7.4239e-06) (hash(x)=48320948)
899
+ 7200 val loss 6.5815
900
+ 7200 val perplexity 721.6158
901
+ 7200 train 6.439557 (lr=1.3471e-05) (hash(x)=49515094)
902
+ 7600 val loss 6.7853
903
+ 7600 val perplexity 884.7520
904
+ 7600 train 6.721845 (lr=1.5872e-05) (hash(x)=49942165)
905
+ 7600 val loss 6.5947
906
+ 7600 val perplexity 731.2087
907
+ 7600 train 6.535936 (lr=4.5349e-06) (hash(x)=49942165)
908
+ 7500 val loss 6.3889
909
+ 7500 val perplexity 595.2056
910
+ 7500 train 6.335147 (lr=7.1083e-06) (hash(x)=40167457)
911
+ 7300 val loss 6.5827
912
+ 7300 val perplexity 722.4566
913
+ 7300 train 6.567333 (lr=1.2915e-05) (hash(x)=51546861)
914
+ 7700 val loss 6.7805
915
+ 7700 val perplexity 880.5395
916
+ 7700 train 6.506702 (lr=1.5181e-05) (hash(x)=48853311)
917
+ 7700 val loss 6.5914
918
+ 7700 val perplexity 728.8103
919
+ 7700 train 6.309133 (lr=4.3375e-06) (hash(x)=48853311)
920
+ 7600 val loss 6.3745
921
+ 7600 val perplexity 586.7179
922
+ 7600 train 6.329422 (lr=6.8023e-06) (hash(x)=49942165)
923
+ 7400 val loss 6.5734
924
+ 7400 val perplexity 715.8018
925
+ 7400 train 6.580335 (lr=1.2373e-05) (hash(x)=48320948)
926
+ 7800 val loss 6.7783
927
+ 7800 val perplexity 878.5657
928
+ 7800 train 6.677202 (lr=1.4514e-05) (hash(x)=48510117)
929
+ 7800 val loss 6.5876
930
+ 7800 val perplexity 726.0670
931
+ 7800 train 6.485681 (lr=4.1470e-06) (hash(x)=48510117)
932
+ 7700 val loss 6.3719
933
+ 7700 val perplexity 585.1765
934
+ 7700 train 6.077825 (lr=6.5062e-06) (hash(x)=48853311)
935
+ 7500 val loss 6.5774
936
+ 7500 val perplexity 718.6973
937
+ 7500 train 6.505441 (lr=1.1847e-05) (hash(x)=40167457)
938
+ 7900 val loss 6.7788
939
+ 7900 val perplexity 879.0137
940
+ 7900 train 6.777487 (lr=1.3872e-05) (hash(x)=48339781)
941
+ 7900 val loss 6.5867
942
+ 7900 val perplexity 725.3773
943
+ 7900 train 6.586193 (lr=3.9635e-06) (hash(x)=48339781)
944
+ 7800 val loss 6.3706
945
+ 7800 val perplexity 584.4119
946
+ 7800 train 6.258163 (lr=6.2205e-06) (hash(x)=48510117)
947
+ 7600 val loss 6.5672
948
+ 7600 val perplexity 711.3448
949
+ 7600 train 6.511395 (lr=1.1337e-05) (hash(x)=49942165)
950
+ 8000 val loss 6.7733
951
+ 8000 val perplexity 874.1851
952
+ 8000 train 6.894838 (lr=1.3255e-05) (hash(x)=54927320)
953
+ 8000 val loss 6.5813
954
+ 8000 val perplexity 721.4551
955
+ 8000 train 6.694423 (lr=3.7873e-06) (hash(x)=54927320)
956
+ 7900 val loss 6.3693
957
+ 7900 val perplexity 583.6351
958
+ 7900 train 6.374780 (lr=5.9453e-06) (hash(x)=48339781)
959
+ 7700 val loss 6.5615
960
+ 7700 val perplexity 707.3499
961
+ 7700 train 6.275876 (lr=1.0844e-05) (hash(x)=48853311)
962
+ 8100 val loss 6.7714
963
+ 8100 val perplexity 872.5671
964
+ 8100 train 6.479921 (lr=1.2665e-05) (hash(x)=46461786)
965
+ 8100 val loss 6.5788
966
+ 8100 val perplexity 719.7039
967
+ 8100 train 6.297609 (lr=3.6184e-06) (hash(x)=46461786)
968
+ 8000 val loss 6.3640
969
+ 8000 val perplexity 580.5896
970
+ 8000 train 6.444406 (lr=5.6809e-06) (hash(x)=54927320)
971
+ 7800 val loss 6.5596
972
+ 7800 val perplexity 705.9629
973
+ 7800 train 6.453285 (lr=1.0367e-05) (hash(x)=48510117)
974
+ 8200 val loss 6.7708
975
+ 8200 val perplexity 872.0380
976
+ 8200 train 6.699470 (lr=1.2100e-05) (hash(x)=51536260)
977
+ 8200 val loss 6.5759
978
+ 8200 val perplexity 717.6141
979
+ 8200 train 6.503042 (lr=3.4572e-06) (hash(x)=51536260)
980
+ 8100 val loss 6.3618
981
+ 8100 val perplexity 579.2977
982
+ 8100 train 6.076931 (lr=5.4277e-06) (hash(x)=46461786)
983
+ 7900 val loss 6.5584
984
+ 7900 val perplexity 705.1194
985
+ 7900 train 6.559155 (lr=9.9088e-06) (hash(x)=48339781)
986
+ 8300 val loss 6.7694
987
+ 8300 val perplexity 870.7827
988
+ 8300 train 6.520802 (lr=1.1563e-05) (hash(x)=44770722)
989
+ 8300 val loss 6.5741
990
+ 8300 val perplexity 716.2737
991
+ 8300 train 6.306182 (lr=3.3037e-06) (hash(x)=44770722)
992
+ 8200 val loss 6.3588
993
+ 8200 val perplexity 577.5776
994
+ 8200 train 6.273705 (lr=5.1858e-06) (hash(x)=51536260)
995
+ 8400 val loss 6.7649
996
+ 8400 val perplexity 866.8718
997
+ 8400 train 6.764847 (lr=1.1053e-05) (hash(x)=50104957)
998
+ 8000 val loss 6.5552
999
+ 8000 val perplexity 702.9102
1000
+ 8000 train 6.663426 (lr=9.4682e-06) (hash(x)=54927320)
1001
+ 8400 val loss 6.5708
1002
+ 8400 val perplexity 713.9655
1003
+ 8400 train 6.575215 (lr=3.1581e-06) (hash(x)=50104957)
1004
+ 8300 val loss 6.3586
1005
+ 8300 val perplexity 577.4501
1006
+ 8300 train 6.086116 (lr=4.9556e-06) (hash(x)=44770722)
1007
+ 8500 val loss 6.7612
1008
+ 8500 val perplexity 863.7002
1009
+ 8500 train 6.898350 (lr=1.0572e-05) (hash(x)=50132971)
1010
+ 8100 val loss 6.5496
1011
+ 8100 val perplexity 698.9459
1012
+ 8100 train 6.264082 (lr=9.0461e-06) (hash(x)=46461786)
1013
+ 8500 val loss 6.5681
1014
+ 8500 val perplexity 711.9845
1015
+ 8500 train 6.696193 (lr=3.0206e-06) (hash(x)=50132971)
1016
+ 8400 val loss 6.3557
1017
+ 8400 val perplexity 575.7655
1018
+ 8400 train 6.362148 (lr=4.7372e-06) (hash(x)=50104957)
1019
+ 8600 val loss 6.7565
1020
+ 8600 val perplexity 859.6239
1021
+ 8600 train 6.720913 (lr=1.0119e-05) (hash(x)=52193699)
1022
+ 8200 val loss 6.5469
1023
+ 8200 val perplexity 697.1107
1024
+ 8200 train 6.480287 (lr=8.6430e-06) (hash(x)=51536260)
1025
+ 8600 val loss 6.5642
1026
+ 8600 val perplexity 709.2699
1027
+ 8600 train 6.535054 (lr=2.8913e-06) (hash(x)=52193699)
1028
+ 8500 val loss 6.3527
1029
+ 8500 val perplexity 574.0456
1030
+ 8500 train 6.457326 (lr=4.5309e-06) (hash(x)=50132971)
1031
+ 8700 val loss 6.7556
1032
+ 8700 val perplexity 858.8803
1033
+ 8700 train 6.753688 (lr=9.6960e-06) (hash(x)=47902319)
1034
+ 8300 val loss 6.5480
1035
+ 8300 val perplexity 697.8151
1036
+ 8300 train 6.279721 (lr=8.2593e-06) (hash(x)=44770722)
1037
+ 8700 val loss 6.5619
1038
+ 8700 val perplexity 707.6201
1039
+ 8700 train 6.558250 (lr=2.7703e-06) (hash(x)=47902319)
1040
+ 8600 val loss 6.3497
1041
+ 8600 val perplexity 572.3204
1042
+ 8600 train 6.325282 (lr=4.3369e-06) (hash(x)=52193699)
1043
+ 8800 val loss 6.7519
1044
+ 8800 val perplexity 855.7248
1045
+ 8800 train 7.023129 (lr=9.3021e-06) (hash(x)=54904230)
1046
+ 8400 val loss 6.5447
1047
+ 8400 val perplexity 695.5720
1048
+ 8400 train 6.552617 (lr=7.8953e-06) (hash(x)=50104957)
1049
+ 8800 val loss 6.5575
1050
+ 8800 val perplexity 704.4779
1051
+ 8800 train 6.865329 (lr=2.6577e-06) (hash(x)=54904230)
1052
+ 8700 val loss 6.3472
1053
+ 8700 val perplexity 570.8699
1054
+ 8700 train 6.359556 (lr=4.1554e-06) (hash(x)=47902319)
1055
+ 8900 val loss 6.7474
1056
+ 8900 val perplexity 851.8157
1057
+ 8900 train 6.624123 (lr=8.9382e-06) (hash(x)=46311615)
1058
+ 8500 val loss 6.5417
1059
+ 8500 val perplexity 693.4591
1060
+ 8500 train 6.659644 (lr=7.5515e-06) (hash(x)=50132971)
1061
+ 8900 val loss 6.5562
1062
+ 8900 val perplexity 703.5956
1063
+ 8900 train 6.433506 (lr=2.5538e-06) (hash(x)=46311615)
1064
+ 8800 val loss 6.3426
1065
+ 8800 val perplexity 568.2990
1066
+ 8800 train 6.660896 (lr=3.9866e-06) (hash(x)=54904230)
1067
+ 9000 val loss 6.7487
1068
+ 9000 val perplexity 852.9575
1069
+ 9000 train 6.639927 (lr=8.6047e-06) (hash(x)=48535188)
1070
+ 8600 val loss 6.5351
1071
+ 8600 val perplexity 688.9059
1072
+ 8600 train 6.505647 (lr=7.2282e-06) (hash(x)=52193699)
1073
+ 9000 val loss 6.5569
1074
+ 9000 val perplexity 704.0800
1075
+ 9000 train 6.438684 (lr=2.4585e-06) (hash(x)=48535188)
1076
+ 8900 val loss 6.3420
1077
+ 8900 val perplexity 567.9246
1078
+ 8900 train 6.216720 (lr=3.8307e-06) (hash(x)=46311615)
1079
+ 9100 val loss 6.7458
1080
+ 9100 val perplexity 850.4387
1081
+ 9100 train 6.777208 (lr=8.3020e-06) (hash(x)=51757372)
1082
+ 8700 val loss 6.5346
1083
+ 8700 val perplexity 688.5260
1084
+ 8700 train 6.549554 (lr=6.9257e-06) (hash(x)=47902319)
1085
+ 9100 val loss 6.5527
1086
+ 9100 val perplexity 701.1675
1087
+ 9100 train 6.597699 (lr=2.3720e-06) (hash(x)=51757372)
1088
+ 9000 val loss 6.3423
1089
+ 9000 val perplexity 568.1190
1090
+ 9000 train 6.216728 (lr=3.6877e-06) (hash(x)=48535188)
1091
+ 9200 val loss 6.7434
1092
+ 9200 val perplexity 848.4285
1093
+ 9200 train 6.608101 (lr=8.0302e-06) (hash(x)=51131708)
1094
+ 9200 val loss 6.5506
1095
+ 9200 val perplexity 699.6799
1096
+ 9200 train 6.395917 (lr=2.2943e-06) (hash(x)=51131708)
1097
+ 8800 val loss 6.5294
1098
+ 8800 val perplexity 684.9847
1099
+ 8800 train 6.834278 (lr=6.6444e-06) (hash(x)=54904230)
1100
+ 9100 val loss 6.3364
1101
+ 9100 val perplexity 564.7474
1102
+ 9100 train 6.369995 (lr=3.5580e-06) (hash(x)=51757372)
1103
+ 9300 val loss 6.7413
1104
+ 9300 val perplexity 846.6523
1105
+ 9300 train 6.670128 (lr=7.7898e-06) (hash(x)=44784276)
1106
+ 9300 val loss 6.5488
1107
+ 9300 val perplexity 698.4379
1108
+ 9300 train 6.492771 (lr=2.2256e-06) (hash(x)=44784276)
1109
+ 8900 val loss 6.5279
1110
+ 8900 val perplexity 683.9315
1111
+ 8900 train 6.411209 (lr=6.3845e-06) (hash(x)=46311615)
1112
+ 9200 val loss 6.3365
1113
+ 9200 val perplexity 564.8207
1114
+ 9200 train 6.164150 (lr=3.4415e-06) (hash(x)=51131708)
1115
+ 9400 val loss 6.7413
1116
+ 9400 val perplexity 846.6563
1117
+ 9400 train 6.825796 (lr=7.5809e-06) (hash(x)=51981169)
1118
+ 9400 val loss 6.5481
1119
+ 9400 val perplexity 697.9249
1120
+ 9400 train 6.641167 (lr=2.1660e-06) (hash(x)=51981169)
1121
+ 9000 val loss 6.5335
1122
+ 9000 val perplexity 687.7910
1123
+ 9000 train 6.425806 (lr=6.1462e-06) (hash(x)=48535188)
1124
+ 9300 val loss 6.3350
1125
+ 9300 val perplexity 563.9686
1126
+ 9300 train 6.284737 (lr=3.3385e-06) (hash(x)=44784276)
1127
+ 9500 val loss 6.7437
1128
+ 9500 val perplexity 848.6773
1129
+ 9500 train 6.667314 (lr=7.4038e-06) (hash(x)=47232936)
1130
+ 9500 val loss 6.5462
1131
+ 9500 val perplexity 696.6165
1132
+ 9500 train 6.484810 (lr=2.1154e-06) (hash(x)=47232936)
1133
+ 9100 val loss 6.5242
1134
+ 9100 val perplexity 681.4025
1135
+ 9100 train 6.559541 (lr=5.9300e-06) (hash(x)=51757372)
1136
+ 9400 val loss 6.3345
1137
+ 9400 val perplexity 563.6874
1138
+ 9400 train 6.435622 (lr=3.2490e-06) (hash(x)=51981169)
1139
+ 9600 val loss 6.7411
1140
+ 9600 val perplexity 846.4912
1141
+ 9600 train 6.770916 (lr=7.2586e-06) (hash(x)=53800450)
1142
+ 9600 val loss 6.5455
1143
+ 9600 val perplexity 696.0746
1144
+ 9600 train 6.573491 (lr=2.0739e-06) (hash(x)=53800450)
1145
+ 9200 val loss 6.5233
1146
+ 9200 val perplexity 680.8189
1147
+ 9200 train 6.369421 (lr=5.7359e-06) (hash(x)=51131708)
1148
+ 9500 val loss 6.3333
1149
+ 9500 val perplexity 563.0075
1150
+ 9500 train 6.274373 (lr=3.1730e-06) (hash(x)=47232936)
1151
+ 9700 val loss 6.7405
1152
+ 9700 val perplexity 845.9715
1153
+ 9700 train 6.864671 (lr=7.1456e-06) (hash(x)=55768123)
1154
+ 9700 val loss 6.5442
1155
+ 9700 val perplexity 695.2151
1156
+ 9700 train 6.679401 (lr=2.0416e-06) (hash(x)=55768123)
1157
+ 9300 val loss 6.5223
1158
+ 9300 val perplexity 680.1148
1159
+ 9300 train 6.468675 (lr=5.5641e-06) (hash(x)=44784276)
1160
+ 9600 val loss 6.3314
1161
+ 9600 val perplexity 561.9553
1162
+ 9600 train 6.342242 (lr=3.1108e-06) (hash(x)=53800450)
1163
+ 9800 val loss 6.7390
1164
+ 9800 val perplexity 844.7344
1165
+ 9800 train 6.674843 (lr=7.0647e-06) (hash(x)=47745177)
1166
+ 9800 val loss 6.5437
1167
+ 9800 val perplexity 694.8572
1168
+ 9800 train 6.458412 (lr=2.0185e-06) (hash(x)=47745177)
1169
+ 9400 val loss 6.5186
1170
+ 9400 val perplexity 677.6028
1171
+ 9400 train 6.622711 (lr=5.4149e-06) (hash(x)=51981169)
1172
+ 9700 val loss 6.3310
1173
+ 9700 val perplexity 561.7244
1174
+ 9700 train 6.463716 (lr=3.0624e-06) (hash(x)=55768123)
1175
+ 9900 val loss 6.7386
1176
+ 9900 val perplexity 844.3973
1177
+ 9900 train 6.993019 (lr=7.0162e-06) (hash(x)=56592246)
1178
+ 9900 val loss 6.5427
1179
+ 9900 val perplexity 694.1486
1180
+ 9900 train 6.819444 (lr=2.0046e-06) (hash(x)=56592246)
1181
+ 9500 val loss 6.5166
1182
+ 9500 val perplexity 676.2491
1183
+ 9500 train 6.458758 (lr=5.2884e-06) (hash(x)=47232936)
1184
+ 9999 val loss 6.7382
1185
+ 9999 val perplexity 844.0145
1186
+ 9800 val loss 6.3319
1187
+ 9800 val perplexity 562.2196
1188
+ 9800 train 6.251424 (lr=3.0277e-06) (hash(x)=47745177)
1189
+ 9999 val loss 6.5435
1190
+ 9999 val perplexity 694.7287
1191
+ 9600 val loss 6.5156
1192
+ 9600 val perplexity 675.5681
1193
+ 9600 train 6.540743 (lr=5.1847e-06) (hash(x)=53800450)
1194
+ 9900 val loss 6.3312
1195
+ 9900 val perplexity 561.8361
1196
+ 9900 train 6.614131 (lr=3.0069e-06) (hash(x)=56592246)
1197
+ 9999 val loss 6.3315
1198
+ 9999 val perplexity 562.0116
1199
+ 9700 val loss 6.5144
1200
+ 9700 val perplexity 674.7742
1201
+ 9700 train 6.652847 (lr=5.1040e-06) (hash(x)=55768123)
1202
+ 9800 val loss 6.5141
1203
+ 9800 val perplexity 674.5625
1204
+ 9800 train 6.437059 (lr=5.0462e-06) (hash(x)=47745177)
1205
+ 9900 val loss 6.5129
1206
+ 9900 val perplexity 673.7457
1207
+ 9900 train 6.774715 (lr=5.0116e-06) (hash(x)=56592246)
1208
+ 9999 val loss 6.5129
1209
+ 9999 val perplexity 673.7534
attention_kindselective_n_heads8_seed1340/model_02500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:546196196f4e135fa42acf649a491e59d4e404903294df6008574f8fcb529fcc
3
+ size 257976706
attention_kindselective_n_heads8_seed1340/model_05000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e0d6f68d89b38b064adf1c87e26bc401ffd6c6c1f9e6dc751711747053f53c2
3
+ size 257976706
attention_kindselective_n_heads8_seed1340/model_07500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5836d089233e1744dd72efd8c6b025464fc61c5cbee97c1610b275dd0c748e5
3
+ size 257976706
attention_kindselective_n_heads8_seed1340/model_09999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:803e7f1d779f4eaa96237848c494b11c7ea9c19ab7ee06466b575e3a7025fff8
3
+ size 257976706
attention_kindselective_n_heads8_seed1340/optimizer_02500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47e117f9e735979d69333522e6bf577f699bf362355432d8a8d55ee77a8e102a
3
+ size 509672838
attention_kindselective_n_heads8_seed1340/optimizer_05000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dc9e23aa59ecd4e79f1695ce1f36b3fbb68e7d1c9daabc7bb7ab239cb5ec48f
3
+ size 509672838
attention_kindselective_n_heads8_seed1340/optimizer_07500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7b908f20aa5a501633bcacffab720c0cbaea5aa7a8a0dfc45110899b03c5827
3
+ size 509672838
attention_kindselective_n_heads8_seed1340/optimizer_09999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3f6b50490bbd4ce73aeef1748bd4d1d083b3dd237165be68b2a0cc0f109a719
3
+ size 509672838