andrew-healey commited on
Commit
4c77236
·
verified ·
1 Parent(s): 64967b4

Upload folder using huggingface_hub

Browse files
Files changed (26) hide show
  1. attention_kindselective_n_heads4_seed1340/args.json +1 -1
  2. attention_kindselective_n_heads4_seed1340/dataloader_10000.pt +3 -0
  3. attention_kindselective_n_heads4_seed1340/dataloader_12500.pt +3 -0
  4. attention_kindselective_n_heads4_seed1340/dataloader_42500.pt +3 -0
  5. attention_kindselective_n_heads4_seed1340/dataloader_45000.pt +3 -0
  6. attention_kindselective_n_heads4_seed1340/dataloader_47500.pt +3 -0
  7. attention_kindselective_n_heads4_seed1340/dataloader_49999.pt +3 -0
  8. attention_kindselective_n_heads4_seed1340/log2.txt +1111 -590
  9. attention_kindselective_n_heads4_seed1340/model_02500.pt +1 -1
  10. attention_kindselective_n_heads4_seed1340/model_05000.pt +1 -1
  11. attention_kindselective_n_heads4_seed1340/model_07500.pt +1 -1
  12. attention_kindselective_n_heads4_seed1340/model_10000.pt +3 -0
  13. attention_kindselective_n_heads4_seed1340/model_12500.pt +3 -0
  14. attention_kindselective_n_heads4_seed1340/model_42500.pt +3 -0
  15. attention_kindselective_n_heads4_seed1340/model_45000.pt +3 -0
  16. attention_kindselective_n_heads4_seed1340/model_47500.pt +3 -0
  17. attention_kindselective_n_heads4_seed1340/model_49999.pt +3 -0
  18. attention_kindselective_n_heads4_seed1340/optimizer_02500.pt +1 -1
  19. attention_kindselective_n_heads4_seed1340/optimizer_05000.pt +1 -1
  20. attention_kindselective_n_heads4_seed1340/optimizer_07500.pt +1 -1
  21. attention_kindselective_n_heads4_seed1340/optimizer_10000.pt +3 -0
  22. attention_kindselective_n_heads4_seed1340/optimizer_12500.pt +3 -0
  23. attention_kindselective_n_heads4_seed1340/optimizer_42500.pt +3 -0
  24. attention_kindselective_n_heads4_seed1340/optimizer_45000.pt +3 -0
  25. attention_kindselective_n_heads4_seed1340/optimizer_47500.pt +3 -0
  26. attention_kindselective_n_heads4_seed1340/optimizer_49999.pt +3 -0
attention_kindselective_n_heads4_seed1340/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads4_seed1340", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1340, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 0.0001, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "10e-5_10240_4_1340", "n_embd": 256}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_7/attention_kindselective_n_heads4_seed1340", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 50000, "warmup_steps": 200, "group": "wider_is_better_7", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1340, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 7e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "7e-5_10240_4_1340", "n_embd": 256}
attention_kindselective_n_heads4_seed1340/dataloader_10000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3858f6c832feea78a674d8c5c384061cc7d4f22cddbd0a2be6de33bc91e2c72
3
+ size 964
attention_kindselective_n_heads4_seed1340/dataloader_12500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab3779d33c2e0a7873fcd8c39402e44260740665950323ad1445480ec339965a
3
+ size 964
attention_kindselective_n_heads4_seed1340/dataloader_42500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf6d24c78d89100d146bce9f26be940db3d71092473d9b55db97d6b35531eac2
3
+ size 964
attention_kindselective_n_heads4_seed1340/dataloader_45000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22bb90b43d81f3da5454f91a70e1ed29aeb2f470a727ce38390ff8a5c4924889
3
+ size 964
attention_kindselective_n_heads4_seed1340/dataloader_47500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55507725e6988f190e4963078652fafa6b68e8d4f79221387612612babf3e1c1
3
+ size 964
attention_kindselective_n_heads4_seed1340/dataloader_49999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47776cddb8021172f048a950b83f25b692cb340214b800ce3837c15ceb58907c
3
+ size 964
attention_kindselective_n_heads4_seed1340/log2.txt CHANGED
@@ -1,592 +1,1113 @@
1
- max_steps: 10000
 
 
 
 
 
 
 
 
 
2
  0 val loss 11.2703
3
  0 val perplexity 78458.0234
4
- 0 train 11.272942 (lr=7.5000e-07) (hash(x)=57791809)
5
- 100 val loss 9.1994
6
- 100 val perplexity 9891.5127
7
- 100 train 9.216334 (lr=7.5750e-05) (hash(x)=48211824)
8
- 200 val loss 7.7349
9
- 200 val perplexity 2286.7070
10
- 200 train 7.673902 (lr=1.5000e-04) (hash(x)=50375849)
11
- 0 train 11.272942 (lr=5.0000e-07) (hash(x)=57791809)
12
- 300 val loss 7.6076
13
- 300 val perplexity 2013.5125
14
- 300 train 7.925827 (lr=1.4997e-04) (hash(x)=57250808)
15
- 100 val loss 9.5575
16
- 100 val perplexity 14150.3975
17
- 100 train 9.571983 (lr=5.0500e-05) (hash(x)=48211824)
18
- 400 val loss 7.5727
19
- 400 val perplexity 1944.4767
20
- 400 train 8.180161 (lr=1.4986e-04) (hash(x)=62519858)
21
- 200 val loss 8.0717
22
- 200 val perplexity 3202.6396
23
- 200 train 8.018360 (lr=1.0000e-04) (hash(x)=50375849)
24
- 500 val loss 7.5053
25
- 500 val perplexity 1817.6418
26
- 500 train 7.404214 (lr=1.4969e-04) (hash(x)=47226806)
27
- 300 val loss 7.6542
28
- 300 val perplexity 2109.4519
29
- 300 train 7.955753 (lr=9.9977e-05) (hash(x)=57250808)
30
- 600 val loss 7.4856
31
- 600 val perplexity 1782.2722
32
- 600 train 7.516066 (lr=1.4945e-04) (hash(x)=51149322)
33
- 400 val loss 7.6014
34
- 400 val perplexity 2001.0934
35
- 400 train 8.229440 (lr=9.9908e-05) (hash(x)=62519858)
36
- 700 val loss 7.4777
37
- 700 val perplexity 1768.1790
38
- 700 train 7.494768 (lr=1.4913e-04) (hash(x)=51564551)
39
- 500 val loss 7.5660
40
- 500 val perplexity 1931.4121
41
- 500 train 7.453595 (lr=9.9792e-05) (hash(x)=47226806)
42
- 800 val loss 7.4734
43
- 800 val perplexity 1760.6666
44
- 800 train 7.244524 (lr=1.4876e-04) (hash(x)=45093459)
45
- 600 val loss 7.5616
46
- 600 val perplexity 1922.8881
47
- 600 train 7.599857 (lr=9.9631e-05) (hash(x)=51149322)
48
- 900 val loss 7.4478
49
- 900 val perplexity 1716.0348
50
- 900 train 7.690024 (lr=1.4831e-04) (hash(x)=54988361)
51
- 700 val loss 7.5504
52
- 700 val perplexity 1901.4637
53
- 700 train 7.545293 (lr=9.9423e-05) (hash(x)=51564551)
54
- 1000 val loss 7.4670
55
- 1000 val perplexity 1749.4158
56
- 1000 train 7.367063 (lr=1.4779e-04) (hash(x)=47588648)
57
- 800 val loss 7.5201
58
- 800 val perplexity 1844.7793
59
- 800 train 7.278911 (lr=9.9170e-05) (hash(x)=45093459)
60
- 1100 val loss 7.4328
61
- 1100 val perplexity 1690.5156
62
- 1100 train 7.082927 (lr=1.4721e-04) (hash(x)=37984588)
63
- 900 val loss 7.5207
64
- 900 val perplexity 1845.7920
65
- 900 train 7.809628 (lr=9.8872e-05) (hash(x)=54988361)
66
- 1200 val loss 7.3818
67
- 1200 val perplexity 1606.4569
68
- 1200 train 7.558693 (lr=1.4656e-04) (hash(x)=56333817)
69
- 1000 val loss 7.5099
70
- 1000 val perplexity 1825.9989
71
- 1000 train 7.391204 (lr=9.8528e-05) (hash(x)=47588648)
72
- 1300 val loss 7.3642
73
- 1300 val perplexity 1578.5209
74
- 1300 train 7.491827 (lr=1.4585e-04) (hash(x)=53454056)
75
- 1100 val loss 7.4699
76
- 1100 val perplexity 1754.4908
77
- 1100 train 7.058946 (lr=9.8140e-05) (hash(x)=37984588)
78
- 1400 val loss 7.3266
79
- 1400 val perplexity 1520.1290
80
- 1400 train 7.428581 (lr=1.4507e-04) (hash(x)=55284163)
81
- 1200 val loss 7.4695
82
- 1200 val perplexity 1753.7841
83
- 1200 train 7.647787 (lr=9.7708e-05) (hash(x)=56333817)
84
- 1500 val loss 7.3234
85
- 1500 val perplexity 1515.3085
86
- 1500 train 7.260311 (lr=1.4422e-04) (hash(x)=48162598)
87
- 1300 val loss 7.4999
88
- 1300 val perplexity 1807.8933
89
- 1300 train 7.617690 (lr=9.7231e-05) (hash(x)=53454056)
90
- 1600 val loss 7.2709
91
- 1600 val perplexity 1437.7860
92
- 1600 train 7.330367 (lr=1.4332e-04) (hash(x)=54214535)
93
- 1400 val loss 7.4034
94
- 1400 val perplexity 1641.5452
95
- 1400 train 7.515925 (lr=9.6711e-05) (hash(x)=55284163)
96
- 1700 val loss 7.3486
97
- 1700 val perplexity 1554.0712
98
- 1700 train 7.561377 (lr=1.4235e-04) (hash(x)=53525003)
99
- 1500 val loss 7.3976
100
- 1500 val perplexity 1631.9938
101
- 1500 train 7.301088 (lr=9.6149e-05) (hash(x)=48162598)
102
- 1800 val loss 7.2820
103
- 1800 val perplexity 1453.8403
104
- 1800 train 7.356642 (lr=1.4131e-04) (hash(x)=51848994)
105
- 1600 val loss 7.4057
106
- 1600 val perplexity 1645.3561
107
- 1600 train 7.469926 (lr=9.5544e-05) (hash(x)=54214535)
108
- 1900 val loss 7.3030
109
- 1900 val perplexity 1484.7439
110
- 1900 train 7.137829 (lr=1.4022e-04) (hash(x)=48405987)
111
- 1700 val loss 7.3512
112
- 1700 val perplexity 1558.1410
113
- 1700 train 7.569810 (lr=9.4897e-05) (hash(x)=53525003)
114
- 2000 val loss 7.2506
115
- 2000 val perplexity 1409.0001
116
- 2000 train 7.602566 (lr=1.3907e-04) (hash(x)=58592291)
117
- 1800 val loss 7.3285
118
- 1800 val perplexity 1523.0544
119
- 1800 train 7.415604 (lr=9.4209e-05) (hash(x)=51848994)
120
- 2100 val loss 7.2532
121
- 2100 val perplexity 1412.6349
122
- 2100 train 7.341368 (lr=1.3786e-04) (hash(x)=51167081)
123
- 1900 val loss 7.3569
124
- 1900 val perplexity 1566.9419
125
- 1900 train 7.183960 (lr=9.3481e-05) (hash(x)=48405987)
126
- 2200 val loss 7.2590
127
- 2200 val perplexity 1420.7731
128
- 2200 train 7.260748 (lr=1.3660e-04) (hash(x)=47994988)
129
- 2000 val loss 7.3472
130
- 2000 val perplexity 1551.8170
131
- 2000 train 7.699927 (lr=9.2714e-05) (hash(x)=58592291)
132
- 2300 val loss 7.2521
133
- 2300 val perplexity 1411.0697
134
- 2300 train 7.250330 (lr=1.3527e-04) (hash(x)=47377604)
135
- 2100 val loss 7.3025
136
- 2100 val perplexity 1483.9901
137
- 2100 train 7.402580 (lr=9.1908e-05) (hash(x)=51167081)
138
- 2400 val loss 7.2556
139
- 2400 val perplexity 1416.0265
140
- 2400 train 7.304438 (lr=1.3390e-04) (hash(x)=53554323)
141
- 2200 val loss 7.3241
142
- 2200 val perplexity 1516.3789
143
- 2200 train 7.340111 (lr=9.1064e-05) (hash(x)=47994988)
144
- 2500 val loss 7.3073
145
- 2500 val perplexity 1491.1608
146
- 2500 train 7.313736 (lr=1.3247e-04) (hash(x)=50780417)
147
- 2300 val loss 7.2707
148
- 2300 val perplexity 1437.5735
149
- 2300 train 7.255850 (lr=9.0182e-05) (hash(x)=47377604)
150
- 2600 val loss 7.2764
151
- 2600 val perplexity 1445.7822
152
- 2600 train 7.182763 (lr=1.3099e-04) (hash(x)=46453562)
153
- 2400 val loss 7.2498
154
- 2400 val perplexity 1407.8685
155
- 2400 train 7.306433 (lr=8.9265e-05) (hash(x)=53554323)
156
- 2700 val loss 7.2792
157
- 2700 val perplexity 1449.8409
158
- 2700 train 7.287231 (lr=1.2946e-04) (hash(x)=54404221)
159
- 2500 val loss 7.2853
160
- 2500 val perplexity 1458.7178
161
- 2500 train 7.310247 (lr=8.8313e-05) (hash(x)=50780417)
162
- 2800 val loss 7.2837
163
- 2800 val perplexity 1456.3575
164
- 2800 train 7.949651 (lr=1.2788e-04) (hash(x)=59318895)
165
- 2600 val loss 7.2457
166
- 2600 val perplexity 1402.0936
167
- 2600 train 7.150898 (lr=8.7326e-05) (hash(x)=46453562)
168
- 2900 val loss 7.2333
169
- 2900 val perplexity 1384.8267
170
- 2900 train 7.221962 (lr=1.2626e-04) (hash(x)=47845760)
171
- 2700 val loss 7.2470
172
- 2700 val perplexity 1403.8638
173
- 2700 train 7.253963 (lr=8.6306e-05) (hash(x)=54404221)
174
- 3000 val loss 7.2471
175
- 3000 val perplexity 1403.9862
176
- 3000 train 6.979670 (lr=1.2459e-04) (hash(x)=44336167)
177
- 2800 val loss 7.2707
178
- 2800 val perplexity 1437.6050
179
- 2800 train 7.978097 (lr=8.5254e-05) (hash(x)=59318895)
180
- 3100 val loss 7.2154
181
- 3100 val perplexity 1360.2776
182
- 3100 train 7.447736 (lr=1.2287e-04) (hash(x)=44479330)
183
- 2900 val loss 7.2657
184
- 2900 val perplexity 1430.3938
185
- 2900 train 7.266902 (lr=8.4170e-05) (hash(x)=47845760)
186
- 3200 val loss 7.2325
187
- 3200 val perplexity 1383.7072
188
- 3200 train 7.285394 (lr=1.2112e-04) (hash(x)=54593096)
189
- 3000 val loss 7.2402
190
- 3000 val perplexity 1394.3390
191
- 3000 train 6.979912 (lr=8.3057e-05) (hash(x)=44336167)
192
- 3300 val loss 7.2123
193
- 3300 val perplexity 1356.0752
194
- 3300 train 7.084377 (lr=1.1932e-04) (hash(x)=45347643)
195
- 3100 val loss 7.2144
196
- 3100 val perplexity 1358.8521
197
- 3100 train 7.504636 (lr=8.1915e-05) (hash(x)=44479330)
198
- 3400 val loss 7.2006
199
- 3400 val perplexity 1340.2240
200
- 3400 train 7.304337 (lr=1.1749e-04) (hash(x)=47797247)
201
- 3200 val loss 7.2444
202
- 3200 val perplexity 1400.2949
203
- 3200 train 7.282361 (lr=8.0745e-05) (hash(x)=54593096)
204
- 3500 val loss 7.1473
205
- 3500 val perplexity 1270.7017
206
- 3500 train 7.050789 (lr=1.1562e-04) (hash(x)=46115683)
207
- 3300 val loss 7.2259
208
- 3300 val perplexity 1374.5964
209
- 3300 train 7.102276 (lr=7.9549e-05) (hash(x)=45347643)
210
- 3600 val loss 7.1462
211
- 3600 val perplexity 1269.2581
212
- 3600 train 6.955878 (lr=1.1372e-04) (hash(x)=44502074)
213
- 3400 val loss 7.2068
214
- 3400 val perplexity 1348.5294
215
- 3400 train 7.334606 (lr=7.8328e-05) (hash(x)=47797247)
216
- 3700 val loss 7.1355
217
- 3700 val perplexity 1255.8024
218
- 3700 train 7.249683 (lr=1.1179e-04) (hash(x)=55388443)
219
- 3500 val loss 7.1718
220
- 3500 val perplexity 1302.1456
221
- 3500 train 7.073598 (lr=7.7082e-05) (hash(x)=46115683)
222
- 3800 val loss 7.1516
223
- 3800 val perplexity 1276.1625
224
- 3800 train 6.977693 (lr=1.0982e-04) (hash(x)=43790341)
225
- 3600 val loss 7.1680
226
- 3600 val perplexity 1297.1829
227
- 3600 train 7.016492 (lr=7.5814e-05) (hash(x)=44502074)
228
- 3900 val loss 7.1724
229
- 3900 val perplexity 1302.9469
230
- 3900 train 7.196667 (lr=1.0783e-04) (hash(x)=50013318)
231
- 3700 val loss 7.1518
232
- 3700 val perplexity 1276.4393
233
- 3700 train 7.289806 (lr=7.4525e-05) (hash(x)=55388443)
234
- 4000 val loss 7.1715
235
- 4000 val perplexity 1301.7539
236
- 4000 train 7.247557 (lr=1.0581e-04) (hash(x)=51704787)
237
- 3800 val loss 7.1593
238
- 3800 val perplexity 1286.0034
239
- 3800 train 6.950498 (lr=7.3215e-05) (hash(x)=43790341)
240
- 4100 val loss 7.1315
241
- 4100 val perplexity 1250.7228
242
- 4100 train 7.198262 (lr=1.0377e-04) (hash(x)=50821964)
243
- 3900 val loss 7.1414
244
- 3900 val perplexity 1263.2340
245
- 3900 train 7.136201 (lr=7.1887e-05) (hash(x)=50013318)
246
- 4200 val loss 7.1552
247
- 4200 val perplexity 1280.7772
248
- 4200 train 7.200123 (lr=1.0171e-04) (hash(x)=49675080)
249
- 4000 val loss 7.1388
250
- 4000 val perplexity 1259.8586
251
- 4000 train 7.210222 (lr=7.0541e-05) (hash(x)=51704787)
252
- 4300 val loss 7.1326
253
- 4300 val perplexity 1252.0898
254
- 4300 train 6.829738 (lr=9.9622e-05) (hash(x)=43239281)
255
- 4100 val loss 7.1296
256
- 4100 val perplexity 1248.3210
257
- 4100 train 7.190423 (lr=6.9180e-05) (hash(x)=50821964)
258
- 4400 val loss 7.1795
259
- 4400 val perplexity 1312.2640
260
- 4400 train 6.915021 (lr=9.7520e-05) (hash(x)=45076737)
261
- 4200 val loss 7.1301
262
- 4200 val perplexity 1249.0374
263
- 4200 train 7.166576 (lr=6.7804e-05) (hash(x)=49675080)
264
- 4500 val loss 7.1361
265
- 4500 val perplexity 1256.5530
266
- 4500 train 7.226453 (lr=9.5403e-05) (hash(x)=57930262)
267
- 4300 val loss 7.1375
268
- 4300 val perplexity 1258.2251
269
- 4300 train 6.828071 (lr=6.6414e-05) (hash(x)=43239281)
270
- 4600 val loss 7.1293
271
- 4600 val perplexity 1247.9628
272
- 4600 train 6.932743 (lr=9.3273e-05) (hash(x)=46721614)
273
- 4400 val loss 7.1356
274
- 4400 val perplexity 1255.8755
275
- 4400 train 6.854441 (lr=6.5013e-05) (hash(x)=45076737)
276
- 4700 val loss 7.1447
277
- 4700 val perplexity 1267.3850
278
- 4700 train 7.003017 (lr=9.1132e-05) (hash(x)=49837920)
279
- 4500 val loss 7.1212
280
- 4500 val perplexity 1237.9576
281
- 4500 train 7.217383 (lr=6.3602e-05) (hash(x)=57930262)
282
- 4800 val loss 7.1396
283
- 4800 val perplexity 1260.9735
284
- 4800 train 7.256048 (lr=8.8982e-05) (hash(x)=48380045)
285
- 4600 val loss 7.1148
286
- 4600 val perplexity 1230.0862
287
- 4600 train 6.938160 (lr=6.2182e-05) (hash(x)=46721614)
288
- 4900 val loss 7.1166
289
- 4900 val perplexity 1232.3048
290
- 4900 train 7.003822 (lr=8.6825e-05) (hash(x)=44202577)
291
- 4700 val loss 7.1160
292
- 4700 val perplexity 1231.4736
293
- 4700 train 6.945326 (lr=6.0754e-05) (hash(x)=49837920)
294
- 5000 val loss 7.1251
295
- 5000 val perplexity 1242.7483
296
- 5000 train 7.123570 (lr=8.4663e-05) (hash(x)=52038024)
297
- 4800 val loss 7.1030
298
- 4800 val perplexity 1215.5854
299
- 4800 train 7.221812 (lr=5.9321e-05) (hash(x)=48380045)
300
- 5100 val loss 7.0881
301
- 5100 val perplexity 1197.5981
302
- 5100 train 7.271852 (lr=8.2500e-05) (hash(x)=53700038)
303
- 4900 val loss 7.0901
304
- 4900 val perplexity 1200.0150
305
- 4900 train 6.977787 (lr=5.7883e-05) (hash(x)=44202577)
306
- 5200 val loss 7.0702
307
- 5200 val perplexity 1176.3390
308
- 5200 train 7.041414 (lr=8.0337e-05) (hash(x)=48137625)
309
- 5000 val loss 7.0778
310
- 5000 val perplexity 1185.3610
311
- 5000 train 7.090936 (lr=5.6442e-05) (hash(x)=52038024)
312
- 5300 val loss 7.0610
313
- 5300 val perplexity 1165.6478
314
- 5300 train 6.929040 (lr=7.8175e-05) (hash(x)=43161573)
315
- 5100 val loss 7.0503
316
- 5100 val perplexity 1153.2191
317
- 5100 train 7.230675 (lr=5.5000e-05) (hash(x)=53700038)
318
- 5400 val loss 7.0882
319
- 5400 val perplexity 1197.7939
320
- 5400 train 7.294000 (lr=7.6018e-05) (hash(x)=56673322)
321
- 5200 val loss 7.0469
322
- 5200 val perplexity 1149.2523
323
- 5200 train 7.025224 (lr=5.3558e-05) (hash(x)=48137625)
324
- 5500 val loss 7.0674
325
- 5500 val perplexity 1173.1265
326
- 5500 train 7.230914 (lr=7.3868e-05) (hash(x)=53468295)
327
- 5300 val loss 7.0382
328
- 5300 val perplexity 1139.3370
329
- 5300 train 6.922906 (lr=5.2117e-05) (hash(x)=43161573)
330
- 5600 val loss 7.0512
331
- 5600 val perplexity 1154.2776
332
- 5600 train 7.362746 (lr=7.1727e-05) (hash(x)=59287280)
333
- 5400 val loss 7.0431
334
- 5400 val perplexity 1144.9668
335
- 5400 train 7.235172 (lr=5.0679e-05) (hash(x)=56673322)
336
- 5700 val loss 7.0557
337
- 5700 val perplexity 1159.4276
338
- 5700 train 7.293931 (lr=6.9597e-05) (hash(x)=57575806)
339
- 5500 val loss 7.0248
340
- 5500 val perplexity 1124.2102
341
- 5500 train 7.204517 (lr=4.9246e-05) (hash(x)=53468295)
342
- 5800 val loss 7.0477
343
- 5800 val perplexity 1150.1986
344
- 5800 train 6.968884 (lr=6.7480e-05) (hash(x)=46897279)
345
- 5600 val loss 7.0278
346
- 5600 val perplexity 1127.5576
347
- 5600 train 7.337178 (lr=4.7818e-05) (hash(x)=59287280)
348
- 5900 val loss 7.0581
349
- 5900 val perplexity 1162.2921
350
- 5900 train 6.934814 (lr=6.5378e-05) (hash(x)=47565679)
351
- 5700 val loss 7.0104
352
- 5700 val perplexity 1108.0684
353
- 5700 train 7.266252 (lr=4.6398e-05) (hash(x)=57575806)
354
- 5800 val loss 6.9975
355
- 5800 val perplexity 1093.8611
356
- 5800 train 6.937305 (lr=4.4987e-05) (hash(x)=46897279)
357
- 6000 val loss 7.0583
358
- 6000 val perplexity 1162.4668
359
- 6000 train 6.945346 (lr=6.3294e-05) (hash(x)=51590090)
360
- 5900 val loss 7.0068
361
- 5900 val perplexity 1104.0801
362
- 5900 train 6.872268 (lr=4.3586e-05) (hash(x)=47565679)
363
- 6100 val loss 7.0384
364
- 6100 val perplexity 1139.5461
365
- 6100 train 7.429985 (lr=6.1230e-05) (hash(x)=59732271)
366
- 6000 val loss 6.9759
367
- 6000 val perplexity 1070.5658
368
- 6000 train 6.855279 (lr=4.2196e-05) (hash(x)=51590090)
369
- 6200 val loss 7.0536
370
- 6200 val perplexity 1157.0466
371
- 6200 train 7.013299 (lr=5.9188e-05) (hash(x)=46394422)
372
- 6100 val loss 6.9696
373
- 6100 val perplexity 1063.7852
374
- 6100 train 7.359732 (lr=4.0820e-05) (hash(x)=59732271)
375
- 6200 val loss 6.9557
376
- 6200 val perplexity 1049.0975
377
- 6200 train 6.948706 (lr=3.9459e-05) (hash(x)=46394422)
378
- 6300 val loss 7.0336
379
- 6300 val perplexity 1134.1621
380
- 6300 train 7.057237 (lr=5.7169e-05) (hash(x)=53748145)
381
- 6300 val loss 6.9495
382
- 6300 val perplexity 1042.5876
383
- 6300 train 6.970834 (lr=3.8113e-05) (hash(x)=53748145)
384
- 6400 val loss 7.0264
385
- 6400 val perplexity 1125.9877
386
- 6400 train 6.902154 (lr=5.5177e-05) (hash(x)=46054751)
387
- 6400 val loss 6.9335
388
- 6400 val perplexity 1026.0587
389
- 6400 train 6.816382 (lr=3.6785e-05) (hash(x)=46054751)
390
- 6500 val loss 6.9250
391
- 6500 val perplexity 1017.3809
392
- 6500 train 7.130113 (lr=3.5475e-05) (hash(x)=51816809)
393
- 6500 val loss 7.0105
394
- 6500 val perplexity 1108.1730
395
- 6500 train 7.192453 (lr=5.3213e-05) (hash(x)=51816809)
396
- 6600 val loss 6.9167
397
- 6600 val perplexity 1009.0242
398
- 6600 train 6.753559 (lr=3.4186e-05) (hash(x)=52453336)
399
- 6600 val loss 7.0129
400
- 6600 val perplexity 1110.8755
401
- 6600 train 6.860138 (lr=5.1279e-05) (hash(x)=52453336)
402
- 6700 val loss 6.9028
403
- 6700 val perplexity 995.0874
404
- 6700 train 6.934293 (lr=3.2918e-05) (hash(x)=49108775)
405
- 6800 val loss 6.8958
406
- 6800 val perplexity 988.1391
407
- 6800 train 6.812268 (lr=3.1672e-05) (hash(x)=46745396)
408
- 6700 val loss 6.9999
409
- 6700 val perplexity 1096.5558
410
- 6700 train 7.014852 (lr=4.9377e-05) (hash(x)=49108775)
411
- 6900 val loss 6.8921
412
- 6900 val perplexity 984.4801
413
- 6900 train 7.014542 (lr=3.0451e-05) (hash(x)=46534986)
414
- 6800 val loss 7.0024
415
- 6800 val perplexity 1099.2440
416
- 6800 train 6.900960 (lr=4.7509e-05) (hash(x)=46745396)
417
- 7000 val loss 6.8785
418
- 7000 val perplexity 971.1417
419
- 7000 train 7.170157 (lr=2.9255e-05) (hash(x)=49317888)
420
- 7100 val loss 6.8711
421
- 7100 val perplexity 964.0015
422
- 7100 train 6.891702 (lr=2.8085e-05) (hash(x)=50360484)
423
- 6900 val loss 6.9860
424
- 6900 val perplexity 1081.3929
425
- 6900 train 7.101790 (lr=4.5676e-05) (hash(x)=46534986)
426
- 7200 val loss 6.8633
427
- 7200 val perplexity 956.5050
428
- 7200 train 6.759152 (lr=2.6943e-05) (hash(x)=49515094)
429
- 7000 val loss 6.9851
430
- 7000 val perplexity 1080.4318
431
- 7000 train 7.236263 (lr=4.3882e-05) (hash(x)=49317888)
432
- 7300 val loss 6.8535
433
- 7300 val perplexity 947.2141
434
- 7300 train 6.831221 (lr=2.5830e-05) (hash(x)=51546861)
435
- 7400 val loss 6.8538
436
- 7400 val perplexity 947.4413
437
- 7400 train 6.841997 (lr=2.4746e-05) (hash(x)=48320948)
438
- 7100 val loss 6.9737
439
- 7100 val perplexity 1068.2135
440
- 7100 train 6.995534 (lr=4.2128e-05) (hash(x)=50360484)
441
- 7500 val loss 6.8490
442
- 7500 val perplexity 942.9482
443
- 7500 train 6.744160 (lr=2.3694e-05) (hash(x)=40167457)
444
- 7200 val loss 6.9736
445
- 7200 val perplexity 1068.0883
446
- 7200 train 6.880153 (lr=4.0414e-05) (hash(x)=49515094)
447
- 7600 val loss 6.8423
448
- 7600 val perplexity 936.6636
449
- 7600 train 6.787523 (lr=2.2674e-05) (hash(x)=49942165)
450
- 7300 val loss 6.9733
451
- 7300 val perplexity 1067.7426
452
- 7300 train 6.945283 (lr=3.8745e-05) (hash(x)=51546861)
453
- 7700 val loss 6.8377
454
- 7700 val perplexity 932.3729
455
- 7700 train 6.593207 (lr=2.1687e-05) (hash(x)=48853311)
456
- 7800 val loss 6.8349
457
- 7800 val perplexity 929.7131
458
- 7800 train 6.757367 (lr=2.0735e-05) (hash(x)=48510117)
459
- 7400 val loss 6.9641
460
- 7400 val perplexity 1058.0121
461
- 7400 train 6.963182 (lr=3.7120e-05) (hash(x)=48320948)
462
- 7900 val loss 6.8322
463
- 7900 val perplexity 927.1918
464
- 7900 train 6.843838 (lr=1.9818e-05) (hash(x)=48339781)
465
- 7500 val loss 6.9680
466
- 7500 val perplexity 1062.0619
467
- 7500 train 6.844945 (lr=3.5541e-05) (hash(x)=40167457)
468
- 8000 val loss 6.8184
469
- 8000 val perplexity 914.4979
470
- 8000 train 6.935864 (lr=1.8936e-05) (hash(x)=54927320)
471
- 8100 val loss 6.8138
472
- 8100 val perplexity 910.3555
473
- 8100 train 6.570134 (lr=1.8092e-05) (hash(x)=46461786)
474
- 7600 val loss 6.9606
475
- 7600 val perplexity 1054.3075
476
- 7600 train 6.885594 (lr=3.4011e-05) (hash(x)=49942165)
477
- 8200 val loss 6.8039
478
- 8200 val perplexity 901.3185
479
- 8200 train 6.741070 (lr=1.7286e-05) (hash(x)=51536260)
480
- 7700 val loss 6.9648
481
- 7700 val perplexity 1058.6656
482
- 7700 train 6.718952 (lr=3.2531e-05) (hash(x)=48853311)
483
- 8300 val loss 6.7965
484
- 8300 val perplexity 894.6973
485
- 8300 train 6.555057 (lr=1.6519e-05) (hash(x)=44770722)
486
- 8400 val loss 6.7932
487
- 8400 val perplexity 891.7589
488
- 8400 train 6.784442 (lr=1.5791e-05) (hash(x)=50104957)
489
- 7800 val loss 6.9621
490
- 7800 val perplexity 1055.7982
491
- 7800 train 6.868794 (lr=3.1102e-05) (hash(x)=48510117)
492
- 8500 val loss 6.7882
493
- 8500 val perplexity 887.2971
494
- 8500 train 6.884346 (lr=1.5103e-05) (hash(x)=50132971)
495
- 7900 val loss 6.9607
496
- 7900 val perplexity 1054.3367
497
- 7900 train 6.978685 (lr=2.9726e-05) (hash(x)=48339781)
498
- 8600 val loss 6.7788
499
- 8600 val perplexity 879.0137
500
- 8600 train 6.735719 (lr=1.4456e-05) (hash(x)=52193699)
501
- 8700 val loss 6.7764
502
- 8700 val perplexity 876.9025
503
- 8700 train 6.774114 (lr=1.3851e-05) (hash(x)=47902319)
504
- 8000 val loss 6.9558
505
- 8000 val perplexity 1049.2601
506
- 8000 train 7.068865 (lr=2.8405e-05) (hash(x)=54927320)
507
- 8800 val loss 6.7649
508
- 8800 val perplexity 866.8619
509
- 8800 train 7.029649 (lr=1.3289e-05) (hash(x)=54904230)
510
- 8100 val loss 6.9537
511
- 8100 val perplexity 1047.0311
512
- 8100 train 6.719138 (lr=2.7138e-05) (hash(x)=46461786)
513
- 8900 val loss 6.7611
514
- 8900 val perplexity 863.6302
515
- 8900 train 6.677911 (lr=1.2769e-05) (hash(x)=46311615)
516
- 9000 val loss 6.7601
517
- 9000 val perplexity 862.7271
518
- 9000 train 6.630702 (lr=1.2292e-05) (hash(x)=48535188)
519
- 8200 val loss 6.9487
520
- 8200 val perplexity 1041.8079
521
- 8200 train 6.892978 (lr=2.5929e-05) (hash(x)=51536260)
522
- 9100 val loss 6.7496
523
- 9100 val perplexity 853.6826
524
- 9100 train 6.778772 (lr=1.1860e-05) (hash(x)=51757372)
525
- 8300 val loss 6.9447
526
- 8300 val perplexity 1037.6498
527
- 8300 train 6.711561 (lr=2.4778e-05) (hash(x)=44770722)
528
- 9200 val loss 6.7470
529
- 9200 val perplexity 851.5067
530
- 9200 train 6.636959 (lr=1.1472e-05) (hash(x)=51131708)
531
- 8400 val loss 6.9416
532
- 8400 val perplexity 1034.3774
533
- 8400 train 6.938999 (lr=2.3686e-05) (hash(x)=50104957)
534
- 9300 val loss 6.7415
535
- 9300 val perplexity 846.8267
536
- 9300 train 6.686966 (lr=1.1128e-05) (hash(x)=44784276)
537
- 8500 val loss 6.9362
538
- 8500 val perplexity 1028.8685
539
- 8500 train 7.034618 (lr=2.2655e-05) (hash(x)=50132971)
540
- 9400 val loss 6.7376
541
- 9400 val perplexity 843.5647
542
- 9400 train 6.831891 (lr=1.0830e-05) (hash(x)=51981169)
543
- 8600 val loss 6.9281
544
- 8600 val perplexity 1020.5430
545
- 8600 train 6.879009 (lr=2.1685e-05) (hash(x)=52193699)
546
- 9500 val loss 6.7318
547
- 9500 val perplexity 838.6479
548
- 9500 train 6.684929 (lr=1.0577e-05) (hash(x)=47232936)
549
- 8700 val loss 6.9265
550
- 8700 val perplexity 1018.9719
551
- 8700 train 6.928772 (lr=2.0777e-05) (hash(x)=47902319)
552
- 9600 val loss 6.7301
553
- 9600 val perplexity 837.1972
554
- 9600 train 6.762636 (lr=1.0369e-05) (hash(x)=53800450)
555
- 8800 val loss 6.9224
556
- 8800 val perplexity 1014.7075
557
- 8800 train 7.177705 (lr=1.9933e-05) (hash(x)=54904230)
558
- 9700 val loss 6.7257
559
- 9700 val perplexity 833.5361
560
- 9700 train 6.842899 (lr=1.0208e-05) (hash(x)=55768123)
561
- 8900 val loss 6.9157
562
- 8900 val perplexity 1008.0081
563
- 8900 train 6.824413 (lr=1.9153e-05) (hash(x)=46311615)
564
- 9800 val loss 6.7242
565
- 9800 val perplexity 832.2679
566
- 9800 train 6.656127 (lr=1.0092e-05) (hash(x)=47745177)
567
- 9900 val loss 6.7249
568
- 9900 val perplexity 832.9302
569
- 9900 train 6.977684 (lr=1.0023e-05) (hash(x)=56592246)
570
- 9000 val loss 6.9129
571
- 9000 val perplexity 1005.1110
572
- 9000 train 6.765456 (lr=1.8439e-05) (hash(x)=48535188)
573
- 9999 val loss 6.7242
574
- 9999 val perplexity 832.3088
575
- 9100 val loss 6.9100
576
- 9100 val perplexity 1002.2089
577
- 9100 train 6.928776 (lr=1.7790e-05) (hash(x)=51757372)
578
- 9200 val loss 6.9081
579
- 9200 val perplexity 1000.3215
580
- 9200 train 6.806732 (lr=1.7208e-05) (hash(x)=51131708)
581
- 9300 val loss 6.9059
582
- 9300 val perplexity 998.1198
583
- 9300 train 6.841065 (lr=1.6692e-05) (hash(x)=44784276)
584
- 9400 val loss 6.9072
585
- 9400 val perplexity 999.4905
586
- 9400 train 6.993570 (lr=1.6245e-05) (hash(x)=51981169)
587
- 9500 val loss 6.9063
588
- 9500 val perplexity 998.5016
589
- 9500 train 6.858905 (lr=1.5865e-05) (hash(x)=47232936)
590
- 9600 val loss 6.9042
591
- 9600 val perplexity 996.4511
592
- 9600 train 6.943208 (lr=1.5554e-05) (hash(x)=53800450)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ max_steps: 50000
2
+ 1100 val loss 7.3709
3
+ 1100 val perplexity 1589.1416
4
+ 1100 train 6.997883 (lr=9.9927e-05) (hash(x)=37984588)
5
+ 40100 val loss 5.8536
6
+ 40100 val perplexity 348.4711
7
+ 40100 train 5.793257 (lr=9.2472e-06) (hash(x)=48035170)
8
+ 1200 val loss 7.3289
9
+ 1200 val perplexity 1523.7263
10
+ 1200 train 7.504585 (lr=9.9910e-05) (hash(x)=56333817)
11
  0 val loss 11.2703
12
  0 val perplexity 78458.0234
13
+ 1300 val loss 7.3412
14
+ 1300 val perplexity 1542.5471
15
+ 1300 train 7.483638 (lr=9.9892e-05) (hash(x)=53454056)
16
+ 40200 val loss 5.8481
17
+ 40200 val perplexity 346.5657
18
+ 40200 train 5.675584 (lr=9.1646e-06) (hash(x)=46651322)
19
+ 1400 val loss 7.2560
20
+ 1400 val perplexity 1416.6073
21
+ 1400 train 7.362457 (lr=9.9871e-05) (hash(x)=55284163)
22
+ 40300 val loss 5.8508
23
+ 40300 val perplexity 347.5022
24
+ 40300 train 5.707502 (lr=9.0827e-06) (hash(x)=46378099)
25
+ 1500 val loss 7.3036
26
+ 1500 val perplexity 1485.6129
27
+ 1500 train 7.158344 (lr=9.9849e-05) (hash(x)=48162598)
28
+ 0 train 11.272942 (lr=3.5000e-07) (hash(x)=57791809)
29
+ 1600 val loss 7.2907
30
+ 1600 val perplexity 1466.6249
31
+ 1600 train 7.350899 (lr=9.9825e-05) (hash(x)=54214535)
32
+ 40400 val loss 5.8443
33
+ 40400 val perplexity 345.2604
34
+ 40400 train 5.696771 (lr=9.0015e-06) (hash(x)=46495438)
35
+ 100 val loss 9.7259
36
+ 100 val perplexity 16745.7324
37
+ 100 train 9.735422 (lr=3.5350e-05) (hash(x)=48211824)
38
+ 1700 val loss 7.2683
39
+ 1700 val perplexity 1434.0831
40
+ 1700 train 7.474521 (lr=9.9799e-05) (hash(x)=53525003)
41
+ 200 val loss 8.1014
42
+ 200 val perplexity 3298.9370
43
+ 200 train 8.058278 (lr=7.0000e-05) (hash(x)=50375849)
44
+ 40500 val loss 5.8388
45
+ 40500 val perplexity 343.3772
46
+ 40500 train 5.996032 (lr=8.9211e-06) (hash(x)=52059147)
47
+ 1800 val loss 7.2337
48
+ 1800 val perplexity 1385.3571
49
+ 1800 train 7.303545 (lr=9.9771e-05) (hash(x)=51848994)
50
+ 300 val loss 7.6442
51
+ 300 val perplexity 2088.4370
52
+ 300 train 7.951274 (lr=6.9999e-05) (hash(x)=57250808)
53
+ 400 val loss 7.5686
54
+ 400 val perplexity 1936.4075
55
+ 400 train 8.221130 (lr=6.9997e-05) (hash(x)=62519858)
56
+ 40600 val loss 5.8407
57
+ 40600 val perplexity 344.0050
58
+ 40600 train 5.996709 (lr=8.8414e-06) (hash(x)=54885045)
59
+ 1900 val loss 7.2157
60
+ 1900 val perplexity 1360.6837
61
+ 1900 train 7.053372 (lr=9.9741e-05) (hash(x)=48405987)
62
+ 500 val loss 7.5147
63
+ 500 val perplexity 1834.7661
64
+ 500 train 7.395103 (lr=6.9994e-05) (hash(x)=47226806)
65
+ 40700 val loss 5.8398
66
+ 40700 val perplexity 343.7109
67
+ 40700 train 6.230294 (lr=8.7624e-06) (hash(x)=53213971)
68
+ 600 val loss 7.4733
69
+ 600 val perplexity 1760.4728
70
+ 600 train 7.511261 (lr=6.9990e-05) (hash(x)=51149322)
71
+ 2000 val loss 7.1997
72
+ 2000 val perplexity 1338.9938
73
+ 2000 train 7.547995 (lr=9.9710e-05) (hash(x)=58592291)
74
+ 700 val loss 7.4403
75
+ 700 val perplexity 1703.2408
76
+ 700 train 7.431039 (lr=6.9984e-05) (hash(x)=51564551)
77
+ 40800 val loss 5.8359
78
+ 40800 val perplexity 342.3692
79
+ 40800 train 5.498599 (lr=8.6842e-06) (hash(x)=45133794)
80
+ 2100 val loss 7.1929
81
+ 2100 val perplexity 1329.9009
82
+ 2100 train 7.272621 (lr=9.9677e-05) (hash(x)=51167081)
83
+ 800 val loss 7.4083
84
+ 800 val perplexity 1649.6486
85
+ 800 train 7.160203 (lr=6.9977e-05) (hash(x)=45093459)
86
+ 40900 val loss 5.8379
87
+ 40900 val perplexity 343.0574
88
+ 40900 train 5.988767 (lr=8.6068e-06) (hash(x)=56546547)
89
+ 900 val loss 7.3864
90
+ 900 val perplexity 1613.9653
91
+ 900 train 7.685310 (lr=6.9969e-05) (hash(x)=54988361)
92
+ 2200 val loss 7.1644
93
+ 2200 val perplexity 1292.5675
94
+ 2200 train 7.169672 (lr=9.9642e-05) (hash(x)=47994988)
95
+ 1000 val loss 7.3699
96
+ 1000 val perplexity 1587.5004
97
+ 1000 train 7.238439 (lr=6.9960e-05) (hash(x)=47588648)
98
+ 41000 val loss 5.8354
99
+ 41000 val perplexity 342.1882
100
+ 41000 train 6.221378 (lr=8.5301e-06) (hash(x)=49552270)
101
+ 2300 val loss 7.1801
102
+ 2300 val perplexity 1313.0338
103
+ 2300 train 7.161005 (lr=9.9606e-05) (hash(x)=47377604)
104
+ 1100 val loss 7.3479
105
+ 1100 val perplexity 1552.9474
106
+ 1100 train 6.955181 (lr=6.9949e-05) (hash(x)=37984588)
107
+ 1200 val loss 7.3201
108
+ 1200 val perplexity 1510.3381
109
+ 1200 train 7.489902 (lr=6.9937e-05) (hash(x)=56333817)
110
+ 41100 val loss 5.8354
111
+ 41100 val perplexity 342.2155
112
+ 41100 train 6.067726 (lr=8.4541e-06) (hash(x)=51222370)
113
+ 2400 val loss 7.1777
114
+ 2400 val perplexity 1309.8896
115
+ 2400 train 7.223081 (lr=9.9567e-05) (hash(x)=53554323)
116
+ 1300 val loss 7.3031
117
+ 1300 val perplexity 1484.8395
118
+ 1300 train 7.442085 (lr=6.9924e-05) (hash(x)=53454056)
119
+ 41200 val loss 5.8369
120
+ 41200 val perplexity 342.7026
121
+ 41200 train 5.903095 (lr=8.3789e-06) (hash(x)=50883608)
122
+ 2500 val loss 7.1743
123
+ 2500 val perplexity 1305.3995
124
+ 2500 train 7.199438 (lr=9.9527e-05) (hash(x)=50780417)
125
+ 1400 val loss 7.2946
126
+ 1400 val perplexity 1472.2584
127
+ 1400 train 7.407144 (lr=6.9910e-05) (hash(x)=55284163)
128
+ 1500 val loss 7.3019
129
+ 1500 val perplexity 1483.1653
130
+ 1500 train 7.143060 (lr=6.9894e-05) (hash(x)=48162598)
131
+ 41300 val loss 5.8382
132
+ 41300 val perplexity 343.1662
133
+ 41300 train 6.058548 (lr=8.3045e-06) (hash(x)=52996828)
134
+ 2600 val loss 7.1702
135
+ 2600 val perplexity 1300.0759
136
+ 2600 train 7.071150 (lr=9.9485e-05) (hash(x)=46453562)
137
+ 1600 val loss 7.2579
138
+ 1600 val perplexity 1419.3342
139
+ 1600 train 7.321304 (lr=6.9877e-05) (hash(x)=54214535)
140
+ 41400 val loss 5.8361
141
+ 41400 val perplexity 342.4566
142
+ 41400 train 5.811955 (lr=8.2308e-06) (hash(x)=48822716)
143
+ 2700 val loss 7.2418
144
+ 2700 val perplexity 1396.6613
145
+ 2700 train 7.236431 (lr=9.9442e-05) (hash(x)=54404221)
146
+ 1700 val loss 7.2608
147
+ 1700 val perplexity 1423.4137
148
+ 1700 train 7.450740 (lr=6.9859e-05) (hash(x)=53525003)
149
+ 1800 val loss 7.2223
150
+ 1800 val perplexity 1369.6931
151
+ 1800 train 7.294693 (lr=6.9840e-05) (hash(x)=51848994)
152
+ 41500 val loss 5.8375
153
+ 41500 val perplexity 342.9267
154
+ 41500 train 5.816633 (lr=8.1579e-06) (hash(x)=50936577)
155
+ 2800 val loss 7.1606
156
+ 2800 val perplexity 1287.7007
157
+ 2800 train 7.852295 (lr=9.9396e-05) (hash(x)=59318895)
158
+ 1900 val loss 7.2239
159
+ 1900 val perplexity 1371.8162
160
+ 1900 train 7.046358 (lr=6.9819e-05) (hash(x)=48405987)
161
+ 41600 val loss 5.8377
162
+ 41600 val perplexity 343.0029
163
+ 41600 train 5.881047 (lr=8.0858e-06) (hash(x)=44375046)
164
+ 2900 val loss 7.1619
165
+ 2900 val perplexity 1289.3486
166
+ 2900 train 7.149612 (lr=9.9349e-05) (hash(x)=47845760)
167
+ 2000 val loss 7.2136
168
+ 2000 val perplexity 1357.7904
169
+ 2000 train 7.559543 (lr=6.9797e-05) (hash(x)=58592291)
170
+ 2100 val loss 7.2009
171
+ 2100 val perplexity 1340.6804
172
+ 2100 train 7.274922 (lr=6.9774e-05) (hash(x)=51167081)
173
+ 41700 val loss 5.8362
174
+ 41700 val perplexity 342.4806
175
+ 41700 train 5.599354 (lr=8.0144e-06) (hash(x)=44060021)
176
+ 3000 val loss 7.1622
177
+ 3000 val perplexity 1289.7507
178
+ 3000 train 6.873025 (lr=9.9300e-05) (hash(x)=44336167)
179
+ 2200 val loss 7.1883
180
+ 2200 val perplexity 1323.8024
181
+ 2200 train 7.174533 (lr=6.9750e-05) (hash(x)=47994988)
182
+ 41800 val loss 5.8358
183
+ 41800 val perplexity 342.3502
184
+ 41800 train 6.128627 (lr=7.9438e-06) (hash(x)=57765221)
185
+ 2300 val loss 7.2004
186
+ 2300 val perplexity 1340.0284
187
+ 3100 val loss 7.1277
188
+ 3100 val perplexity 1246.0415
189
+ 2300 train 7.185493 (lr=6.9724e-05) (hash(x)=47377604)
190
+ 3100 train 7.390228 (lr=9.9249e-05) (hash(x)=44479330)
191
+ 2400 val loss 7.1732
192
+ 2400 val perplexity 1303.9562
193
+ 2400 train 7.220889 (lr=6.9697e-05) (hash(x)=53554323)
194
+ 3200 val loss 7.1646
195
+ 3200 val perplexity 1292.8153
196
+ 3200 train 7.241447 (lr=9.9197e-05) (hash(x)=54593096)
197
+ 41900 val loss 5.8348
198
+ 41900 val perplexity 341.9940
199
+ 41900 train 5.597972 (lr=7.8740e-06) (hash(x)=46051470)
200
+ 2500 val loss 7.1729
201
+ 2500 val perplexity 1303.6392
202
+ 2500 train 7.214406 (lr=6.9669e-05) (hash(x)=50780417)
203
+ 3300 val loss 7.1778
204
+ 3300 val perplexity 1309.9883
205
+ 3300 train 7.088508 (lr=9.9142e-05) (hash(x)=45347643)
206
+ 2600 val loss 7.1263
207
+ 2600 val perplexity 1244.2609
208
+ 2600 train 7.028562 (lr=6.9640e-05) (hash(x)=46453562)
209
+ 42000 val loss 5.8415
210
+ 42000 val perplexity 344.2894
211
+ 42000 train 6.066402 (lr=7.8050e-06) (hash(x)=52077616)
212
+ 2700 val loss 7.1115
213
+ 2700 val perplexity 1225.9312
214
+ 2700 train 7.080199 (lr=6.9609e-05) (hash(x)=54404221)
215
+ 3400 val loss 7.1109
216
+ 3400 val perplexity 1225.2228
217
+ 3400 train 7.224405 (lr=9.9086e-05) (hash(x)=47797247)
218
+ 42100 val loss 5.8336
219
+ 42100 val perplexity 341.5976
220
+ 42100 train 5.685880 (lr=7.7368e-06) (hash(x)=47845199)
221
+ 2800 val loss 7.1089
222
+ 2800 val perplexity 1222.8175
223
+ 2800 train 7.750087 (lr=6.9577e-05) (hash(x)=59318895)
224
+ 2900 val loss 7.1042
225
+ 2900 val perplexity 1217.0992
226
+ 2900 train 7.102781 (lr=6.9544e-05) (hash(x)=47845760)
227
+ 3500 val loss 7.1129
228
+ 3500 val perplexity 1227.7335
229
+ 3500 train 7.017200 (lr=9.9028e-05) (hash(x)=46115683)
230
+ 42200 val loss 5.8312
231
+ 42200 val perplexity 340.7710
232
+ 42200 train 6.088301 (lr=7.6693e-06) (hash(x)=51549823)
233
+ 3000 val loss 7.0754
234
+ 3000 val perplexity 1182.5112
235
+ 3000 train 6.773137 (lr=6.9510e-05) (hash(x)=44336167)
236
+ 3600 val loss 7.1055
237
+ 3600 val perplexity 1218.6689
238
+ 3600 train 6.937795 (lr=9.8969e-05) (hash(x)=44502074)
239
+ 42300 val loss 5.8243
240
+ 42300 val perplexity 338.4359
241
+ 42300 train 5.965167 (lr=7.6027e-06) (hash(x)=56922131)
242
+ 3100 val loss 7.0511
243
+ 3100 val perplexity 1154.1840
244
+ 3100 train 7.331172 (lr=6.9474e-05) (hash(x)=44479330)
245
+ 3200 val loss 7.0365
246
+ 3200 val perplexity 1137.3547
247
+ 3200 train 7.123116 (lr=6.9438e-05) (hash(x)=54593096)
248
+ 3700 val loss 7.0926
249
+ 3700 val perplexity 1202.9988
250
+ 3700 train 7.232045 (lr=9.8908e-05) (hash(x)=55388443)
251
+ 42400 val loss 5.8262
252
+ 42400 val perplexity 339.0629
253
+ 42400 train 5.787869 (lr=7.5368e-06) (hash(x)=49004372)
254
+ 3300 val loss 7.0396
255
+ 3300 val perplexity 1140.9235
256
+ 3300 train 6.961339 (lr=6.9400e-05) (hash(x)=45347643)
257
+ 3800 val loss 7.0881
258
+ 3800 val perplexity 1197.5861
259
+ 3800 train 6.871030 (lr=9.8845e-05) (hash(x)=43790341)
260
+ 42500 val loss 5.8224
261
+ 42500 val perplexity 337.7739
262
+ 42500 train 5.810077 (lr=7.4717e-06) (hash(x)=50651839)
263
+ 3400 val loss 7.0278
264
+ 3400 val perplexity 1127.5500
265
+ 3400 train 7.154087 (lr=6.9360e-05) (hash(x)=47797247)
266
+ 3500 val loss 7.0127
267
+ 3500 val perplexity 1110.6456
268
+ 3500 train 6.888947 (lr=6.9320e-05) (hash(x)=46115683)
269
+ 3900 val loss 7.0927
270
+ 3900 val perplexity 1203.1331
271
+ 3900 train 7.089813 (lr=9.8780e-05) (hash(x)=50013318)
272
+ 42600 val loss 5.8226
273
+ 42600 val perplexity 337.8404
274
+ 42600 train 6.146226 (lr=7.4074e-06) (hash(x)=50767721)
275
+ 3600 val loss 6.9927
276
+ 3600 val perplexity 1088.6067
277
+ 3600 train 6.841239 (lr=6.9278e-05) (hash(x)=44502074)
278
+ 4000 val loss 7.0888
279
+ 4000 val perplexity 1198.4447
280
+ 4000 train 7.173865 (lr=9.8713e-05) (hash(x)=51704787)
281
+ 42700 val loss 5.8274
282
+ 42700 val perplexity 339.4667
283
+ 42700 train 5.559310 (lr=7.3440e-06) (hash(x)=49099183)
284
+ 3700 val loss 6.9963
285
+ 3700 val perplexity 1092.6323
286
+ 3700 train 7.130051 (lr=6.9235e-05) (hash(x)=55388443)
287
+ 3800 val loss 6.9835
288
+ 3800 val perplexity 1078.6748
289
+ 3800 train 6.767923 (lr=6.9191e-05) (hash(x)=43790341)
290
+ 4100 val loss 7.0791
291
+ 4100 val perplexity 1186.8921
292
+ 4100 train 7.136350 (lr=9.8645e-05) (hash(x)=50821964)
293
+ 42800 val loss 5.8205
294
+ 42800 val perplexity 337.1293
295
+ 42800 train 6.236982 (lr=7.2813e-06) (hash(x)=42272413)
296
+ 3900 val loss 6.9793
297
+ 3900 val perplexity 1074.1350
298
+ 3900 train 6.971794 (lr=6.9146e-05) (hash(x)=50013318)
299
+ 4200 val loss 7.1331
300
+ 4200 val perplexity 1252.7737
301
+ 4200 train 7.145490 (lr=9.8575e-05) (hash(x)=49675080)
302
+ 42900 val loss 5.8239
303
+ 42900 val perplexity 338.2816
304
+ 42900 train 5.616502 (lr=7.2194e-06) (hash(x)=48582863)
305
+ 4000 val loss 6.9700
306
+ 4000 val perplexity 1064.2661
307
+ 4000 train 7.038657 (lr=6.9099e-05) (hash(x)=51704787)
308
+ 4100 val loss 6.9672
309
+ 4100 val perplexity 1061.2119
310
+ 4100 train 7.042075 (lr=6.9051e-05) (hash(x)=50821964)
311
+ 4300 val loss 7.1120
312
+ 4300 val perplexity 1226.5977
313
+ 4300 train 6.766655 (lr=9.8503e-05) (hash(x)=43239281)
314
+ 43000 val loss 5.8217
315
+ 43000 val perplexity 337.5590
316
+ 43000 train 5.796019 (lr=7.1583e-06) (hash(x)=48703446)
317
+ 4200 val loss 6.9870
318
+ 4200 val perplexity 1082.4434
319
+ 4200 train 6.998270 (lr=6.9002e-05) (hash(x)=49675080)
320
+ 4400 val loss 7.1208
321
+ 4400 val perplexity 1237.4618
322
+ 4400 train 6.820117 (lr=9.8430e-05) (hash(x)=45076737)
323
+ 4300 val loss 6.9632
324
+ 4300 val perplexity 1056.9971
325
+ 4300 train 6.617874 (lr=6.8952e-05) (hash(x)=43239281)
326
+ 43100 val loss 5.8203
327
+ 43100 val perplexity 337.0722
328
+ 43100 train 5.743421 (lr=7.0981e-06) (hash(x)=48730321)
329
+ 4400 val loss 6.9505
330
+ 4400 val perplexity 1043.6844
331
+ 4400 train 6.636121 (lr=6.8901e-05) (hash(x)=45076737)
332
+ 4500 val loss 7.1026
333
+ 4500 val perplexity 1215.1410
334
+ 4500 train 7.188966 (lr=9.8355e-05) (hash(x)=57930262)
335
+ 43200 val loss 5.8184
336
+ 43200 val perplexity 336.4272
337
+ 43200 train 6.053661 (lr=7.0386e-06) (hash(x)=56536090)
338
+ 4500 val loss 6.9386
339
+ 4500 val perplexity 1031.2871
340
+ 4500 train 7.046677 (lr=6.8848e-05) (hash(x)=57930262)
341
+ 4600 val loss 7.0943
342
+ 4600 val perplexity 1205.0604
343
+ 4600 train 6.878264 (lr=9.8278e-05) (hash(x)=46721614)
344
+ 4600 val loss 6.9329
345
+ 4600 val perplexity 1025.4572
346
+ 4600 train 6.712116 (lr=6.8794e-05) (hash(x)=46721614)
347
+ 43300 val loss 5.8176
348
+ 43300 val perplexity 336.1527
349
+ 43300 train 5.962167 (lr=6.9800e-06) (hash(x)=54154116)
350
+ 4700 val loss 6.9091
351
+ 4700 val perplexity 1001.3547
352
+ 4700 train 6.706449 (lr=6.8739e-05) (hash(x)=49837920)
353
+ 4700 val loss 7.0600
354
+ 4700 val perplexity 1164.4540
355
+ 4700 train 6.883050 (lr=9.8199e-05) (hash(x)=49837920)
356
+ 4800 val loss 6.8954
357
+ 4800 val perplexity 987.7471
358
+ 4800 train 7.025270 (lr=6.8683e-05) (hash(x)=48380045)
359
+ 43400 val loss 5.8258
360
+ 43400 val perplexity 338.9396
361
+ 43400 train 5.723001 (lr=6.9222e-06) (hash(x)=50058055)
362
+ 4800 val loss 7.0530
363
+ 4800 val perplexity 1156.3136
364
+ 4900 val loss 6.8825
365
+ 4900 val perplexity 975.0323
366
+ 4800 train 7.163837 (lr=9.8119e-05) (hash(x)=48380045)
367
+ 4900 train 6.764085 (lr=6.8626e-05) (hash(x)=44202577)
368
+ 43500 val loss 5.8194
369
+ 43500 val perplexity 336.7863
370
+ 43500 train 5.749609 (lr=6.8652e-06) (hash(x)=48743802)
371
+ 5000 val loss 6.8525
372
+ 5000 val perplexity 946.2589
373
+ 5000 train 6.858996 (lr=6.8567e-05) (hash(x)=52038024)
374
+ 4900 val loss 7.0500
375
+ 4900 val perplexity 1152.8193
376
+ 4900 train 6.937270 (lr=9.8036e-05) (hash(x)=44202577)
377
+ 5100 val loss 6.8317
378
+ 5100 val perplexity 926.8068
379
+ 5100 train 7.012638 (lr=6.8507e-05) (hash(x)=53700038)
380
+ 43600 val loss 5.8188
381
+ 43600 val perplexity 336.5771
382
+ 43600 train 5.458486 (lr=6.8090e-06) (hash(x)=42792886)
383
+ 5200 val loss 6.8274
384
+ 5200 val perplexity 922.8022
385
+ 5200 train 6.814481 (lr=6.8446e-05) (hash(x)=48137625)
386
+ 5000 val loss 7.0187
387
+ 5000 val perplexity 1117.3700
388
+ 5000 train 7.044736 (lr=9.7953e-05) (hash(x)=52038024)
389
+ 43700 val loss 5.8163
390
+ 43700 val perplexity 335.7397
391
+ 43700 train 6.229331 (lr=6.7537e-06) (hash(x)=56446070)
392
+ 5300 val loss 6.8135
393
+ 5300 val perplexity 910.0942
394
+ 5300 train 6.670180 (lr=6.8384e-05) (hash(x)=43161573)
395
+ 5100 val loss 7.0031
396
+ 5100 val perplexity 1100.0636
397
+ 5100 train 7.189824 (lr=9.7867e-05) (hash(x)=53700038)
398
+ 5400 val loss 6.8088
399
+ 5400 val perplexity 905.7951
400
+ 5400 train 7.007874 (lr=6.8320e-05) (hash(x)=56673322)
401
+ 43800 val loss 5.8178
402
+ 43800 val perplexity 336.2392
403
+ 43800 train 5.653908 (lr=6.6992e-06) (hash(x)=45584354)
404
+ 5500 val loss 6.8175
405
+ 5500 val perplexity 913.7433
406
+ 5500 train 6.971771 (lr=6.8256e-05) (hash(x)=53468295)
407
+ 5200 val loss 6.9942
408
+ 5200 val perplexity 1090.2700
409
+ 5200 train 6.960814 (lr=9.7780e-05) (hash(x)=48137625)
410
+ 43900 val loss 5.8161
411
+ 43900 val perplexity 335.6672
412
+ 43900 train 5.664525 (lr=6.6455e-06) (hash(x)=49339253)
413
+ 5600 val loss 6.7675
414
+ 5600 val perplexity 869.1533
415
+ 5600 train 7.060785 (lr=6.8190e-05) (hash(x)=59287280)
416
+ 5300 val loss 6.9828
417
+ 5300 val perplexity 1077.8917
418
+ 5300 train 6.854216 (lr=9.7691e-05) (hash(x)=43161573)
419
+ 5700 val loss 6.7728
420
+ 5700 val perplexity 873.7137
421
+ 5700 train 7.033890 (lr=6.8123e-05) (hash(x)=57575806)
422
+ 44000 val loss 5.8158
423
+ 44000 val perplexity 335.5459
424
+ 44000 train 5.635589 (lr=6.5926e-06) (hash(x)=46183203)
425
+ 5800 val loss 6.7736
426
+ 5800 val perplexity 874.4873
427
+ 5800 train 6.723006 (lr=6.8055e-05) (hash(x)=46897279)
428
+ 5400 val loss 6.9617
429
+ 5400 val perplexity 1055.4479
430
+ 5400 train 7.139781 (lr=9.7600e-05) (hash(x)=56673322)
431
+ 44100 val loss 5.8135
432
+ 44100 val perplexity 334.7792
433
+ 44100 train 5.802389 (lr=6.5406e-06) (hash(x)=47849630)
434
+ 5900 val loss 6.7703
435
+ 5900 val perplexity 871.5712
436
+ 5900 train 6.636249 (lr=6.7985e-05) (hash(x)=47565679)
437
+ 5500 val loss 6.9375
438
+ 5500 val perplexity 1030.2126
439
+ 5500 train 7.112161 (lr=9.7508e-05) (hash(x)=53468295)
440
+ 6000 val loss 6.7804
441
+ 6000 val perplexity 880.4198
442
+ 6000 train 6.648599 (lr=6.7915e-05) (hash(x)=51590090)
443
+ 44200 val loss 5.8127
444
+ 44200 val perplexity 334.5341
445
+ 44200 train 6.146719 (lr=6.4894e-06) (hash(x)=49834275)
446
+ 6100 val loss 6.7618
447
+ 6100 val perplexity 864.2370
448
+ 6100 train 7.137344 (lr=6.7843e-05) (hash(x)=59732271)
449
+ 5600 val loss 6.9085
450
+ 5600 val perplexity 1000.7457
451
+ 5600 train 7.224924 (lr=9.7414e-05) (hash(x)=59287280)
452
+ 44300 val loss 5.8095
453
+ 44300 val perplexity 333.4532
454
+ 44300 train 6.254009 (lr=6.4390e-06) (hash(x)=62535257)
455
+ 6200 val loss 6.7642
456
+ 6200 val perplexity 866.2368
457
+ 6200 train 6.741118 (lr=6.7770e-05) (hash(x)=46394422)
458
+ 5700 val loss 6.9043
459
+ 5700 val perplexity 996.5828
460
+ 5700 train 7.165204 (lr=9.7318e-05) (hash(x)=57575806)
461
+ 6300 val loss 6.7599
462
+ 6300 val perplexity 862.5737
463
+ 6300 train 6.768298 (lr=6.7696e-05) (hash(x)=53748145)
464
+ 44400 val loss 5.8078
465
+ 44400 val perplexity 332.8983
466
+ 44400 train 5.866293 (lr=6.3895e-06) (hash(x)=49253957)
467
+ 6400 val loss 6.7398
468
+ 6400 val perplexity 845.4134
469
+ 6400 train 6.611226 (lr=6.7621e-05) (hash(x)=46054751)
470
+ 5800 val loss 6.8937
471
+ 5800 val perplexity 986.0107
472
+ 5800 train 6.843620 (lr=9.7221e-05) (hash(x)=46897279)
473
+ 6500 val loss 6.7447
474
+ 6500 val perplexity 849.5231
475
+ 6500 train 7.008954 (lr=6.7545e-05) (hash(x)=51816809)
476
+ 44500 val loss 5.8086
477
+ 44500 val perplexity 333.1573
478
+ 44500 train 5.787062 (lr=6.3408e-06) (hash(x)=55368339)
479
+ 5900 val loss 6.8839
480
+ 5900 val perplexity 976.4327
481
+ 5900 train 6.751657 (lr=9.7122e-05) (hash(x)=47565679)
482
+ 6600 val loss 6.7294
483
+ 6600 val perplexity 836.6632
484
+ 6600 train 6.619945 (lr=6.7467e-05) (hash(x)=52453336)
485
+ 44600 val loss 5.8085
486
+ 44600 val perplexity 333.1111
487
+ 44600 train 5.920068 (lr=6.2929e-06) (hash(x)=47098476)
488
+ 6700 val loss 6.7275
489
+ 6700 val perplexity 835.0275
490
+ 6700 train 6.752491 (lr=6.7389e-05) (hash(x)=49108775)
491
+ 6000 val loss 6.8816
492
+ 6000 val perplexity 974.1851
493
+ 6000 train 6.755291 (lr=9.7021e-05) (hash(x)=51590090)
494
+ 6800 val loss 6.7240
495
+ 6800 val perplexity 832.1128
496
+ 6800 train 6.615003 (lr=6.7309e-05) (hash(x)=46745396)
497
+ 44700 val loss 5.8119
498
+ 44700 val perplexity 334.2415
499
+ 44700 train 5.744551 (lr=6.2459e-06) (hash(x)=48280562)
500
+ 6100 val loss 6.8433
501
+ 6100 val perplexity 937.6213
502
+ 6100 train 7.231963 (lr=9.6919e-05) (hash(x)=59732271)
503
+ 6900 val loss 6.7129
504
+ 6900 val perplexity 822.9876
505
+ 6900 train 6.874073 (lr=6.7228e-05) (hash(x)=46534986)
506
+ 44800 val loss 5.8111
507
+ 44800 val perplexity 333.9803
508
+ 44800 train 6.081460 (lr=6.1998e-06) (hash(x)=55591638)
509
+ 7000 val loss 6.7105
510
+ 7000 val perplexity 821.0043
511
+ 7000 train 7.102075 (lr=6.7146e-05) (hash(x)=49317888)
512
+ 6200 val loss 6.8340
513
+ 6200 val perplexity 928.8552
514
+ 6200 train 6.807478 (lr=9.6815e-05) (hash(x)=46394422)
515
+ 7100 val loss 6.7149
516
+ 7100 val perplexity 824.5883
517
+ 7100 train 6.716461 (lr=6.7063e-05) (hash(x)=50360484)
518
+ 44900 val loss 5.8106
519
+ 44900 val perplexity 333.8185
520
+ 44900 train 5.963002 (lr=6.1545e-06) (hash(x)=53757748)
521
+ 6300 val loss 6.8275
522
+ 6300 val perplexity 922.9268
523
+ 6300 train 6.854854 (lr=9.6709e-05) (hash(x)=53748145)
524
+ 7200 val loss 6.7289
525
+ 7200 val perplexity 836.1858
526
+ 7200 train 6.611360 (lr=6.6978e-05) (hash(x)=49515094)
527
+ 45000 val loss 5.8083
528
+ 45000 val perplexity 333.0545
529
+ 45000 train 5.965748 (lr=6.1100e-06) (hash(x)=51685087)
530
+ 7300 val loss 6.7329
531
+ 7300 val perplexity 839.6050
532
+ 7300 train 6.724388 (lr=6.6893e-05) (hash(x)=51546861)
533
+ 6400 val loss 6.8335
534
+ 6400 val perplexity 928.4501
535
+ 6400 train 6.675262 (lr=9.6602e-05) (hash(x)=46054751)
536
+ 7400 val loss 6.7554
537
+ 7400 val perplexity 858.6415
538
+ 7400 train 6.747338 (lr=6.6806e-05) (hash(x)=48320948)
539
+ 45100 val loss 5.8066
540
+ 45100 val perplexity 332.4754
541
+ 45100 train 5.804672 (lr=6.0664e-06) (hash(x)=50093774)
542
+ 6500 val loss 6.8217
543
+ 6500 val perplexity 917.5349
544
+ 6500 train 7.011214 (lr=9.6493e-05) (hash(x)=51816809)
545
+ 7500 val loss 6.7581
546
+ 7500 val perplexity 861.0138
547
+ 7500 train 6.601406 (lr=6.6718e-05) (hash(x)=40167457)
548
+ 45200 val loss 5.8079
549
+ 45200 val perplexity 332.9253
550
+ 45200 train 5.531459 (lr=6.0237e-06) (hash(x)=43460450)
551
+ 7600 val loss 6.7487
552
+ 7600 val perplexity 852.9794
553
+ 7600 train 6.694093 (lr=6.6630e-05) (hash(x)=49942165)
554
+ 6600 val loss 6.8168
555
+ 6600 val perplexity 913.0578
556
+ 6600 train 6.709125 (lr=9.6382e-05) (hash(x)=52453336)
557
+ 7700 val loss 6.7365
558
+ 7700 val perplexity 842.6336
559
+ 7700 train 6.467313 (lr=6.6540e-05) (hash(x)=48853311)
560
+ 45300 val loss 5.8075
561
+ 45300 val perplexity 332.7967
562
+ 45300 train 6.013202 (lr=5.9818e-06) (hash(x)=49935488)
563
+ 6700 val loss 6.7775
564
+ 6700 val perplexity 877.8773
565
+ 6700 train 6.801557 (lr=9.6270e-05) (hash(x)=49108775)
566
+ 7800 val loss 6.6977
567
+ 7800 val perplexity 810.5665
568
+ 7800 train 6.587886 (lr=6.6448e-05) (hash(x)=48510117)
569
+ 7900 val loss 6.6830
570
+ 7900 val perplexity 798.6885
571
+ 7900 train 6.674704 (lr=6.6356e-05) (hash(x)=48339781)
572
+ 45400 val loss 5.8084
573
+ 45400 val perplexity 333.0822
574
+ 45400 train 5.796069 (lr=5.9407e-06) (hash(x)=49447929)
575
+ 6800 val loss 6.7620
576
+ 6800 val perplexity 864.3433
577
+ 6800 train 6.641977 (lr=9.6156e-05) (hash(x)=46745396)
578
+ 8000 val loss 6.6692
579
+ 8000 val perplexity 787.8011
580
+ 8000 train 6.791812 (lr=6.6263e-05) (hash(x)=54927320)
581
+ 45500 val loss 5.8073
582
+ 45500 val perplexity 332.7321
583
+ 45500 train 5.758606 (lr=5.9005e-06) (hash(x)=50713904)
584
+ 8100 val loss 6.6685
585
+ 8100 val perplexity 787.1943
586
+ 8100 train 6.383808 (lr=6.6169e-05) (hash(x)=46461786)
587
+ 6900 val loss 6.7623
588
+ 6900 val perplexity 864.6273
589
+ 6900 train 6.878799 (lr=9.6040e-05) (hash(x)=46534986)
590
+ 8200 val loss 6.6542
591
+ 8200 val perplexity 776.0621
592
+ 8200 train 6.567985 (lr=6.6073e-05) (hash(x)=51536260)
593
+ 45600 val loss 5.8281
594
+ 45600 val perplexity 339.7057
595
+ 45600 train 5.737741 (lr=5.8612e-06) (hash(x)=47674606)
596
+ 7000 val loss 6.7450
597
+ 7000 val perplexity 849.7970
598
+ 7000 train 7.090025 (lr=9.5923e-05) (hash(x)=49317888)
599
+ 8300 val loss 6.6445
600
+ 8300 val perplexity 768.5745
601
+ 8300 train 6.385070 (lr=6.5976e-05) (hash(x)=44770722)
602
+ 45700 val loss 5.8069
603
+ 45700 val perplexity 332.6016
604
+ 45700 train 5.718114 (lr=5.8227e-06) (hash(x)=51539617)
605
+ 8400 val loss 6.6291
606
+ 8400 val perplexity 756.7707
607
+ 8400 train 6.630779 (lr=6.5879e-05) (hash(x)=50104957)
608
+ 7100 val loss 6.7150
609
+ 7100 val perplexity 824.7158
610
+ 7100 train 6.732314 (lr=9.5804e-05) (hash(x)=50360484)
611
+ 8500 val loss 6.6186
612
+ 8500 val perplexity 748.8944
613
+ 8500 train 6.730096 (lr=6.5780e-05) (hash(x)=50132971)
614
+ 45800 val loss 5.8064
615
+ 45800 val perplexity 332.4121
616
+ 45800 train 5.777891 (lr=5.7851e-06) (hash(x)=44448785)
617
+ 7200 val loss 6.7163
618
+ 7200 val perplexity 825.7302
619
+ 7200 train 6.610737 (lr=9.5683e-05) (hash(x)=49515094)
620
+ 8600 val loss 6.6052
621
+ 8600 val perplexity 738.9441
622
+ 8600 train 6.559481 (lr=6.5680e-05) (hash(x)=52193699)
623
+ 45900 val loss 5.8067
624
+ 45900 val perplexity 332.5286
625
+ 45900 train 5.633348 (lr=5.7484e-06) (hash(x)=51499105)
626
+ 8700 val loss 6.5946
627
+ 8700 val perplexity 731.1027
628
+ 8700 train 6.601634 (lr=6.5579e-05) (hash(x)=47902319)
629
+ 7300 val loss 6.7274
630
+ 7300 val perplexity 834.9750
631
+ 7300 train 6.727911 (lr=9.5561e-05) (hash(x)=51546861)
632
+ 8800 val loss 6.5905
633
+ 8800 val perplexity 728.1722
634
+ 8800 train 6.874655 (lr=6.5477e-05) (hash(x)=54904230)
635
+ 46000 val loss 5.8019
636
+ 46000 val perplexity 330.9422
637
+ 46000 train 5.717524 (lr=5.7125e-06) (hash(x)=48359464)
638
+ 8900 val loss 6.5658
639
+ 8900 val perplexity 710.3594
640
+ 7400 val loss 6.7125
641
+ 7400 val perplexity 822.6318
642
+ 8900 train 6.456809 (lr=6.5374e-05) (hash(x)=46311615)
643
+ 7400 train 6.713262 (lr=9.5437e-05) (hash(x)=48320948)
644
+ 46100 val loss 5.8034
645
+ 46100 val perplexity 331.4209
646
+ 46100 train 5.856997 (lr=5.6775e-06) (hash(x)=51885986)
647
+ 9000 val loss 6.5762
648
+ 9000 val perplexity 717.8407
649
+ 9000 train 6.467047 (lr=6.5270e-05) (hash(x)=48535188)
650
+ 7500 val loss 6.7149
651
+ 7500 val perplexity 824.6104
652
+ 7500 train 6.552715 (lr=9.5312e-05) (hash(x)=40167457)
653
+ 9100 val loss 6.5482
654
+ 9100 val perplexity 697.9618
655
+ 9100 train 6.596004 (lr=6.5164e-05) (hash(x)=51757372)
656
+ 46200 val loss 5.8014
657
+ 46200 val perplexity 330.7549
658
+ 46200 train 6.350838 (lr=5.6434e-06) (hash(x)=65186615)
659
+ 9200 val loss 6.5645
660
+ 9200 val perplexity 709.4841
661
+ 9200 train 6.451322 (lr=6.5058e-05) (hash(x)=51131708)
662
+ 7600 val loss 6.6876
663
+ 7600 val perplexity 802.4150
664
+ 7600 train 6.612953 (lr=9.5185e-05) (hash(x)=49942165)
665
+ 9300 val loss 6.5492
666
+ 9300 val perplexity 698.7104
667
+ 9300 train 6.486413 (lr=6.4951e-05) (hash(x)=44784276)
668
+ 46300 val loss 5.8012
669
+ 46300 val perplexity 330.6895
670
+ 46300 train 5.776049 (lr=5.6101e-06) (hash(x)=49626999)
671
+ 7700 val loss 6.6817
672
+ 7700 val perplexity 797.6483
673
+ 7700 train 6.419989 (lr=9.5057e-05) (hash(x)=48853311)
674
+ 9400 val loss 6.5449
675
+ 9400 val perplexity 695.6748
676
+ 9400 train 6.656165 (lr=6.4842e-05) (hash(x)=51981169)
677
+ 46400 val loss 5.7988
678
+ 46400 val perplexity 329.9163
679
+ 46400 train 5.685073 (lr=5.5777e-06) (hash(x)=43325701)
680
+ 9500 val loss 6.5243
681
+ 9500 val perplexity 681.5016
682
+ 9500 train 6.455208 (lr=6.4733e-05) (hash(x)=47232936)
683
+ 7800 val loss 6.7101
684
+ 7800 val perplexity 820.6340
685
+ 7800 train 6.605823 (lr=9.4926e-05) (hash(x)=48510117)
686
+ 9600 val loss 6.5283
687
+ 9600 val perplexity 684.2013
688
+ 9600 train 6.555980 (lr=6.4622e-05) (hash(x)=53800450)
689
+ 46500 val loss 5.8014
690
+ 46500 val perplexity 330.7682
691
+ 46500 train 6.095033 (lr=5.5462e-06) (hash(x)=54028595)
692
+ 7900 val loss 6.6982
693
+ 7900 val perplexity 810.9125
694
+ 7900 train 6.697021 (lr=9.4795e-05) (hash(x)=48339781)
695
+ 9700 val loss 6.5110
696
+ 9700 val perplexity 672.4990
697
+ 9700 train 6.611804 (lr=6.4511e-05) (hash(x)=55768123)
698
+ 46600 val loss 5.8004
699
+ 46600 val perplexity 330.4477
700
+ 46600 train 5.646887 (lr=5.5156e-06) (hash(x)=44519175)
701
+ 9800 val loss 6.5201
702
+ 9800 val perplexity 678.6589
703
+ 9800 train 6.437235 (lr=6.4398e-05) (hash(x)=47745177)
704
+ 8000 val loss 6.7015
705
+ 8000 val perplexity 813.6622
706
+ 8000 train 6.827817 (lr=9.4661e-05) (hash(x)=54927320)
707
+ 46700 val loss 5.8020
708
+ 46700 val perplexity 330.9630
709
+ 46700 train 5.859873 (lr=5.4858e-06) (hash(x)=48357998)
710
+ 9900 val loss 6.5059
711
+ 9900 val perplexity 669.1097
712
+ 9900 train 6.771001 (lr=6.4284e-05) (hash(x)=56592246)
713
+ 8100 val loss 6.7125
714
+ 8100 val perplexity 822.6165
715
+ 8100 train 6.440390 (lr=9.4526e-05) (hash(x)=46461786)
716
+ 10000 val loss 6.5050
717
+ 10000 val perplexity 668.4611
718
+ 10000 train 6.518945 (lr=6.4170e-05) (hash(x)=51655963)
719
+ 46800 val loss 5.8007
720
+ 46800 val perplexity 330.5271
721
+ 46800 train 5.972873 (lr=5.4569e-06) (hash(x)=55911353)
722
+ 10100 val loss 6.5228
723
+ 10100 val perplexity 680.4901
724
+ 10100 train 6.419588 (lr=6.4054e-05) (hash(x)=49809511)
725
+ 8200 val loss 6.6683
726
+ 8200 val perplexity 787.0919
727
+ 8200 train 6.587510 (lr=9.4390e-05) (hash(x)=51536260)
728
+ 46900 val loss 5.8022
729
+ 46900 val perplexity 331.0330
730
+ 46900 train 5.741434 (lr=5.4289e-06) (hash(x)=47897187)
731
+ 10200 val loss 6.5020
732
+ 10200 val perplexity 666.4471
733
+ 10200 train 6.091907 (lr=6.3937e-05) (hash(x)=42297812)
734
+ 8300 val loss 6.7038
735
+ 8300 val perplexity 815.4675
736
+ 8300 train 6.438375 (lr=9.4252e-05) (hash(x)=44770722)
737
+ 10300 val loss 6.4973
738
+ 10300 val perplexity 663.3268
739
+ 10300 train 6.167974 (lr=6.3820e-05) (hash(x)=55529820)
740
+ 47000 val loss 5.7987
741
+ 47000 val perplexity 329.8616
742
+ 47000 train 5.359689 (lr=5.4017e-06) (hash(x)=43196571)
743
+ 10400 val loss 6.5084
744
+ 10400 val perplexity 670.7539
745
+ 10400 train 6.523131 (lr=6.3701e-05) (hash(x)=53255684)
746
+ 8400 val loss 6.6674
747
+ 8400 val perplexity 786.3558
748
+ 8400 train 6.660437 (lr=9.4112e-05) (hash(x)=50104957)
749
+ 10500 val loss 6.4665
750
+ 10500 val perplexity 643.2431
751
+ 10500 train 6.590051 (lr=6.3581e-05) (hash(x)=54306191)
752
+ 47100 val loss 5.7992
753
+ 47100 val perplexity 330.0356
754
+ 47100 train 5.657062 (lr=5.3755e-06) (hash(x)=51224987)
755
+ 8500 val loss 6.6906
756
+ 8500 val perplexity 804.8203
757
+ 8500 train 6.818645 (lr=9.3971e-05) (hash(x)=50132971)
758
+ 10600 val loss 6.4734
759
+ 10600 val perplexity 647.6887
760
+ 10600 train 6.714907 (lr=6.3460e-05) (hash(x)=60130567)
761
+ 47200 val loss 5.8016
762
+ 47200 val perplexity 330.8251
763
+ 47200 train 5.508888 (lr=5.3501e-06) (hash(x)=47943697)
764
+ 10700 val loss 6.4934
765
+ 10700 val perplexity 660.7467
766
+ 10700 train 6.420184 (lr=6.3339e-05) (hash(x)=50074737)
767
+ 8600 val loss 6.6548
768
+ 8600 val perplexity 776.4918
769
+ 8600 train 6.595765 (lr=9.3828e-05) (hash(x)=52193699)
770
+ 10800 val loss 6.4459
771
+ 10800 val perplexity 630.1255
772
+ 10800 train 6.556093 (lr=6.3216e-05) (hash(x)=51547220)
773
+ 47300 val loss 5.8012
774
+ 47300 val perplexity 330.6942
775
+ 47300 train 5.713035 (lr=5.3256e-06) (hash(x)=47351003)
776
+ 10900 val loss 6.4832
777
+ 10900 val perplexity 654.0450
778
+ 10900 train 6.565746 (lr=6.3092e-05) (hash(x)=55943981)
779
+ 8700 val loss 6.6440
780
+ 8700 val perplexity 768.1542
781
+ 8700 train 6.659471 (lr=9.3684e-05) (hash(x)=47902319)
782
+ 47400 val loss 5.8019
783
+ 47400 val perplexity 330.9364
784
+ 47400 train 5.875019 (lr=5.3020e-06) (hash(x)=55562243)
785
+ 11000 val loss 6.4434
786
+ 11000 val perplexity 628.5189
787
+ 11000 train 6.483435 (lr=6.2968e-05) (hash(x)=46444570)
788
+ 8800 val loss 6.6390
789
+ 8800 val perplexity 764.3397
790
+ 8800 train 6.929801 (lr=9.3538e-05) (hash(x)=54904230)
791
+ 11100 val loss 6.4500
792
+ 11100 val perplexity 632.7070
793
+ 11100 train 6.459686 (lr=6.2842e-05) (hash(x)=49589063)
794
+ 47500 val loss 5.8001
795
+ 47500 val perplexity 330.3456
796
+ 47500 train 5.675170 (lr=5.2792e-06) (hash(x)=53544850)
797
+ 11200 val loss 6.4506
798
+ 11200 val perplexity 633.0538
799
+ 11200 train 6.457446 (lr=6.2715e-05) (hash(x)=51392283)
800
+ 8900 val loss 6.6381
801
+ 8900 val perplexity 763.6228
802
+ 8900 train 6.531604 (lr=9.3391e-05) (hash(x)=46311615)
803
+ 47600 val loss 5.8024
804
+ 47600 val perplexity 331.0871
805
+ 47600 train 5.985342 (lr=5.2574e-06) (hash(x)=43634907)
806
+ 11300 val loss 6.4364
807
+ 11300 val perplexity 624.1540
808
+ 11300 train 6.397132 (lr=6.2588e-05) (hash(x)=45081133)
809
+ 9000 val loss 6.6386
810
+ 9000 val perplexity 764.0009
811
+ 9000 train 6.519309 (lr=9.3242e-05) (hash(x)=48535188)
812
+ 11400 val loss 6.4289
813
+ 11400 val perplexity 619.4864
814
+ 11400 train 6.524812 (lr=6.2459e-05) (hash(x)=53700397)
815
+ 47700 val loss 5.7999
816
+ 47700 val perplexity 330.2570
817
+ 47700 train 5.776142 (lr=5.2364e-06) (hash(x)=47909383)
818
+ 11500 val loss 6.4207
819
+ 11500 val perplexity 614.4413
820
+ 11500 train 6.142918 (lr=6.2330e-05) (hash(x)=43839088)
821
+ 9100 val loss 6.6102
822
+ 9100 val perplexity 742.6683
823
+ 9100 train 6.649930 (lr=9.3092e-05) (hash(x)=51757372)
824
+ 47800 val loss 5.7953
825
+ 47800 val perplexity 328.7346
826
+ 47800 train 5.674577 (lr=5.2163e-06) (hash(x)=45871079)
827
+ 11600 val loss 6.4324
828
+ 11600 val perplexity 621.6376
829
+ 11600 train 6.465709 (lr=6.2199e-05) (hash(x)=48088111)
830
+ 9200 val loss 6.6127
831
+ 9200 val perplexity 744.4720
832
+ 9200 train 6.500169 (lr=9.2940e-05) (hash(x)=51131708)
833
+ 11700 val loss 6.4299
834
+ 11700 val perplexity 620.0857
835
+ 11700 train 6.745090 (lr=6.2068e-05) (hash(x)=55108226)
836
+ 47900 val loss 5.7951
837
+ 47900 val perplexity 328.6861
838
+ 47900 train 5.784526 (lr=5.1972e-06) (hash(x)=47333324)
839
+ 11800 val loss 6.4176
840
+ 11800 val perplexity 612.5077
841
+ 11800 train 6.684676 (lr=6.1936e-05) (hash(x)=58524839)
842
+ 9300 val loss 6.5997
843
+ 9300 val perplexity 734.8544
844
+ 9300 train 6.554953 (lr=9.2786e-05) (hash(x)=44784276)
845
+ 48000 val loss 5.7948
846
+ 48000 val perplexity 328.5829
847
+ 48000 train 5.876556 (lr=5.1788e-06) (hash(x)=52758020)
848
+ 11900 val loss 6.4150
849
+ 11900 val perplexity 610.9667
850
+ 11900 train 6.081831 (lr=6.1802e-05) (hash(x)=43864078)
851
+ 9400 val loss 6.5917
852
+ 9400 val perplexity 729.0008
853
+ 9400 train 6.715466 (lr=9.2632e-05) (hash(x)=51981169)
854
+ 12000 val loss 6.4093
855
+ 12000 val perplexity 607.4685
856
+ 12000 train 6.095185 (lr=6.1668e-05) (hash(x)=43448544)
857
+ 48100 val loss 5.7937
858
+ 48100 val perplexity 328.2226
859
+ 48100 train 5.802242 (lr=5.1614e-06) (hash(x)=49806349)
860
+ 12100 val loss 6.3978
861
+ 12100 val perplexity 600.5402
862
+ 12100 train 6.396197 (lr=6.1533e-05) (hash(x)=55200399)
863
+ 9500 val loss 6.6014
864
+ 9500 val perplexity 736.1489
865
+ 9500 train 6.523407 (lr=9.2475e-05) (hash(x)=47232936)
866
+ 12200 val loss 6.4067
867
+ 12200 val perplexity 605.8679
868
+ 48200 val loss 5.7914
869
+ 48200 val perplexity 327.4593
870
+ 12200 train 6.546592 (lr=6.1397e-05) (hash(x)=57627314)
871
+ 48200 train 5.868467 (lr=5.1449e-06) (hash(x)=53220839)
872
+ 9600 val loss 6.5844
873
+ 9600 val perplexity 723.6813
874
+ 9600 train 6.593096 (lr=9.2317e-05) (hash(x)=53800450)
875
+ 12300 val loss 6.4145
876
+ 12300 val perplexity 610.6370
877
+ 12300 train 6.693074 (lr=6.1260e-05) (hash(x)=53617087)
878
+ 48300 val loss 5.7940
879
+ 48300 val perplexity 328.3305
880
+ 48300 train 5.822805 (lr=5.1293e-06) (hash(x)=56052541)
881
+ 12400 val loss 6.4177
882
+ 12400 val perplexity 612.5649
883
+ 12400 train 6.262563 (lr=6.1122e-05) (hash(x)=51135678)
884
+ 9700 val loss 6.5799
885
+ 9700 val perplexity 720.4314
886
+ 9700 train 6.675194 (lr=9.2158e-05) (hash(x)=55768123)
887
+ 48400 val loss 5.7905
888
+ 48400 val perplexity 327.1818
889
+ 48400 train 5.670732 (lr=5.1145e-06) (hash(x)=44482356)
890
+ 12500 val loss 6.4237
891
+ 12500 val perplexity 616.2940
892
+ 12500 train 6.428000 (lr=6.0984e-05) (hash(x)=48025130)
893
+ 12600 val loss 6.4127
894
+ 12600 val perplexity 609.5201
895
+ 9800 val loss 6.5909
896
+ 9800 val perplexity 728.4636
897
+ 12600 train 6.470107 (lr=6.0844e-05) (hash(x)=52135695)
898
+ 9800 train 6.517842 (lr=9.1997e-05) (hash(x)=47745177)
899
+ 48500 val loss 5.7897
900
+ 48500 val perplexity 326.9112
901
+ 48500 train 5.520173 (lr=5.1007e-06) (hash(x)=45714818)
902
+ 12700 val loss 6.4382
903
+ 12700 val perplexity 625.2830
904
+ 12700 train 6.303355 (lr=6.0703e-05) (hash(x)=51888613)
905
+ 9900 val loss 6.5974
906
+ 9900 val perplexity 733.2065
907
+ 9900 train 6.867185 (lr=9.1835e-05) (hash(x)=56592246)
908
+ 48600 val loss 5.7913
909
+ 48600 val perplexity 327.4302
910
+ 48600 train 5.555061 (lr=5.0877e-06) (hash(x)=49476556)
911
+ 12800 val loss 6.4182
912
+ 12800 val perplexity 612.8937
913
+ 12800 train 6.351943 (lr=6.0562e-05) (hash(x)=50418818)
914
+ 12900 val loss 6.4053
915
+ 12900 val perplexity 605.0304
916
+ 12900 train 7.257672 (lr=6.0420e-05) (hash(x)=58649585)
917
+ 10000 val loss 6.6087
918
+ 10000 val perplexity 741.5094
919
+ 10000 train 6.601171 (lr=9.1671e-05) (hash(x)=51655963)
920
+ 48700 val loss 5.7916
921
+ 48700 val perplexity 327.5210
922
+ 48700 train 5.483535 (lr=5.0756e-06) (hash(x)=42508579)
923
+ 13000 val loss 6.3998
924
+ 13000 val perplexity 601.7260
925
+ 13000 train 6.663928 (lr=6.0277e-05) (hash(x)=54567307)
926
+ 10100 val loss 6.5917
927
+ 10100 val perplexity 728.9938
928
+ 10100 train 6.492918 (lr=9.1506e-05) (hash(x)=49809511)
929
+ 13100 val loss 6.3887
930
+ 13100 val perplexity 595.0941
931
+ 13100 train 6.565659 (lr=6.0133e-05) (hash(x)=52071473)
932
+ 48800 val loss 5.7920
933
+ 48800 val perplexity 327.6678
934
+ 48800 train 6.004168 (lr=5.0644e-06) (hash(x)=52737449)
935
+ 13200 val loss 6.3823
936
+ 13200 val perplexity 591.2615
937
+ 13200 train 6.148626 (lr=5.9988e-05) (hash(x)=46293092)
938
+ 10200 val loss 6.5790
939
+ 10200 val perplexity 719.8549
940
+ 10200 train 6.172993 (lr=9.1339e-05) (hash(x)=42297812)
941
+ 48900 val loss 5.7912
942
+ 48900 val perplexity 327.4165
943
+ 48900 train 5.636753 (lr=5.0542e-06) (hash(x)=47057569)
944
+ 13300 val loss 6.3882
945
+ 13300 val perplexity 594.8053
946
+ 13300 train 6.513732 (lr=5.9842e-05) (hash(x)=56511467)
947
+ 10300 val loss 6.5718
948
+ 10300 val perplexity 714.6194
949
+ 10300 train 6.288669 (lr=9.1171e-05) (hash(x)=55529820)
950
+ 13400 val loss 6.3714
951
+ 13400 val perplexity 584.8615
952
+ 13400 train 6.486885 (lr=5.9695e-05) (hash(x)=54753763)
953
+ 49000 val loss 5.7940
954
+ 49000 val perplexity 328.3136
955
+ 49000 train 5.770644 (lr=5.0448e-06) (hash(x)=49908975)
956
+ 13500 val loss 6.3630
957
+ 13500 val perplexity 579.9861
958
+ 13500 train 6.510321 (lr=5.9548e-05) (hash(x)=53610247)
959
+ 10400 val loss 6.5765
960
+ 10400 val perplexity 717.9875
961
+ 10400 train 6.559584 (lr=9.1001e-05) (hash(x)=53255684)
962
+ 49100 val loss 5.7920
963
+ 49100 val perplexity 327.6564
964
+ 49100 train 5.614700 (lr=5.0363e-06) (hash(x)=48427414)
965
+ 13600 val loss 6.3478
966
+ 13600 val perplexity 571.2138
967
+ 13600 train 6.225275 (lr=5.9400e-05) (hash(x)=47526249)
968
+ 10500 val loss 6.5377
969
+ 10500 val perplexity 690.7025
970
+ 10500 train 6.650892 (lr=9.0830e-05) (hash(x)=54306191)
971
+ 13700 val loss 6.3612
972
+ 13700 val perplexity 578.9458
973
+ 13700 train 6.217256 (lr=5.9251e-05) (hash(x)=51185517)
974
+ 49200 val loss 5.7897
975
+ 49200 val perplexity 326.9223
976
+ 49200 train 5.618961 (lr=5.0286e-06) (hash(x)=50246074)
977
+ 13800 val loss 6.3604
978
+ 13800 val perplexity 578.4911
979
+ 13800 train 6.032686 (lr=5.9101e-05) (hash(x)=45953529)
980
+ 10600 val loss 6.5441
981
+ 10600 val perplexity 695.1240
982
+ 10600 train 6.779467 (lr=9.0658e-05) (hash(x)=60130567)
983
+ 49300 val loss 5.7902
984
+ 49300 val perplexity 327.0729
985
+ 49300 train 5.844873 (lr=5.0219e-06) (hash(x)=47715359)
986
+ 13900 val loss 6.3799
987
+ 13900 val perplexity 589.8560
988
+ 13900 train 6.307417 (lr=5.8950e-05) (hash(x)=47238157)
989
+ 14000 val loss 6.3578
990
+ 14000 val perplexity 577.0012
991
+ 14000 train 6.338602 (lr=5.8799e-05) (hash(x)=54250750)
992
+ 10700 val loss 6.5191
993
+ 10700 val perplexity 677.9451
994
+ 10700 train 6.443255 (lr=9.0484e-05) (hash(x)=50074737)
995
+ 49400 val loss 5.7951
996
+ 49400 val perplexity 328.7013
997
+ 49400 train 5.789733 (lr=5.0161e-06) (hash(x)=50175867)
998
+ 14100 val loss 6.3644
999
+ 14100 val perplexity 580.7674
1000
+ 14100 train 6.351201 (lr=5.8646e-05) (hash(x)=48198552)
1001
+ 10800 val loss 6.5320
1002
+ 10800 val perplexity 686.7475
1003
+ 10800 train 6.645676 (lr=9.0308e-05) (hash(x)=51547220)
1004
+ 14200 val loss 6.3626
1005
+ 14200 val perplexity 579.7433
1006
+ 49500 val loss 5.7927
1007
+ 49500 val perplexity 327.8968
1008
+ 14200 train 6.497223 (lr=5.8493e-05) (hash(x)=52020690)
1009
+ 49500 train 5.631345 (lr=5.0112e-06) (hash(x)=49336040)
1010
+ 14300 val loss 6.3541
1011
+ 14300 val perplexity 574.8251
1012
+ 14300 train 6.156728 (lr=5.8339e-05) (hash(x)=45165483)
1013
+ 10900 val loss 6.5470
1014
+ 10900 val perplexity 697.1835
1015
+ 10900 train 6.629424 (lr=9.0132e-05) (hash(x)=55943981)
1016
+ 49600 val loss 5.7945
1017
+ 49600 val perplexity 328.5037
1018
+ 49600 train 5.834286 (lr=5.0072e-06) (hash(x)=52039357)
1019
+ 14400 val loss 6.3595
1020
+ 14400 val perplexity 577.9749
1021
+ 14400 train 6.475995 (lr=5.8184e-05) (hash(x)=52184072)
1022
+ 11000 val loss 6.5333
1023
+ 11000 val perplexity 687.6558
1024
+ 11000 train 6.565971 (lr=8.9954e-05) (hash(x)=46444570)
1025
+ 14500 val loss 6.3595
1026
+ 14500 val perplexity 577.9520
1027
+ 14500 train 6.706707 (lr=5.8029e-05) (hash(x)=56046436)
1028
+ 49700 val loss 5.7948
1029
+ 49700 val perplexity 328.5858
1030
+ 49700 train 5.590436 (lr=5.0040e-06) (hash(x)=47568707)
1031
+ 14600 val loss 6.3534
1032
+ 14600 val perplexity 574.4607
1033
+ 14600 train 6.230230 (lr=5.7872e-05) (hash(x)=52029694)
1034
+ 11100 val loss 6.5385
1035
+ 11100 val perplexity 691.2264
1036
+ 11100 train 6.547019 (lr=8.9774e-05) (hash(x)=49589063)
1037
+ 49800 val loss 5.7962
1038
+ 49800 val perplexity 329.0321
1039
+ 49800 train 5.719766 (lr=5.0018e-06) (hash(x)=48451274)
1040
+ 14700 val loss 6.3699
1041
+ 14700 val perplexity 584.0244
1042
+ 14700 train 6.211578 (lr=5.7715e-05) (hash(x)=50258224)
1043
+ 11200 val loss 6.5242
1044
+ 11200 val perplexity 681.4029
1045
+ 11200 train 6.523566 (lr=8.9593e-05) (hash(x)=51392283)
1046
+ 14800 val loss 6.3676
1047
+ 14800 val perplexity 582.6866
1048
+ 14800 train 5.692614 (lr=5.7558e-05) (hash(x)=42112262)
1049
+ 49900 val loss 5.7984
1050
+ 49900 val perplexity 329.7762
1051
+ 49900 train 5.486496 (lr=5.0004e-06) (hash(x)=44523603)
1052
+ 14900 val loss 6.3608
1053
+ 14900 val perplexity 578.7267
1054
+ 14900 train 6.101720 (lr=5.7399e-05) (hash(x)=47219933)
1055
+ 11300 val loss 6.5394
1056
+ 11300 val perplexity 691.8585
1057
+ 11300 train 6.495648 (lr=8.9411e-05) (hash(x)=45081133)
1058
+ 49999 val loss 5.7909
1059
+ 49999 val perplexity 327.3206
1060
+ 15000 val loss 6.4009
1061
+ 15000 val perplexity 602.3866
1062
+ 15000 train 6.002666 (lr=5.7240e-05) (hash(x)=58309309)
1063
+ 11400 val loss 6.5171
1064
+ 11400 val perplexity 676.6274
1065
+ 11400 train 6.611968 (lr=8.9227e-05) (hash(x)=53700397)
1066
+ 15100 val loss 6.3498
1067
+ 15100 val perplexity 572.3857
1068
+ 15100 train 6.254695 (lr=5.7079e-05) (hash(x)=48756049)
1069
+ 15200 val loss 6.3577
1070
+ 15200 val perplexity 576.9346
1071
+ 15200 train 6.047930 (lr=5.6919e-05) (hash(x)=49791737)
1072
+ 11500 val loss 6.5199
1073
+ 11500 val perplexity 678.5172
1074
+ 11500 train 6.260948 (lr=8.9043e-05) (hash(x)=43839088)
1075
+ 15300 val loss 6.3336
1076
+ 15300 val perplexity 563.2054
1077
+ 15300 train 6.302131 (lr=5.6757e-05) (hash(x)=53084126)
1078
+ 11600 val loss 6.5099
1079
+ 11600 val perplexity 671.7664
1080
+ 11600 train 6.543172 (lr=8.8856e-05) (hash(x)=48088111)
1081
+ 15400 val loss 6.3310
1082
+ 15400 val perplexity 561.6938
1083
+ 15400 train 6.422942 (lr=5.6595e-05) (hash(x)=55041679)
1084
+ 15500 val loss 6.3303
1085
+ 15500 val perplexity 561.3511
1086
+ 15500 train 6.124517 (lr=5.6432e-05) (hash(x)=43249867)
1087
+ 11700 val loss 6.5147
1088
+ 11700 val perplexity 674.9705
1089
+ 11700 train 6.829165 (lr=8.8668e-05) (hash(x)=55108226)
1090
+ 15600 val loss 6.3211
1091
+ 15600 val perplexity 556.1926
1092
+ 15600 train 6.312618 (lr=5.6268e-05) (hash(x)=49006517)
1093
+ 15700 val loss 6.3168
1094
+ 15700 val perplexity 553.8163
1095
+ 15700 train 6.645555 (lr=5.6104e-05) (hash(x)=60986839)
1096
+ 11800 val loss 6.5320
1097
+ 11800 val perplexity 686.7426
1098
+ 11800 train 6.764431 (lr=8.8479e-05) (hash(x)=58524839)
1099
+ 15800 val loss 6.3371
1100
+ 15800 val perplexity 565.1663
1101
+ 15800 train 6.246183 (lr=5.5938e-05) (hash(x)=48354906)
1102
+ 11900 val loss 6.5044
1103
+ 11900 val perplexity 668.0739
1104
+ 11900 train 6.170547 (lr=8.8289e-05) (hash(x)=43864078)
1105
+ 15900 val loss 6.3422
1106
+ 15900 val perplexity 568.0605
1107
+ 15900 train 6.207072 (lr=5.5773e-05) (hash(x)=52679780)
1108
+ 16000 val loss 6.3104
1109
+ 16000 val perplexity 550.2414
1110
+ 16000 train 6.475496 (lr=5.5606e-05) (hash(x)=58049587)
1111
+ 12000 val loss 6.5055
1112
+ 12000 val perplexity 668.7924
1113
+ 12000 train 6.222349 (lr=8.8097e-05) (hash(x)=43448544)
attention_kindselective_n_heads4_seed1340/model_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5d1d69356d9c198afaea93cf7ccf66f2d5809310214a3e417a834f53abe451a
3
  size 92843394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e856700b2ba8c1c78fec371f511980a9165c3a304e65b1e2ecf19f9a9ba2c4f5
3
  size 92843394
attention_kindselective_n_heads4_seed1340/model_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3969de9631fa1091a1c449d2a388db43ecd8c763683bd08da56a12468a2b209a
3
  size 92843394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ce0f91188a61cf2a20045bdc8d3dc3070278a30ac99975978bf3d232dd03f70
3
  size 92843394
attention_kindselective_n_heads4_seed1340/model_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf4eafbd63635551ef8204a5b1e4fa01c7111a78c8eb15db22b64873e52bad4b
3
  size 92843394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21edaf2540d7212d65d0bd210419f7d6a0f9281cde1d06f864c3ec1d63fff124
3
  size 92843394
attention_kindselective_n_heads4_seed1340/model_10000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e28d809d0626f302c7278753b8c13072c86cc80bea7caeb91264050622dd5e7
3
+ size 92843394
attention_kindselective_n_heads4_seed1340/model_12500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69a6cd3908266914a289ba86b8a7d2af8ce51bf4034f6b670c6730502d6f65d4
3
+ size 92843394
attention_kindselective_n_heads4_seed1340/model_42500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e937e5bde413b5f1d5cd2e7a03428c77cca6ac8ef7547cc1d8aa0565c33ea0d
3
+ size 92843394
attention_kindselective_n_heads4_seed1340/model_45000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03ef3fa0f39e905fced4c8e9f6d4c4c36d3dfdec8a8589b65a5f4c6363469015
3
+ size 92843394
attention_kindselective_n_heads4_seed1340/model_47500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58579aca525c6640a50ed6de3f5bc997ccf568b1c74ac74080a8413181c72725
3
+ size 92843394
attention_kindselective_n_heads4_seed1340/model_49999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddd2671bc53d3d47c2a3597b04a352fe6e56e1aa82a19bed3ee6f4e205890842
3
+ size 92843394
attention_kindselective_n_heads4_seed1340/optimizer_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06179ff7fbca3cf6f6d3d1546cd7e6f27c331a63a9a71de710896c329a40b635
3
  size 179406214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71d5d80bc8621c01c7517a9c8966efcf316b98161adb2e069c5201828111ea9b
3
  size 179406214
attention_kindselective_n_heads4_seed1340/optimizer_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:534c88f2b95620226732b552d992f6d228437ef793d6a67da093fc7cc42ce5e8
3
  size 179406214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cf0986c29b0869c7e9016f8a369414e67e321e3e73344a91dc6a3451b88ff99
3
  size 179406214
attention_kindselective_n_heads4_seed1340/optimizer_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f9ae9b15de67f44a4e1d5dcb110a3995dd6867e2ff4a70347ffd32b561b4229
3
  size 179406214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f45a3b4647bd40c9e04b365a1e5c1703f4207fa3698dc385fc2cfd5bc0ea4bd5
3
  size 179406214
attention_kindselective_n_heads4_seed1340/optimizer_10000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06bc9c8845c72cf18f14e45b43f7c4c19b17aa50a69cff163f981ea53f88744f
3
+ size 179406214
attention_kindselective_n_heads4_seed1340/optimizer_12500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61803c8687b0d4b6de778e759324ef063faecd26bcfb6e193e45505fe759b8d9
3
+ size 179406214
attention_kindselective_n_heads4_seed1340/optimizer_42500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dffbcca299af84d6d052050b7c9c7230f9052fbb489bbc7104607f111ccd656e
3
+ size 179406214
attention_kindselective_n_heads4_seed1340/optimizer_45000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b4dba3c28b0a3140da1cda29187583bfb0ffbc5b49bd20693dae0ef1a68772c
3
+ size 179406214
attention_kindselective_n_heads4_seed1340/optimizer_47500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee4537d10c929400a183bc037015dc153913766e3bc65eeecde61d2e9ef87311
3
+ size 179406214
attention_kindselective_n_heads4_seed1340/optimizer_49999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5280d94beeaac57235e9ca63bb2521ca98af393b93180fe9a674caafe9bb168
3
+ size 179406214