andrew-healey commited on
Commit
1ed1645
·
verified ·
1 Parent(s): f68044c

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads8_seed1341/args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads8_seed1341", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 8, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1341, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 3e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "3e-5_10240_8_1341", "n_embd": 512}
attention_kindselective_n_heads8_seed1341/dataloader_02500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b2ea67f78ff5a7970d0db044ff7ee527b3dc065f295fd30f588df4b44b568d0
3
+ size 964
attention_kindselective_n_heads8_seed1341/dataloader_05000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f03ed2ebf741f15e13c79e6cc1e9a19b308450d81cc3b4d8d0338c63d77ca59
3
+ size 964
attention_kindselective_n_heads8_seed1341/dataloader_07500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82590037fb2eecbec961f7967a8dd1b8d85515d31a252f66b92b8139858a8b7c
3
+ size 964
attention_kindselective_n_heads8_seed1341/dataloader_09999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c544303717d12355a69b8ffb1eb109434e4fdccfd5a61279b6e8ba2e870d6700
3
+ size 964
attention_kindselective_n_heads8_seed1341/log2.txt ADDED
@@ -0,0 +1,1209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ max_steps: 10000
2
+ 0 val loss 11.0680
3
+ 0 val perplexity 64089.5781
4
+ 0 val loss 11.0680
5
+ 0 val perplexity 64089.5781
6
+ 0 val loss 11.0680
7
+ 0 val perplexity 64089.5781
8
+ 0 val loss 11.0680
9
+ 0 val perplexity 64089.5781
10
+ 0 train 11.048663 (lr=1.0000e-07) (hash(x)=47078120)
11
+ 0 train 11.048663 (lr=2.5000e-07) (hash(x)=47078120)
12
+ 0 train 11.048663 (lr=3.5000e-07) (hash(x)=47078120)
13
+ 0 train 11.048676 (lr=1.5000e-07) (hash(x)=47078120)
14
+ 100 val loss 9.8889
15
+ 100 val perplexity 19711.1328
16
+ 100 train 9.874242 (lr=1.0100e-05) (hash(x)=43429388)
17
+ 100 val loss 9.4787
18
+ 100 val perplexity 13077.7705
19
+ 100 val loss 9.7282
20
+ 100 val perplexity 16783.9766
21
+ 100 train 9.448478 (lr=3.5350e-05) (hash(x)=43429388)
22
+ 100 val loss 9.5889
23
+ 100 val perplexity 14601.5840
24
+ 100 train 9.713765 (lr=1.5150e-05) (hash(x)=43429388)
25
+ 100 train 9.563825 (lr=2.5250e-05) (hash(x)=43429388)
26
+ 200 val loss 9.4174
27
+ 200 val perplexity 12300.9268
28
+ 200 train 9.477206 (lr=2.0000e-05) (hash(x)=52929681)
29
+ 200 val loss 8.3919
30
+ 200 val perplexity 4411.0986
31
+ 200 val loss 8.0984
32
+ 200 val perplexity 3289.3396
33
+ 200 train 8.515907 (lr=5.0000e-05) (hash(x)=52929681)
34
+ 200 val loss 9.1730
35
+ 200 val perplexity 9633.6963
36
+ 200 train 8.248425 (lr=7.0000e-05) (hash(x)=52929681)
37
+ 200 train 9.244697 (lr=3.0000e-05) (hash(x)=52929681)
38
+ 300 val loss 8.8455
39
+ 300 val perplexity 6943.1738
40
+ 300 train 8.874369 (lr=1.9995e-05) (hash(x)=49930367)
41
+ 300 val loss 7.7464
42
+ 300 val perplexity 2313.2307
43
+ 300 train 7.757118 (lr=4.9988e-05) (hash(x)=49930367)
44
+ 300 val loss 7.6647
45
+ 300 val perplexity 2131.7766
46
+ 300 val loss 8.2538
47
+ 300 val perplexity 3842.1523
48
+ 300 train 7.684315 (lr=6.9984e-05) (hash(x)=49930367)
49
+ 300 train 8.276509 (lr=2.9993e-05) (hash(x)=49930367)
50
+ 400 val loss 8.2382
51
+ 400 val perplexity 3782.5420
52
+ 400 train 8.121539 (lr=1.9982e-05) (hash(x)=48542946)
53
+ 400 val loss 7.5870
54
+ 400 val perplexity 1972.4014
55
+ 400 train 7.386682 (lr=4.9954e-05) (hash(x)=48542946)
56
+ 400 val loss 7.5243
57
+ 400 val perplexity 1852.4517
58
+ 400 val loss 7.8717
59
+ 400 val perplexity 2621.8926
60
+ 400 train 7.326735 (lr=6.9935e-05) (hash(x)=48542946)
61
+ 400 train 7.709966 (lr=2.9972e-05) (hash(x)=48542946)
62
+ 500 val loss 7.9134
63
+ 500 val perplexity 2733.5874
64
+ 500 train 8.054297 (lr=1.9958e-05) (hash(x)=55286048)
65
+ 500 val loss 7.6640
66
+ 500 val perplexity 2130.2502
67
+ 500 val loss 7.5074
68
+ 500 val perplexity 1821.5359
69
+ 500 val loss 7.4518
70
+ 500 val perplexity 1722.9878
71
+ 500 train 7.837950 (lr=2.9938e-05) (hash(x)=55286048)
72
+ 500 train 7.714735 (lr=4.9896e-05) (hash(x)=55286048)
73
+ 500 train 7.667075 (lr=6.9854e-05) (hash(x)=55286048)
74
+ 600 val loss 7.7274
75
+ 600 val perplexity 2269.6853
76
+ 600 train 7.917661 (lr=1.9926e-05) (hash(x)=51577760)
77
+ 600 val loss 7.5601
78
+ 600 val perplexity 1920.0514
79
+ 600 train 7.793221 (lr=2.9889e-05) (hash(x)=51577760)
80
+ 600 val loss 7.4491
81
+ 600 val perplexity 1718.2578
82
+ 600 val loss 7.3973
83
+ 600 val perplexity 1631.6017
84
+ 600 train 7.691466 (lr=4.9815e-05) (hash(x)=51577760)
85
+ 600 train 7.632826 (lr=6.9741e-05) (hash(x)=51577760)
86
+ 700 val loss 7.6300
87
+ 700 val perplexity 2059.1208
88
+ 700 train 7.758367 (lr=1.9885e-05) (hash(x)=57433471)
89
+ 700 val loss 7.4870
90
+ 700 val perplexity 1784.6235
91
+ 700 train 7.647078 (lr=2.9827e-05) (hash(x)=57433471)
92
+ 700 val loss 7.3347
93
+ 700 val perplexity 1532.5104
94
+ 700 train 7.507759 (lr=6.9596e-05) (hash(x)=57433471)
95
+ 700 val loss 7.3973
96
+ 700 val perplexity 1631.5386
97
+ 700 train 7.570853 (lr=4.9712e-05) (hash(x)=57433471)
98
+ 800 val loss 7.5663
99
+ 800 val perplexity 1931.9979
100
+ 800 train 7.500276 (lr=1.9834e-05) (hash(x)=49799291)
101
+ 800 val loss 7.4398
102
+ 800 val perplexity 1702.4897
103
+ 800 train 7.366629 (lr=2.9751e-05) (hash(x)=49799291)
104
+ 800 val loss 7.3000
105
+ 800 val perplexity 1480.3511
106
+ 800 train 7.210431 (lr=6.9419e-05) (hash(x)=49799291)
107
+ 800 val loss 7.3632
108
+ 800 val perplexity 1576.9290
109
+ 800 train 7.281087 (lr=4.9585e-05) (hash(x)=49799291)
110
+ 900 val loss 7.5209
111
+ 900 val perplexity 1846.1362
112
+ 900 train 7.526892 (lr=1.9774e-05) (hash(x)=49502839)
113
+ 900 val loss 7.4191
114
+ 900 val perplexity 1667.5374
115
+ 900 train 7.422843 (lr=2.9662e-05) (hash(x)=49502839)
116
+ 900 val loss 7.2387
117
+ 900 val perplexity 1392.2841
118
+ 900 train 7.215050 (lr=6.9210e-05) (hash(x)=49502839)
119
+ 900 val loss 7.3366
120
+ 900 val perplexity 1535.4969
121
+ 900 train 7.333570 (lr=4.9436e-05) (hash(x)=49502839)
122
+ 1000 val loss 7.4822
123
+ 1000 val perplexity 1776.1934
124
+ 1000 train 7.707015 (lr=1.9706e-05) (hash(x)=51142904)
125
+ 1000 val loss 7.3874
126
+ 1000 val perplexity 1615.5121
127
+ 1000 train 7.649211 (lr=2.9558e-05) (hash(x)=51142904)
128
+ 1000 val loss 7.1808
129
+ 1000 val perplexity 1313.9176
130
+ 1000 train 7.483747 (lr=6.8970e-05) (hash(x)=51142904)
131
+ 1000 val loss 7.3008
132
+ 1000 val perplexity 1481.4540
133
+ 1000 train 7.578658 (lr=4.9264e-05) (hash(x)=51142904)
134
+ 1100 val loss 7.4532
135
+ 1100 val perplexity 1725.2938
136
+ 1100 train 7.515146 (lr=1.9628e-05) (hash(x)=52751086)
137
+ 1100 val loss 7.3649
138
+ 1100 val perplexity 1579.5704
139
+ 1100 train 7.431870 (lr=2.9442e-05) (hash(x)=52751086)
140
+ 1100 val loss 7.1476
141
+ 1100 val perplexity 1271.1102
142
+ 1100 train 7.207986 (lr=6.8698e-05) (hash(x)=52751086)
143
+ 1100 val loss 7.2667
144
+ 1100 val perplexity 1431.8098
145
+ 1100 train 7.324548 (lr=4.9070e-05) (hash(x)=52751086)
146
+ 1200 val loss 7.4319
147
+ 1200 val perplexity 1688.9976
148
+ 1200 train 7.388729 (lr=1.9542e-05) (hash(x)=51538621)
149
+ 1200 val loss 7.3563
150
+ 1200 val perplexity 1566.0082
151
+ 1200 train 7.301937 (lr=2.9312e-05) (hash(x)=51538621)
152
+ 1200 val loss 7.1481
153
+ 1200 val perplexity 1271.6545
154
+ 1200 train 7.103180 (lr=6.8395e-05) (hash(x)=51538621)
155
+ 1200 val loss 7.2521
156
+ 1200 val perplexity 1411.0454
157
+ 1200 train 7.200856 (lr=4.8854e-05) (hash(x)=51538621)
158
+ 1300 val loss 7.4095
159
+ 1300 val perplexity 1651.6470
160
+ 1300 train 7.466151 (lr=1.9446e-05) (hash(x)=52034040)
161
+ 1300 val loss 7.3408
162
+ 1300 val perplexity 1541.8721
163
+ 1300 train 7.412338 (lr=2.9169e-05) (hash(x)=52034040)
164
+ 1300 val loss 7.0932
165
+ 1300 val perplexity 1203.7023
166
+ 1300 train 7.166739 (lr=6.8062e-05) (hash(x)=52034040)
167
+ 1300 val loss 7.2236
168
+ 1300 val perplexity 1371.4453
169
+ 1300 train 7.305515 (lr=4.8616e-05) (hash(x)=52034040)
170
+ 1400 val loss 7.3843
171
+ 1400 val perplexity 1610.4229
172
+ 1400 train 7.408330 (lr=1.9342e-05) (hash(x)=50640105)
173
+ 1400 val loss 7.3167
174
+ 1400 val perplexity 1505.1840
175
+ 1400 train 7.352315 (lr=2.9013e-05) (hash(x)=50640105)
176
+ 1400 val loss 7.0789
177
+ 1400 val perplexity 1186.6193
178
+ 1400 train 7.134275 (lr=6.7698e-05) (hash(x)=50640105)
179
+ 1400 val loss 7.2080
180
+ 1400 val perplexity 1350.1521
181
+ 1400 train 7.266603 (lr=4.8356e-05) (hash(x)=50640105)
182
+ 1500 val loss 7.3619
183
+ 1500 val perplexity 1574.9045
184
+ 1500 train 7.287918 (lr=1.9230e-05) (hash(x)=49016270)
185
+ 1500 val loss 7.2903
186
+ 1500 val perplexity 1465.9382
187
+ 1500 train 7.219070 (lr=2.8845e-05) (hash(x)=49016270)
188
+ 1500 val loss 7.0480
189
+ 1500 val perplexity 1150.5200
190
+ 1500 train 6.976660 (lr=6.7304e-05) (hash(x)=49016270)
191
+ 1500 val loss 7.1885
192
+ 1500 val perplexity 1324.1281
193
+ 1500 train 7.115997 (lr=4.8074e-05) (hash(x)=49016270)
194
+ 1600 val loss 7.3388
195
+ 1600 val perplexity 1538.8591
196
+ 1600 train 7.111701 (lr=1.9109e-05) (hash(x)=46100488)
197
+ 1600 val loss 7.2736
198
+ 1600 val perplexity 1441.7178
199
+ 1600 train 7.037468 (lr=2.8663e-05) (hash(x)=46100488)
200
+ 1600 val loss 7.0191
201
+ 1600 val perplexity 1117.7505
202
+ 1600 train 6.790306 (lr=6.6881e-05) (hash(x)=46100488)
203
+ 1600 val loss 7.1701
204
+ 1600 val perplexity 1299.9185
205
+ 1600 train 6.944131 (lr=4.7772e-05) (hash(x)=46100488)
206
+ 1700 val loss 7.3212
207
+ 1700 val perplexity 1512.0359
208
+ 1700 train 7.369852 (lr=1.8979e-05) (hash(x)=49185350)
209
+ 1700 val loss 7.2640
210
+ 1700 val perplexity 1427.8956
211
+ 1700 train 7.317546 (lr=2.8469e-05) (hash(x)=49185350)
212
+ 1700 val loss 7.0044
213
+ 1700 val perplexity 1101.4866
214
+ 1700 train 7.065816 (lr=6.6428e-05) (hash(x)=49185350)
215
+ 1700 val loss 7.1566
216
+ 1700 val perplexity 1282.5233
217
+ 1700 train 7.217639 (lr=4.7448e-05) (hash(x)=49185350)
218
+ 1800 val loss 7.3080
219
+ 1800 val perplexity 1492.1678
220
+ 1800 train 7.221244 (lr=1.8842e-05) (hash(x)=48024574)
221
+ 1800 val loss 7.2471
222
+ 1800 val perplexity 1403.9862
223
+ 1800 train 7.151842 (lr=2.8263e-05) (hash(x)=48024574)
224
+ 1800 val loss 6.9785
225
+ 1800 val perplexity 1073.2799
226
+ 1800 train 6.877046 (lr=6.5947e-05) (hash(x)=48024574)
227
+ 1800 val loss 7.1461
228
+ 1800 val perplexity 1269.1606
229
+ 1900 val loss 7.2891
230
+ 1900 val perplexity 1464.3209
231
+ 1800 train 7.040640 (lr=4.7105e-05) (hash(x)=48024574)
232
+ 1900 train 7.115648 (lr=1.8696e-05) (hash(x)=45823189)
233
+ 1900 val loss 7.2267
234
+ 1900 val perplexity 1375.6698
235
+ 1900 train 7.034698 (lr=2.8044e-05) (hash(x)=45823189)
236
+ 1900 val loss 6.9555
237
+ 1900 val perplexity 1048.9285
238
+ 1900 train 6.739227 (lr=6.5437e-05) (hash(x)=45823189)
239
+ 2000 val loss 7.2767
240
+ 2000 val perplexity 1446.2552
241
+ 2000 train 7.108987 (lr=1.8543e-05) (hash(x)=45703932)
242
+ 1900 val loss 7.1339
243
+ 1900 val perplexity 1253.7621
244
+ 1900 train 6.941078 (lr=4.6741e-05) (hash(x)=45823189)
245
+ 2000 val loss 7.2035
246
+ 2000 val perplexity 1344.1517
247
+ 2000 train 7.027834 (lr=2.7814e-05) (hash(x)=45703932)
248
+ 2000 val loss 6.9407
249
+ 2000 val perplexity 1033.5043
250
+ 2000 train 6.743909 (lr=6.4900e-05) (hash(x)=45703932)
251
+ 2100 val loss 7.2564
252
+ 2100 val perplexity 1417.1031
253
+ 2100 train 7.887122 (lr=1.8382e-05) (hash(x)=58570170)
254
+ 2000 val loss 7.1274
255
+ 2000 val perplexity 1245.6257
256
+ 2000 train 6.943665 (lr=4.6357e-05) (hash(x)=45703932)
257
+ 2100 val loss 7.1834
258
+ 2100 val perplexity 1317.3862
259
+ 2100 train 7.836891 (lr=2.7572e-05) (hash(x)=58570170)
260
+ 2100 val loss 6.9112
261
+ 2100 val perplexity 1003.4402
262
+ 2100 train 7.606760 (lr=6.4335e-05) (hash(x)=58570170)
263
+ 2200 val loss 7.2513
264
+ 2200 val perplexity 1409.9761
265
+ 2200 train 7.275453 (lr=1.8213e-05) (hash(x)=55262880)
266
+ 2100 val loss 7.1219
267
+ 2100 val perplexity 1238.8298
268
+ 2100 train 7.786014 (lr=4.5954e-05) (hash(x)=58570170)
269
+ 2200 val loss 7.1710
270
+ 2200 val perplexity 1301.1643
271
+ 2200 train 7.189610 (lr=2.7319e-05) (hash(x)=55262880)
272
+ 2200 val loss 6.9619
273
+ 2200 val perplexity 1055.6140
274
+ 2200 train 6.981012 (lr=6.3745e-05) (hash(x)=55262880)
275
+ 2300 val loss 7.2239
276
+ 2300 val perplexity 1371.8573
277
+ 2300 train 6.981198 (lr=1.8036e-05) (hash(x)=46415497)
278
+ 2200 val loss 7.1250
279
+ 2200 val perplexity 1242.6696
280
+ 2200 train 7.142995 (lr=4.5532e-05) (hash(x)=55262880)
281
+ 2300 val loss 7.1466
282
+ 2300 val perplexity 1269.8204
283
+ 2300 train 6.890901 (lr=2.7055e-05) (hash(x)=46415497)
284
+ 2400 val loss 7.2045
285
+ 2400 val perplexity 1345.4766
286
+ 2400 train 7.125453 (lr=1.7853e-05) (hash(x)=49272278)
287
+ 2300 val loss 6.8938
288
+ 2300 val perplexity 986.1673
289
+ 2300 train 6.622870 (lr=6.3128e-05) (hash(x)=46415497)
290
+ 2300 val loss 7.1014
291
+ 2300 val perplexity 1213.7175
292
+ 2300 train 6.864444 (lr=4.5091e-05) (hash(x)=46415497)
293
+ 2400 val loss 7.1125
294
+ 2400 val perplexity 1227.2383
295
+ 2400 train 7.022389 (lr=2.6780e-05) (hash(x)=49272278)
296
+ 2500 val loss 7.1825
297
+ 2500 val perplexity 1316.1329
298
+ 2500 train 7.040866 (lr=1.7663e-05) (hash(x)=48390803)
299
+ 2400 val loss 6.8477
300
+ 2400 val perplexity 941.7234
301
+ 2400 train 6.740791 (lr=6.2486e-05) (hash(x)=49272278)
302
+ 2400 val loss 7.0815
303
+ 2400 val perplexity 1189.7360
304
+ 2400 train 6.993971 (lr=4.4633e-05) (hash(x)=49272278)
305
+ 2500 val loss 7.0937
306
+ 2500 val perplexity 1204.3481
307
+ 2500 train 6.950256 (lr=2.6494e-05) (hash(x)=48390803)
308
+ 2500 val loss 6.8303
309
+ 2500 val perplexity 925.4961
310
+ 2600 val loss 7.1665
311
+ 2600 val perplexity 1295.2786
312
+ 2600 train 7.031013 (lr=1.7465e-05) (hash(x)=47450116)
313
+ 2500 train 6.699334 (lr=6.1819e-05) (hash(x)=48390803)
314
+ 2500 val loss 7.0623
315
+ 2500 val perplexity 1167.1173
316
+ 2500 train 6.923334 (lr=4.4156e-05) (hash(x)=48390803)
317
+ 2600 val loss 7.0709
318
+ 2600 val perplexity 1177.1604
319
+ 2600 train 6.918849 (lr=2.6198e-05) (hash(x)=47450116)
320
+ 2700 val loss 7.1567
321
+ 2700 val perplexity 1282.6627
322
+ 2700 train 7.217981 (lr=1.7261e-05) (hash(x)=52681152)
323
+ 2600 val loss 6.8076
324
+ 2600 val perplexity 904.6650
325
+ 2600 train 6.673070 (lr=6.1128e-05) (hash(x)=47450116)
326
+ 2600 val loss 7.0478
327
+ 2600 val perplexity 1150.3018
328
+ 2600 train 6.902493 (lr=4.3663e-05) (hash(x)=47450116)
329
+ 2700 val loss 7.0496
330
+ 2700 val perplexity 1152.3412
331
+ 2700 train 7.117827 (lr=2.5892e-05) (hash(x)=52681152)
332
+ 2800 val loss 7.1363
333
+ 2800 val perplexity 1256.7675
334
+ 2800 train 7.209763 (lr=1.7051e-05) (hash(x)=50664094)
335
+ 2700 val loss 6.7852
336
+ 2700 val perplexity 884.6849
337
+ 2700 train 6.864418 (lr=6.0414e-05) (hash(x)=52681152)
338
+ 2700 val loss 7.0395
339
+ 2700 val perplexity 1140.7711
340
+ 2700 train 7.108398 (lr=4.3153e-05) (hash(x)=52681152)
341
+ 2800 val loss 7.0391
342
+ 2800 val perplexity 1140.3605
343
+ 2800 train 7.111782 (lr=2.5576e-05) (hash(x)=50664094)
344
+ 2900 val loss 7.1164
345
+ 2900 val perplexity 1231.9611
346
+ 2900 train 6.889511 (lr=1.6834e-05) (hash(x)=47067144)
347
+ 2800 val loss 6.7762
348
+ 2800 val perplexity 876.7094
349
+ 2800 train 6.850021 (lr=5.9677e-05) (hash(x)=50664094)
350
+ 2800 val loss 7.0292
351
+ 2800 val perplexity 1129.1125
352
+ 2800 train 7.094847 (lr=4.2627e-05) (hash(x)=50664094)
353
+ 2900 val loss 7.0108
354
+ 2900 val perplexity 1108.5176
355
+ 2900 train 6.776527 (lr=2.5251e-05) (hash(x)=47067144)
356
+ 3000 val loss 7.0995
357
+ 3000 val perplexity 1211.4059
358
+ 3000 train 6.896778 (lr=1.6611e-05) (hash(x)=45015009)
359
+ 2900 val loss 6.7621
360
+ 2900 val perplexity 864.4884
361
+ 2900 train 6.564690 (lr=5.8919e-05) (hash(x)=47067144)
362
+ 2900 val loss 7.0175
363
+ 2900 val perplexity 1115.9957
364
+ 2900 train 6.791795 (lr=4.2085e-05) (hash(x)=47067144)
365
+ 3000 val loss 6.9916
366
+ 3000 val perplexity 1087.4424
367
+ 3000 train 6.785407 (lr=2.4917e-05) (hash(x)=45015009)
368
+ 3100 val loss 7.0851
369
+ 3100 val perplexity 1194.0815
370
+ 3100 train 6.896208 (lr=1.6383e-05) (hash(x)=45245896)
371
+ 3000 val loss 6.7417
372
+ 3000 val perplexity 847.0032
373
+ 3000 train 6.542316 (lr=5.8140e-05) (hash(x)=45015009)
374
+ 3000 val loss 7.0100
375
+ 3000 val perplexity 1107.6675
376
+ 3000 train 6.804945 (lr=4.1529e-05) (hash(x)=45015009)
377
+ 3100 val loss 6.9760
378
+ 3100 val perplexity 1070.6133
379
+ 3100 train 6.779830 (lr=2.4574e-05) (hash(x)=45245896)
380
+ 3200 val loss 7.0673
381
+ 3200 val perplexity 1172.9346
382
+ 3200 train 7.026747 (lr=1.6149e-05) (hash(x)=49995942)
383
+ 3100 val loss 6.7318
384
+ 3100 val perplexity 838.6891
385
+ 3100 train 6.545393 (lr=5.7340e-05) (hash(x)=45245896)
386
+ 3100 val loss 6.9952
387
+ 3100 val perplexity 1091.3998
388
+ 3100 train 6.805078 (lr=4.0957e-05) (hash(x)=45245896)
389
+ 3200 val loss 6.9604
390
+ 3200 val perplexity 1054.0768
391
+ 3200 train 6.913863 (lr=2.4224e-05) (hash(x)=49995942)
392
+ 3300 val loss 7.0569
393
+ 3300 val perplexity 1160.8708
394
+ 3300 train 6.930760 (lr=1.5910e-05) (hash(x)=52311504)
395
+ 3200 val loss 6.7166
396
+ 3200 val perplexity 825.9999
397
+ 3200 train 6.681448 (lr=5.6522e-05) (hash(x)=49995942)
398
+ 3200 val loss 6.9841
399
+ 3200 val perplexity 1079.2999
400
+ 3200 train 6.941720 (lr=4.0373e-05) (hash(x)=49995942)
401
+ 3300 val loss 6.9465
402
+ 3300 val perplexity 1039.4871
403
+ 3300 train 6.819232 (lr=2.3865e-05) (hash(x)=52311504)
404
+ 3400 val loss 7.0490
405
+ 3400 val perplexity 1151.7167
406
+ 3400 train 7.042104 (lr=1.5666e-05) (hash(x)=44332917)
407
+ 3300 val loss 6.7106
408
+ 3300 val perplexity 821.0791
409
+ 3300 train 6.580021 (lr=5.5684e-05) (hash(x)=52311504)
410
+ 3300 val loss 6.9826
411
+ 3300 val perplexity 1077.7489
412
+ 3300 train 6.858939 (lr=3.9775e-05) (hash(x)=52311504)
413
+ 3400 val loss 6.9332
414
+ 3400 val perplexity 1025.7731
415
+ 3400 train 6.936850 (lr=2.3498e-05) (hash(x)=44332917)
416
+ 3500 val loss 7.0323
417
+ 3500 val perplexity 1132.6359
418
+ 3500 train 7.139286 (lr=1.5416e-05) (hash(x)=56517159)
419
+ 3400 val loss 6.7010
420
+ 3400 val perplexity 813.2463
421
+ 3400 train 6.689954 (lr=5.4829e-05) (hash(x)=44332917)
422
+ 3400 val loss 6.9820
423
+ 3400 val perplexity 1077.1057
424
+ 3400 train 6.967134 (lr=3.9164e-05) (hash(x)=44332917)
425
+ 3500 val loss 6.9179
426
+ 3500 val perplexity 1010.1714
427
+ 3500 train 7.036409 (lr=2.3125e-05) (hash(x)=56517159)
428
+ 3600 val loss 7.0287
429
+ 3600 val perplexity 1128.5656
430
+ 3600 train 6.907561 (lr=1.5163e-05) (hash(x)=50720920)
431
+ 3500 val loss 6.6896
432
+ 3500 val perplexity 803.9983
433
+ 3500 train 6.777810 (lr=5.3958e-05) (hash(x)=56517159)
434
+ 3500 val loss 6.9596
435
+ 3500 val perplexity 1053.2172
436
+ 3500 train 7.056179 (lr=3.8541e-05) (hash(x)=56517159)
437
+ 3600 val loss 6.9100
438
+ 3600 val perplexity 1002.2887
439
+ 3600 train 6.787700 (lr=2.2744e-05) (hash(x)=50720920)
440
+ 3700 val loss 7.0122
441
+ 3700 val perplexity 1110.0436
442
+ 3700 train 7.504549 (lr=1.4905e-05) (hash(x)=62727701)
443
+ 3600 val loss 6.6984
444
+ 3600 val perplexity 811.0815
445
+ 3600 train 6.556833 (lr=5.3070e-05) (hash(x)=50720920)
446
+ 3600 val loss 6.9576
447
+ 3600 val perplexity 1051.0614
448
+ 3600 train 6.823028 (lr=3.7907e-05) (hash(x)=50720920)
449
+ 3700 val loss 6.8997
450
+ 3700 val perplexity 991.9734
451
+ 3700 train 7.375038 (lr=2.2357e-05) (hash(x)=62727701)
452
+ 3800 val loss 7.0017
453
+ 3800 val perplexity 1098.4895
454
+ 3800 train 6.844926 (lr=1.4643e-05) (hash(x)=54772539)
455
+ 3700 val loss 6.6738
456
+ 3700 val perplexity 791.3810
457
+ 3700 train 7.154129 (lr=5.2167e-05) (hash(x)=62727701)
458
+ 3700 val loss 6.9509
459
+ 3700 val perplexity 1044.1140
460
+ 3700 train 7.432554 (lr=3.7262e-05) (hash(x)=62727701)
461
+ 3800 val loss 6.8858
462
+ 3800 val perplexity 978.2759
463
+ 3800 train 6.742421 (lr=2.1965e-05) (hash(x)=54772539)
464
+ 3900 val loss 6.9911
465
+ 3900 val perplexity 1086.9178
466
+ 3900 train 6.996634 (lr=1.4377e-05) (hash(x)=52274485)
467
+ 3800 val loss 6.6617
468
+ 3800 val perplexity 781.8557
469
+ 3800 train 6.517062 (lr=5.1251e-05) (hash(x)=54772539)
470
+ 3800 val loss 6.9500
471
+ 3800 val perplexity 1043.1793
472
+ 3800 train 6.794149 (lr=3.6608e-05) (hash(x)=54772539)
473
+ 3900 val loss 6.8733
474
+ 3900 val perplexity 966.1602
475
+ 3900 train 6.879457 (lr=2.1566e-05) (hash(x)=52274485)
476
+ 4000 val loss 6.9853
477
+ 4000 val perplexity 1080.6085
478
+ 4000 train 6.692829 (lr=1.4108e-05) (hash(x)=50118307)
479
+ 3900 val loss 6.6532
480
+ 3900 val perplexity 775.2562
481
+ 3900 train 6.649825 (lr=5.0321e-05) (hash(x)=52274485)
482
+ 4000 val loss 6.8697
483
+ 4000 val perplexity 962.6872
484
+ 3900 val loss 6.9389
485
+ 3900 val perplexity 1031.6731
486
+ 4000 train 6.566363 (lr=2.1162e-05) (hash(x)=50118307)
487
+ 3900 train 6.947895 (lr=3.5944e-05) (hash(x)=52274485)
488
+ 4100 val loss 6.9691
489
+ 4100 val perplexity 1063.2795
490
+ 4100 train 6.554935 (lr=1.3836e-05) (hash(x)=42771647)
491
+ 4000 val loss 6.6577
492
+ 4000 val perplexity 778.7485
493
+ 4000 train 6.355518 (lr=4.9379e-05) (hash(x)=50118307)
494
+ 4100 val loss 6.8494
495
+ 4100 val perplexity 943.3229
496
+ 4100 train 6.423832 (lr=2.0754e-05) (hash(x)=42771647)
497
+ 4000 val loss 6.9340
498
+ 4000 val perplexity 1026.6221
499
+ 4000 train 6.642421 (lr=3.5271e-05) (hash(x)=50118307)
500
+ 4200 val loss 6.9589
501
+ 4200 val perplexity 1052.4305
502
+ 4200 train 7.068559 (lr=1.3561e-05) (hash(x)=51748836)
503
+ 4100 val loss 6.6376
504
+ 4100 val perplexity 763.2496
505
+ 4100 train 6.206556 (lr=4.8426e-05) (hash(x)=42771647)
506
+ 4200 val loss 6.8396
507
+ 4200 val perplexity 934.1204
508
+ 4200 train 6.950802 (lr=2.0341e-05) (hash(x)=51748836)
509
+ 4100 val loss 6.9343
510
+ 4100 val perplexity 1026.8899
511
+ 4100 train 6.515369 (lr=3.4590e-05) (hash(x)=42771647)
512
+ 4300 val loss 6.9447
513
+ 4300 val perplexity 1037.6448
514
+ 4300 train 6.956426 (lr=1.3283e-05) (hash(x)=49021280)
515
+ 4200 val loss 6.6271
516
+ 4200 val perplexity 755.2573
517
+ 4200 train 6.723958 (lr=4.7463e-05) (hash(x)=51748836)
518
+ 4300 val loss 6.8180
519
+ 4300 val perplexity 914.1560
520
+ 4300 train 6.831802 (lr=1.9924e-05) (hash(x)=49021280)
521
+ 4400 val loss 6.9332
522
+ 4400 val perplexity 1025.7461
523
+ 4400 train 7.020908 (lr=1.3003e-05) (hash(x)=55200309)
524
+ 4200 val loss 6.9201
525
+ 4200 val perplexity 1012.4133
526
+ 4200 train 7.022127 (lr=3.3902e-05) (hash(x)=51748836)
527
+ 4300 val loss 6.6103
528
+ 4300 val perplexity 742.6902
529
+ 4300 train 6.629007 (lr=4.6490e-05) (hash(x)=49021280)
530
+ 4400 val loss 6.7991
531
+ 4400 val perplexity 897.0301
532
+ 4400 train 6.883133 (lr=1.9504e-05) (hash(x)=55200309)
533
+ 4500 val loss 6.9237
534
+ 4500 val perplexity 1016.1121
535
+ 4500 train 7.022612 (lr=1.2720e-05) (hash(x)=52085049)
536
+ 4300 val loss 6.9037
537
+ 4300 val perplexity 995.9319
538
+ 4300 train 6.916924 (lr=3.3207e-05) (hash(x)=49021280)
539
+ 4400 val loss 6.6076
540
+ 4400 val perplexity 740.7267
541
+ 4400 train 6.694966 (lr=4.5509e-05) (hash(x)=55200309)
542
+ 4500 val loss 6.7862
543
+ 4500 val perplexity 885.5211
544
+ 4500 train 6.876517 (lr=1.9081e-05) (hash(x)=52085049)
545
+ 4600 val loss 6.9170
546
+ 4600 val perplexity 1009.3023
547
+ 4600 train 6.909331 (lr=1.2436e-05) (hash(x)=48935595)
548
+ 4400 val loss 6.8934
549
+ 4400 val perplexity 985.7658
550
+ 4400 train 6.987580 (lr=3.2507e-05) (hash(x)=55200309)
551
+ 4500 val loss 6.5932
552
+ 4500 val perplexity 730.0834
553
+ 4500 train 6.665695 (lr=4.4521e-05) (hash(x)=52085049)
554
+ 4600 val loss 6.7758
555
+ 4600 val perplexity 876.3883
556
+ 4600 train 6.771263 (lr=1.8655e-05) (hash(x)=48935595)
557
+ 4700 val loss 6.9094
558
+ 4700 val perplexity 1001.5992
559
+ 4700 train 7.105185 (lr=1.2151e-05) (hash(x)=49182380)
560
+ 4500 val loss 6.8863
561
+ 4500 val perplexity 978.7505
562
+ 4500 train 6.974458 (lr=3.1801e-05) (hash(x)=52085049)
563
+ 4600 val loss 6.5769
564
+ 4600 val perplexity 718.3427
565
+ 4600 train 6.586899 (lr=4.3527e-05) (hash(x)=48935595)
566
+ 4800 val loss 6.8964
567
+ 4800 val perplexity 988.7254
568
+ 4700 val loss 6.7589
569
+ 4700 val perplexity 861.7277
570
+ 4800 train 6.694540 (lr=1.1864e-05) (hash(x)=43941929)
571
+ 4700 train 6.979102 (lr=1.8226e-05) (hash(x)=49182380)
572
+ 4600 val loss 6.8921
573
+ 4600 val perplexity 984.4585
574
+ 4600 train 6.891096 (lr=3.1091e-05) (hash(x)=48935595)
575
+ 4700 val loss 6.5731
576
+ 4700 val perplexity 715.5541
577
+ 4700 train 6.820342 (lr=4.2528e-05) (hash(x)=49182380)
578
+ 4800 val loss 6.7492
579
+ 4800 val perplexity 853.3676
580
+ 4900 val loss 6.8910
581
+ 4900 val perplexity 983.3855
582
+ 4900 train 7.058427 (lr=1.1577e-05) (hash(x)=51852773)
583
+ 4800 train 6.550427 (lr=1.7796e-05) (hash(x)=43941929)
584
+ 4700 val loss 6.8718
585
+ 4700 val perplexity 964.7123
586
+ 4700 train 7.057541 (lr=3.0377e-05) (hash(x)=49182380)
587
+ 4800 val loss 6.5603
588
+ 4800 val perplexity 706.4680
589
+ 4800 train 6.383238 (lr=4.1525e-05) (hash(x)=43941929)
590
+ 5000 val loss 6.8868
591
+ 5000 val perplexity 979.2640
592
+ 4900 val loss 6.7417
593
+ 4900 val perplexity 846.9903
594
+ 4900 train 6.903447 (lr=1.7365e-05) (hash(x)=51852773)
595
+ 5000 train 6.477450 (lr=1.1288e-05) (hash(x)=40509616)
596
+ 4800 val loss 6.8656
597
+ 4800 val perplexity 958.6776
598
+ 4800 train 6.658672 (lr=2.9661e-05) (hash(x)=43941929)
599
+ 4900 val loss 6.5601
600
+ 4900 val perplexity 706.3693
601
+ 4900 train 6.714787 (lr=4.0518e-05) (hash(x)=51852773)
602
+ 5000 val loss 6.7326
603
+ 5000 val perplexity 839.3625
604
+ 5100 val loss 6.8764
605
+ 5100 val perplexity 969.1464
606
+ 5100 train 7.171636 (lr=1.1000e-05) (hash(x)=57585369)
607
+ 5000 train 6.303533 (lr=1.6933e-05) (hash(x)=40509616)
608
+ 4900 val loss 6.8608
609
+ 4900 val perplexity 954.1130
610
+ 4900 train 7.019123 (lr=2.8942e-05) (hash(x)=51852773)
611
+ 5000 val loss 6.5599
612
+ 5000 val perplexity 706.1871
613
+ 5000 train 6.147030 (lr=3.9510e-05) (hash(x)=40509616)
614
+ 5200 val loss 6.8709
615
+ 5200 val perplexity 963.7726
616
+ 5200 train 6.894734 (lr=1.0712e-05) (hash(x)=51042313)
617
+ 5100 val loss 6.7199
618
+ 5100 val perplexity 828.7194
619
+ 5100 train 6.972271 (lr=1.6500e-05) (hash(x)=57585369)
620
+ 5000 val loss 6.8607
621
+ 5000 val perplexity 954.0266
622
+ 5000 train 6.446211 (lr=2.8221e-05) (hash(x)=40509616)
623
+ 5100 val loss 6.5459
624
+ 5100 val perplexity 696.3658
625
+ 5100 train 6.742117 (lr=3.8500e-05) (hash(x)=57585369)
626
+ 5300 val loss 6.8614
627
+ 5300 val perplexity 954.7220
628
+ 5300 train 7.021107 (lr=1.0423e-05) (hash(x)=52001684)
629
+ 5200 val loss 6.7183
630
+ 5200 val perplexity 827.4274
631
+ 5200 train 6.726495 (lr=1.6067e-05) (hash(x)=51042313)
632
+ 5100 val loss 6.8445
633
+ 5100 val perplexity 938.6766
634
+ 5100 train 7.109215 (lr=2.7500e-05) (hash(x)=57585369)
635
+ 5200 val loss 6.5457
636
+ 5200 val perplexity 696.2642
637
+ 5400 val loss 6.8580
638
+ 5400 val perplexity 951.4784
639
+ 5200 train 6.539655 (lr=3.7490e-05) (hash(x)=51042313)
640
+ 5400 train 6.782125 (lr=1.0136e-05) (hash(x)=48831647)
641
+ 5300 val loss 6.7052
642
+ 5300 val perplexity 816.6333
643
+ 5300 train 6.861244 (lr=1.5635e-05) (hash(x)=52001684)
644
+ 5200 val loss 6.8397
645
+ 5200 val perplexity 934.1863
646
+ 5200 train 6.856593 (lr=2.6779e-05) (hash(x)=51042313)
647
+ 5500 val loss 6.8556
648
+ 5500 val perplexity 949.1456
649
+ 5500 train 7.206544 (lr=9.8491e-06) (hash(x)=50192069)
650
+ 5300 val loss 6.5281
651
+ 5300 val perplexity 684.1060
652
+ 5300 train 6.689152 (lr=3.6482e-05) (hash(x)=52001684)
653
+ 5400 val loss 6.7006
654
+ 5400 val perplexity 812.8881
655
+ 5400 train 6.625731 (lr=1.5204e-05) (hash(x)=48831647)
656
+ 5300 val loss 6.8333
657
+ 5300 val perplexity 928.2460
658
+ 5300 train 6.993705 (lr=2.6058e-05) (hash(x)=52001684)
659
+ 5600 val loss 6.8466
660
+ 5600 val perplexity 940.6655
661
+ 5600 train 6.654557 (lr=9.5636e-06) (hash(x)=47208852)
662
+ 5400 val loss 6.5233
663
+ 5400 val perplexity 680.8095
664
+ 5400 train 6.443679 (lr=3.5475e-05) (hash(x)=48831647)
665
+ 5500 val loss 6.6987
666
+ 5500 val perplexity 811.3495
667
+ 5500 train 7.087494 (lr=1.4774e-05) (hash(x)=50192069)
668
+ 5400 val loss 6.8253
669
+ 5400 val perplexity 920.8331
670
+ 5400 train 6.749354 (lr=2.5339e-05) (hash(x)=48831647)
671
+ 5700 val loss 6.8428
672
+ 5700 val perplexity 937.1461
673
+ 5700 train 6.436399 (lr=9.2796e-06) (hash(x)=44061694)
674
+ 5500 val loss 6.5335
675
+ 5500 val perplexity 687.8297
676
+ 5500 train 6.943909 (lr=3.4472e-05) (hash(x)=50192069)
677
+ 5600 val loss 6.6884
678
+ 5600 val perplexity 803.0125
679
+ 5600 train 6.500677 (lr=1.4345e-05) (hash(x)=47208852)
680
+ 5500 val loss 6.8219
681
+ 5500 val perplexity 917.7541
682
+ 5500 train 7.200629 (lr=2.4623e-05) (hash(x)=50192069)
683
+ 5800 val loss 6.8371
684
+ 5800 val perplexity 931.7457
685
+ 5800 train 7.215187 (lr=8.9973e-06) (hash(x)=56513279)
686
+ 5600 val loss 6.5178
687
+ 5600 val perplexity 677.0574
688
+ 5700 val loss 6.6834
689
+ 5700 val perplexity 799.0583
690
+ 5600 train 6.333476 (lr=3.3473e-05) (hash(x)=47208852)
691
+ 5700 train 6.274059 (lr=1.3919e-05) (hash(x)=44061694)
692
+ 5600 val loss 6.8152
693
+ 5600 val perplexity 911.5791
694
+ 5600 train 6.629575 (lr=2.3909e-05) (hash(x)=47208852)
695
+ 5900 val loss 6.8312
696
+ 5900 val perplexity 926.2846
697
+ 5900 train 6.992841 (lr=8.7171e-06) (hash(x)=50412818)
698
+ 5800 val loss 6.6761
699
+ 5800 val perplexity 793.2364
700
+ 5700 val loss 6.5278
701
+ 5700 val perplexity 683.8582
702
+ 5800 train 7.079706 (lr=1.3496e-05) (hash(x)=56513279)
703
+ 5700 train 6.099132 (lr=3.2479e-05) (hash(x)=44061694)
704
+ 5700 val loss 6.8144
705
+ 5700 val perplexity 910.8314
706
+ 5700 train 6.405194 (lr=2.3199e-05) (hash(x)=44061694)
707
+ 6000 val loss 6.8262
708
+ 6000 val perplexity 921.7257
709
+ 6000 train 6.613876 (lr=8.4393e-06) (hash(x)=47159634)
710
+ 5900 val loss 6.6691
711
+ 5900 val perplexity 787.6990
712
+ 5900 train 6.819257 (lr=1.3076e-05) (hash(x)=50412818)
713
+ 5800 val loss 6.5141
714
+ 5800 val perplexity 674.6172
715
+ 5800 train 6.898224 (lr=3.1491e-05) (hash(x)=56513279)
716
+ 5800 val loss 6.8042
717
+ 5800 val perplexity 901.6212
718
+ 5800 train 7.188510 (lr=2.2493e-05) (hash(x)=56513279)
719
+ 6100 val loss 6.8191
720
+ 6100 val perplexity 915.1422
721
+ 6100 train 6.918497 (lr=8.1640e-06) (hash(x)=54312795)
722
+ 6000 val loss 6.6627
723
+ 6000 val perplexity 782.6782
724
+ 6000 train 6.458968 (lr=1.2659e-05) (hash(x)=47159634)
725
+ 5900 val loss 6.5100
726
+ 5900 val perplexity 671.8426
727
+ 5900 train 6.649952 (lr=3.0510e-05) (hash(x)=50412818)
728
+ 5900 val loss 6.8038
729
+ 5900 val perplexity 901.3005
730
+ 5900 train 6.949226 (lr=2.1793e-05) (hash(x)=50412818)
731
+ 6200 val loss 6.8130
732
+ 6200 val perplexity 909.6113
733
+ 6200 train 6.990555 (lr=7.8917e-06) (hash(x)=54187587)
734
+ 6100 val loss 6.6554
735
+ 6100 val perplexity 776.9844
736
+ 6100 train 6.756692 (lr=1.2246e-05) (hash(x)=54312795)
737
+ 6000 val loss 6.5064
738
+ 6000 val perplexity 669.4238
739
+ 6000 train 6.328054 (lr=2.9537e-05) (hash(x)=47159634)
740
+ 6000 val loss 6.7957
741
+ 6000 val perplexity 894.0056
742
+ 6000 train 6.589294 (lr=2.1098e-05) (hash(x)=47159634)
743
+ 6300 val loss 6.8062
744
+ 6300 val perplexity 903.3907
745
+ 6300 train 6.916306 (lr=7.6226e-06) (hash(x)=53620387)
746
+ 6200 val loss 6.6462
747
+ 6200 val perplexity 769.8491
748
+ 6200 train 6.825405 (lr=1.1838e-05) (hash(x)=54187587)
749
+ 6100 val loss 6.4902
750
+ 6100 val perplexity 658.6664
751
+ 6100 train 6.581672 (lr=2.8574e-05) (hash(x)=54312795)
752
+ 6100 val loss 6.7919
753
+ 6100 val perplexity 890.6199
754
+ 6100 train 6.885747 (lr=2.0410e-05) (hash(x)=54312795)
755
+ 6400 val loss 6.7978
756
+ 6400 val perplexity 895.8444
757
+ 6400 train 6.737978 (lr=7.3569e-06) (hash(x)=48761774)
758
+ 6300 val loss 6.6364
759
+ 6300 val perplexity 762.3516
760
+ 6300 train 6.762627 (lr=1.1434e-05) (hash(x)=53620387)
761
+ 6200 val loss 6.4772
762
+ 6200 val perplexity 650.1536
763
+ 6200 train 6.615796 (lr=2.7621e-05) (hash(x)=54187587)
764
+ 6200 val loss 6.7828
765
+ 6200 val perplexity 882.5361
766
+ 6200 train 6.953031 (lr=1.9729e-05) (hash(x)=54187587)
767
+ 6500 val loss 6.7933
768
+ 6500 val perplexity 891.8179
769
+ 6500 train 7.034630 (lr=7.0950e-06) (hash(x)=56690281)
770
+ 6400 val loss 6.6275
771
+ 6400 val perplexity 755.6107
772
+ 6400 train 6.551711 (lr=1.1035e-05) (hash(x)=48761774)
773
+ 6300 val loss 6.4718
774
+ 6300 val perplexity 646.6768
775
+ 6300 train 6.592690 (lr=2.6679e-05) (hash(x)=53620387)
776
+ 6300 val loss 6.7704
777
+ 6300 val perplexity 871.6821
778
+ 6300 train 6.878792 (lr=1.9056e-05) (hash(x)=53620387)
779
+ 6600 val loss 6.7888
780
+ 6600 val perplexity 887.8833
781
+ 6600 train 6.580529 (lr=6.8372e-06) (hash(x)=42985269)
782
+ 6500 val loss 6.6192
783
+ 6500 val perplexity 749.3109
784
+ 6500 train 6.858943 (lr=1.0643e-05) (hash(x)=56690281)
785
+ 6400 val loss 6.4743
786
+ 6400 val perplexity 648.2544
787
+ 6400 train 6.393586 (lr=2.5749e-05) (hash(x)=48761774)
788
+ 6400 val loss 6.7612
789
+ 6400 val perplexity 863.6722
790
+ 6400 train 6.697858 (lr=1.8392e-05) (hash(x)=48761774)
791
+ 6700 val loss 6.7863
792
+ 6700 val perplexity 885.5966
793
+ 6700 train 6.804174 (lr=6.5835e-06) (hash(x)=53315447)
794
+ 6600 val loss 6.6162
795
+ 6600 val perplexity 747.0964
796
+ 6600 train 6.430020 (lr=1.0256e-05) (hash(x)=42985269)
797
+ 6500 val loss 6.4627
798
+ 6500 val perplexity 640.7616
799
+ 6500 train 6.703842 (lr=2.4833e-05) (hash(x)=56690281)
800
+ 6500 val loss 6.7577
801
+ 6500 val perplexity 860.6866
802
+ 6500 train 7.011350 (lr=1.7738e-05) (hash(x)=56690281)
803
+ 6800 val loss 6.7796
804
+ 6800 val perplexity 879.7517
805
+ 6800 train 7.271560 (lr=6.3345e-06) (hash(x)=61577166)
806
+ 6700 val loss 6.6127
807
+ 6700 val perplexity 744.4901
808
+ 6700 train 6.654109 (lr=9.8753e-06) (hash(x)=53315447)
809
+ 6600 val loss 6.4587
810
+ 6600 val perplexity 638.2580
811
+ 6600 train 6.275012 (lr=2.3930e-05) (hash(x)=42985269)
812
+ 6900 val loss 6.7779
813
+ 6900 val perplexity 878.2637
814
+ 6900 train 6.883226 (lr=6.0902e-06) (hash(x)=54641005)
815
+ 6600 val loss 6.7546
816
+ 6600 val perplexity 857.9875
817
+ 6600 train 6.544014 (lr=1.7093e-05) (hash(x)=42985269)
818
+ 6800 val loss 6.6055
819
+ 6800 val perplexity 739.1291
820
+ 6800 train 7.100903 (lr=9.5017e-06) (hash(x)=61577166)
821
+ 6700 val loss 6.4604
822
+ 6700 val perplexity 639.3189
823
+ 6700 train 6.494969 (lr=2.3042e-05) (hash(x)=53315447)
824
+ 7000 val loss 6.7713
825
+ 7000 val perplexity 872.4722
826
+ 7000 train 7.330594 (lr=5.8510e-06) (hash(x)=60579512)
827
+ 6700 val loss 6.7507
828
+ 6700 val perplexity 854.6246
829
+ 6700 train 6.784648 (lr=1.6459e-05) (hash(x)=53315447)
830
+ 6900 val loss 6.5959
831
+ 6900 val perplexity 732.0701
832
+ 6900 train 6.713835 (lr=9.1353e-06) (hash(x)=54641005)
833
+ 6800 val loss 6.4498
834
+ 6800 val perplexity 632.5809
835
+ 6800 train 6.904761 (lr=2.2171e-05) (hash(x)=61577166)
836
+ 7100 val loss 6.7694
837
+ 7100 val perplexity 870.7997
838
+ 7100 train 6.665070 (lr=5.6170e-06) (hash(x)=53151549)
839
+ 6800 val loss 6.7475
840
+ 6800 val perplexity 851.9592
841
+ 6800 train 7.248035 (lr=1.5836e-05) (hash(x)=61577166)
842
+ 7000 val loss 6.5925
843
+ 7000 val perplexity 729.6163
844
+ 7000 train 7.168230 (lr=8.7764e-06) (hash(x)=60579512)
845
+ 6900 val loss 6.4431
846
+ 6900 val perplexity 628.3522
847
+ 6900 train 6.581858 (lr=2.1316e-05) (hash(x)=54641005)
848
+ 7200 val loss 6.7639
849
+ 7200 val perplexity 865.9951
850
+ 7200 train 7.761684 (lr=5.3886e-06) (hash(x)=71842455)
851
+ 6900 val loss 6.7337
852
+ 6900 val perplexity 840.2455
853
+ 6900 train 6.854381 (lr=1.5225e-05) (hash(x)=54641005)
854
+ 7100 val loss 6.5893
855
+ 7100 val perplexity 727.2419
856
+ 7100 train 6.488562 (lr=8.4255e-06) (hash(x)=53151549)
857
+ 7000 val loss 6.4382
858
+ 7000 val perplexity 625.2898
859
+ 7000 train 6.964822 (lr=2.0478e-05) (hash(x)=60579512)
860
+ 7300 val loss 6.7625
861
+ 7300 val perplexity 864.8347
862
+ 7300 train 6.528047 (lr=5.1659e-06) (hash(x)=44516452)
863
+ 7000 val loss 6.7305
864
+ 7000 val perplexity 837.5577
865
+ 7000 train 7.277111 (lr=1.4627e-05) (hash(x)=60579512)
866
+ 7200 val loss 6.5813
867
+ 7200 val perplexity 721.4806
868
+ 7200 train 7.597327 (lr=8.0829e-06) (hash(x)=71842455)
869
+ 7100 val loss 6.4414
870
+ 7100 val perplexity 627.2758
871
+ 7100 train 6.345554 (lr=1.9660e-05) (hash(x)=53151549)
872
+ 7400 val loss 6.7592
873
+ 7400 val perplexity 861.9483
874
+ 7400 train 6.410480 (lr=4.9493e-06) (hash(x)=42667710)
875
+ 7100 val loss 6.7267
876
+ 7100 val perplexity 834.4161
877
+ 7100 train 6.616548 (lr=1.4043e-05) (hash(x)=53151549)
878
+ 7300 val loss 6.5815
879
+ 7300 val perplexity 721.5986
880
+ 7300 train 6.349890 (lr=7.7489e-06) (hash(x)=44516452)
881
+ 7200 val loss 6.4361
882
+ 7200 val perplexity 623.9594
883
+ 7200 train 7.351032 (lr=1.8860e-05) (hash(x)=71842455)
884
+ 7500 val loss 6.7555
885
+ 7500 val perplexity 858.8078
886
+ 7500 train 6.505149 (lr=4.7389e-06) (hash(x)=47050797)
887
+ 7200 val loss 6.7204
888
+ 7200 val perplexity 829.1766
889
+ 7200 train 7.733423 (lr=1.3471e-05) (hash(x)=71842455)
890
+ 7400 val loss 6.5739
891
+ 7400 val perplexity 716.1360
892
+ 7400 train 6.221677 (lr=7.4239e-06) (hash(x)=42667710)
893
+ 7300 val loss 6.4313
894
+ 7300 val perplexity 620.9935
895
+ 7300 train 6.198883 (lr=1.8081e-05) (hash(x)=44516452)
896
+ 7600 val loss 6.7532
897
+ 7600 val perplexity 856.8362
898
+ 7600 train 6.666946 (lr=4.5349e-06) (hash(x)=49785056)
899
+ 7300 val loss 6.7178
900
+ 7300 val perplexity 827.0081
901
+ 7300 train 6.492526 (lr=1.2915e-05) (hash(x)=44516452)
902
+ 7500 val loss 6.5720
903
+ 7500 val perplexity 714.8130
904
+ 7500 train 6.316886 (lr=7.1083e-06) (hash(x)=47050797)
905
+ 7400 val loss 6.4278
906
+ 7400 val perplexity 618.8268
907
+ 7400 train 6.063058 (lr=1.7323e-05) (hash(x)=42667710)
908
+ 7700 val loss 6.7499
909
+ 7700 val perplexity 853.9936
910
+ 7700 train 6.595833 (lr=4.3375e-06) (hash(x)=53232030)
911
+ 7400 val loss 6.7131
912
+ 7400 val perplexity 823.0838
913
+ 7400 train 6.368190 (lr=1.2373e-05) (hash(x)=42667710)
914
+ 7600 val loss 6.5660
915
+ 7600 val perplexity 710.5325
916
+ 7600 train 6.483896 (lr=6.8023e-06) (hash(x)=49785056)
917
+ 7500 val loss 6.4225
918
+ 7500 val perplexity 615.5386
919
+ 7500 train 6.168694 (lr=1.6586e-05) (hash(x)=47050797)
920
+ 7800 val loss 6.7466
921
+ 7800 val perplexity 851.1527
922
+ 7800 train 6.579307 (lr=4.1470e-06) (hash(x)=48049749)
923
+ 7500 val loss 6.7074
924
+ 7500 val perplexity 818.4573
925
+ 7500 train 6.456196 (lr=1.1847e-05) (hash(x)=47050797)
926
+ 7700 val loss 6.5620
927
+ 7700 val perplexity 707.7010
928
+ 7700 train 6.402081 (lr=6.5062e-06) (hash(x)=53232030)
929
+ 7600 val loss 6.4167
930
+ 7600 val perplexity 611.9720
931
+ 7600 train 6.321131 (lr=1.5872e-05) (hash(x)=49785056)
932
+ 7900 val loss 6.7407
933
+ 7900 val perplexity 846.1240
934
+ 7900 train 6.525646 (lr=3.9635e-06) (hash(x)=44768513)
935
+ 7600 val loss 6.7034
936
+ 7600 val perplexity 815.2051
937
+ 7600 train 6.616943 (lr=1.1337e-05) (hash(x)=49785056)
938
+ 7800 val loss 6.5586
939
+ 7800 val perplexity 705.2940
940
+ 7800 train 6.375649 (lr=6.2205e-06) (hash(x)=48049749)
941
+ 7700 val loss 6.4153
942
+ 7700 val perplexity 611.1429
943
+ 7700 train 6.237316 (lr=1.5181e-05) (hash(x)=53232030)
944
+ 8000 val loss 6.7368
945
+ 8000 val perplexity 842.8249
946
+ 8000 train 6.609796 (lr=3.7873e-06) (hash(x)=46228039)
947
+ 7700 val loss 6.7032
948
+ 7700 val perplexity 814.9871
949
+ 7700 train 6.551203 (lr=1.0844e-05) (hash(x)=53232030)
950
+ 7900 val loss 6.5509
951
+ 7900 val perplexity 699.8884
952
+ 7900 train 6.348945 (lr=5.9453e-06) (hash(x)=44768513)
953
+ 7800 val loss 6.4185
954
+ 7800 val perplexity 613.0842
955
+ 7800 train 6.233474 (lr=1.4514e-05) (hash(x)=48049749)
956
+ 8100 val loss 6.7341
957
+ 8100 val perplexity 840.5965
958
+ 8100 train 7.155693 (lr=3.6184e-06) (hash(x)=60017091)
959
+ 7800 val loss 6.6980
960
+ 7800 val perplexity 810.7551
961
+ 7800 train 6.527575 (lr=1.0367e-05) (hash(x)=48049749)
962
+ 8000 val loss 6.5460
963
+ 8000 val perplexity 696.4471
964
+ 8000 train 6.408615 (lr=5.6809e-06) (hash(x)=46228039)
965
+ 7900 val loss 6.4048
966
+ 7900 val perplexity 604.7163
967
+ 7900 train 6.222796 (lr=1.3872e-05) (hash(x)=44768513)
968
+ 8200 val loss 6.7300
969
+ 8200 val perplexity 837.1589
970
+ 8200 train 6.721200 (lr=3.4572e-06) (hash(x)=49910198)
971
+ 7900 val loss 6.6898
972
+ 7900 val perplexity 804.1443
973
+ 7900 train 6.489498 (lr=9.9088e-06) (hash(x)=44768513)
974
+ 8100 val loss 6.5439
975
+ 8100 val perplexity 695.0140
976
+ 8100 train 6.955059 (lr=5.4277e-06) (hash(x)=60017091)
977
+ 8300 val loss 6.7268
978
+ 8300 val perplexity 834.4623
979
+ 8300 train 7.089801 (lr=3.3037e-06) (hash(x)=57919055)
980
+ 8000 val loss 6.4018
981
+ 8000 val perplexity 602.9116
982
+ 8000 train 6.253847 (lr=1.3255e-05) (hash(x)=46228039)
983
+ 8000 val loss 6.6854
984
+ 8000 val perplexity 800.5988
985
+ 8000 train 6.562236 (lr=9.4682e-06) (hash(x)=46228039)
986
+ 8200 val loss 6.5399
987
+ 8200 val perplexity 692.2427
988
+ 8200 train 6.549590 (lr=5.1858e-06) (hash(x)=49910198)
989
+ 8400 val loss 6.7268
990
+ 8400 val perplexity 834.4695
991
+ 8400 train 6.833617 (lr=3.1581e-06) (hash(x)=49694964)
992
+ 8100 val loss 6.3971
993
+ 8100 val perplexity 600.1071
994
+ 8100 train 6.769569 (lr=1.2665e-05) (hash(x)=60017091)
995
+ 8100 val loss 6.6832
996
+ 8100 val perplexity 798.8370
997
+ 8100 train 7.114765 (lr=9.0461e-06) (hash(x)=60017091)
998
+ 8300 val loss 6.5345
999
+ 8300 val perplexity 688.5184
1000
+ 8300 train 6.907079 (lr=4.9556e-06) (hash(x)=57919055)
1001
+ 8500 val loss 6.7234
1002
+ 8500 val perplexity 831.6531
1003
+ 8500 train 6.736595 (lr=3.0206e-06) (hash(x)=53762585)
1004
+ 8200 val loss 6.3928
1005
+ 8200 val perplexity 597.5019
1006
+ 8200 train 6.411548 (lr=1.2100e-05) (hash(x)=49910198)
1007
+ 8200 val loss 6.6775
1008
+ 8200 val perplexity 794.3458
1009
+ 8200 train 6.668144 (lr=8.6430e-06) (hash(x)=49910198)
1010
+ 8400 val loss 6.5352
1011
+ 8400 val perplexity 688.9457
1012
+ 8400 train 6.652549 (lr=4.7372e-06) (hash(x)=49694964)
1013
+ 8600 val loss 6.7207
1014
+ 8600 val perplexity 829.4001
1015
+ 8600 train 6.771939 (lr=2.8913e-06) (hash(x)=51166973)
1016
+ 8300 val loss 6.3882
1017
+ 8300 val perplexity 594.7954
1018
+ 8300 train 6.720931 (lr=1.1563e-05) (hash(x)=57919055)
1019
+ 8300 val loss 6.6720
1020
+ 8300 val perplexity 789.9393
1021
+ 8300 train 7.051010 (lr=8.2593e-06) (hash(x)=57919055)
1022
+ 8500 val loss 6.5307
1023
+ 8500 val perplexity 685.8688
1024
+ 8500 train 6.550485 (lr=4.5309e-06) (hash(x)=53762585)
1025
+ 8700 val loss 6.7181
1026
+ 8700 val perplexity 827.2309
1027
+ 8700 train 6.789652 (lr=2.7703e-06) (hash(x)=53968049)
1028
+ 8400 val loss 6.3887
1029
+ 8400 val perplexity 595.0958
1030
+ 8400 train 6.482622 (lr=1.1053e-05) (hash(x)=49694964)
1031
+ 8400 val loss 6.6700
1032
+ 8400 val perplexity 788.4261
1033
+ 8400 train 6.787225 (lr=7.8953e-06) (hash(x)=49694964)
1034
+ 8600 val loss 6.5288
1035
+ 8600 val perplexity 684.6053
1036
+ 8600 train 6.580200 (lr=4.3369e-06) (hash(x)=51166973)
1037
+ 8800 val loss 6.7160
1038
+ 8800 val perplexity 825.5129
1039
+ 8800 train 6.815679 (lr=2.6577e-06) (hash(x)=59231056)
1040
+ 8500 val loss 6.3836
1041
+ 8500 val perplexity 592.0306
1042
+ 8500 train 6.408396 (lr=1.0572e-05) (hash(x)=53762585)
1043
+ 8500 val loss 6.6665
1044
+ 8500 val perplexity 785.6351
1045
+ 8500 train 6.681557 (lr=7.5515e-06) (hash(x)=53762585)
1046
+ 8700 val loss 6.5256
1047
+ 8700 val perplexity 682.3744
1048
+ 8900 val loss 6.7144
1049
+ 8900 val perplexity 824.1870
1050
+ 8700 train 6.605652 (lr=4.1554e-06) (hash(x)=53968049)
1051
+ 8900 train 6.623099 (lr=2.5538e-06) (hash(x)=50488048)
1052
+ 8600 val loss 6.3830
1053
+ 8600 val perplexity 591.6888
1054
+ 8600 train 6.420757 (lr=1.0119e-05) (hash(x)=51166973)
1055
+ 8600 val loss 6.6634
1056
+ 8600 val perplexity 783.2065
1057
+ 8600 train 6.713762 (lr=7.2282e-06) (hash(x)=51166973)
1058
+ 9000 val loss 6.7123
1059
+ 9000 val perplexity 822.4427
1060
+ 9000 train 6.414406 (lr=2.4585e-06) (hash(x)=44492956)
1061
+ 8800 val loss 6.5224
1062
+ 8800 val perplexity 680.2186
1063
+ 8800 train 6.619614 (lr=3.9866e-06) (hash(x)=59231056)
1064
+ 8700 val loss 6.3795
1065
+ 8700 val perplexity 589.6207
1066
+ 8700 train 6.453899 (lr=9.6960e-06) (hash(x)=53968049)
1067
+ 8700 val loss 6.6606
1068
+ 8700 val perplexity 781.0009
1069
+ 8700 train 6.732889 (lr=6.9257e-06) (hash(x)=53968049)
1070
+ 9100 val loss 6.7135
1071
+ 9100 val perplexity 823.4442
1072
+ 9100 train 6.769123 (lr=2.3720e-06) (hash(x)=51134989)
1073
+ 8900 val loss 6.5213
1074
+ 8900 val perplexity 679.4908
1075
+ 8900 train 6.445029 (lr=3.8307e-06) (hash(x)=50488048)
1076
+ 8800 val loss 6.3741
1077
+ 8800 val perplexity 586.4824
1078
+ 8800 train 6.473434 (lr=9.3021e-06) (hash(x)=59231056)
1079
+ 8800 val loss 6.6562
1080
+ 8800 val perplexity 777.5667
1081
+ 8800 train 6.754422 (lr=6.6444e-06) (hash(x)=59231056)
1082
+ 9200 val loss 6.7107
1083
+ 9200 val perplexity 821.1844
1084
+ 9200 train 6.514919 (lr=2.2943e-06) (hash(x)=48636056)
1085
+ 9000 val loss 6.5202
1086
+ 9000 val perplexity 678.7302
1087
+ 9000 train 6.222732 (lr=3.6877e-06) (hash(x)=44492956)
1088
+ 8900 val loss 6.3754
1089
+ 8900 val perplexity 587.2310
1090
+ 8900 train 6.260254 (lr=8.9382e-06) (hash(x)=50488048)
1091
+ 9300 val loss 6.7099
1092
+ 9300 val perplexity 820.5175
1093
+ 9300 train 6.680036 (lr=2.2256e-06) (hash(x)=50200551)
1094
+ 8900 val loss 6.6543
1095
+ 8900 val perplexity 776.0824
1096
+ 8900 train 6.573340 (lr=6.3845e-06) (hash(x)=50488048)
1097
+ 9100 val loss 6.5226
1098
+ 9100 val perplexity 680.3639
1099
+ 9100 train 6.545186 (lr=3.5580e-06) (hash(x)=51134989)
1100
+ 9000 val loss 6.3725
1101
+ 9000 val perplexity 585.4980
1102
+ 9000 train 6.081764 (lr=8.6047e-06) (hash(x)=44492956)
1103
+ 9400 val loss 6.7092
1104
+ 9400 val perplexity 819.8851
1105
+ 9400 train 6.530241 (lr=2.1660e-06) (hash(x)=48057228)
1106
+ 9000 val loss 6.6524
1107
+ 9000 val perplexity 774.6767
1108
+ 9000 train 6.361135 (lr=6.1462e-06) (hash(x)=44492956)
1109
+ 9200 val loss 6.5177
1110
+ 9200 val perplexity 676.9999
1111
+ 9200 train 6.318551 (lr=3.4415e-06) (hash(x)=48636056)
1112
+ 9100 val loss 6.3738
1113
+ 9100 val perplexity 586.2674
1114
+ 9100 train 6.352215 (lr=8.3020e-06) (hash(x)=51134989)
1115
+ 9500 val loss 6.7056
1116
+ 9500 val perplexity 816.9913
1117
+ 9500 train 6.484961 (lr=2.1154e-06) (hash(x)=48125171)
1118
+ 9100 val loss 6.6548
1119
+ 9100 val perplexity 776.4929
1120
+ 9100 train 6.679288 (lr=5.9300e-06) (hash(x)=51134989)
1121
+ 9300 val loss 6.5164
1122
+ 9300 val perplexity 676.1420
1123
+ 9300 train 6.469476 (lr=3.3385e-06) (hash(x)=50200551)
1124
+ 9200 val loss 6.3685
1125
+ 9200 val perplexity 583.1851
1126
+ 9200 train 6.174442 (lr=8.0302e-06) (hash(x)=48636056)
1127
+ 9600 val loss 6.7056
1128
+ 9600 val perplexity 816.9484
1129
+ 9600 train 6.674135 (lr=2.0739e-06) (hash(x)=53375853)
1130
+ 9200 val loss 6.6493
1131
+ 9200 val perplexity 772.2683
1132
+ 9200 train 6.453731 (lr=5.7359e-06) (hash(x)=48636056)
1133
+ 9400 val loss 6.5164
1134
+ 9400 val perplexity 676.1276
1135
+ 9400 train 6.336080 (lr=3.2490e-06) (hash(x)=48057228)
1136
+ 9300 val loss 6.3682
1137
+ 9300 val perplexity 583.0174
1138
+ 9300 train 6.318146 (lr=7.7898e-06) (hash(x)=50200551)
1139
+ 9700 val loss 6.7019
1140
+ 9700 val perplexity 813.9532
1141
+ 9700 train 7.475645 (lr=2.0416e-06) (hash(x)=53924631)
1142
+ 9300 val loss 6.6490
1143
+ 9300 val perplexity 772.0294
1144
+ 9300 train 6.610506 (lr=5.5641e-06) (hash(x)=50200551)
1145
+ 9500 val loss 6.5129
1146
+ 9500 val perplexity 673.7589
1147
+ 9500 train 6.297124 (lr=3.1730e-06) (hash(x)=48125171)
1148
+ 9400 val loss 6.3696
1149
+ 9400 val perplexity 583.8489
1150
+ 9400 train 6.185252 (lr=7.5809e-06) (hash(x)=48057228)
1151
+ 9800 val loss 6.6990
1152
+ 9800 val perplexity 811.5999
1153
+ 9800 train 6.732530 (lr=2.0185e-06) (hash(x)=48895047)
1154
+ 9400 val loss 6.6472
1155
+ 9400 val perplexity 770.5902
1156
+ 9400 train 6.478499 (lr=5.4149e-06) (hash(x)=48057228)
1157
+ 9600 val loss 6.5126
1158
+ 9600 val perplexity 673.5715
1159
+ 9600 train 6.459769 (lr=3.1108e-06) (hash(x)=53375853)
1160
+ 9500 val loss 6.3674
1161
+ 9500 val perplexity 582.5411
1162
+ 9500 train 6.146292 (lr=7.4038e-06) (hash(x)=48125171)
1163
+ 9900 val loss 6.6963
1164
+ 9900 val perplexity 809.4128
1165
+ 9900 train 6.532602 (lr=2.0046e-06) (hash(x)=44269923)
1166
+ 9500 val loss 6.6440
1167
+ 9500 val perplexity 768.1942
1168
+ 9500 train 6.421329 (lr=5.2884e-06) (hash(x)=48125171)
1169
+ 9700 val loss 6.5079
1170
+ 9700 val perplexity 670.4332
1171
+ 9700 train 7.313867 (lr=3.0624e-06) (hash(x)=53924631)
1172
+ 9600 val loss 6.3645
1173
+ 9600 val perplexity 580.8505
1174
+ 9600 train 6.295037 (lr=7.2586e-06) (hash(x)=53375853)
1175
+ 9999 val loss 6.6978
1176
+ 9999 val perplexity 810.6125
1177
+ 9600 val loss 6.6432
1178
+ 9600 val perplexity 767.5720
1179
+ 9600 train 6.602069 (lr=5.1847e-06) (hash(x)=53375853)
1180
+ 9800 val loss 6.5053
1181
+ 9800 val perplexity 668.6689
1182
+ 9800 train 6.551248 (lr=3.0277e-06) (hash(x)=48895047)
1183
+ 9700 val loss 6.3601
1184
+ 9700 val perplexity 578.3121
1185
+ 9700 train 7.180431 (lr=7.1456e-06) (hash(x)=53924631)
1186
+ 9700 val loss 6.6379
1187
+ 9700 val perplexity 763.5204
1188
+ 9700 train 7.429144 (lr=5.1040e-06) (hash(x)=53924631)
1189
+ 9900 val loss 6.5029
1190
+ 9900 val perplexity 667.0677
1191
+ 9900 train 6.344713 (lr=3.0069e-06) (hash(x)=44269923)
1192
+ 9800 val loss 6.3598
1193
+ 9800 val perplexity 578.1113
1194
+ 9800 train 6.405945 (lr=7.0647e-06) (hash(x)=48895047)
1195
+ 9800 val loss 6.6355
1196
+ 9800 val perplexity 761.6445
1197
+ 9800 train 6.674919 (lr=5.0462e-06) (hash(x)=48895047)
1198
+ 9999 val loss 6.5035
1199
+ 9999 val perplexity 667.4902
1200
+ 9900 val loss 6.6316
1201
+ 9900 val perplexity 758.6907
1202
+ 9900 train 6.478441 (lr=5.0116e-06) (hash(x)=44269923)
1203
+ 9900 val loss 6.3549
1204
+ 9900 val perplexity 575.2765
1205
+ 9900 train 6.186326 (lr=7.0162e-06) (hash(x)=44269923)
1206
+ 9999 val loss 6.6315
1207
+ 9999 val perplexity 758.6360
1208
+ 9999 val loss 6.3557
1209
+ 9999 val perplexity 575.7891
attention_kindselective_n_heads8_seed1341/model_02500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e860e923a556180e7e2dade3cce2e338a0c6e4ced0183df1bef4b801ac4ed341
3
+ size 257976706
attention_kindselective_n_heads8_seed1341/model_05000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f185c3159438e039d4fc515e37f75f445947587e36943760fb5202a8375b6af4
3
+ size 257976706
attention_kindselective_n_heads8_seed1341/model_07500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee0c71f6868299f0a2ff54ed1ec8d3aeaf7c4572b36fbacc656c3c31f9cbf2a1
3
+ size 257976706
attention_kindselective_n_heads8_seed1341/model_09999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef70a6883ed7ace4af08d40c7efad1fdddd7b9a3923d2826bd87c81631ed7525
3
+ size 257976706
attention_kindselective_n_heads8_seed1341/optimizer_02500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bd67e3d6826de3f55ad5d5f1c04b26f72da5c15b4892f1263072c7dfd955a14
3
+ size 509672838
attention_kindselective_n_heads8_seed1341/optimizer_05000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba33e8595171564264bbf96f4a16fc0a8b17e06aca96550034da8efea6848d93
3
+ size 509672838
attention_kindselective_n_heads8_seed1341/optimizer_07500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a08c66319f568af505a5def305e7297e4f583c6ba4f94fce35a6492eaaf6622d
3
+ size 509672838
attention_kindselective_n_heads8_seed1341/optimizer_09999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0de2886281b4cfd84dfa959b6a06d6011db4a724bda0944d7119a668fadebcb4
3
+ size 509672838