andrew-healey commited on
Commit
f61ac5d
·
verified ·
1 Parent(s): 35c2929

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads2_seed1338/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_4/attention_kindselective_n_heads2_seed1338", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_4", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1338, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 30720, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 3e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "3e-5_30720_2_1338", "n_embd": 128}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads2_seed1338", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1338, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "5e-5_10240_2_1338", "n_embd": 128}
attention_kindselective_n_heads2_seed1338/dataloader_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db41c5e5513137877487a93451adf8ec4ed2448ab6e9471ebd5595c8e3293875
3
  size 964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b2ea67f78ff5a7970d0db044ff7ee527b3dc065f295fd30f588df4b44b568d0
3
  size 964
attention_kindselective_n_heads2_seed1338/dataloader_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6df8947c6ab773db1947914387d3db345a84828521d3a64bae9b652e1b0a410
3
  size 964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f03ed2ebf741f15e13c79e6cc1e9a19b308450d81cc3b4d8d0338c63d77ca59
3
  size 964
attention_kindselective_n_heads2_seed1338/dataloader_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:169891a726a7ff746d1a7aa99f459a66d85ceb4e9f2583f790f5b8501f97b6af
3
  size 964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82590037fb2eecbec961f7967a8dd1b8d85515d31a252f66b92b8139858a8b7c
3
  size 964
attention_kindselective_n_heads2_seed1338/dataloader_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e685a568a36c792ccbe7b5fcae0b9d630955e589991190bd8902836cea6a91df
3
  size 964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c544303717d12355a69b8ffb1eb109434e4fdccfd5a61279b6e8ba2e870d6700
3
  size 964
attention_kindselective_n_heads2_seed1338/log2.txt CHANGED
@@ -1,303 +1,606 @@
1
  max_steps: 10000
2
- 0 val loss 11.7630
3
- 0 val perplexity 128408.9922
4
- 0 train 11.769258 (lr=1.5000e-07) (hash(x)=150327452)
5
- 100 val loss 10.4010
6
- 100 val perplexity 32894.0078
7
- 100 train 10.388559 (lr=1.5150e-05) (hash(x)=166441190)
8
- 200 val loss 9.3220
9
- 200 val perplexity 11180.8262
10
- 200 train 9.420354 (lr=3.0000e-05) (hash(x)=166780046)
11
- 300 val loss 8.1198
12
- 300 val perplexity 3360.3025
13
- 300 train 8.114033 (lr=2.9993e-05) (hash(x)=159835303)
14
- 400 val loss 7.7643
15
- 400 val perplexity 2355.1169
16
- 400 train 7.589100 (lr=2.9972e-05) (hash(x)=155040610)
17
- 500 val loss 7.6660
18
- 500 val perplexity 2134.4812
19
- 500 train 7.423435 (lr=2.9938e-05) (hash(x)=130190460)
20
- 600 val loss 7.6109
21
- 600 val perplexity 2020.0375
22
- 600 train 7.641937 (lr=2.9889e-05) (hash(x)=155504036)
23
- 700 val loss 7.5711
24
- 700 val perplexity 1941.2490
25
- 700 train 7.414425 (lr=2.9827e-05) (hash(x)=137347213)
26
- 800 val loss 7.5448
27
- 800 val perplexity 1890.9220
28
- 800 train 7.399529 (lr=2.9751e-05) (hash(x)=143823248)
29
- 900 val loss 7.5278
30
- 900 val perplexity 1859.0282
31
- 900 train 7.648718 (lr=2.9662e-05) (hash(x)=156260416)
32
- 1000 val loss 7.5065
33
- 1000 val perplexity 1819.8811
34
- 1000 train 7.474764 (lr=2.9558e-05) (hash(x)=143734685)
35
- 1100 val loss 7.4984
36
- 1100 val perplexity 1805.1945
37
- 1100 train 7.444295 (lr=2.9442e-05) (hash(x)=160013925)
38
- 1200 val loss 7.4866
39
- 1200 val perplexity 1784.0110
40
- 1200 train 7.261100 (lr=2.9312e-05) (hash(x)=150678249)
41
- 1300 val loss 7.4759
42
- 1300 val perplexity 1764.9208
43
- 1300 train 7.281197 (lr=2.9169e-05) (hash(x)=149073315)
44
- 1400 val loss 7.4569
45
- 1400 val perplexity 1731.7219
46
- 1400 train 7.886369 (lr=2.9013e-05) (hash(x)=175802021)
47
- 1500 val loss 7.4343
48
- 1500 val perplexity 1693.1479
49
- 1500 train 7.685257 (lr=2.8845e-05) (hash(x)=171034639)
50
- 1600 val loss 7.4127
51
- 1600 val perplexity 1656.9448
52
- 1600 train 7.640902 (lr=2.8663e-05) (hash(x)=158681215)
53
- 1700 val loss 7.3985
54
- 1700 val perplexity 1633.6101
55
- 1700 train 7.341957 (lr=2.8469e-05) (hash(x)=152116061)
56
- 1800 val loss 7.3886
57
- 1800 val perplexity 1617.4152
58
- 1800 train 7.323990 (lr=2.8263e-05) (hash(x)=146108145)
59
- 1900 val loss 7.3732
60
- 1900 val perplexity 1592.7101
61
- 1900 train 7.219169 (lr=2.8044e-05) (hash(x)=147598108)
62
- 2000 val loss 7.3618
63
- 2000 val perplexity 1574.7190
64
- 2000 train 7.232282 (lr=2.7814e-05) (hash(x)=154996086)
65
- 2100 val loss 7.3471
66
- 2100 val perplexity 1551.6771
67
- 2100 train 7.141953 (lr=2.7572e-05) (hash(x)=153396183)
68
- 2200 val loss 7.3012
69
- 2200 val perplexity 1482.1154
70
- 2200 train 7.291854 (lr=2.7319e-05) (hash(x)=153885445)
71
- 2300 val loss 7.2835
72
- 2300 val perplexity 1456.0382
73
- 2300 train 7.268628 (lr=2.7055e-05) (hash(x)=159666385)
74
- 2400 val loss 7.2455
75
- 2400 val perplexity 1401.8269
76
- 2400 train 7.193364 (lr=2.6780e-05) (hash(x)=142353087)
77
- 2500 val loss 7.2123
78
- 2500 val perplexity 1355.9503
79
- 2500 train 7.208140 (lr=2.6494e-05) (hash(x)=146491718)
80
- 2600 val loss 7.1809
81
- 2600 val perplexity 1314.0718
82
- 2600 train 7.118464 (lr=2.6198e-05) (hash(x)=150750353)
83
- 2700 val loss 7.1504
84
- 2700 val perplexity 1274.6262
85
- 2700 train 6.869782 (lr=2.5892e-05) (hash(x)=129849193)
86
- 2800 val loss 7.1357
87
- 2800 val perplexity 1256.0383
88
- 2800 train 6.996274 (lr=2.5576e-05) (hash(x)=152767913)
89
- 2900 val loss 7.1052
90
- 2900 val perplexity 1218.3030
91
- 2900 train 6.898202 (lr=2.5251e-05) (hash(x)=146531140)
92
- 3000 val loss 7.0670
93
- 3000 val perplexity 1172.6589
94
- 3000 train 7.135310 (lr=2.4917e-05) (hash(x)=151562048)
95
- 3100 val loss 7.0301
96
- 3100 val perplexity 1130.1343
97
- 3100 train 7.015220 (lr=2.4574e-05) (hash(x)=146001424)
98
- 3200 val loss 7.0035
99
- 3200 val perplexity 1100.5248
100
- 3200 train 7.012419 (lr=2.4224e-05) (hash(x)=166486165)
101
- 3300 val loss 6.9697
102
- 3300 val perplexity 1063.8522
103
- 3300 train 6.894574 (lr=2.3865e-05) (hash(x)=150866680)
104
- 3400 val loss 6.9519
105
- 3400 val perplexity 1045.1385
106
- 3400 train 6.863424 (lr=2.3498e-05) (hash(x)=143900419)
107
- 3500 val loss 6.9233
108
- 3500 val perplexity 1015.6485
109
- 3500 train 6.757936 (lr=2.3125e-05) (hash(x)=148845794)
110
- 3600 val loss 6.9006
111
- 3600 val perplexity 992.9057
112
- 3600 train 6.751880 (lr=2.2744e-05) (hash(x)=145667796)
113
- 3700 val loss 6.8779
114
- 3700 val perplexity 970.6259
115
- 3700 train 6.897323 (lr=2.2357e-05) (hash(x)=163563851)
116
- 3800 val loss 6.8582
117
- 3800 val perplexity 951.6208
118
- 3800 train 6.815534 (lr=2.1965e-05) (hash(x)=147488689)
119
- 3900 val loss 6.8149
120
- 3900 val perplexity 911.3462
121
- 3900 train 6.802123 (lr=2.1566e-05) (hash(x)=148186608)
122
- 4000 val loss 6.7898
123
- 4000 val perplexity 888.6940
124
- 4000 train 6.638687 (lr=2.1162e-05) (hash(x)=142970187)
125
- 4100 val loss 6.7610
126
- 4100 val perplexity 863.5334
127
- 4100 train 6.777088 (lr=2.0754e-05) (hash(x)=141584883)
128
- 4200 val loss 6.7359
129
- 4200 val perplexity 842.1255
130
- 4200 train 6.537104 (lr=2.0341e-05) (hash(x)=145664585)
131
- 4300 val loss 6.7157
132
- 4300 val perplexity 825.2897
133
- 4300 train 6.550407 (lr=1.9924e-05) (hash(x)=143736499)
134
- 4400 val loss 6.7043
135
- 4400 val perplexity 815.9366
136
- 4400 train 6.524540 (lr=1.9504e-05) (hash(x)=151883322)
137
- 4500 val loss 6.6853
138
- 4500 val perplexity 800.5656
139
- 4500 train 6.591630 (lr=1.9081e-05) (hash(x)=153904871)
140
- 4600 val loss 6.6625
141
- 4600 val perplexity 782.4913
142
- 4600 train 6.722117 (lr=1.8655e-05) (hash(x)=154893521)
143
- 4700 val loss 6.6334
144
- 4700 val perplexity 760.0438
145
- 4700 train 6.709569 (lr=1.8226e-05) (hash(x)=152323949)
146
- 4800 val loss 6.6148
147
- 4800 val perplexity 746.0839
148
- 4800 train 6.544276 (lr=1.7796e-05) (hash(x)=154104619)
149
- 4900 val loss 6.5995
150
- 4900 val perplexity 734.7310
151
- 4900 train 6.678655 (lr=1.7365e-05) (hash(x)=146311426)
152
- 5000 val loss 6.5837
153
- 5000 val perplexity 723.2397
154
- 5000 train 6.608577 (lr=1.6933e-05) (hash(x)=156741847)
155
- 5100 val loss 6.5716
156
- 5100 val perplexity 714.5206
157
- 5100 train 6.365155 (lr=1.6500e-05) (hash(x)=142086346)
158
- 5200 val loss 6.5636
159
- 5200 val perplexity 708.8334
160
- 5200 train 6.348958 (lr=1.6067e-05) (hash(x)=150265428)
161
- 5300 val loss 6.5557
162
- 5300 val perplexity 703.2093
163
- 5300 train 6.452664 (lr=1.5635e-05) (hash(x)=151339108)
164
- 5400 val loss 6.5387
165
- 5400 val perplexity 691.3912
166
- 5400 train 6.534310 (lr=1.5204e-05) (hash(x)=154654372)
167
- 5500 val loss 6.5184
168
- 5500 val perplexity 677.4675
169
- 5500 train 6.529605 (lr=1.4774e-05) (hash(x)=150575051)
170
- 5600 val loss 6.5039
171
- 5600 val perplexity 667.7398
172
- 5600 train 6.391018 (lr=1.4345e-05) (hash(x)=140396423)
173
- 5700 val loss 6.4901
174
- 5700 val perplexity 658.5672
175
- 5700 train 6.374229 (lr=1.3919e-05) (hash(x)=144678758)
176
- 5800 val loss 6.4787
177
- 5800 val perplexity 651.1094
178
- 5800 train 6.583964 (lr=1.3496e-05) (hash(x)=151992743)
179
- 5900 val loss 6.4715
180
- 5900 val perplexity 646.4711
181
- 5900 train 6.251740 (lr=1.3076e-05) (hash(x)=144396927)
182
- 6000 val loss 6.4619
183
- 6000 val perplexity 640.2845
184
- 6000 train 6.434009 (lr=1.2659e-05) (hash(x)=165478625)
185
- 6100 val loss 6.4656
186
- 6100 val perplexity 642.6783
187
- 6100 train 6.216571 (lr=1.2246e-05) (hash(x)=147088621)
188
- 6200 val loss 6.4535
189
- 6200 val perplexity 634.8896
190
- 6200 train 6.240591 (lr=1.1838e-05) (hash(x)=140794994)
191
- 6300 val loss 6.4400
192
- 6300 val perplexity 626.4254
193
- 6300 train 6.364601 (lr=1.1434e-05) (hash(x)=134780906)
194
- 6400 val loss 6.4261
195
- 6400 val perplexity 617.7345
196
- 6400 train 6.413403 (lr=1.1035e-05) (hash(x)=149023655)
197
- 6500 val loss 6.4153
198
- 6500 val perplexity 611.1179
199
- 6500 train 6.281328 (lr=1.0643e-05) (hash(x)=147497796)
200
- 6600 val loss 6.4143
201
- 6600 val perplexity 610.5001
202
- 6600 train 6.261950 (lr=1.0256e-05) (hash(x)=152902689)
203
- 6700 val loss 6.4073
204
- 6700 val perplexity 606.2618
205
- 6700 train 6.460744 (lr=9.8753e-06) (hash(x)=153846046)
206
- 6800 val loss 6.3954
207
- 6800 val perplexity 599.0741
208
- 6800 train 6.533175 (lr=9.5017e-06) (hash(x)=158512738)
209
- 6900 val loss 6.3901
210
- 6900 val perplexity 595.9360
211
- 6900 train 6.872580 (lr=9.1353e-06) (hash(x)=156849968)
212
- 7000 val loss 6.3825
213
- 7000 val perplexity 591.4265
214
- 7000 train 6.211296 (lr=8.7764e-06) (hash(x)=142395855)
215
- 7100 val loss 6.3768
216
- 7100 val perplexity 588.0710
217
- 7100 train 6.247732 (lr=8.4255e-06) (hash(x)=147114884)
218
- 7200 val loss 6.3739
219
- 7200 val perplexity 586.3677
220
- 7200 train 6.394201 (lr=8.0829e-06) (hash(x)=156979839)
221
- 7300 val loss 6.3695
222
- 7300 val perplexity 583.7812
223
- 7300 train 6.134778 (lr=7.7489e-06) (hash(x)=145584373)
224
- 7400 val loss 6.3681
225
- 7400 val perplexity 582.9282
226
- 7400 train 6.108246 (lr=7.4239e-06) (hash(x)=141508204)
227
- 7500 val loss 6.3592
228
- 7500 val perplexity 577.7770
229
- 7500 train 6.410224 (lr=7.1083e-06) (hash(x)=148803965)
230
- 7600 val loss 6.3525
231
- 7600 val perplexity 573.9427
232
- 7600 train 6.412134 (lr=6.8023e-06) (hash(x)=151019676)
233
- 7700 val loss 6.3469
234
- 7700 val perplexity 570.7330
235
- 7700 train 6.363933 (lr=6.5062e-06) (hash(x)=143155750)
236
- 7800 val loss 6.3412
237
- 7800 val perplexity 567.4922
238
- 7800 train 6.365192 (lr=6.2205e-06) (hash(x)=152569653)
239
- 7900 val loss 6.3391
240
- 7900 val perplexity 566.2675
241
- 7900 train 6.198283 (lr=5.9453e-06) (hash(x)=143519455)
242
- 8000 val loss 6.3370
243
- 8000 val perplexity 565.0965
244
- 8000 train 6.521562 (lr=5.6809e-06) (hash(x)=161180944)
245
- 8100 val loss 6.3356
246
- 8100 val perplexity 564.3146
247
- 8100 train 6.367569 (lr=5.4277e-06) (hash(x)=154107345)
248
- 8200 val loss 6.3359
249
- 8200 val perplexity 564.4596
250
- 8200 train 6.350157 (lr=5.1858e-06) (hash(x)=152486517)
251
- 8300 val loss 6.3262
252
- 8300 val perplexity 559.0314
253
- 8300 train 6.355000 (lr=4.9556e-06) (hash(x)=156167749)
254
- 8400 val loss 6.3224
255
- 8400 val perplexity 556.9216
256
- 8400 train 6.399919 (lr=4.7372e-06) (hash(x)=149155006)
257
- 8500 val loss 6.3170
258
- 8500 val perplexity 553.9238
259
- 8500 train 6.291015 (lr=4.5309e-06) (hash(x)=147844390)
260
- 8600 val loss 6.3133
261
- 8600 val perplexity 551.8853
262
- 8600 train 6.573327 (lr=4.3369e-06) (hash(x)=165753320)
263
- 8700 val loss 6.3135
264
- 8700 val perplexity 551.9460
265
- 8700 train 6.098110 (lr=4.1554e-06) (hash(x)=146079979)
266
- 8800 val loss 6.3118
267
- 8800 val perplexity 551.0446
268
- 8800 train 6.534912 (lr=3.9866e-06) (hash(x)=172259509)
269
- 8900 val loss 6.3120
270
- 8900 val perplexity 551.1379
271
- 8900 train 5.987774 (lr=3.8307e-06) (hash(x)=145148314)
272
- 9000 val loss 6.3112
273
- 9000 val perplexity 550.7257
274
- 9000 train 6.234556 (lr=3.6877e-06) (hash(x)=144250633)
275
- 9100 val loss 6.3057
276
- 9100 val perplexity 547.6868
277
- 9100 train 6.414195 (lr=3.5580e-06) (hash(x)=157219797)
278
- 9200 val loss 6.2999
279
- 9200 val perplexity 544.5214
280
- 9200 train 6.288376 (lr=3.4415e-06) (hash(x)=142743778)
281
- 9300 val loss 6.2994
282
- 9300 val perplexity 544.2646
283
- 9300 train 6.183533 (lr=3.3385e-06) (hash(x)=139669771)
284
- 9400 val loss 6.2957
285
- 9400 val perplexity 542.2340
286
- 9400 train 6.265487 (lr=3.2490e-06) (hash(x)=145916843)
287
- 9500 val loss 6.2935
288
- 9500 val perplexity 541.0405
289
- 9500 train 6.209871 (lr=3.1730e-06) (hash(x)=150196125)
290
- 9600 val loss 6.2931
291
- 9600 val perplexity 540.8432
292
- 9600 train 6.577899 (lr=3.1108e-06) (hash(x)=160041419)
293
- 9700 val loss 6.2934
294
- 9700 val perplexity 540.9856
295
- 9700 train 6.079773 (lr=3.0624e-06) (hash(x)=139931627)
296
- 9800 val loss 6.2910
297
- 9800 val perplexity 539.6734
298
- 9800 train 6.253589 (lr=3.0277e-06) (hash(x)=150370792)
299
- 9900 val loss 6.2893
300
- 9900 val perplexity 538.8027
301
- 9900 train 6.149877 (lr=3.0069e-06) (hash(x)=153014886)
302
- 9999 val loss 6.2893
303
- 9999 val perplexity 538.7606
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  max_steps: 10000
2
+ max_steps: 10000
3
+ 0 val loss 11.7640
4
+ 0 val perplexity 128545.8516
5
+ 0 val loss 11.7640
6
+ 0 val perplexity 128545.8516
7
+ 0 train 11.762399 (lr=2.5000e-07) (hash(x)=50671684)
8
+ 0 train 11.762399 (lr=3.5000e-07) (hash(x)=50671684)
9
+ 100 val loss 10.2612
10
+ 100 val perplexity 28599.8301
11
+ 100 val loss 10.0855
12
+ 100 val perplexity 23992.4160
13
+ 100 train 10.199640 (lr=2.5250e-05) (hash(x)=52740221)
14
+ 100 train 10.027347 (lr=3.5350e-05) (hash(x)=52740221)
15
+ 200 val loss 9.0641
16
+ 200 val perplexity 8639.7002
17
+ 200 train 9.082980 (lr=5.0000e-05) (hash(x)=49034180)
18
+ 200 val loss 8.3482
19
+ 200 val perplexity 4222.5566
20
+ 200 train 8.351851 (lr=7.0000e-05) (hash(x)=49034180)
21
+ 300 val loss 8.1940
22
+ 300 val perplexity 3619.2537
23
+ 300 train 8.292101 (lr=4.9988e-05) (hash(x)=63180688)
24
+ 300 val loss 7.7296
25
+ 300 val perplexity 2274.6714
26
+ 300 train 7.949653 (lr=6.9984e-05) (hash(x)=63180688)
27
+ 400 val loss 7.8544
28
+ 400 val perplexity 2577.1091
29
+ 400 train 7.816474 (lr=4.9954e-05) (hash(x)=50373500)
30
+ 400 val loss 7.6567
31
+ 400 val perplexity 2114.7312
32
+ 400 train 7.666688 (lr=6.9935e-05) (hash(x)=50373500)
33
+ 500 val loss 7.6951
34
+ 500 val perplexity 2197.5679
35
+ 500 train 7.606697 (lr=4.9896e-05) (hash(x)=44547422)
36
+ 500 val loss 7.6381
37
+ 500 val perplexity 2075.7231
38
+ 500 train 7.539029 (lr=6.9854e-05) (hash(x)=44547422)
39
+ 600 val loss 7.6593
40
+ 600 val perplexity 2120.3745
41
+ 600 train 7.554819 (lr=4.9815e-05) (hash(x)=47184699)
42
+ 600 val loss 7.6413
43
+ 600 val perplexity 2082.3872
44
+ 600 train 7.530231 (lr=6.9741e-05) (hash(x)=47184699)
45
+ 700 val loss 7.6521
46
+ 700 val perplexity 2104.9644
47
+ 700 train 7.565280 (lr=4.9712e-05) (hash(x)=51374582)
48
+ 700 val loss 7.6389
49
+ 700 val perplexity 2077.5461
50
+ 700 train 7.553903 (lr=6.9596e-05) (hash(x)=51374582)
51
+ 800 val loss 7.6369
52
+ 800 val perplexity 2073.2136
53
+ 800 train 7.378779 (lr=4.9585e-05) (hash(x)=46264805)
54
+ 800 val loss 7.6290
55
+ 800 val perplexity 2057.0747
56
+ 800 train 7.379150 (lr=6.9419e-05) (hash(x)=46264805)
57
+ 900 val loss 7.6547
58
+ 900 val perplexity 2110.6108
59
+ 900 train 7.937618 (lr=4.9436e-05) (hash(x)=61178712)
60
+ 900 val loss 7.6323
61
+ 900 val perplexity 2063.8235
62
+ 900 train 7.921933 (lr=6.9210e-05) (hash(x)=61178712)
63
+ 1000 val loss 7.6362
64
+ 1000 val perplexity 2071.7578
65
+ 1000 train 7.647681 (lr=4.9264e-05) (hash(x)=50886520)
66
+ 1000 val loss 7.6263
67
+ 1000 val perplexity 2051.3552
68
+ 1000 train 7.646755 (lr=6.8970e-05) (hash(x)=50886520)
69
+ 1100 val loss 7.6266
70
+ 1100 val perplexity 2052.0225
71
+ 1100 train 7.411226 (lr=4.9070e-05) (hash(x)=48600099)
72
+ 1100 val loss 7.6231
73
+ 1100 val perplexity 2044.9672
74
+ 1100 train 7.445827 (lr=6.8698e-05) (hash(x)=48600099)
75
+ 1200 val loss 7.6176
76
+ 1200 val perplexity 2033.6812
77
+ 1200 train 7.354834 (lr=4.8854e-05) (hash(x)=50146792)
78
+ 1200 val loss 7.6272
79
+ 1200 val perplexity 2053.3645
80
+ 1200 train 7.383700 (lr=6.8395e-05) (hash(x)=50146792)
81
+ 1300 val loss 7.6195
82
+ 1300 val perplexity 2037.5619
83
+ 1300 train 7.491829 (lr=4.8616e-05) (hash(x)=52617313)
84
+ 1300 val loss 7.6346
85
+ 1300 val perplexity 2068.6394
86
+ 1300 train 7.519730 (lr=6.8062e-05) (hash(x)=52617313)
87
+ 1400 val loss 7.6085
88
+ 1400 val perplexity 2015.1982
89
+ 1400 train 7.284451 (lr=4.8356e-05) (hash(x)=49794446)
90
+ 1400 val loss 7.6309
91
+ 1400 val perplexity 2060.8892
92
+ 1400 train 7.316226 (lr=6.7698e-05) (hash(x)=49794446)
93
+ 1500 val loss 7.6101
94
+ 1500 val perplexity 2018.5585
95
+ 1500 train 7.262372 (lr=4.8074e-05) (hash(x)=50766317)
96
+ 1500 val loss 7.6201
97
+ 1500 val perplexity 2038.8419
98
+ 1500 train 7.261213 (lr=6.7304e-05) (hash(x)=50766317)
99
+ 1600 val loss 7.6007
100
+ 1600 val perplexity 1999.5605
101
+ 1600 train 7.493066 (lr=4.7772e-05) (hash(x)=55551175)
102
+ 1600 val loss 7.5985
103
+ 1600 val perplexity 1995.1804
104
+ 1600 train 7.499768 (lr=6.6881e-05) (hash(x)=55551175)
105
+ 1700 val loss 7.5795
106
+ 1700 val perplexity 1957.7035
107
+ 1700 train 7.733157 (lr=4.7448e-05) (hash(x)=56717172)
108
+ 1700 val loss 7.5674
109
+ 1700 val perplexity 1934.0377
110
+ 1700 train 7.691600 (lr=6.6428e-05) (hash(x)=56717172)
111
+ 1800 val loss 7.5356
112
+ 1800 val perplexity 1873.6436
113
+ 1800 train 7.837944 (lr=4.7105e-05) (hash(x)=55376447)
114
+ 1800 val loss 7.5258
115
+ 1800 val perplexity 1855.3097
116
+ 1800 train 7.796223 (lr=6.5947e-05) (hash(x)=55376447)
117
+ 1900 val loss 7.5072
118
+ 1900 val perplexity 1821.0981
119
+ 1900 train 7.223547 (lr=4.6741e-05) (hash(x)=43810837)
120
+ 1900 val loss 7.5078
121
+ 1900 val perplexity 1822.2770
122
+ 1900 train 7.237650 (lr=6.5437e-05) (hash(x)=43810837)
123
+ 2000 val loss 7.4870
124
+ 2000 val perplexity 1784.7079
125
+ 2000 train 7.566131 (lr=4.6357e-05) (hash(x)=50881655)
126
+ 2000 val loss 7.4835
127
+ 2000 val perplexity 1778.4316
128
+ 2000 train 7.579944 (lr=6.4900e-05) (hash(x)=50881655)
129
+ 2100 val loss 7.4819
130
+ 2100 val perplexity 1775.6853
131
+ 2100 train 7.358100 (lr=4.5954e-05) (hash(x)=49386015)
132
+ 2100 val loss 7.4574
133
+ 2100 val perplexity 1732.5850
134
+ 2100 train 7.333408 (lr=6.4335e-05) (hash(x)=49386015)
135
+ 2200 val loss 7.4734
136
+ 2200 val perplexity 1760.6473
137
+ 2200 train 7.392198 (lr=4.5532e-05) (hash(x)=48572079)
138
+ 2200 val loss 7.4651
139
+ 2200 val perplexity 1745.9648
140
+ 2200 train 7.405998 (lr=6.3745e-05) (hash(x)=48572079)
141
+ 2300 val loss 7.4649
142
+ 2300 val perplexity 1745.6702
143
+ 2300 train 7.528904 (lr=4.5091e-05) (hash(x)=54950719)
144
+ 2300 val loss 7.4181
145
+ 2300 val perplexity 1665.8230
146
+ 2300 train 7.485063 (lr=6.3128e-05) (hash(x)=54950719)
147
+ 2400 val loss 7.4481
148
+ 2400 val perplexity 1716.5986
149
+ 2400 train 7.104608 (lr=4.4633e-05) (hash(x)=42190240)
150
+ 2400 val loss 7.3888
151
+ 2400 val perplexity 1617.7377
152
+ 2400 train 7.051908 (lr=6.2486e-05) (hash(x)=42190240)
153
+ 2500 val loss 7.4384
154
+ 2500 val perplexity 1700.0844
155
+ 2500 train 7.503536 (lr=4.4156e-05) (hash(x)=45223539)
156
+ 2500 val loss 7.3688
157
+ 2500 val perplexity 1585.7074
158
+ 2500 train 7.440190 (lr=6.1819e-05) (hash(x)=45223539)
159
+ 2600 val loss 7.4214
160
+ 2600 val perplexity 1671.3839
161
+ 2600 train 7.425591 (lr=4.3663e-05) (hash(x)=54037353)
162
+ 2600 val loss 7.3434
163
+ 2600 val perplexity 1546.0243
164
+ 2600 train 7.361808 (lr=6.1128e-05) (hash(x)=54037353)
165
+ 2700 val loss 7.3990
166
+ 2700 val perplexity 1634.3760
167
+ 2700 train 7.943274 (lr=4.3153e-05) (hash(x)=59131616)
168
+ 2700 val loss 7.3256
169
+ 2700 val perplexity 1518.7589
170
+ 2700 train 7.771473 (lr=6.0414e-05) (hash(x)=59131616)
171
+ 2800 val loss 7.3895
172
+ 2800 val perplexity 1618.9755
173
+ 2800 train 7.214322 (lr=4.2627e-05) (hash(x)=45882743)
174
+ 2800 val loss 7.2809
175
+ 2800 val perplexity 1452.2246
176
+ 2800 train 7.131896 (lr=5.9677e-05) (hash(x)=45882743)
177
+ 2900 val loss 7.3756
178
+ 2900 val perplexity 1596.5599
179
+ 2900 train 6.986534 (lr=4.2085e-05) (hash(x)=43758910)
180
+ 3000 val loss 7.4025
181
+ 3000 val perplexity 1640.0563
182
+ 3000 train 7.337453 (lr=4.1529e-05) (hash(x)=47965974)
183
+ 2900 val loss 7.2484
184
+ 2900 val perplexity 1405.8976
185
+ 2900 train 6.918225 (lr=5.8919e-05) (hash(x)=43758910)
186
+ 3100 val loss 7.3828
187
+ 3100 val perplexity 1608.0157
188
+ 3100 train 7.282964 (lr=4.0957e-05) (hash(x)=48205243)
189
+ 3000 val loss 7.2421
190
+ 3000 val perplexity 1397.0543
191
+ 3000 train 7.161748 (lr=5.8140e-05) (hash(x)=47965974)
192
+ 3200 val loss 7.3542
193
+ 3200 val perplexity 1562.7103
194
+ 3200 train 7.431534 (lr=4.0373e-05) (hash(x)=54511383)
195
+ 3100 val loss 7.2156
196
+ 3100 val perplexity 1360.4982
197
+ 3100 train 7.109374 (lr=5.7340e-05) (hash(x)=48205243)
198
+ 3300 val loss 7.3452
199
+ 3300 val perplexity 1548.7839
200
+ 3300 train 7.324124 (lr=3.9775e-05) (hash(x)=54428388)
201
+ 3200 val loss 7.2021
202
+ 3200 val perplexity 1342.2188
203
+ 3200 train 7.273560 (lr=5.6522e-05) (hash(x)=54511383)
204
+ 3400 val loss 7.3255
205
+ 3400 val perplexity 1518.5460
206
+ 3400 train 7.373497 (lr=3.9164e-05) (hash(x)=48115990)
207
+ 3300 val loss 7.1854
208
+ 3300 val perplexity 1320.0015
209
+ 3300 train 7.137845 (lr=5.5684e-05) (hash(x)=54428388)
210
+ 3500 val loss 7.3067
211
+ 3500 val perplexity 1490.2999
212
+ 3500 train 6.951129 (lr=3.8541e-05) (hash(x)=41137345)
213
+ 3400 val loss 7.1642
214
+ 3400 val perplexity 1292.2944
215
+ 3400 train 7.198205 (lr=5.4829e-05) (hash(x)=48115990)
216
+ 3600 val loss 7.3099
217
+ 3600 val perplexity 1495.0908
218
+ 3600 train 7.210977 (lr=3.7907e-05) (hash(x)=55186224)
219
+ 3500 val loss 7.1480
220
+ 3500 val perplexity 1271.5587
221
+ 3500 train 6.770042 (lr=5.3958e-05) (hash(x)=41137345)
222
+ 3700 val loss 7.2953
223
+ 3700 val perplexity 1473.4081
224
+ 3700 train 7.162465 (lr=3.7262e-05) (hash(x)=54990049)
225
+ 3600 val loss 7.1600
226
+ 3600 val perplexity 1286.9684
227
+ 3600 train 7.056310 (lr=5.3070e-05) (hash(x)=55186224)
228
+ 3800 val loss 7.2900
229
+ 3800 val perplexity 1465.5238
230
+ 3800 train 7.036756 (lr=3.6608e-05) (hash(x)=46288812)
231
+ 3700 val loss 7.1365
232
+ 3700 val perplexity 1257.0605
233
+ 3700 train 7.001248 (lr=5.2167e-05) (hash(x)=54990049)
234
+ 3900 val loss 7.2680
235
+ 3900 val perplexity 1433.7078
236
+ 3900 train 6.930634 (lr=3.5944e-05) (hash(x)=45829773)
237
+ 3800 val loss 7.1138
238
+ 3800 val perplexity 1228.7883
239
+ 3800 train 6.855778 (lr=5.1251e-05) (hash(x)=46288812)
240
+ 4000 val loss 7.2640
241
+ 4000 val perplexity 1427.9847
242
+ 4000 train 7.045815 (lr=3.5271e-05) (hash(x)=52499943)
243
+ 3900 val loss 7.1105
244
+ 3900 val perplexity 1224.7800
245
+ 3900 train 6.770267 (lr=5.0321e-05) (hash(x)=45829773)
246
+ 4100 val loss 7.2214
247
+ 4100 val perplexity 1368.4528
248
+ 4100 train 7.119732 (lr=3.4590e-05) (hash(x)=48563796)
249
+ 4000 val loss 7.0887
250
+ 4000 val perplexity 1198.3544
251
+ 4000 train 6.878484 (lr=4.9379e-05) (hash(x)=52499943)
252
+ 4200 val loss 7.1987
253
+ 4200 val perplexity 1337.7422
254
+ 4200 train 7.161106 (lr=3.3902e-05) (hash(x)=49165143)
255
+ 4100 val loss 7.0505
256
+ 4100 val perplexity 1153.4874
257
+ 4100 train 6.955088 (lr=4.8426e-05) (hash(x)=48563796)
258
+ 4300 val loss 7.1848
259
+ 4300 val perplexity 1319.2406
260
+ 4300 train 7.263635 (lr=3.3207e-05) (hash(x)=50973176)
261
+ 4200 val loss 6.9988
262
+ 4200 val perplexity 1095.3329
263
+ 4200 train 6.969129 (lr=4.7463e-05) (hash(x)=49165143)
264
+ 4400 val loss 7.1592
265
+ 4400 val perplexity 1285.8815
266
+ 4400 train 7.206085 (lr=3.2507e-05) (hash(x)=55275124)
267
+ 4300 val loss 6.9748
268
+ 4300 val perplexity 1069.3761
269
+ 4300 train 7.032444 (lr=4.6490e-05) (hash(x)=50973176)
270
+ 4500 val loss 7.1423
271
+ 4500 val perplexity 1264.3616
272
+ 4500 train 7.480568 (lr=3.1801e-05) (hash(x)=58646505)
273
+ 4400 val loss 6.9587
274
+ 4400 val perplexity 1052.2388
275
+ 4400 train 7.023558 (lr=4.5509e-05) (hash(x)=55275124)
276
+ 4600 val loss 7.1201
277
+ 4600 val perplexity 1236.5511
278
+ 4600 train 6.934882 (lr=3.1091e-05) (hash(x)=42554666)
279
+ 4500 val loss 6.9399
280
+ 4500 val perplexity 1032.6294
281
+ 4500 train 7.294427 (lr=4.4521e-05) (hash(x)=58646505)
282
+ 4700 val loss 7.1123
283
+ 4700 val perplexity 1226.9720
284
+ 4700 train 7.021177 (lr=3.0377e-05) (hash(x)=47846764)
285
+ 4600 val loss 6.9231
286
+ 4600 val perplexity 1015.4349
287
+ 4600 train 6.728717 (lr=4.3527e-05) (hash(x)=42554666)
288
+ 4800 val loss 7.1072
289
+ 4800 val perplexity 1220.7145
290
+ 4800 train 7.640239 (lr=2.9661e-05) (hash(x)=58239019)
291
+ 4700 val loss 6.9024
292
+ 4700 val perplexity 994.6642
293
+ 4700 train 6.794252 (lr=4.2528e-05) (hash(x)=47846764)
294
+ 4900 val loss 7.0912
295
+ 4900 val perplexity 1201.3014
296
+ 4900 train 7.122570 (lr=2.8942e-05) (hash(x)=50711220)
297
+ 4800 val loss 6.8864
298
+ 4800 val perplexity 978.8246
299
+ 4800 train 7.385535 (lr=4.1525e-05) (hash(x)=58239019)
300
+ 5000 val loss 7.0806
301
+ 5000 val perplexity 1188.6388
302
+ 5000 train 7.011305 (lr=2.8221e-05) (hash(x)=45994194)
303
+ 4900 val loss 6.8732
304
+ 4900 val perplexity 966.0390
305
+ 4900 train 6.911656 (lr=4.0518e-05) (hash(x)=50711220)
306
+ 5100 val loss 7.0847
307
+ 5100 val perplexity 1193.5157
308
+ 5100 train 6.948140 (lr=2.7500e-05) (hash(x)=48659050)
309
+ 5000 val loss 6.8794
310
+ 5000 val perplexity 972.0590
311
+ 5000 train 6.808244 (lr=3.9510e-05) (hash(x)=45994194)
312
+ 5200 val loss 7.0560
313
+ 5200 val perplexity 1159.7809
314
+ 5200 train 6.998636 (lr=2.6779e-05) (hash(x)=49369682)
315
+ 5100 val loss 6.8575
316
+ 5100 val perplexity 950.9432
317
+ 5100 train 6.727626 (lr=3.8500e-05) (hash(x)=48659050)
318
+ 5300 val loss 7.0524
319
+ 5300 val perplexity 1155.6340
320
+ 5300 train 7.432145 (lr=2.6058e-05) (hash(x)=57787700)
321
+ 5200 val loss 6.8367
322
+ 5200 val perplexity 931.4388
323
+ 5200 train 6.751755 (lr=3.7490e-05) (hash(x)=49369682)
324
+ 5400 val loss 7.0336
325
+ 5400 val perplexity 1134.0551
326
+ 5400 train 6.964849 (lr=2.5339e-05) (hash(x)=49365400)
327
+ 5300 val loss 6.8303
328
+ 5300 val perplexity 925.4502
329
+ 5300 train 7.234496 (lr=3.6482e-05) (hash(x)=57787700)
330
+ 5500 val loss 7.0214
331
+ 5500 val perplexity 1120.3920
332
+ 5500 train 6.973766 (lr=2.4623e-05) (hash(x)=48720412)
333
+ 5400 val loss 6.8203
334
+ 5400 val perplexity 916.2604
335
+ 5400 train 6.747429 (lr=3.5475e-05) (hash(x)=49365400)
336
+ 5600 val loss 7.0085
337
+ 5600 val perplexity 1106.0455
338
+ 5600 train 7.324727 (lr=2.3909e-05) (hash(x)=55784800)
339
+ 5500 val loss 6.8173
340
+ 5500 val perplexity 913.5103
341
+ 5500 train 6.778089 (lr=3.4472e-05) (hash(x)=48720412)
342
+ 5700 val loss 7.0040
343
+ 5700 val perplexity 1101.0533
344
+ 5700 train 6.863188 (lr=2.3199e-05) (hash(x)=50073634)
345
+ 5600 val loss 6.8012
346
+ 5600 val perplexity 898.9412
347
+ 5600 train 7.131571 (lr=3.3473e-05) (hash(x)=55784800)
348
+ 5800 val loss 6.9817
349
+ 5800 val perplexity 1076.7632
350
+ 5800 train 6.810428 (lr=2.2493e-05) (hash(x)=50170324)
351
+ 5700 val loss 6.7964
352
+ 5700 val perplexity 894.6154
353
+ 5700 train 6.646752 (lr=3.2479e-05) (hash(x)=50073634)
354
+ 5900 val loss 6.9721
355
+ 5900 val perplexity 1066.4613
356
+ 5900 train 6.721549 (lr=2.1793e-05) (hash(x)=48410268)
357
+ 5800 val loss 6.7835
358
+ 5800 val perplexity 883.1710
359
+ 5800 train 6.606258 (lr=3.1491e-05) (hash(x)=50170324)
360
+ 6000 val loss 6.9624
361
+ 6000 val perplexity 1056.1648
362
+ 6000 train 6.858176 (lr=2.1098e-05) (hash(x)=49527342)
363
+ 5900 val loss 6.7863
364
+ 5900 val perplexity 885.6093
365
+ 5900 train 6.532552 (lr=3.0510e-05) (hash(x)=48410268)
366
+ 6100 val loss 6.9515
367
+ 6100 val perplexity 1044.7220
368
+ 6100 train 6.763265 (lr=2.0410e-05) (hash(x)=49550294)
369
+ 6000 val loss 6.7822
370
+ 6000 val perplexity 881.9771
371
+ 6000 train 6.668611 (lr=2.9537e-05) (hash(x)=49527342)
372
+ 6200 val loss 6.9387
373
+ 6200 val perplexity 1031.4410
374
+ 6200 train 6.494091 (lr=1.9729e-05) (hash(x)=42126106)
375
+ 6100 val loss 6.7750
376
+ 6100 val perplexity 875.6949
377
+ 6100 train 6.587618 (lr=2.8574e-05) (hash(x)=49550294)
378
+ 6300 val loss 6.9254
379
+ 6300 val perplexity 1017.7720
380
+ 6300 train 6.696783 (lr=1.9056e-05) (hash(x)=49608772)
381
+ 6200 val loss 6.7762
382
+ 6200 val perplexity 876.6934
383
+ 6200 train 6.334079 (lr=2.7621e-05) (hash(x)=42126106)
384
+ 6400 val loss 6.9128
385
+ 6400 val perplexity 1005.0568
386
+ 6400 train 6.485902 (lr=1.8392e-05) (hash(x)=52324417)
387
+ 6300 val loss 6.7694
388
+ 6300 val perplexity 870.7931
389
+ 6300 train 6.520870 (lr=2.6679e-05) (hash(x)=49608772)
390
+ 6500 val loss 6.8815
391
+ 6500 val perplexity 974.0597
392
+ 6500 train 6.923303 (lr=1.7738e-05) (hash(x)=46207215)
393
+ 6400 val loss 6.7631
394
+ 6400 val perplexity 865.2852
395
+ 6400 train 6.338704 (lr=2.5749e-05) (hash(x)=52324417)
396
+ 6600 val loss 6.8604
397
+ 6600 val perplexity 953.7169
398
+ 6600 train 6.797771 (lr=1.7093e-05) (hash(x)=49027014)
399
+ 6500 val loss 6.7358
400
+ 6500 val perplexity 841.9837
401
+ 6500 train 6.789601 (lr=2.4833e-05) (hash(x)=46207215)
402
+ 6700 val loss 6.8459
403
+ 6700 val perplexity 939.9939
404
+ 6700 train 6.819520 (lr=1.6459e-05) (hash(x)=46232513)
405
+ 6800 val loss 6.8301
406
+ 6800 val perplexity 925.2843
407
+ 6800 train 6.766226 (lr=1.5836e-05) (hash(x)=47348403)
408
+ 6600 val loss 6.7171
409
+ 6600 val perplexity 826.3766
410
+ 6600 train 6.647032 (lr=2.3930e-05) (hash(x)=49027014)
411
+ 6900 val loss 6.8254
412
+ 6900 val perplexity 920.9736
413
+ 6900 train 6.799843 (lr=1.5225e-05) (hash(x)=49806647)
414
+ 6700 val loss 6.7033
415
+ 6700 val perplexity 815.0651
416
+ 6700 train 6.672481 (lr=2.3042e-05) (hash(x)=46232513)
417
+ 7000 val loss 6.8056
418
+ 7000 val perplexity 902.8452
419
+ 7000 train 6.843875 (lr=1.4627e-05) (hash(x)=50893018)
420
+ 6800 val loss 6.7007
421
+ 6800 val perplexity 812.9648
422
+ 6800 train 6.624757 (lr=2.2171e-05) (hash(x)=47348403)
423
+ 7100 val loss 6.7984
424
+ 7100 val perplexity 896.4370
425
+ 7100 train 6.822917 (lr=1.4043e-05) (hash(x)=49157639)
426
+ 6900 val loss 6.7069
427
+ 6900 val perplexity 817.9907
428
+ 6900 train 6.675504 (lr=2.1316e-05) (hash(x)=49806647)
429
+ 7200 val loss 6.7897
430
+ 7200 val perplexity 888.6813
431
+ 7200 train 6.761259 (lr=1.3471e-05) (hash(x)=47014759)
432
+ 7000 val loss 6.6812
433
+ 7000 val perplexity 797.3110
434
+ 7000 train 6.741058 (lr=2.0478e-05) (hash(x)=50893018)
435
+ 7300 val loss 6.7779
436
+ 7300 val perplexity 878.1854
437
+ 7300 train 6.779035 (lr=1.2915e-05) (hash(x)=47325591)
438
+ 7100 val loss 6.6752
439
+ 7100 val perplexity 792.4863
440
+ 7100 train 6.693850 (lr=1.9660e-05) (hash(x)=49157639)
441
+ 7400 val loss 6.7720
442
+ 7400 val perplexity 873.0865
443
+ 7400 train 6.654572 (lr=1.2373e-05) (hash(x)=49184604)
444
+ 7200 val loss 6.6680
445
+ 7200 val perplexity 786.8536
446
+ 7200 train 6.651248 (lr=1.8860e-05) (hash(x)=47014759)
447
+ 7500 val loss 6.7629
448
+ 7500 val perplexity 865.1734
449
+ 7500 train 7.052269 (lr=1.1847e-05) (hash(x)=55053584)
450
+ 7300 val loss 6.6610
451
+ 7300 val perplexity 781.3257
452
+ 7300 train 6.657533 (lr=1.8081e-05) (hash(x)=47325591)
453
+ 7600 val loss 6.7585
454
+ 7600 val perplexity 861.3661
455
+ 7600 train 6.715505 (lr=1.1337e-05) (hash(x)=48693923)
456
+ 7400 val loss 6.6557
457
+ 7400 val perplexity 777.1942
458
+ 7400 train 6.538068 (lr=1.7323e-05) (hash(x)=49184604)
459
+ 7700 val loss 6.7544
460
+ 7700 val perplexity 857.8153
461
+ 7700 train 6.298505 (lr=1.0844e-05) (hash(x)=40952882)
462
+ 7500 val loss 6.6487
463
+ 7500 val perplexity 771.8184
464
+ 7500 train 6.927235 (lr=1.6586e-05) (hash(x)=55053584)
465
+ 7800 val loss 6.7480
466
+ 7800 val perplexity 852.3216
467
+ 7800 train 6.836084 (lr=1.0367e-05) (hash(x)=52487845)
468
+ 7600 val loss 6.6504
469
+ 7600 val perplexity 773.0807
470
+ 7600 train 6.592214 (lr=1.5872e-05) (hash(x)=48693923)
471
+ 7900 val loss 6.7444
472
+ 7900 val perplexity 849.3254
473
+ 7900 train 6.821626 (lr=9.9088e-06) (hash(x)=50221547)
474
+ 7700 val loss 6.6482
475
+ 7700 val perplexity 771.4020
476
+ 7700 train 6.223924 (lr=1.5181e-05) (hash(x)=40952882)
477
+ 8000 val loss 6.7372
478
+ 8000 val perplexity 843.2168
479
+ 8000 train 7.036058 (lr=9.4682e-06) (hash(x)=62294204)
480
+ 7800 val loss 6.6412
481
+ 7800 val perplexity 765.9936
482
+ 7800 train 6.724183 (lr=1.4514e-05) (hash(x)=52487845)
483
+ 8100 val loss 6.7324
484
+ 8100 val perplexity 839.1255
485
+ 8100 train 6.425050 (lr=9.0461e-06) (hash(x)=44401967)
486
+ 7900 val loss 6.6360
487
+ 7900 val perplexity 762.0717
488
+ 7900 train 6.738249 (lr=1.3872e-05) (hash(x)=50221547)
489
+ 8200 val loss 6.7270
490
+ 8200 val perplexity 834.6295
491
+ 8200 train 6.687708 (lr=8.6430e-06) (hash(x)=52769095)
492
+ 8000 val loss 6.6358
493
+ 8000 val perplexity 761.8933
494
+ 8000 train 6.951126 (lr=1.3255e-05) (hash(x)=62294204)
495
+ 8300 val loss 6.7317
496
+ 8300 val perplexity 838.6075
497
+ 8300 train 6.673268 (lr=8.2593e-06) (hash(x)=56829883)
498
+ 8100 val loss 6.6309
499
+ 8100 val perplexity 758.1688
500
+ 8100 train 6.340541 (lr=1.2665e-05) (hash(x)=44401967)
501
+ 8400 val loss 6.7297
502
+ 8400 val perplexity 836.9218
503
+ 8400 train 6.691971 (lr=7.8953e-06) (hash(x)=52147375)
504
+ 8200 val loss 6.6290
505
+ 8200 val perplexity 756.7592
506
+ 8200 train 6.576173 (lr=1.2100e-05) (hash(x)=52769095)
507
+ 8500 val loss 6.7269
508
+ 8500 val perplexity 834.5331
509
+ 8500 train 6.963628 (lr=7.5515e-06) (hash(x)=60197820)
510
+ 8300 val loss 6.6358
511
+ 8300 val perplexity 761.8755
512
+ 8300 train 6.583430 (lr=1.1563e-05) (hash(x)=56829883)
513
+ 8600 val loss 6.7228
514
+ 8600 val perplexity 831.1801
515
+ 8600 train 6.488226 (lr=7.2282e-06) (hash(x)=49377068)
516
+ 8400 val loss 6.6368
517
+ 8400 val perplexity 762.6450
518
+ 8400 train 6.574768 (lr=1.1053e-05) (hash(x)=52147375)
519
+ 8700 val loss 6.7243
520
+ 8700 val perplexity 832.3748
521
+ 8700 train 6.683083 (lr=6.9257e-06) (hash(x)=51092724)
522
+ 8500 val loss 6.6341
523
+ 8500 val perplexity 760.6315
524
+ 8500 train 6.861284 (lr=1.0572e-05) (hash(x)=60197820)
525
+ 8800 val loss 6.7192
526
+ 8800 val perplexity 828.1299
527
+ 8800 train 6.728143 (lr=6.6444e-06) (hash(x)=48642928)
528
+ 8600 val loss 6.6271
529
+ 8600 val perplexity 755.2836
530
+ 8600 train 6.383428 (lr=1.0119e-05) (hash(x)=49377068)
531
+ 8900 val loss 6.7147
532
+ 8900 val perplexity 824.3984
533
+ 8900 train 6.926672 (lr=6.3845e-06) (hash(x)=55342246)
534
+ 8700 val loss 6.6293
535
+ 8700 val perplexity 756.9796
536
+ 8700 train 6.602247 (lr=9.6960e-06) (hash(x)=51092724)
537
+ 9000 val loss 6.6997
538
+ 9000 val perplexity 812.1678
539
+ 9000 train 6.671376 (lr=6.1462e-06) (hash(x)=48093368)
540
+ 8800 val loss 6.6263
541
+ 8800 val perplexity 754.6838
542
+ 8800 train 6.621269 (lr=9.3021e-06) (hash(x)=48642928)
543
+ 9100 val loss 6.6943
544
+ 9100 val perplexity 807.8050
545
+ 9100 train 6.789246 (lr=5.9300e-06) (hash(x)=48578183)
546
+ 8900 val loss 6.6234
547
+ 8900 val perplexity 752.5152
548
+ 8900 train 6.830116 (lr=8.9382e-06) (hash(x)=55342246)
549
+ 9200 val loss 6.6896
550
+ 9200 val perplexity 803.9971
551
+ 9200 train 6.888481 (lr=5.7359e-06) (hash(x)=50794720)
552
+ 9000 val loss 6.6079
553
+ 9000 val perplexity 740.9231
554
+ 9000 train 6.601912 (lr=8.6047e-06) (hash(x)=48093368)
555
+ 9300 val loss 6.6857
556
+ 9300 val perplexity 800.8321
557
+ 9300 train 6.454147 (lr=5.5641e-06) (hash(x)=46513190)
558
+ 9100 val loss 6.6025
559
+ 9100 val perplexity 736.9377
560
+ 9100 train 6.679001 (lr=8.3020e-06) (hash(x)=48578183)
561
+ 9400 val loss 6.6830
562
+ 9400 val perplexity 798.7258
563
+ 9400 train 6.323279 (lr=5.4149e-06) (hash(x)=43808238)
564
+ 9200 val loss 6.5976
565
+ 9200 val perplexity 733.3257
566
+ 9200 train 6.786611 (lr=8.0302e-06) (hash(x)=50794720)
567
+ 9500 val loss 6.6799
568
+ 9500 val perplexity 796.2480
569
+ 9500 train 6.484011 (lr=5.2884e-06) (hash(x)=45021888)
570
+ 9300 val loss 6.5947
571
+ 9300 val perplexity 731.1756
572
+ 9300 train 6.369024 (lr=7.7898e-06) (hash(x)=46513190)
573
+ 9600 val loss 6.6766
574
+ 9600 val perplexity 793.6359
575
+ 9600 train 6.773294 (lr=5.1847e-06) (hash(x)=56525570)
576
+ 9400 val loss 6.5938
577
+ 9400 val perplexity 730.5848
578
+ 9400 train 6.250168 (lr=7.5809e-06) (hash(x)=43808238)
579
+ 9700 val loss 6.6736
580
+ 9700 val perplexity 791.2244
581
+ 9700 train 6.871308 (lr=5.1040e-06) (hash(x)=52585913)
582
+ 9500 val loss 6.5891
583
+ 9500 val perplexity 727.1143
584
+ 9500 train 6.406715 (lr=7.4038e-06) (hash(x)=45021888)
585
+ 9800 val loss 6.6725
586
+ 9800 val perplexity 790.3431
587
+ 9800 train 6.852726 (lr=5.0462e-06) (hash(x)=52344698)
588
+ 9600 val loss 6.5886
589
+ 9600 val perplexity 726.7500
590
+ 9600 train 6.674443 (lr=7.2586e-06) (hash(x)=56525570)
591
+ 9900 val loss 6.6691
592
+ 9900 val perplexity 787.7227
593
+ 9900 train 6.682109 (lr=5.0116e-06) (hash(x)=51740945)
594
+ 9700 val loss 6.5857
595
+ 9700 val perplexity 724.6240
596
+ 9700 train 6.775486 (lr=7.1456e-06) (hash(x)=52585913)
597
+ 9999 val loss 6.6652
598
+ 9999 val perplexity 784.6176
599
+ 9800 val loss 6.5830
600
+ 9800 val perplexity 722.7222
601
+ 9800 train 6.780724 (lr=7.0647e-06) (hash(x)=52344698)
602
+ 9900 val loss 6.5810
603
+ 9900 val perplexity 721.2598
604
+ 9900 train 6.589230 (lr=7.0162e-06) (hash(x)=51740945)
605
+ 9999 val loss 6.5788
606
+ 9999 val perplexity 719.6898
attention_kindselective_n_heads2_seed1338/model_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bfe50e723e825bd0786c1a4596ac4cdaf796676df7fdc5da3f87bcf867afc971
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88a13d00e75e1cf66932d118b2f84bb24f00babd0ee75e7f5801725ef5c13418
3
  size 38587970
attention_kindselective_n_heads2_seed1338/model_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2e8ba950312264226829ac705df43b9cdffaf5bcc036d9b24aef5f3e832494c
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32d5afc7324a3d22f4c1218ec09c6c54149343ba625fcba9940acb396b796c11
3
  size 38587970
attention_kindselective_n_heads2_seed1338/model_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:720ce17fda15dca0483b5bee479b09521f324cbda2f55ee89297aeb3815e7127
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05a121807594d5ecb5feed74e34dfa1e69b69860960bb12751f372377bb006a6
3
  size 38587970
attention_kindselective_n_heads2_seed1338/model_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e1a2367f59732516e841fec0ec047482f7b40395d9a8c4e9bfd7434db667578
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40e0b112f277f6cbba421303357ec83f7697b417831c9e4ff43b473f4f3e1114
3
  size 38587970
attention_kindselective_n_heads2_seed1338/optimizer_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8266da6cdaef57d288608149918e3d4cc507a0f73a368f4c248cc84d130fa87
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d833216fc7bfe1147572ad064c3757e6d11dab6d14b15ed9dbc193d3fe0c652
3
  size 70895430
attention_kindselective_n_heads2_seed1338/optimizer_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1cf4773a63f7bd4d9b9b266dfca8bb2631ec02374f0d2e94c402329e5b41680
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42125ee96db8d16feb270e483c050b0f0ad3baa06ce04fb49898faaa1eb18ca6
3
  size 70895430
attention_kindselective_n_heads2_seed1338/optimizer_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:28ffac77284f9c9c07e805be4e544c3bf52c59e2e9c317ae438e92592ac76c33
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4c2b49578a7dc4e2283cd26e5e389f35e17e5b918115b0f054f8c487f39e3d0
3
  size 70895430
attention_kindselective_n_heads2_seed1338/optimizer_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d24ecb4199ebaa5047064c533fe7647126557157f3557829d39923f2d8fba6fd
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42529f3655fc336117c939c1c8ec0fb125634901be3c730bed8da0466e4741ef
3
  size 70895430