andrew-healey commited on
Commit
452770a
·
verified ·
1 Parent(s): a24a3d4

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads4_seed1338/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_4/attention_kindselective_n_heads4_seed1338", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_4", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1338, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 30720, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 3e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "3e-5_30720_4_1338", "n_embd": 256}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads4_seed1338", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1338, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 7e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "7e-5_10240_4_1338", "n_embd": 256}
attention_kindselective_n_heads4_seed1338/dataloader_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db41c5e5513137877487a93451adf8ec4ed2448ab6e9471ebd5595c8e3293875
3
  size 964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b2ea67f78ff5a7970d0db044ff7ee527b3dc065f295fd30f588df4b44b568d0
3
  size 964
attention_kindselective_n_heads4_seed1338/dataloader_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6df8947c6ab773db1947914387d3db345a84828521d3a64bae9b652e1b0a410
3
  size 964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f03ed2ebf741f15e13c79e6cc1e9a19b308450d81cc3b4d8d0338c63d77ca59
3
  size 964
attention_kindselective_n_heads4_seed1338/dataloader_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:169891a726a7ff746d1a7aa99f459a66d85ceb4e9f2583f790f5b8501f97b6af
3
  size 964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82590037fb2eecbec961f7967a8dd1b8d85515d31a252f66b92b8139858a8b7c
3
  size 964
attention_kindselective_n_heads4_seed1338/dataloader_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e685a568a36c792ccbe7b5fcae0b9d630955e589991190bd8902836cea6a91df
3
  size 964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c544303717d12355a69b8ffb1eb109434e4fdccfd5a61279b6e8ba2e870d6700
3
  size 964
attention_kindselective_n_heads4_seed1338/log2.txt CHANGED
@@ -1,303 +1,605 @@
1
  max_steps: 10000
2
- 0 val loss 11.2663
3
- 0 val perplexity 78147.0000
4
- 0 train 11.263341 (lr=1.5000e-07) (hash(x)=150327452)
5
- 100 val loss 9.9038
6
- 100 val perplexity 20006.5938
7
- 100 train 9.914461 (lr=1.5150e-05) (hash(x)=166441190)
8
- 200 val loss 8.9665
9
- 200 val perplexity 7836.3984
10
- 200 train 9.079846 (lr=3.0000e-05) (hash(x)=166780046)
11
- 300 val loss 8.0400
12
- 300 val perplexity 3102.7432
13
- 300 train 7.989579 (lr=2.9993e-05) (hash(x)=159835303)
14
- 400 val loss 7.7330
15
- 400 val perplexity 2282.3311
16
- 400 train 7.532744 (lr=2.9972e-05) (hash(x)=155040610)
17
- 500 val loss 7.6145
18
- 500 val perplexity 2027.4534
19
- 500 train 7.370183 (lr=2.9938e-05) (hash(x)=130190460)
20
- 600 val loss 7.5275
21
- 600 val perplexity 1858.4928
22
- 600 train 7.548462 (lr=2.9889e-05) (hash(x)=155504036)
23
- 700 val loss 7.4642
24
- 700 val perplexity 1744.3755
25
- 700 train 7.321239 (lr=2.9827e-05) (hash(x)=137347213)
26
- 800 val loss 7.4136
27
- 800 val perplexity 1658.3201
28
- 800 train 7.269616 (lr=2.9751e-05) (hash(x)=143823248)
29
- 900 val loss 7.3688
30
- 900 val perplexity 1585.6975
31
- 900 train 7.486793 (lr=2.9662e-05) (hash(x)=156260416)
32
- 1000 val loss 7.3444
33
- 1000 val perplexity 1547.5503
34
- 1000 train 7.314602 (lr=2.9558e-05) (hash(x)=143734685)
35
- 1100 val loss 7.3103
36
- 1100 val perplexity 1495.6113
37
- 1100 train 7.249287 (lr=2.9442e-05) (hash(x)=160013925)
38
- 1200 val loss 7.2809
39
- 1200 val perplexity 1452.2426
40
- 1200 train 7.030381 (lr=2.9312e-05) (hash(x)=150678249)
41
- 1300 val loss 7.2499
42
- 1300 val perplexity 1407.9894
43
- 1300 train 7.035954 (lr=2.9169e-05) (hash(x)=149073315)
44
- 1400 val loss 7.1976
45
- 1400 val perplexity 1336.1791
46
- 1400 train 7.560995 (lr=2.9013e-05) (hash(x)=175802021)
47
- 1500 val loss 7.1596
48
- 1500 val perplexity 1286.3782
49
- 1500 train 7.414792 (lr=2.8845e-05) (hash(x)=171034639)
50
- 1600 val loss 7.1100
51
- 1600 val perplexity 1224.1091
52
- 1600 train 7.318711 (lr=2.8663e-05) (hash(x)=158681215)
53
- 1700 val loss 7.0716
54
- 1700 val perplexity 1178.0487
55
- 1700 train 7.011314 (lr=2.8469e-05) (hash(x)=152116061)
56
- 1800 val loss 7.0428
57
- 1800 val perplexity 1144.6343
58
- 1800 train 6.981174 (lr=2.8263e-05) (hash(x)=146108145)
59
- 1900 val loss 7.0107
60
- 1900 val perplexity 1108.4087
61
- 1900 train 6.845355 (lr=2.8044e-05) (hash(x)=147598108)
62
- 2000 val loss 6.9936
63
- 2000 val perplexity 1089.6827
64
- 2000 train 6.852312 (lr=2.7814e-05) (hash(x)=154996086)
65
- 2100 val loss 6.9603
66
- 2100 val perplexity 1053.9778
67
- 2100 train 6.747998 (lr=2.7572e-05) (hash(x)=153396183)
68
- 2200 val loss 6.9095
69
- 2200 val perplexity 1001.7344
70
- 2200 train 6.897384 (lr=2.7319e-05) (hash(x)=153885445)
71
- 2300 val loss 6.8842
72
- 2300 val perplexity 976.7466
73
- 2300 train 6.883799 (lr=2.7055e-05) (hash(x)=159666385)
74
- 2400 val loss 6.8381
75
- 2400 val perplexity 932.6805
76
- 2400 train 6.814809 (lr=2.6780e-05) (hash(x)=142353087)
77
- 2500 val loss 6.8053
78
- 2500 val perplexity 902.6398
79
- 2500 train 6.818474 (lr=2.6494e-05) (hash(x)=146491718)
80
- 2600 val loss 6.7792
81
- 2600 val perplexity 879.3629
82
- 2600 train 6.706415 (lr=2.6198e-05) (hash(x)=150750353)
83
- 2700 val loss 6.7435
84
- 2700 val perplexity 848.5612
85
- 2700 train 6.450558 (lr=2.5892e-05) (hash(x)=129849193)
86
- 2800 val loss 6.7261
87
- 2800 val perplexity 833.9042
88
- 2800 train 6.579156 (lr=2.5576e-05) (hash(x)=152767913)
89
- 2900 val loss 6.6977
90
- 2900 val perplexity 810.5140
91
- 2900 train 6.479834 (lr=2.5251e-05) (hash(x)=146531140)
92
- 3000 val loss 6.6654
93
- 3000 val perplexity 784.7470
94
- 3000 train 6.737319 (lr=2.4917e-05) (hash(x)=151562048)
95
- 3100 val loss 6.6276
96
- 3100 val perplexity 755.6806
97
- 3100 train 6.601431 (lr=2.4574e-05) (hash(x)=146001424)
98
- 3200 val loss 6.6050
99
- 3200 val perplexity 738.7577
100
- 3200 train 6.661371 (lr=2.4224e-05) (hash(x)=166486165)
101
- 3300 val loss 6.5734
102
- 3300 val perplexity 715.8162
103
- 3300 train 6.504717 (lr=2.3865e-05) (hash(x)=150866680)
104
- 3400 val loss 6.5584
105
- 3400 val perplexity 705.1649
106
- 3400 train 6.459944 (lr=2.3498e-05) (hash(x)=143900419)
107
- 3500 val loss 6.5325
108
- 3500 val perplexity 687.0813
109
- 3500 train 6.343013 (lr=2.3125e-05) (hash(x)=148845794)
110
- 3600 val loss 6.5133
111
- 3600 val perplexity 674.0522
112
- 3600 train 6.354141 (lr=2.2744e-05) (hash(x)=145667796)
113
- 3700 val loss 6.4988
114
- 3700 val perplexity 664.3641
115
- 3700 train 6.519285 (lr=2.2357e-05) (hash(x)=163563851)
116
- 3800 val loss 6.4812
117
- 3800 val perplexity 652.7576
118
- 3800 train 6.425587 (lr=2.1965e-05) (hash(x)=147488689)
119
- 3900 val loss 6.4513
120
- 3900 val perplexity 633.5565
121
- 3900 train 6.449866 (lr=2.1566e-05) (hash(x)=148186608)
122
- 4000 val loss 6.4286
123
- 4000 val perplexity 619.2852
124
- 4000 train 6.293235 (lr=2.1162e-05) (hash(x)=142970187)
125
- 4100 val loss 6.4098
126
- 4100 val perplexity 607.7936
127
- 4100 train 6.482577 (lr=2.0754e-05) (hash(x)=141584883)
128
- 4200 val loss 6.3939
129
- 4200 val perplexity 598.1957
130
- 4200 train 6.192812 (lr=2.0341e-05) (hash(x)=145664585)
131
- 4300 val loss 6.3816
132
- 4300 val perplexity 590.8762
133
- 4300 train 6.243621 (lr=1.9924e-05) (hash(x)=143736499)
134
- 4400 val loss 6.3786
135
- 4400 val perplexity 589.0906
136
- 4400 train 6.205530 (lr=1.9504e-05) (hash(x)=151883322)
137
- 4500 val loss 6.3650
138
- 4500 val perplexity 581.1219
139
- 4500 train 6.248657 (lr=1.9081e-05) (hash(x)=153904871)
140
- 4600 val loss 6.3499
141
- 4600 val perplexity 572.4113
142
- 4600 train 6.429447 (lr=1.8655e-05) (hash(x)=154893521)
143
- 4700 val loss 6.3303
144
- 4700 val perplexity 561.3388
145
- 4700 train 6.389878 (lr=1.8226e-05) (hash(x)=152323949)
146
- 4800 val loss 6.3137
147
- 4800 val perplexity 552.0685
148
- 4800 train 6.247524 (lr=1.7796e-05) (hash(x)=154104619)
149
- 4900 val loss 6.3041
150
- 4900 val perplexity 546.8019
151
- 4900 train 6.426771 (lr=1.7365e-05) (hash(x)=146311426)
152
- 5000 val loss 6.2913
153
- 5000 val perplexity 539.8301
154
- 5000 train 6.296994 (lr=1.6933e-05) (hash(x)=156741847)
155
- 5100 val loss 6.2820
156
- 5100 val perplexity 534.8316
157
- 5100 train 6.057541 (lr=1.6500e-05) (hash(x)=142086346)
158
- 5200 val loss 6.2774
159
- 5200 val perplexity 532.4290
160
- 5200 train 6.050554 (lr=1.6067e-05) (hash(x)=150265428)
161
- 5300 val loss 6.2747
162
- 5300 val perplexity 530.9745
163
- 5300 train 6.163742 (lr=1.5635e-05) (hash(x)=151339108)
164
- 5400 val loss 6.2586
165
- 5400 val perplexity 522.4918
166
- 5400 train 6.242078 (lr=1.5204e-05) (hash(x)=154654372)
167
- 5500 val loss 6.2442
168
- 5500 val perplexity 515.0331
169
- 5500 train 6.253026 (lr=1.4774e-05) (hash(x)=150575051)
170
- 5600 val loss 6.2343
171
- 5600 val perplexity 509.9215
172
- 5600 train 6.134394 (lr=1.4345e-05) (hash(x)=140396423)
173
- 5700 val loss 6.2225
174
- 5700 val perplexity 503.9442
175
- 5700 train 6.086608 (lr=1.3919e-05) (hash(x)=144678758)
176
- 5800 val loss 6.2157
177
- 5800 val perplexity 500.5370
178
- 5800 train 6.277523 (lr=1.3496e-05) (hash(x)=151992743)
179
- 5900 val loss 6.2089
180
- 5900 val perplexity 497.1764
181
- 5900 train 5.998001 (lr=1.3076e-05) (hash(x)=144396927)
182
- 6000 val loss 6.2031
183
- 6000 val perplexity 494.2646
184
- 6000 train 6.175484 (lr=1.2659e-05) (hash(x)=165478625)
185
- 6100 val loss 6.2044
186
- 6100 val perplexity 494.9267
187
- 6100 train 5.943242 (lr=1.2246e-05) (hash(x)=147088621)
188
- 6200 val loss 6.1952
189
- 6200 val perplexity 490.3867
190
- 6200 train 5.959939 (lr=1.1838e-05) (hash(x)=140794994)
191
- 6300 val loss 6.1857
192
- 6300 val perplexity 485.7651
193
- 6300 train 6.112713 (lr=1.1434e-05) (hash(x)=134780906)
194
- 6400 val loss 6.1726
195
- 6400 val perplexity 479.4460
196
- 6400 train 6.158383 (lr=1.1035e-05) (hash(x)=149023655)
197
- 6500 val loss 6.1676
198
- 6500 val perplexity 477.0429
199
- 6500 train 6.019427 (lr=1.0643e-05) (hash(x)=147497796)
200
- 6600 val loss 6.1663
201
- 6600 val perplexity 476.4064
202
- 6600 train 6.022311 (lr=1.0256e-05) (hash(x)=152902689)
203
- 6700 val loss 6.1598
204
- 6700 val perplexity 473.3375
205
- 6700 train 6.199059 (lr=9.8753e-06) (hash(x)=153846046)
206
- 6800 val loss 6.1487
207
- 6800 val perplexity 468.0990
208
- 6800 train 6.287767 (lr=9.5017e-06) (hash(x)=158512738)
209
- 6900 val loss 6.1438
210
- 6900 val perplexity 465.8420
211
- 6900 train 6.667380 (lr=9.1353e-06) (hash(x)=156849968)
212
- 7000 val loss 6.1404
213
- 7000 val perplexity 464.2478
214
- 7000 train 5.972945 (lr=8.7764e-06) (hash(x)=142395855)
215
- 7100 val loss 6.1347
216
- 7100 val perplexity 461.6042
217
- 7100 train 5.991301 (lr=8.4255e-06) (hash(x)=147114884)
218
- 7200 val loss 6.1303
219
- 7200 val perplexity 459.5841
220
- 7200 train 6.150269 (lr=8.0829e-06) (hash(x)=156979839)
221
- 7300 val loss 6.1279
222
- 7300 val perplexity 458.4534
223
- 7300 train 5.902767 (lr=7.7489e-06) (hash(x)=145584373)
224
- 7400 val loss 6.1273
225
- 7400 val perplexity 458.1935
226
- 7400 train 5.868430 (lr=7.4239e-06) (hash(x)=141508204)
227
- 7500 val loss 6.1193
228
- 7500 val perplexity 454.5416
229
- 7500 train 6.157444 (lr=7.1083e-06) (hash(x)=148803965)
230
- 7600 val loss 6.1136
231
- 7600 val perplexity 451.9759
232
- 7600 train 6.178225 (lr=6.8023e-06) (hash(x)=151019676)
233
- 7700 val loss 6.1085
234
- 7700 val perplexity 449.6451
235
- 7700 train 6.151628 (lr=6.5062e-06) (hash(x)=143155750)
236
- 7800 val loss 6.1058
237
- 7800 val perplexity 448.4541
238
- 7800 train 6.123940 (lr=6.2205e-06) (hash(x)=152569653)
239
- 7900 val loss 6.1027
240
- 7900 val perplexity 447.0435
241
- 7900 train 5.969548 (lr=5.9453e-06) (hash(x)=143519455)
242
- 8000 val loss 6.1011
243
- 8000 val perplexity 446.3461
244
- 8000 train 6.265728 (lr=5.6809e-06) (hash(x)=161180944)
245
- 8100 val loss 6.1003
246
- 8100 val perplexity 445.9847
247
- 8100 train 6.132887 (lr=5.4277e-06) (hash(x)=154107345)
248
- 8200 val loss 6.1007
249
- 8200 val perplexity 446.1861
250
- 8200 train 6.124985 (lr=5.1858e-06) (hash(x)=152486517)
251
- 8300 val loss 6.0919
252
- 8300 val perplexity 442.2632
253
- 8300 train 6.113282 (lr=4.9556e-06) (hash(x)=156167749)
254
- 8400 val loss 6.0884
255
- 8400 val perplexity 440.7003
256
- 8400 train 6.169589 (lr=4.7372e-06) (hash(x)=149155006)
257
- 8500 val loss 6.0843
258
- 8500 val perplexity 438.8997
259
- 8500 train 6.044793 (lr=4.5309e-06) (hash(x)=147844390)
260
- 8600 val loss 6.0825
261
- 8600 val perplexity 438.1269
262
- 8600 train 6.348199 (lr=4.3369e-06) (hash(x)=165753320)
263
- 8700 val loss 6.0821
264
- 8700 val perplexity 437.9322
265
- 8700 train 5.860587 (lr=4.1554e-06) (hash(x)=146079979)
266
- 8800 val loss 6.0813
267
- 8800 val perplexity 437.6174
268
- 8800 train 6.286478 (lr=3.9866e-06) (hash(x)=172259509)
269
- 8900 val loss 6.0796
270
- 8900 val perplexity 436.8750
271
- 8900 train 5.752324 (lr=3.8307e-06) (hash(x)=145148314)
272
- 9000 val loss 6.0798
273
- 9000 val perplexity 436.9346
274
- 9000 train 5.990164 (lr=3.6877e-06) (hash(x)=144250633)
275
- 9100 val loss 6.0743
276
- 9100 val perplexity 434.5333
277
- 9100 train 6.182425 (lr=3.5580e-06) (hash(x)=157219797)
278
- 9200 val loss 6.0697
279
- 9200 val perplexity 432.5594
280
- 9200 train 6.088066 (lr=3.4415e-06) (hash(x)=142743778)
281
- 9300 val loss 6.0686
282
- 9300 val perplexity 432.0853
283
- 9300 train 5.967726 (lr=3.3385e-06) (hash(x)=139669771)
284
- 9400 val loss 6.0656
285
- 9400 val perplexity 430.7808
286
- 9400 train 6.073001 (lr=3.2490e-06) (hash(x)=145916843)
287
- 9500 val loss 6.0646
288
- 9500 val perplexity 430.3534
289
- 9500 train 5.989102 (lr=3.1730e-06) (hash(x)=150196125)
290
- 9600 val loss 6.0638
291
- 9600 val perplexity 429.9952
292
- 9600 train 6.368902 (lr=3.1108e-06) (hash(x)=160041419)
293
- 9700 val loss 6.0641
294
- 9700 val perplexity 430.1185
295
- 9700 train 5.840649 (lr=3.0624e-06) (hash(x)=139931627)
296
- 9800 val loss 6.0617
297
- 9800 val perplexity 429.1059
298
- 9800 train 6.026587 (lr=3.0277e-06) (hash(x)=150370792)
299
- 9900 val loss 6.0607
300
- 9900 val perplexity 428.6625
301
- 9900 train 5.908560 (lr=3.0069e-06) (hash(x)=153014886)
302
- 9999 val loss 6.0601
303
- 9999 val perplexity 428.4363
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  max_steps: 10000
2
+ 0 val loss 11.2672
3
+ 0 val perplexity 78216.1172
4
+ 0 val loss 11.2672
5
+ 0 val perplexity 78216.1172
6
+ 0 train 11.243532 (lr=2.5000e-07) (hash(x)=50671684)
7
+ 0 train 11.243532 (lr=3.5000e-07) (hash(x)=50671684)
8
+ 100 val loss 9.7863
9
+ 100 val perplexity 17788.1406
10
+ 100 train 9.748065 (lr=2.5250e-05) (hash(x)=52740221)
11
+ 100 val loss 9.6966
12
+ 100 val perplexity 16261.9414
13
+ 100 train 9.656058 (lr=3.5350e-05) (hash(x)=52740221)
14
+ 200 val loss 8.6243
15
+ 200 val perplexity 5565.2393
16
+ 200 train 8.645584 (lr=5.0000e-05) (hash(x)=49034180)
17
+ 200 val loss 8.5884
18
+ 200 val perplexity 5369.2666
19
+ 200 train 8.609220 (lr=7.0000e-05) (hash(x)=49034180)
20
+ 300 val loss 7.8432
21
+ 300 val perplexity 2548.3008
22
+ 300 train 8.034177 (lr=4.9988e-05) (hash(x)=63180688)
23
+ 300 val loss 7.9184
24
+ 300 val perplexity 2747.3516
25
+ 300 train 8.088190 (lr=6.9984e-05) (hash(x)=63180688)
26
+ 400 val loss 7.6322
27
+ 400 val perplexity 2063.5647
28
+ 400 train 7.646658 (lr=4.9954e-05) (hash(x)=50373500)
29
+ 400 val loss 7.6588
30
+ 400 val perplexity 2119.1797
31
+ 400 train 7.679573 (lr=6.9935e-05) (hash(x)=50373500)
32
+ 500 val loss 7.5570
33
+ 500 val perplexity 1914.0950
34
+ 500 train 7.449134 (lr=4.9896e-05) (hash(x)=44547422)
35
+ 500 val loss 7.5919
36
+ 500 val perplexity 1982.1578
37
+ 500 train 7.476151 (lr=6.9854e-05) (hash(x)=44547422)
38
+ 600 val loss 7.5231
39
+ 600 val perplexity 1850.2341
40
+ 600 train 7.392508 (lr=4.9815e-05) (hash(x)=47184699)
41
+ 600 val loss 7.5484
42
+ 600 val perplexity 1897.6368
43
+ 600 train 7.411293 (lr=6.9741e-05) (hash(x)=47184699)
44
+ 700 val loss 7.4862
45
+ 700 val perplexity 1783.2515
46
+ 700 train 7.382556 (lr=4.9712e-05) (hash(x)=51374582)
47
+ 700 val loss 7.5192
48
+ 700 val perplexity 1843.0841
49
+ 700 train 7.419620 (lr=6.9596e-05) (hash(x)=51374582)
50
+ 800 val loss 7.4512
51
+ 800 val perplexity 1721.9939
52
+ 800 train 7.184936 (lr=4.9585e-05) (hash(x)=46264805)
53
+ 800 val loss 7.5009
54
+ 800 val perplexity 1809.6692
55
+ 800 train 7.231192 (lr=6.9419e-05) (hash(x)=46264805)
56
+ 900 val loss 7.4239
57
+ 900 val perplexity 1675.6316
58
+ 900 train 7.616142 (lr=4.9436e-05) (hash(x)=61178712)
59
+ 900 val loss 7.4928
60
+ 900 val perplexity 1795.0725
61
+ 900 train 7.696317 (lr=6.9210e-05) (hash(x)=61178712)
62
+ 1000 val loss 7.3880
63
+ 1000 val perplexity 1616.4591
64
+ 1000 train 7.320238 (lr=4.9264e-05) (hash(x)=50886520)
65
+ 1000 val loss 7.4503
66
+ 1000 val perplexity 1720.3885
67
+ 1000 train 7.406900 (lr=6.8970e-05) (hash(x)=50886520)
68
+ 1100 val loss 7.3679
69
+ 1100 val perplexity 1584.2887
70
+ 1100 train 7.124536 (lr=4.9070e-05) (hash(x)=48600099)
71
+ 1100 val loss 7.4293
72
+ 1100 val perplexity 1684.7103
73
+ 1100 train 7.190936 (lr=6.8698e-05) (hash(x)=48600099)
74
+ 1200 val loss 7.3450
75
+ 1200 val perplexity 1548.3999
76
+ 1200 train 7.066013 (lr=4.8854e-05) (hash(x)=50146792)
77
+ 1200 val loss 7.4227
78
+ 1200 val perplexity 1673.5076
79
+ 1200 train 7.144311 (lr=6.8395e-05) (hash(x)=50146792)
80
+ 1300 val loss 7.3218
81
+ 1300 val perplexity 1512.8927
82
+ 1300 train 7.209438 (lr=4.8616e-05) (hash(x)=52617313)
83
+ 1300 val loss 7.4052
84
+ 1300 val perplexity 1644.5828
85
+ 1300 train 7.299134 (lr=6.8062e-05) (hash(x)=52617313)
86
+ 1400 val loss 7.2835
87
+ 1400 val perplexity 1456.0742
88
+ 1400 train 6.942636 (lr=4.8356e-05) (hash(x)=49794446)
89
+ 1400 val loss 7.3959
90
+ 1400 val perplexity 1629.2686
91
+ 1400 train 7.067560 (lr=6.7698e-05) (hash(x)=49794446)
92
+ 1500 val loss 7.2511
93
+ 1500 val perplexity 1409.6560
94
+ 1500 train 6.871803 (lr=4.8074e-05) (hash(x)=50766317)
95
+ 1500 val loss 7.3682
96
+ 1500 val perplexity 1584.7428
97
+ 1500 train 7.009883 (lr=6.7304e-05) (hash(x)=50766317)
98
+ 1600 val loss 7.2362
99
+ 1600 val perplexity 1388.8374
100
+ 1600 train 7.086946 (lr=4.7772e-05) (hash(x)=55551175)
101
+ 1600 val loss 7.3618
102
+ 1600 val perplexity 1574.6808
103
+ 1600 train 7.228473 (lr=6.6881e-05) (hash(x)=55551175)
104
+ 1700 val loss 7.1878
105
+ 1700 val perplexity 1323.1958
106
+ 1700 train 7.377913 (lr=4.7448e-05) (hash(x)=56717172)
107
+ 1700 val loss 7.3199
108
+ 1700 val perplexity 1510.0026
109
+ 1700 train 7.497116 (lr=6.6428e-05) (hash(x)=56717172)
110
+ 1800 val loss 7.1270
111
+ 1800 val perplexity 1245.1523
112
+ 1800 train 7.316638 (lr=4.7105e-05) (hash(x)=55376447)
113
+ 1800 val loss 7.2595
114
+ 1800 val perplexity 1421.4961
115
+ 1800 train 7.487352 (lr=6.5947e-05) (hash(x)=55376447)
116
+ 1900 val loss 7.1062
117
+ 1900 val perplexity 1219.4828
118
+ 1900 train 6.827673 (lr=4.6741e-05) (hash(x)=43810837)
119
+ 1900 val loss 7.2502
120
+ 1900 val perplexity 1408.3661
121
+ 1900 train 6.984783 (lr=6.5437e-05) (hash(x)=43810837)
122
+ 2000 val loss 7.0607
123
+ 2000 val perplexity 1165.2810
124
+ 2000 train 7.099247 (lr=4.6357e-05) (hash(x)=50881655)
125
+ 2000 val loss 7.2211
126
+ 2000 val perplexity 1368.0502
127
+ 2000 train 7.268182 (lr=6.4900e-05) (hash(x)=50881655)
128
+ 2100 val loss 7.0425
129
+ 2100 val perplexity 1144.2305
130
+ 2100 train 6.927150 (lr=4.5954e-05) (hash(x)=49386015)
131
+ 2100 val loss 7.1989
132
+ 2100 val perplexity 1337.9802
133
+ 2100 train 7.097069 (lr=6.4335e-05) (hash(x)=49386015)
134
+ 2200 val loss 7.0278
135
+ 2200 val perplexity 1127.5027
136
+ 2200 train 6.943182 (lr=4.5532e-05) (hash(x)=48572079)
137
+ 2200 val loss 7.1965
138
+ 2200 val perplexity 1334.7056
139
+ 2200 train 7.109676 (lr=6.3745e-05) (hash(x)=48572079)
140
+ 2300 val loss 7.0047
141
+ 2300 val perplexity 1101.8270
142
+ 2300 train 7.066078 (lr=4.5091e-05) (hash(x)=54950719)
143
+ 2300 val loss 7.1867
144
+ 2300 val perplexity 1321.7347
145
+ 2300 train 7.240222 (lr=6.3128e-05) (hash(x)=54950719)
146
+ 2400 val loss 6.9956
147
+ 2400 val perplexity 1091.7751
148
+ 2400 train 6.643163 (lr=4.4633e-05) (hash(x)=42190240)
149
+ 2400 val loss 7.1732
150
+ 2400 val perplexity 1303.9774
151
+ 2400 train 6.820462 (lr=6.2486e-05) (hash(x)=42190240)
152
+ 2500 val loss 6.9921
153
+ 2500 val perplexity 1088.0327
154
+ 2500 train 7.077521 (lr=4.4156e-05) (hash(x)=45223539)
155
+ 2500 val loss 7.1472
156
+ 2500 val perplexity 1270.5563
157
+ 2500 train 7.217351 (lr=6.1819e-05) (hash(x)=45223539)
158
+ 2600 val loss 6.9545
159
+ 2600 val perplexity 1047.8682
160
+ 2600 train 6.976867 (lr=4.3663e-05) (hash(x)=54037353)
161
+ 2600 val loss 7.1369
162
+ 2600 val perplexity 1257.5035
163
+ 2600 train 7.166002 (lr=6.1128e-05) (hash(x)=54037353)
164
+ 2700 val loss 6.9415
165
+ 2700 val perplexity 1034.3434
166
+ 2700 train 7.436725 (lr=4.3153e-05) (hash(x)=59131616)
167
+ 2700 val loss 7.1107
168
+ 2700 val perplexity 1224.9740
169
+ 2700 train 7.566201 (lr=6.0414e-05) (hash(x)=59131616)
170
+ 2800 val loss 6.9349
171
+ 2800 val perplexity 1027.5295
172
+ 2800 train 6.777809 (lr=4.2627e-05) (hash(x)=45882743)
173
+ 2800 val loss 7.1161
174
+ 2800 val perplexity 1231.6880
175
+ 2800 train 6.949253 (lr=5.9677e-05) (hash(x)=45882743)
176
+ 2900 val loss 6.9195
177
+ 2900 val perplexity 1011.8037
178
+ 2900 train 6.530170 (lr=4.2085e-05) (hash(x)=43758910)
179
+ 2900 val loss 7.1009
180
+ 2900 val perplexity 1213.0134
181
+ 2900 train 6.715117 (lr=5.8919e-05) (hash(x)=43758910)
182
+ 3000 val loss 6.9221
183
+ 3000 val perplexity 1014.4709
184
+ 3000 train 6.823209 (lr=4.1529e-05) (hash(x)=47965974)
185
+ 3000 val loss 7.1030
186
+ 3000 val perplexity 1215.6672
187
+ 3000 train 7.019185 (lr=5.8140e-05) (hash(x)=47965974)
188
+ 3100 val loss 6.8896
189
+ 3100 val perplexity 982.0345
190
+ 3100 train 6.756184 (lr=4.0957e-05) (hash(x)=48205243)
191
+ 3100 val loss 7.0986
192
+ 3100 val perplexity 1210.2229
193
+ 3100 train 6.971920 (lr=5.7340e-05) (hash(x)=48205243)
194
+ 3200 val loss 6.8666
195
+ 3200 val perplexity 959.7057
196
+ 3200 train 6.943860 (lr=4.0373e-05) (hash(x)=54511383)
197
+ 3200 val loss 7.0723
198
+ 3200 val perplexity 1178.8484
199
+ 3200 train 7.163098 (lr=5.6522e-05) (hash(x)=54511383)
200
+ 3300 val loss 6.8515
201
+ 3300 val perplexity 945.2843
202
+ 3300 train 6.816091 (lr=3.9775e-05) (hash(x)=54428388)
203
+ 3300 val loss 7.0479
204
+ 3300 val perplexity 1150.4756
205
+ 3300 train 7.016342 (lr=5.5684e-05) (hash(x)=54428388)
206
+ 3400 val loss 6.8383
207
+ 3400 val perplexity 932.9412
208
+ 3400 train 6.837810 (lr=3.9164e-05) (hash(x)=48115990)
209
+ 3400 val loss 7.0391
210
+ 3400 val perplexity 1140.3251
211
+ 3400 train 7.041271 (lr=5.4829e-05) (hash(x)=48115990)
212
+ 3500 val loss 6.8256
213
+ 3500 val perplexity 921.1580
214
+ 3500 train 6.416183 (lr=3.8541e-05) (hash(x)=41137345)
215
+ 3500 val loss 7.0312
216
+ 3500 val perplexity 1131.3469
217
+ 3500 train 6.639562 (lr=5.3958e-05) (hash(x)=41137345)
218
+ 3600 val loss 6.8099
219
+ 3600 val perplexity 906.7393
220
+ 3600 train 6.700560 (lr=3.7907e-05) (hash(x)=55186224)
221
+ 3600 val loss 7.0239
222
+ 3600 val perplexity 1123.1450
223
+ 3600 train 6.912842 (lr=5.3070e-05) (hash(x)=55186224)
224
+ 3700 val loss 6.8071
225
+ 3700 val perplexity 904.2829
226
+ 3700 train 6.704582 (lr=3.7262e-05) (hash(x)=54990049)
227
+ 3700 val loss 7.0245
228
+ 3700 val perplexity 1123.8029
229
+ 3700 train 6.886618 (lr=5.2167e-05) (hash(x)=54990049)
230
+ 3800 val loss 6.7851
231
+ 3800 val perplexity 884.5947
232
+ 3800 train 6.510171 (lr=3.6608e-05) (hash(x)=46288812)
233
+ 3800 val loss 7.0206
234
+ 3800 val perplexity 1119.4424
235
+ 3800 train 6.773824 (lr=5.1251e-05) (hash(x)=46288812)
236
+ 3900 val loss 6.7655
237
+ 3900 val perplexity 867.3606
238
+ 3900 train 6.406036 (lr=3.5944e-05) (hash(x)=45829773)
239
+ 3900 val loss 7.0133
240
+ 3900 val perplexity 1111.2819
241
+ 3900 train 6.649640 (lr=5.0321e-05) (hash(x)=45829773)
242
+ 4000 val loss 6.7527
243
+ 4000 val perplexity 856.3824
244
+ 4000 train 6.514423 (lr=3.5271e-05) (hash(x)=52499943)
245
+ 4000 val loss 6.9932
246
+ 4000 val perplexity 1089.1742
247
+ 4000 train 6.779234 (lr=4.9379e-05) (hash(x)=52499943)
248
+ 4100 val loss 6.7186
249
+ 4100 val perplexity 827.6704
250
+ 4100 train 6.617018 (lr=3.4590e-05) (hash(x)=48563796)
251
+ 4100 val loss 6.9636
252
+ 4100 val perplexity 1057.4326
253
+ 4100 train 6.877402 (lr=4.8426e-05) (hash(x)=48563796)
254
+ 4200 val loss 6.6921
255
+ 4200 val perplexity 806.0439
256
+ 4200 train 6.658999 (lr=3.3902e-05) (hash(x)=49165143)
257
+ 4200 val loss 6.9332
258
+ 4200 val perplexity 1025.7916
259
+ 4200 train 6.899343 (lr=4.7463e-05) (hash(x)=49165143)
260
+ 4300 val loss 6.6656
261
+ 4300 val perplexity 784.9682
262
+ 4300 train 6.739098 (lr=3.3207e-05) (hash(x)=50973176)
263
+ 4300 val loss 6.9141
264
+ 4300 val perplexity 1006.3737
265
+ 4300 train 6.983593 (lr=4.6490e-05) (hash(x)=50973176)
266
+ 4400 val loss 6.6427
267
+ 4400 val perplexity 767.1820
268
+ 4400 train 6.686252 (lr=3.2507e-05) (hash(x)=55275124)
269
+ 4400 val loss 6.8895
270
+ 4400 val perplexity 981.9423
271
+ 4400 train 6.929132 (lr=4.5509e-05) (hash(x)=55275124)
272
+ 4500 val loss 6.6312
273
+ 4500 val perplexity 758.4039
274
+ 4500 train 6.999173 (lr=3.1801e-05) (hash(x)=58646505)
275
+ 4500 val loss 6.8767
276
+ 4500 val perplexity 969.4566
277
+ 4500 train 7.242509 (lr=4.4521e-05) (hash(x)=58646505)
278
+ 4600 val loss 6.6130
279
+ 4600 val perplexity 744.7347
280
+ 4600 train 6.412494 (lr=3.1091e-05) (hash(x)=42554666)
281
+ 4600 val loss 6.8488
282
+ 4600 val perplexity 942.7289
283
+ 4600 train 6.660697 (lr=4.3527e-05) (hash(x)=42554666)
284
+ 4700 val loss 6.6027
285
+ 4700 val perplexity 737.1144
286
+ 4700 train 6.528410 (lr=3.0377e-05) (hash(x)=47846764)
287
+ 4700 val loss 6.8299
288
+ 4700 val perplexity 925.0769
289
+ 4700 train 6.747003 (lr=4.2528e-05) (hash(x)=47846764)
290
+ 4800 val loss 6.5888
291
+ 4800 val perplexity 726.8744
292
+ 4800 train 7.141794 (lr=2.9661e-05) (hash(x)=58239019)
293
+ 4800 val loss 6.8121
294
+ 4800 val perplexity 908.7399
295
+ 4800 train 7.329013 (lr=4.1525e-05) (hash(x)=58239019)
296
+ 4900 val loss 6.5791
297
+ 4900 val perplexity 719.8779
298
+ 4900 train 6.642244 (lr=2.8942e-05) (hash(x)=50711220)
299
+ 4900 val loss 6.7941
300
+ 4900 val perplexity 892.5339
301
+ 4900 train 6.849504 (lr=4.0518e-05) (hash(x)=50711220)
302
+ 5000 val loss 6.5743
303
+ 5000 val perplexity 716.4384
304
+ 5000 train 6.507086 (lr=2.8221e-05) (hash(x)=45994194)
305
+ 5000 val loss 6.7812
306
+ 5000 val perplexity 881.1363
307
+ 5000 train 6.706810 (lr=3.9510e-05) (hash(x)=45994194)
308
+ 5100 val loss 6.5611
309
+ 5100 val perplexity 707.0463
310
+ 5100 train 6.430910 (lr=2.7500e-05) (hash(x)=48659050)
311
+ 5100 val loss 6.7622
312
+ 5100 val perplexity 864.5593
313
+ 5100 train 6.633748 (lr=3.8500e-05) (hash(x)=48659050)
314
+ 5200 val loss 6.5536
315
+ 5200 val perplexity 701.7662
316
+ 5200 train 6.456882 (lr=2.6779e-05) (hash(x)=49369682)
317
+ 5200 val loss 6.7529
318
+ 5200 val perplexity 856.5147
319
+ 5200 train 6.673459 (lr=3.7490e-05) (hash(x)=49369682)
320
+ 5300 val loss 6.5406
321
+ 5300 val perplexity 692.6973
322
+ 5300 train 6.962357 (lr=2.6058e-05) (hash(x)=57787700)
323
+ 5300 val loss 6.7406
324
+ 5300 val perplexity 846.0259
325
+ 5300 train 7.159966 (lr=3.6482e-05) (hash(x)=57787700)
326
+ 5400 val loss 6.5395
327
+ 5400 val perplexity 691.9344
328
+ 5400 train 6.458861 (lr=2.5339e-05) (hash(x)=49365400)
329
+ 5400 val loss 6.7261
330
+ 5400 val perplexity 833.8752
331
+ 5500 val loss 6.5243
332
+ 5500 val perplexity 681.5267
333
+ 5400 train 6.657549 (lr=3.5475e-05) (hash(x)=49365400)
334
+ 5500 train 6.480797 (lr=2.4623e-05) (hash(x)=48720412)
335
+ 5500 val loss 6.7196
336
+ 5500 val perplexity 828.4913
337
+ 5600 val loss 6.5153
338
+ 5600 val perplexity 675.3881
339
+ 5500 train 6.674323 (lr=3.4472e-05) (hash(x)=48720412)
340
+ 5600 train 6.858240 (lr=2.3909e-05) (hash(x)=55784800)
341
+ 5700 val loss 6.5099
342
+ 5700 val perplexity 671.7321
343
+ 5700 train 6.328226 (lr=2.3199e-05) (hash(x)=50073634)
344
+ 5600 val loss 6.7044
345
+ 5600 val perplexity 815.9836
346
+ 5600 train 7.043695 (lr=3.3473e-05) (hash(x)=55784800)
347
+ 5800 val loss 6.5062
348
+ 5800 val perplexity 669.2766
349
+ 5800 train 6.321751 (lr=2.2493e-05) (hash(x)=50170324)
350
+ 5700 val loss 6.6907
351
+ 5700 val perplexity 804.8875
352
+ 5700 train 6.527936 (lr=3.2479e-05) (hash(x)=50073634)
353
+ 5900 val loss 6.5102
354
+ 5900 val perplexity 671.9464
355
+ 5900 train 6.234162 (lr=2.1793e-05) (hash(x)=48410268)
356
+ 5800 val loss 6.6803
357
+ 5800 val perplexity 796.5750
358
+ 5800 train 6.504085 (lr=3.1491e-05) (hash(x)=50170324)
359
+ 6000 val loss 6.5055
360
+ 6000 val perplexity 668.7971
361
+ 6000 train 6.368552 (lr=2.1098e-05) (hash(x)=49527342)
362
+ 5900 val loss 6.6904
363
+ 5900 val perplexity 804.6039
364
+ 5900 train 6.427372 (lr=3.0510e-05) (hash(x)=48410268)
365
+ 6100 val loss 6.4971
366
+ 6100 val perplexity 663.2069
367
+ 6100 train 6.286675 (lr=2.0410e-05) (hash(x)=49550294)
368
+ 6000 val loss 6.6704
369
+ 6000 val perplexity 788.7175
370
+ 6000 train 6.565147 (lr=2.9537e-05) (hash(x)=49527342)
371
+ 6200 val loss 6.4929
372
+ 6200 val perplexity 660.4078
373
+ 6200 train 6.008740 (lr=1.9729e-05) (hash(x)=42126106)
374
+ 6100 val loss 6.6618
375
+ 6100 val perplexity 781.9240
376
+ 6100 train 6.465837 (lr=2.8574e-05) (hash(x)=49550294)
377
+ 6300 val loss 6.4897
378
+ 6300 val perplexity 658.3044
379
+ 6300 train 6.224653 (lr=1.9056e-05) (hash(x)=49608772)
380
+ 6200 val loss 6.6584
381
+ 6200 val perplexity 779.2789
382
+ 6200 train 6.193086 (lr=2.7621e-05) (hash(x)=42126106)
383
+ 6400 val loss 6.4802
384
+ 6400 val perplexity 652.1339
385
+ 6400 train 6.016216 (lr=1.8392e-05) (hash(x)=52324417)
386
+ 6300 val loss 6.6522
387
+ 6300 val perplexity 774.4972
388
+ 6300 train 6.393860 (lr=2.6679e-05) (hash(x)=49608772)
389
+ 6500 val loss 6.4586
390
+ 6500 val perplexity 638.1494
391
+ 6500 train 6.503439 (lr=1.7738e-05) (hash(x)=46207215)
392
+ 6400 val loss 6.6367
393
+ 6400 val perplexity 762.5599
394
+ 6400 train 6.191780 (lr=2.5749e-05) (hash(x)=52324417)
395
+ 6600 val loss 6.4430
396
+ 6600 val perplexity 628.3019
397
+ 6600 train 6.401751 (lr=1.7093e-05) (hash(x)=49027014)
398
+ 6500 val loss 6.6079
399
+ 6500 val perplexity 740.8976
400
+ 6500 train 6.642453 (lr=2.4833e-05) (hash(x)=46207215)
401
+ 6700 val loss 6.4321
402
+ 6700 val perplexity 621.4651
403
+ 6700 train 6.459781 (lr=1.6459e-05) (hash(x)=46232513)
404
+ 6600 val loss 6.5917
405
+ 6600 val perplexity 729.0474
406
+ 6600 train 6.547888 (lr=2.3930e-05) (hash(x)=49027014)
407
+ 6800 val loss 6.4318
408
+ 6800 val perplexity 621.2779
409
+ 6800 train 6.339231 (lr=1.5836e-05) (hash(x)=47348403)
410
+ 6700 val loss 6.5817
411
+ 6700 val perplexity 721.7724
412
+ 6700 train 6.570748 (lr=2.3042e-05) (hash(x)=46232513)
413
+ 6900 val loss 6.4355
414
+ 6900 val perplexity 623.5914
415
+ 6900 train 6.444074 (lr=1.5225e-05) (hash(x)=49806647)
416
+ 6800 val loss 6.5738
417
+ 6800 val perplexity 716.0906
418
+ 6800 train 6.504720 (lr=2.2171e-05) (hash(x)=47348403)
419
+ 7000 val loss 6.4125
420
+ 7000 val perplexity 609.4304
421
+ 7000 train 6.457445 (lr=1.4627e-05) (hash(x)=50893018)
422
+ 6900 val loss 6.5837
423
+ 6900 val perplexity 723.2028
424
+ 6900 train 6.578163 (lr=2.1316e-05) (hash(x)=49806647)
425
+ 7100 val loss 6.4138
426
+ 7100 val perplexity 610.1981
427
+ 7100 train 6.440397 (lr=1.4043e-05) (hash(x)=49157639)
428
+ 7000 val loss 6.5469
429
+ 7000 val perplexity 697.0728
430
+ 7000 train 6.610025 (lr=2.0478e-05) (hash(x)=50893018)
431
+ 7200 val loss 6.4047
432
+ 7200 val perplexity 604.6961
433
+ 7200 train 6.387370 (lr=1.3471e-05) (hash(x)=47014759)
434
+ 7100 val loss 6.5489
435
+ 7100 val perplexity 698.4415
436
+ 7100 train 6.581534 (lr=1.9660e-05) (hash(x)=49157639)
437
+ 7300 val loss 6.4001
438
+ 7300 val perplexity 601.9218
439
+ 7300 train 6.383959 (lr=1.2915e-05) (hash(x)=47325591)
440
+ 7200 val loss 6.5344
441
+ 7200 val perplexity 688.4330
442
+ 7200 train 6.524500 (lr=1.8860e-05) (hash(x)=47014759)
443
+ 7400 val loss 6.3930
444
+ 7400 val perplexity 597.6287
445
+ 7400 train 6.285221 (lr=1.2373e-05) (hash(x)=49184604)
446
+ 7300 val loss 6.5278
447
+ 7300 val perplexity 683.9073
448
+ 7300 train 6.516582 (lr=1.8081e-05) (hash(x)=47325591)
449
+ 7500 val loss 6.3856
450
+ 7500 val perplexity 593.2499
451
+ 7500 train 6.649272 (lr=1.1847e-05) (hash(x)=55053584)
452
+ 7400 val loss 6.5201
453
+ 7400 val perplexity 678.6690
454
+ 7400 train 6.414679 (lr=1.7323e-05) (hash(x)=49184604)
455
+ 7600 val loss 6.3827
456
+ 7600 val perplexity 591.5508
457
+ 7600 train 6.325932 (lr=1.1337e-05) (hash(x)=48693923)
458
+ 7500 val loss 6.5144
459
+ 7500 val perplexity 674.8160
460
+ 7500 train 6.811224 (lr=1.6586e-05) (hash(x)=55053584)
461
+ 7700 val loss 6.3809
462
+ 7700 val perplexity 590.4791
463
+ 7700 train 5.916291 (lr=1.0844e-05) (hash(x)=40952882)
464
+ 7600 val loss 6.5111
465
+ 7600 val perplexity 672.5692
466
+ 7600 train 6.471277 (lr=1.5872e-05) (hash(x)=48693923)
467
+ 7800 val loss 6.3779
468
+ 7800 val perplexity 588.6711
469
+ 7800 train 6.476369 (lr=1.0367e-05) (hash(x)=52487845)
470
+ 7700 val loss 6.5067
471
+ 7700 val perplexity 669.5911
472
+ 7700 train 6.060650 (lr=1.5181e-05) (hash(x)=40952882)
473
+ 7900 val loss 6.3770
474
+ 7900 val perplexity 588.1319
475
+ 7900 train 6.461136 (lr=9.9088e-06) (hash(x)=50221547)
476
+ 7800 val loss 6.5008
477
+ 7800 val perplexity 665.6864
478
+ 7800 train 6.590390 (lr=1.4514e-05) (hash(x)=52487845)
479
+ 8000 val loss 6.3713
480
+ 8000 val perplexity 584.8088
481
+ 8000 train 6.648645 (lr=9.4682e-06) (hash(x)=62294204)
482
+ 7900 val loss 6.4972
483
+ 7900 val perplexity 663.2847
484
+ 7900 train 6.574942 (lr=1.3872e-05) (hash(x)=50221547)
485
+ 8100 val loss 6.3669
486
+ 8100 val perplexity 582.2661
487
+ 8100 train 6.040726 (lr=9.0461e-06) (hash(x)=44401967)
488
+ 8000 val loss 6.4938
489
+ 8000 val perplexity 661.0527
490
+ 8000 train 6.781792 (lr=1.3255e-05) (hash(x)=62294204)
491
+ 8200 val loss 6.3676
492
+ 8200 val perplexity 582.6852
493
+ 8200 train 6.291852 (lr=8.6430e-06) (hash(x)=52769095)
494
+ 8100 val loss 6.4890
495
+ 8100 val perplexity 657.8419
496
+ 8100 train 6.187840 (lr=1.2665e-05) (hash(x)=44401967)
497
+ 8300 val loss 6.3675
498
+ 8300 val perplexity 582.5966
499
+ 8300 train 6.317422 (lr=8.2593e-06) (hash(x)=56829883)
500
+ 8200 val loss 6.4881
501
+ 8200 val perplexity 657.2552
502
+ 8200 train 6.420963 (lr=1.2100e-05) (hash(x)=52769095)
503
+ 8400 val loss 6.3669
504
+ 8400 val perplexity 582.2473
505
+ 8400 train 6.269656 (lr=7.8953e-06) (hash(x)=52147375)
506
+ 8300 val loss 6.4923
507
+ 8300 val perplexity 660.0215
508
+ 8300 train 6.443340 (lr=1.1563e-05) (hash(x)=56829883)
509
+ 8500 val loss 6.3666
510
+ 8500 val perplexity 582.0471
511
+ 8500 train 6.658837 (lr=7.5515e-06) (hash(x)=60197820)
512
+ 8400 val loss 6.4939
513
+ 8400 val perplexity 661.1094
514
+ 8400 train 6.411846 (lr=1.1053e-05) (hash(x)=52147375)
515
+ 8600 val loss 6.3644
516
+ 8600 val perplexity 580.8034
517
+ 8600 train 6.102660 (lr=7.2282e-06) (hash(x)=49377068)
518
+ 8700 val loss 6.3617
519
+ 8700 val perplexity 579.2385
520
+ 8700 train 6.333673 (lr=6.9257e-06) (hash(x)=51092724)
521
+ 8500 val loss 6.4915
522
+ 8500 val perplexity 659.5043
523
+ 8500 train 6.751138 (lr=1.0572e-05) (hash(x)=60197820)
524
+ 8800 val loss 6.3632
525
+ 8800 val perplexity 580.0929
526
+ 8800 train 6.343732 (lr=6.6444e-06) (hash(x)=48642928)
527
+ 8600 val loss 6.4865
528
+ 8600 val perplexity 656.2393
529
+ 8600 train 6.233767 (lr=1.0119e-05) (hash(x)=49377068)
530
+ 8900 val loss 6.3662
531
+ 8900 val perplexity 581.8434
532
+ 8900 train 6.577977 (lr=6.3845e-06) (hash(x)=55342246)
533
+ 8700 val loss 6.4884
534
+ 8700 val perplexity 657.4505
535
+ 8700 train 6.452022 (lr=9.6960e-06) (hash(x)=51092724)
536
+ 9000 val loss 6.3454
537
+ 9000 val perplexity 569.8459
538
+ 9000 train 6.334527 (lr=6.1462e-06) (hash(x)=48093368)
539
+ 8800 val loss 6.4842
540
+ 8800 val perplexity 654.6934
541
+ 8800 train 6.479820 (lr=9.3021e-06) (hash(x)=48642928)
542
+ 9100 val loss 6.3400
543
+ 9100 val perplexity 566.8240
544
+ 9100 train 6.413877 (lr=5.9300e-06) (hash(x)=48578183)
545
+ 8900 val loss 6.4770
546
+ 8900 val perplexity 650.0265
547
+ 8900 train 6.673745 (lr=8.9382e-06) (hash(x)=55342246)
548
+ 9200 val loss 6.3371
549
+ 9200 val perplexity 565.1450
550
+ 9200 train 6.528768 (lr=5.7359e-06) (hash(x)=50794720)
551
+ 9000 val loss 6.4624
552
+ 9000 val perplexity 640.5951
553
+ 9000 train 6.451552 (lr=8.6047e-06) (hash(x)=48093368)
554
+ 9300 val loss 6.3341
555
+ 9300 val perplexity 563.4738
556
+ 9300 train 6.083742 (lr=5.5641e-06) (hash(x)=46513190)
557
+ 9100 val loss 6.4584
558
+ 9100 val perplexity 638.0627
559
+ 9100 train 6.534726 (lr=8.3020e-06) (hash(x)=48578183)
560
+ 9400 val loss 6.3325
561
+ 9400 val perplexity 562.5811
562
+ 9400 train 6.010540 (lr=5.4149e-06) (hash(x)=43808238)
563
+ 9500 val loss 6.3304
564
+ 9500 val perplexity 561.3873
565
+ 9500 train 6.091447 (lr=5.2884e-06) (hash(x)=45021888)
566
+ 9200 val loss 6.4552
567
+ 9200 val perplexity 636.0129
568
+ 9200 train 6.642209 (lr=8.0302e-06) (hash(x)=50794720)
569
+ 9600 val loss 6.3272
570
+ 9600 val perplexity 559.5969
571
+ 9600 train 6.422736 (lr=5.1847e-06) (hash(x)=56525570)
572
+ 9300 val loss 6.4515
573
+ 9300 val perplexity 633.6569
574
+ 9300 train 6.224042 (lr=7.7898e-06) (hash(x)=46513190)
575
+ 9700 val loss 6.3240
576
+ 9700 val perplexity 557.8226
577
+ 9700 train 6.512197 (lr=5.1040e-06) (hash(x)=52585913)
578
+ 9400 val loss 6.4492
579
+ 9400 val perplexity 632.1708
580
+ 9400 train 6.119009 (lr=7.5809e-06) (hash(x)=43808238)
581
+ 9800 val loss 6.3230
582
+ 9800 val perplexity 557.2422
583
+ 9800 train 6.537199 (lr=5.0462e-06) (hash(x)=52344698)
584
+ 9500 val loss 6.4458
585
+ 9500 val perplexity 630.0528
586
+ 9900 val loss 6.3207
587
+ 9900 val perplexity 555.9839
588
+ 9500 train 6.227046 (lr=7.4038e-06) (hash(x)=45021888)
589
+ 9900 train 6.319592 (lr=5.0116e-06) (hash(x)=51740945)
590
+ 9999 val loss 6.3184
591
+ 9999 val perplexity 554.7078
592
+ 9600 val loss 6.4448
593
+ 9600 val perplexity 629.4426
594
+ 9600 train 6.529235 (lr=7.2586e-06) (hash(x)=56525570)
595
+ 9700 val loss 6.4413
596
+ 9700 val perplexity 627.2043
597
+ 9700 train 6.629733 (lr=7.1456e-06) (hash(x)=52585913)
598
+ 9800 val loss 6.4397
599
+ 9800 val perplexity 626.2005
600
+ 9800 train 6.633550 (lr=7.0647e-06) (hash(x)=52344698)
601
+ 9900 val loss 6.4373
602
+ 9900 val perplexity 624.7111
603
+ 9900 train 6.438586 (lr=7.0162e-06) (hash(x)=51740945)
604
+ 9999 val loss 6.4359
605
+ 9999 val perplexity 623.8198
attention_kindselective_n_heads4_seed1338/model_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cfb2160933da3c9bd20cc75e7b5e3e17efe7597e8f4ceb7e2131d2db5fe2a7be
3
  size 92843394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5555ffb4b2cceb8de7a85935af9bde78a8e12f00bfe537f9a6700433d8845a8
3
  size 92843394
attention_kindselective_n_heads4_seed1338/model_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1a68456ab6dd22f9e0fa8d9d98fa5890dc9184894790e74af0dbd1de1e7eabb
3
  size 92843394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c04035190bd46f9b0ed7808da2d4a12edcfe48d98592d3f25b51cfa5cebc7f75
3
  size 92843394
attention_kindselective_n_heads4_seed1338/model_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ad3c1940f30f39a067179a398d7645ba2a216761c016c202a7bd2ff59ad9146
3
  size 92843394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:935d9743a04892562e7641457ff0d4005ceb841e7c4c4235482eaeed288493f0
3
  size 92843394
attention_kindselective_n_heads4_seed1338/model_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48cf363ca99835cf2690a96b5b649ff230b1bd43022ac3c5cc864f3dbc4ad02b
3
  size 92843394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee20cfea6b75b11ea0b76d6461126f89256d5bbb4f082ae38fe67c7d47377126
3
  size 92843394
attention_kindselective_n_heads4_seed1338/optimizer_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4b9a2b4aab41d39c6c30b36ca11463d9a9c59401a5856681c6f004e3eb2b08b
3
  size 179406214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43d0cb2f3cfea37b5079b99029c5c9bd80a6d7bb4746e206f8cec47096fd7833
3
  size 179406214
attention_kindselective_n_heads4_seed1338/optimizer_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85d3d0e3997291682cfc03365e6569ad60b2cc96084e2f466df903968cb7dad4
3
  size 179406214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38cc2f6202bc33ddc3a918860a93e0d1dc9ef15836049b642b28e22df331fa4c
3
  size 179406214
attention_kindselective_n_heads4_seed1338/optimizer_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6720b9370b3c247560dd03bca887c5fd03cca9f0f8935c8f0e32f2b2d0bc882
3
  size 179406214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14cbf8ccb5561be3b3741e5cf9b629cfbcb429c960a08e1b812c4786d2ef9cf9
3
  size 179406214
attention_kindselective_n_heads4_seed1338/optimizer_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f09307ada0be1dc0ce77e7e2f58b55e8fcbcd5554b4db8268178906ff5dee43
3
  size 179406214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7c734804fea06b9be0d6722d9437244ede6c63f1ca46b939296e687e528677e
3
  size 179406214