andrew-healey commited on
Commit
295810e
·
verified ·
1 Parent(s): 7b93a31

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads4_seed1345/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1345", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1345, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 3.5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "3.5e-5_61440_4_1345", "n_embd": 256}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1345", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1345, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 4.5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "4.5e-5_61440_4_1345", "n_embd": 256}
attention_kindselective_n_heads4_seed1345/log2.txt CHANGED
@@ -1,534 +1,267 @@
1
  max_steps: 8750
2
- max_steps: 8750
3
- 0 val loss 11.2669
4
- 0 val perplexity 78187.7734
5
  0 val loss 11.2669
6
  0 val perplexity 78187.7734
7
- 0 train 11.258430 (lr=7.0000e-08) (hash(x)=134751525)
8
- 0 train 11.258430 (lr=6.0000e-08) (hash(x)=134751525)
9
- 100 val loss 10.1750
10
- 100 val perplexity 26239.3242
11
- 100 val loss 10.2733
12
- 100 val perplexity 28949.7637
13
- 100 train 10.194183 (lr=7.0700e-06) (hash(x)=150358957)
14
- 100 train 10.292044 (lr=6.0600e-06) (hash(x)=150358957)
15
- 200 val loss 9.7819
16
- 200 val perplexity 17709.6523
17
- 200 val loss 9.6434
18
- 200 val perplexity 15420.2432
19
- 200 train 9.817169 (lr=1.2060e-05) (hash(x)=126319983)
20
- 200 train 9.672717 (lr=1.4070e-05) (hash(x)=126319983)
21
- 300 val loss 9.0849
22
- 300 val perplexity 8821.1250
23
- 300 train 9.013100 (lr=1.8060e-05) (hash(x)=159305874)
24
- 300 val loss 8.6685
25
- 300 val perplexity 5817.0503
26
- 300 train 8.598497 (lr=2.1070e-05) (hash(x)=159305874)
27
- 400 val loss 8.2047
28
- 400 val perplexity 3658.1746
29
- 400 train 8.218742 (lr=2.4060e-05) (hash(x)=149855631)
30
- 400 val loss 7.8251
31
- 400 val perplexity 2502.6160
32
- 400 train 7.823033 (lr=2.8070e-05) (hash(x)=149855631)
33
- 500 val loss 7.6848
34
- 500 val perplexity 2175.0186
35
- 500 train 7.707771 (lr=3.0000e-05) (hash(x)=150706760)
36
- 500 val loss 7.5264
37
- 500 val perplexity 1856.4717
38
- 500 train 7.555436 (lr=3.5000e-05) (hash(x)=150706760)
39
- 600 val loss 7.4936
40
- 600 val perplexity 1796.5043
41
- 600 train 7.436448 (lr=2.9990e-05) (hash(x)=146858163)
42
- 600 val loss 7.4167
43
- 600 val perplexity 1663.5187
44
- 600 train 7.352398 (lr=3.4989e-05) (hash(x)=146858163)
45
- 700 val loss 7.4029
46
- 700 val perplexity 1640.6820
47
- 700 train 7.286075 (lr=2.9961e-05) (hash(x)=144262072)
48
- 700 val loss 7.3377
49
- 700 val perplexity 1537.2355
50
- 700 train 7.216905 (lr=3.4954e-05) (hash(x)=144262072)
51
- 800 val loss 7.3213
52
- 800 val perplexity 1512.1823
53
- 800 train 7.288542 (lr=2.9912e-05) (hash(x)=135443880)
54
- 800 val loss 7.2552
55
- 800 val perplexity 1415.4203
56
- 800 train 7.228300 (lr=3.4897e-05) (hash(x)=135443880)
57
- 900 val loss 7.2423
58
- 900 val perplexity 1397.3021
59
- 900 train 7.185520 (lr=2.9844e-05) (hash(x)=153147388)
60
- 900 val loss 7.1680
61
- 900 val perplexity 1297.2410
62
- 900 train 7.111306 (lr=3.4818e-05) (hash(x)=153147388)
63
- 1000 val loss 7.1730
64
- 1000 val perplexity 1303.8014
65
- 1000 train 7.101679 (lr=2.9756e-05) (hash(x)=151448445)
66
- 1000 val loss 7.1006
67
- 1000 val perplexity 1212.7179
68
- 1000 train 7.030866 (lr=3.4715e-05) (hash(x)=151448445)
69
- 1100 val loss 7.1041
70
- 1100 val perplexity 1216.9623
71
- 1100 train 7.102648 (lr=2.9649e-05) (hash(x)=153161010)
72
- 1100 val loss 7.0359
73
- 1100 val perplexity 1136.6917
74
- 1100 train 7.039263 (lr=3.4591e-05) (hash(x)=153161010)
75
- 1200 val loss 7.0362
76
- 1200 val perplexity 1137.0972
77
- 1200 train 6.867702 (lr=2.9523e-05) (hash(x)=143227423)
78
- 1200 val loss 6.9704
79
- 1200 val perplexity 1064.5956
80
- 1200 train 6.798764 (lr=3.4444e-05) (hash(x)=143227423)
81
- 1300 val loss 6.9623
82
- 1300 val perplexity 1056.0319
83
- 1300 train 7.441185 (lr=2.9378e-05) (hash(x)=176373796)
84
- 1300 val loss 6.9014
85
- 1300 val perplexity 993.7034
86
- 1300 train 7.385946 (lr=3.4275e-05) (hash(x)=176373796)
87
- 1400 val loss 6.8869
88
- 1400 val perplexity 979.3471
89
- 1400 train 6.808840 (lr=2.9215e-05) (hash(x)=155989503)
90
- 1400 val loss 6.8330
91
- 1400 val perplexity 927.9659
92
- 1400 train 6.754021 (lr=3.4084e-05) (hash(x)=155989503)
93
- 1500 val loss 6.8189
94
- 1500 val perplexity 914.9921
95
- 1500 train 6.790832 (lr=2.9033e-05) (hash(x)=156507542)
96
- 1500 val loss 6.7613
97
- 1500 val perplexity 863.7879
98
- 1500 train 6.730458 (lr=3.3872e-05) (hash(x)=156507542)
99
- 1600 val loss 6.7582
100
- 1600 val perplexity 861.0930
101
- 1600 train 6.778030 (lr=2.8833e-05) (hash(x)=156078901)
102
- 1600 val loss 6.6964
103
- 1600 val perplexity 809.4827
104
- 1600 train 6.710325 (lr=3.3638e-05) (hash(x)=156078901)
105
- 1700 val loss 6.6933
106
- 1700 val perplexity 807.0126
107
- 1700 train 6.597588 (lr=2.8615e-05) (hash(x)=156889457)
108
- 1700 val loss 6.6403
109
- 1700 val perplexity 765.3299
110
- 1700 train 6.545618 (lr=3.3384e-05) (hash(x)=156889457)
111
- 1800 val loss 6.6458
112
- 1800 val perplexity 769.5268
113
- 1800 train 6.895873 (lr=2.8379e-05) (hash(x)=162477906)
114
- 1800 val loss 6.5875
115
- 1800 val perplexity 725.9478
116
- 1800 train 6.827469 (lr=3.3109e-05) (hash(x)=162477906)
117
- 1900 val loss 6.5819
118
- 1900 val perplexity 721.9397
119
- 1900 train 6.625430 (lr=2.8127e-05) (hash(x)=158648033)
120
- 1900 val loss 6.5338
121
- 1900 val perplexity 688.0337
122
- 1900 train 6.575150 (lr=3.2814e-05) (hash(x)=158648033)
123
- 2000 val loss 6.5412
124
- 2000 val perplexity 693.0895
125
- 2000 train 6.461535 (lr=2.7857e-05) (hash(x)=156732586)
126
- 2000 val loss 6.4917
127
- 2000 val perplexity 659.6571
128
- 2000 train 6.414240 (lr=3.2500e-05) (hash(x)=156732586)
129
- 2100 val loss 6.4977
130
- 2100 val perplexity 663.5922
131
- 2100 train 6.521001 (lr=2.7571e-05) (hash(x)=159721084)
132
- 2100 val loss 6.4501
133
- 2100 val perplexity 632.7357
134
- 2100 train 6.469040 (lr=3.2166e-05) (hash(x)=159721084)
135
- 2200 val loss 6.4529
136
- 2200 val perplexity 634.5237
137
- 2200 train 6.426077 (lr=2.7269e-05) (hash(x)=144907001)
138
- 2200 val loss 6.4076
139
- 2200 val perplexity 606.4177
140
- 2200 train 6.382673 (lr=3.1813e-05) (hash(x)=144907001)
141
- 2300 val loss 6.4275
142
- 2300 val perplexity 618.5964
143
- 2300 train 6.370883 (lr=2.6951e-05) (hash(x)=154541837)
144
- 2300 val loss 6.3895
145
- 2300 val perplexity 595.5529
146
- 2300 train 6.327733 (lr=3.1443e-05) (hash(x)=154541837)
147
- 2400 val loss 6.3938
148
- 2400 val perplexity 598.1364
149
- 2400 train 6.478797 (lr=2.6618e-05) (hash(x)=161706790)
150
- 2400 val loss 6.3556
151
- 2400 val perplexity 575.6793
152
- 2400 train 6.445687 (lr=3.1054e-05) (hash(x)=161706790)
153
- 2500 val loss 6.3581
154
- 2500 val perplexity 577.1680
155
- 2500 train 6.277150 (lr=2.6270e-05) (hash(x)=143556162)
156
- 2500 val loss 6.3157
157
- 2500 val perplexity 553.1826
158
- 2500 train 6.235758 (lr=3.0649e-05) (hash(x)=143556162)
159
- 2600 val loss 6.3334
160
- 2600 val perplexity 563.0918
161
- 2600 train 6.187194 (lr=2.5909e-05) (hash(x)=142425078)
162
- 2600 val loss 6.2915
163
- 2600 val perplexity 539.9864
164
- 2600 train 6.143201 (lr=3.0227e-05) (hash(x)=142425078)
165
- 2700 val loss 6.3056
166
- 2700 val perplexity 547.6417
167
- 2700 train 6.324026 (lr=2.5533e-05) (hash(x)=163195606)
168
- 2700 val loss 6.2703
169
- 2700 val perplexity 528.6313
170
- 2700 train 6.290276 (lr=2.9789e-05) (hash(x)=163195606)
171
- 2800 val loss 6.2788
172
- 2800 val perplexity 533.1338
173
- 2800 train 6.179240 (lr=2.5145e-05) (hash(x)=154078337)
174
- 2800 val loss 6.2392
175
- 2800 val perplexity 512.4443
176
- 2800 train 6.143046 (lr=2.9336e-05) (hash(x)=154078337)
177
- 2900 val loss 6.2577
178
- 2900 val perplexity 522.0040
179
- 2900 train 6.118522 (lr=2.4744e-05) (hash(x)=145186687)
180
- 2900 val loss 6.2222
181
- 2900 val perplexity 503.8161
182
- 2900 train 6.084497 (lr=2.8868e-05) (hash(x)=145186687)
183
- 3000 val loss 6.2310
184
- 3000 val perplexity 508.2703
185
- 3000 train 6.183014 (lr=2.4331e-05) (hash(x)=149726716)
186
- 3000 val loss 6.1939
187
- 3000 val perplexity 489.7742
188
- 3000 train 6.147453 (lr=2.8386e-05) (hash(x)=149726716)
189
- 3100 val loss 6.2127
190
- 3100 val perplexity 499.0278
191
- 3100 train 6.136366 (lr=2.3906e-05) (hash(x)=151122509)
192
- 3100 val loss 6.1761
193
- 3100 val perplexity 481.1284
194
- 3100 train 6.095349 (lr=2.7891e-05) (hash(x)=151122509)
195
- 3200 val loss 6.1962
196
- 3200 val perplexity 490.8775
197
- 3200 train 6.167598 (lr=2.3471e-05) (hash(x)=152172187)
198
- 3200 val loss 6.1624
199
- 3200 val perplexity 474.5640
200
- 3200 train 6.133832 (lr=2.7383e-05) (hash(x)=152172187)
201
- 3300 val loss 6.1759
202
- 3300 val perplexity 481.0361
203
- 3300 train 6.112210 (lr=2.3026e-05) (hash(x)=150581974)
204
- 3300 val loss 6.1415
205
- 3300 val perplexity 464.7546
206
- 3300 train 6.078421 (lr=2.6864e-05) (hash(x)=150581974)
207
- 3400 val loss 6.1620
208
- 3400 val perplexity 474.3841
209
- 3400 train 6.091798 (lr=2.2572e-05) (hash(x)=160061666)
210
- 3400 val loss 6.1262
211
- 3400 val perplexity 457.6865
212
- 3400 train 6.052583 (lr=2.6333e-05) (hash(x)=160061666)
213
- 3500 val loss 6.1390
214
- 3500 val perplexity 463.6089
215
- 3500 train 6.089871 (lr=2.2108e-05) (hash(x)=150556913)
216
- 3500 val loss 6.1030
217
- 3500 val perplexity 447.1866
218
- 3500 train 6.052739 (lr=2.5793e-05) (hash(x)=150556913)
219
- 3600 val loss 6.1277
220
- 3600 val perplexity 458.3880
221
- 3600 train 6.037222 (lr=2.1637e-05) (hash(x)=148311961)
222
- 3600 val loss 6.0914
223
- 3600 val perplexity 442.0209
224
- 3600 train 5.994712 (lr=2.5243e-05) (hash(x)=148311961)
225
- 3700 val loss 6.1109
226
- 3700 val perplexity 450.7532
227
- 3700 train 6.010225 (lr=2.1158e-05) (hash(x)=150305284)
228
- 3700 val loss 6.0769
229
- 3700 val perplexity 435.6559
230
- 3700 train 5.972827 (lr=2.4684e-05) (hash(x)=150305284)
231
- 3800 val loss 6.0935
232
- 3800 val perplexity 442.9523
233
- 3800 train 6.107690 (lr=2.0672e-05) (hash(x)=148428531)
234
- 3800 val loss 6.0581
235
- 3800 val perplexity 427.5569
236
- 3800 train 6.070702 (lr=2.4117e-05) (hash(x)=148428531)
237
- 3900 val loss 6.0789
238
- 3900 val perplexity 436.5287
239
- 3900 train 5.950237 (lr=2.0180e-05) (hash(x)=142448374)
240
- 3900 val loss 6.0463
241
- 3900 val perplexity 422.5590
242
- 3900 train 5.920741 (lr=2.3543e-05) (hash(x)=142448374)
243
- 4000 val loss 6.0688
244
- 4000 val perplexity 432.1684
245
- 4000 train 6.003921 (lr=1.9683e-05) (hash(x)=158002288)
246
- 4000 val loss 6.0329
247
- 4000 val perplexity 416.9308
248
- 4000 train 5.960671 (lr=2.2963e-05) (hash(x)=158002288)
249
- 4100 val loss 6.0503
250
- 4100 val perplexity 424.2229
251
- 4100 train 5.992175 (lr=1.9181e-05) (hash(x)=151388626)
252
- 4100 val loss 6.0160
253
- 4100 val perplexity 409.9351
254
- 4100 train 5.953603 (lr=2.2378e-05) (hash(x)=151388626)
255
- 4200 val loss 6.0385
256
- 4200 val perplexity 419.2610
257
- 4200 train 5.956691 (lr=1.8675e-05) (hash(x)=145123294)
258
- 4200 val loss 6.0044
259
- 4200 val perplexity 405.1890
260
- 4200 train 5.921735 (lr=2.1788e-05) (hash(x)=145123294)
261
- 4300 val loss 6.0281
262
- 4300 val perplexity 414.9109
263
- 4300 train 5.895347 (lr=1.8166e-05) (hash(x)=145869150)
264
- 4300 val loss 5.9918
265
- 4300 val perplexity 400.1511
266
- 4300 train 5.864752 (lr=2.1194e-05) (hash(x)=145869150)
267
- 4400 val loss 6.0184
268
- 4400 val perplexity 410.9097
269
- 4400 train 6.043102 (lr=1.7655e-05) (hash(x)=146923320)
270
- 4400 val loss 5.9831
271
- 4400 val perplexity 396.6707
272
- 4400 train 6.006795 (lr=2.0598e-05) (hash(x)=146923320)
273
- 4500 val loss 6.0034
274
- 4500 val perplexity 404.8162
275
- 4500 train 5.822377 (lr=1.7142e-05) (hash(x)=146720819)
276
- 4500 val loss 5.9655
277
- 4500 val perplexity 389.7443
278
- 4500 train 5.786932 (lr=1.9999e-05) (hash(x)=146720819)
279
- 4600 val loss 5.9952
280
- 4600 val perplexity 401.5007
281
- 4600 train 5.732059 (lr=1.6629e-05) (hash(x)=140342909)
282
- 4600 val loss 5.9578
283
- 4600 val perplexity 386.7601
284
- 4600 train 5.701501 (lr=1.9400e-05) (hash(x)=140342909)
285
- 4700 val loss 5.9897
286
- 4700 val perplexity 399.2866
287
- 4700 train 6.129215 (lr=1.6114e-05) (hash(x)=163866463)
288
- 4700 val loss 5.9490
289
- 4700 val perplexity 383.3703
290
- 4700 train 6.082926 (lr=1.8800e-05) (hash(x)=163866463)
291
- 4800 val loss 5.9734
292
- 4800 val perplexity 392.8430
293
- 4800 train 5.674410 (lr=1.5601e-05) (hash(x)=133459145)
294
- 4800 val loss 5.9356
295
- 4800 val perplexity 378.2767
296
- 4800 train 5.632190 (lr=1.8201e-05) (hash(x)=133459145)
297
- 4900 val loss 5.9669
298
- 4900 val perplexity 390.2767
299
- 4900 train 5.767529 (lr=1.5089e-05) (hash(x)=143144356)
300
- 4900 val loss 5.9261
301
- 4900 val perplexity 374.6805
302
- 4900 train 5.728006 (lr=1.7604e-05) (hash(x)=143144356)
303
- 5000 val loss 5.9762
304
- 5000 val perplexity 393.9481
305
- 5000 train 5.873638 (lr=1.4579e-05) (hash(x)=132636494)
306
- 5000 val loss 5.9323
307
- 5000 val perplexity 377.0263
308
- 5000 train 5.834423 (lr=1.7009e-05) (hash(x)=132636494)
309
- 5100 val loss 5.9468
310
- 5100 val perplexity 382.5381
311
- 5100 train 5.866047 (lr=1.4071e-05) (hash(x)=157278728)
312
- 5100 val loss 5.9048
313
- 5100 val perplexity 366.7968
314
- 5100 train 5.822343 (lr=1.6417e-05) (hash(x)=157278728)
315
- 5200 val loss 5.9387
316
- 5200 val perplexity 379.4555
317
- 5200 train 5.798069 (lr=1.3568e-05) (hash(x)=148198434)
318
- 5200 val loss 5.8980
319
- 5200 val perplexity 364.3023
320
- 5200 train 5.757902 (lr=1.5829e-05) (hash(x)=148198434)
321
- 5300 val loss 5.9282
322
- 5300 val perplexity 375.4887
323
- 5300 train 5.859479 (lr=1.3068e-05) (hash(x)=138380906)
324
- 5300 val loss 5.8857
325
- 5300 val perplexity 359.8555
326
- 5300 train 5.814668 (lr=1.5246e-05) (hash(x)=138380906)
327
- 5400 val loss 5.9193
328
- 5400 val perplexity 372.1389
329
- 5400 train 6.013505 (lr=1.2573e-05) (hash(x)=163441464)
330
- 5400 val loss 5.8765
331
- 5400 val perplexity 356.5496
332
- 5400 train 5.965582 (lr=1.4669e-05) (hash(x)=163441464)
333
- 5500 val loss 5.9173
334
- 5500 val perplexity 371.4005
335
- 5500 train 5.873360 (lr=1.2085e-05) (hash(x)=154347714)
336
- 5500 val loss 5.8715
337
- 5500 val perplexity 354.7765
338
- 5500 train 5.825165 (lr=1.4099e-05) (hash(x)=154347714)
339
- 5600 val loss 5.9056
340
- 5600 val perplexity 367.1030
341
- 5600 train 5.935726 (lr=1.1602e-05) (hash(x)=148449981)
342
- 5600 val loss 5.8623
343
- 5600 val perplexity 351.5228
344
- 5600 train 5.894027 (lr=1.3536e-05) (hash(x)=148449981)
345
- 5700 val loss 5.8968
346
- 5700 val perplexity 363.8591
347
- 5700 train 5.787348 (lr=1.1127e-05) (hash(x)=139300274)
348
- 5700 val loss 5.8525
349
- 5700 val perplexity 348.0916
350
- 5700 train 5.742023 (lr=1.2981e-05) (hash(x)=139300274)
351
- 5800 val loss 5.8947
352
- 5800 val perplexity 363.1082
353
- 5800 train 5.880175 (lr=1.0659e-05) (hash(x)=154857144)
354
- 5800 val loss 5.8471
355
- 5800 val perplexity 346.2293
356
- 5800 train 5.833060 (lr=1.2436e-05) (hash(x)=154857144)
357
- 5900 val loss 5.8848
358
- 5900 val perplexity 359.5408
359
- 5900 train 5.955789 (lr=1.0200e-05) (hash(x)=151756013)
360
- 5900 val loss 5.8394
361
- 5900 val perplexity 343.5686
362
- 5900 train 5.909068 (lr=1.1900e-05) (hash(x)=151756013)
363
- 6000 val loss 5.8803
364
- 6000 val perplexity 357.8994
365
- 6000 train 5.779049 (lr=9.7500e-06) (hash(x)=145414657)
366
- 6000 val loss 5.8321
367
- 6000 val perplexity 341.0894
368
- 6000 train 5.732763 (lr=1.1375e-05) (hash(x)=145414657)
369
- 6100 val loss 5.8751
370
- 6100 val perplexity 356.0493
371
- 6100 train 5.700992 (lr=9.3098e-06) (hash(x)=139501217)
372
- 6100 val loss 5.8282
373
- 6100 val perplexity 339.7401
374
- 6100 train 5.652315 (lr=1.0861e-05) (hash(x)=139501217)
375
- 6200 val loss 5.8671
376
- 6200 val perplexity 353.2263
377
- 6200 train 5.908251 (lr=8.8800e-06) (hash(x)=155783358)
378
- 6200 val loss 5.8197
379
- 6200 val perplexity 336.8552
380
- 6200 train 5.859098 (lr=1.0360e-05) (hash(x)=155783358)
381
- 6300 val loss 5.8604
382
- 6300 val perplexity 350.8642
383
- 6300 train 5.921547 (lr=8.4613e-06) (hash(x)=157033091)
384
- 6300 val loss 5.8135
385
- 6300 val perplexity 334.7826
386
- 6300 train 5.880195 (lr=9.8715e-06) (hash(x)=157033091)
387
- 6400 val loss 5.8582
388
- 6400 val perplexity 350.0984
389
- 6400 train 5.662329 (lr=8.0542e-06) (hash(x)=144475330)
390
- 6400 val loss 5.8101
391
- 6400 val perplexity 333.6645
392
- 6400 train 5.613411 (lr=9.3966e-06) (hash(x)=144475330)
393
- 6500 val loss 5.8514
394
- 6500 val perplexity 347.7374
395
- 6500 train 5.894106 (lr=7.6594e-06) (hash(x)=124041822)
396
- 6500 val loss 5.8043
397
- 6500 val perplexity 331.7292
398
- 6500 train 5.851692 (lr=8.9359e-06) (hash(x)=124041822)
399
- 6600 val loss 5.8488
400
- 6600 val perplexity 346.8195
401
- 6600 train 5.777397 (lr=7.2774e-06) (hash(x)=142131981)
402
- 6600 val loss 5.8006
403
- 6600 val perplexity 330.5139
404
- 6600 train 5.729760 (lr=8.4903e-06) (hash(x)=142131981)
405
- 6700 val loss 5.8397
406
- 6700 val perplexity 343.6647
407
- 6700 train 5.932328 (lr=6.9087e-06) (hash(x)=154916248)
408
- 6700 val loss 5.7911
409
- 6700 val perplexity 327.3885
410
- 6700 train 5.880747 (lr=8.0602e-06) (hash(x)=154916248)
411
- 6800 val loss 5.8399
412
- 6800 val perplexity 343.7483
413
- 6800 train 5.837266 (lr=6.5540e-06) (hash(x)=147996387)
414
- 6800 val loss 5.7908
415
- 6800 val perplexity 327.2812
416
- 6800 train 5.789515 (lr=7.6463e-06) (hash(x)=147996387)
417
- 6900 val loss 5.8349
418
- 6900 val perplexity 342.0426
419
- 6900 train 5.866730 (lr=6.2137e-06) (hash(x)=153422428)
420
- 6900 val loss 5.7859
421
- 6900 val perplexity 325.6611
422
- 6900 train 5.821322 (lr=7.2493e-06) (hash(x)=153422428)
423
- 7000 val loss 5.8287
424
- 7000 val perplexity 339.9290
425
- 7000 train 5.845691 (lr=5.8883e-06) (hash(x)=174135078)
426
- 7000 val loss 5.7791
427
- 7000 val perplexity 323.4625
428
- 7000 train 5.799284 (lr=6.8697e-06) (hash(x)=174135078)
429
- 7100 val loss 5.8261
430
- 7100 val perplexity 339.0508
431
- 7100 train 5.890498 (lr=5.5783e-06) (hash(x)=160382475)
432
- 7100 val loss 5.7757
433
- 7100 val perplexity 322.3593
434
- 7100 train 5.846087 (lr=6.5080e-06) (hash(x)=160382475)
435
- 7200 val loss 5.8238
436
- 7200 val perplexity 338.2447
437
- 7200 train 5.911463 (lr=5.2841e-06) (hash(x)=155310085)
438
- 7200 val loss 5.7736
439
- 7200 val perplexity 321.6892
440
- 7200 train 5.862586 (lr=6.1648e-06) (hash(x)=155310085)
441
- 7300 val loss 5.8176
442
- 7300 val perplexity 336.1636
443
- 7300 train 5.947505 (lr=5.0062e-06) (hash(x)=159329031)
444
- 7300 val loss 5.7693
445
- 7300 val perplexity 320.3102
446
- 7300 train 5.899338 (lr=5.8405e-06) (hash(x)=159329031)
447
- 7400 val loss 5.8165
448
- 7400 val perplexity 335.7898
449
- 7400 train 5.755105 (lr=4.7449e-06) (hash(x)=151577269)
450
- 7400 val loss 5.7669
451
- 7400 val perplexity 319.5350
452
- 7400 train 5.706599 (lr=5.5357e-06) (hash(x)=151577269)
453
- 7500 val loss 5.8145
454
- 7500 val perplexity 335.1223
455
- 7500 train 5.860515 (lr=4.5007e-06) (hash(x)=160620287)
456
- 7500 val loss 5.7642
457
- 7500 val perplexity 318.6920
458
- 7500 train 5.815320 (lr=5.2508e-06) (hash(x)=160620287)
459
- 7600 val loss 5.8093
460
- 7600 val perplexity 333.3877
461
- 7600 train 5.698301 (lr=4.2739e-06) (hash(x)=143711597)
462
- 7600 val loss 5.7591
463
- 7600 val perplexity 317.0681
464
- 7600 train 5.647112 (lr=4.9862e-06) (hash(x)=143711597)
465
- 7700 val loss 5.8077
466
- 7700 val perplexity 332.8600
467
- 7700 train 5.814168 (lr=4.0648e-06) (hash(x)=149243577)
468
- 7700 val loss 5.7580
469
- 7700 val perplexity 316.6993
470
- 7700 train 5.764244 (lr=4.7423e-06) (hash(x)=149243577)
471
- 7800 val loss 5.8074
472
- 7800 val perplexity 332.7407
473
- 7800 train 5.800000 (lr=3.8738e-06) (hash(x)=147677247)
474
- 7800 val loss 5.7568
475
- 7800 val perplexity 316.3229
476
- 7800 train 5.748062 (lr=4.5194e-06) (hash(x)=147677247)
477
- 7900 val loss 5.8000
478
- 7900 val perplexity 330.2837
479
- 7900 train 5.754653 (lr=3.7010e-06) (hash(x)=145801236)
480
- 7900 val loss 5.7508
481
- 7900 val perplexity 314.4332
482
- 7900 train 5.701422 (lr=4.3179e-06) (hash(x)=145801236)
483
- 8000 val loss 5.7995
484
- 8000 val perplexity 330.1450
485
- 8000 train 5.912129 (lr=3.5468e-06) (hash(x)=158007655)
486
- 8000 val loss 5.7496
487
- 8000 val perplexity 314.0717
488
- 8000 train 5.863448 (lr=4.1380e-06) (hash(x)=158007655)
489
- 8100 val loss 5.7986
490
- 8100 val perplexity 329.8300
491
- 8100 train 5.867946 (lr=3.4114e-06) (hash(x)=158030048)
492
- 8100 val loss 5.7484
493
- 8100 val perplexity 313.6838
494
- 8100 train 5.815113 (lr=3.9800e-06) (hash(x)=158030048)
495
- 8200 val loss 5.7953
496
- 8200 val perplexity 328.7345
497
- 8200 train 5.643704 (lr=3.2950e-06) (hash(x)=150674725)
498
- 8200 val loss 5.7446
499
- 8200 val perplexity 312.4954
500
- 8200 train 5.588871 (lr=3.8442e-06) (hash(x)=150674725)
501
- 8300 val loss 5.7946
502
- 8300 val perplexity 328.5292
503
- 8300 train 5.713874 (lr=3.1977e-06) (hash(x)=150805711)
504
- 8300 val loss 5.7437
505
- 8300 val perplexity 312.2120
506
- 8300 train 5.659549 (lr=3.7307e-06) (hash(x)=150805711)
507
- 8400 val loss 5.7908
508
- 8400 val perplexity 327.2678
509
- 8400 train 5.666281 (lr=3.1197e-06) (hash(x)=138396210)
510
- 8400 val loss 5.7395
511
- 8400 val perplexity 310.9045
512
- 8400 train 5.610754 (lr=3.6397e-06) (hash(x)=138396210)
513
- 8500 val loss 5.7905
514
- 8500 val perplexity 327.1677
515
- 8500 train 5.663531 (lr=3.0611e-06) (hash(x)=156172740)
516
- 8500 val loss 5.7396
517
- 8500 val perplexity 310.9279
518
- 8500 train 5.616183 (lr=3.5713e-06) (hash(x)=156172740)
519
- 8600 val loss 5.7883
520
- 8600 val perplexity 326.4504
521
- 8600 train 5.838070 (lr=3.0220e-06) (hash(x)=154911352)
522
- 8600 val loss 5.7373
523
- 8600 val perplexity 310.2140
524
- 8600 train 5.787480 (lr=3.5257e-06) (hash(x)=154911352)
525
- 8700 val loss 5.7860
526
- 8700 val perplexity 325.7220
527
- 8700 train 5.850677 (lr=3.0024e-06) (hash(x)=153446789)
528
- 8749 val loss 5.7843
529
- 8749 val perplexity 325.1580
530
- 8700 val loss 5.7350
531
- 8700 val perplexity 309.5193
532
- 8700 train 5.797166 (lr=3.5029e-06) (hash(x)=153446789)
533
- 8749 val loss 5.7334
534
- 8749 val perplexity 309.0074
 
1
  max_steps: 8750
 
 
 
2
  0 val loss 11.2669
3
  0 val perplexity 78187.7734
4
+ 0 train 11.258421 (lr=9.0000e-08) (hash(x)=134751525)
5
+ 100 val loss 10.0591
6
+ 100 val perplexity 23368.2949
7
+ 100 train 10.084015 (lr=9.0900e-06) (hash(x)=150358957)
8
+ 200 val loss 9.4623
9
+ 200 val perplexity 12865.2119
10
+ 200 train 9.482208 (lr=1.8090e-05) (hash(x)=126319983)
11
+ 300 val loss 8.2846
12
+ 300 val perplexity 3962.3010
13
+ 300 train 8.218966 (lr=2.7090e-05) (hash(x)=159305874)
14
+ 400 val loss 7.6156
15
+ 400 val perplexity 2029.5901
16
+ 400 train 7.596677 (lr=3.6090e-05) (hash(x)=149855631)
17
+ 500 val loss 7.4200
18
+ 500 val perplexity 1669.0735
19
+ 500 train 7.448377 (lr=4.5000e-05) (hash(x)=150706760)
20
+ 600 val loss 7.3022
21
+ 600 val perplexity 1483.5238
22
+ 600 train 7.233404 (lr=4.4985e-05) (hash(x)=146858163)
23
+ 700 val loss 7.1946
24
+ 700 val perplexity 1332.2479
25
+ 700 train 7.066607 (lr=4.4941e-05) (hash(x)=144262072)
26
+ 800 val loss 7.0810
27
+ 800 val perplexity 1189.1654
28
+ 800 train 7.053042 (lr=4.4868e-05) (hash(x)=135443880)
29
+ 900 val loss 6.9834
30
+ 900 val perplexity 1078.5498
31
+ 900 train 6.929296 (lr=4.4766e-05) (hash(x)=153147388)
32
+ 1000 val loss 6.9159
33
+ 1000 val perplexity 1008.2147
34
+ 1000 train 6.843293 (lr=4.4634e-05) (hash(x)=151448445)
35
+ 1100 val loss 6.8259
36
+ 1100 val perplexity 921.3812
37
+ 1100 train 6.837877 (lr=4.4474e-05) (hash(x)=153161010)
38
+ 1200 val loss 6.7506
39
+ 1200 val perplexity 854.6096
40
+ 1200 train 6.578292 (lr=4.4285e-05) (hash(x)=143227423)
41
+ 1300 val loss 6.6868
42
+ 1300 val perplexity 801.7579
43
+ 1300 train 7.172554 (lr=4.4068e-05) (hash(x)=176373796)
44
+ 1400 val loss 6.6160
45
+ 1400 val perplexity 746.9671
46
+ 1400 train 6.540329 (lr=4.3822e-05) (hash(x)=155989503)
47
+ 1500 val loss 6.5556
48
+ 1500 val perplexity 703.1462
49
+ 1500 train 6.532267 (lr=4.3549e-05) (hash(x)=156507542)
50
+ 1600 val loss 6.5053
51
+ 1600 val perplexity 668.6906
52
+ 1600 train 6.507467 (lr=4.3249e-05) (hash(x)=156078901)
53
+ 1700 val loss 6.4618
54
+ 1700 val perplexity 640.2064
55
+ 1700 train 6.365796 (lr=4.2922e-05) (hash(x)=156889457)
56
+ 1800 val loss 6.4338
57
+ 1800 val perplexity 622.5391
58
+ 1800 train 6.691582 (lr=4.2569e-05) (hash(x)=162477906)
59
+ 1900 val loss 6.3811
60
+ 1900 val perplexity 590.5602
61
+ 1900 train 6.422180 (lr=4.2190e-05) (hash(x)=158648033)
62
+ 2000 val loss 6.3547
63
+ 2000 val perplexity 575.2073
64
+ 2000 train 6.281474 (lr=4.1785e-05) (hash(x)=156732586)
65
+ 2100 val loss 6.3244
66
+ 2100 val perplexity 558.0005
67
+ 2100 train 6.340858 (lr=4.1356e-05) (hash(x)=159721084)
68
+ 2200 val loss 6.2931
69
+ 2200 val perplexity 540.8015
70
+ 2200 train 6.273417 (lr=4.0903e-05) (hash(x)=144907001)
71
+ 2300 val loss 6.2683
72
+ 2300 val perplexity 527.5813
73
+ 2300 train 6.208333 (lr=4.0426e-05) (hash(x)=154541837)
74
+ 2400 val loss 6.2421
75
+ 2400 val perplexity 513.9549
76
+ 2400 train 6.331748 (lr=3.9927e-05) (hash(x)=161706790)
77
+ 2500 val loss 6.2153
78
+ 2500 val perplexity 500.3525
79
+ 2500 train 6.141446 (lr=3.9406e-05) (hash(x)=143556162)
80
+ 2600 val loss 6.1963
81
+ 2600 val perplexity 490.9108
82
+ 2600 train 6.050023 (lr=3.8863e-05) (hash(x)=142425078)
83
+ 2700 val loss 6.1650
84
+ 2700 val perplexity 475.8134
85
+ 2700 train 6.189079 (lr=3.8300e-05) (hash(x)=163195606)
86
+ 2800 val loss 6.1348
87
+ 2800 val perplexity 461.6595
88
+ 2800 train 6.041891 (lr=3.7717e-05) (hash(x)=154078337)
89
+ 2900 val loss 6.1197
90
+ 2900 val perplexity 454.7504
91
+ 2900 train 5.981575 (lr=3.7116e-05) (hash(x)=145186687)
92
+ 3000 val loss 6.0966
93
+ 3000 val perplexity 444.3332
94
+ 3000 train 6.058323 (lr=3.6496e-05) (hash(x)=149726716)
95
+ 3100 val loss 6.0758
96
+ 3100 val perplexity 435.2021
97
+ 3100 train 6.000450 (lr=3.5860e-05) (hash(x)=151122509)
98
+ 3200 val loss 6.0591
99
+ 3200 val perplexity 427.9922
100
+ 3200 train 6.028750 (lr=3.5207e-05) (hash(x)=152172187)
101
+ 3300 val loss 6.0385
102
+ 3300 val perplexity 419.2482
103
+ 3300 train 5.972092 (lr=3.4539e-05) (hash(x)=150581974)
104
+ 3400 val loss 6.0271
105
+ 3400 val perplexity 414.4937
106
+ 3400 train 5.958289 (lr=3.3857e-05) (hash(x)=160061666)
107
+ 3500 val loss 6.0029
108
+ 3500 val perplexity 404.6082
109
+ 3500 train 5.951034 (lr=3.3162e-05) (hash(x)=150556913)
110
+ 3600 val loss 5.9885
111
+ 3600 val perplexity 398.8278
112
+ 3600 train 5.896544 (lr=3.2455e-05) (hash(x)=148311961)
113
+ 3700 val loss 5.9747
114
+ 3700 val perplexity 393.3694
115
+ 3700 train 5.866710 (lr=3.1736e-05) (hash(x)=150305284)
116
+ 3800 val loss 5.9561
117
+ 3800 val perplexity 386.0872
118
+ 3800 train 5.966131 (lr=3.1008e-05) (hash(x)=148428531)
119
+ 3900 val loss 5.9435
120
+ 3900 val perplexity 381.2489
121
+ 3900 train 5.814651 (lr=3.0270e-05) (hash(x)=142448374)
122
+ 4000 val loss 5.9283
123
+ 4000 val perplexity 375.5012
124
+ 4000 train 5.862759 (lr=2.9524e-05) (hash(x)=158002288)
125
+ 4100 val loss 5.9091
126
+ 4100 val perplexity 368.3666
127
+ 4100 train 5.850963 (lr=2.8771e-05) (hash(x)=151388626)
128
+ 4200 val loss 5.8978
129
+ 4200 val perplexity 364.2400
130
+ 4200 train 5.821980 (lr=2.8013e-05) (hash(x)=145123294)
131
+ 4300 val loss 5.8837
132
+ 4300 val perplexity 359.1506
133
+ 4300 train 5.765946 (lr=2.7250e-05) (hash(x)=145869150)
134
+ 4400 val loss 5.8761
135
+ 4400 val perplexity 356.4175
136
+ 4400 train 5.911475 (lr=2.6483e-05) (hash(x)=146923320)
137
+ 4500 val loss 5.8595
138
+ 4500 val perplexity 350.5503
139
+ 4500 train 5.688518 (lr=2.5714e-05) (hash(x)=146720819)
140
+ 4600 val loss 5.8489
141
+ 4600 val perplexity 346.8638
142
+ 4600 train 5.593976 (lr=2.4943e-05) (hash(x)=140342909)
143
+ 4700 val loss 5.8383
144
+ 4700 val perplexity 343.2022
145
+ 4700 train 5.974589 (lr=2.4172e-05) (hash(x)=163866463)
146
+ 4800 val loss 5.8217
147
+ 4800 val perplexity 337.5406
148
+ 4800 train 5.515379 (lr=2.3402e-05) (hash(x)=133459145)
149
+ 4900 val loss 5.8140
150
+ 4900 val perplexity 334.9486
151
+ 4900 train 5.611025 (lr=2.2633e-05) (hash(x)=143144356)
152
+ 5000 val loss 5.8290
153
+ 5000 val perplexity 340.0303
154
+ 5000 train 5.728374 (lr=2.1868e-05) (hash(x)=132636494)
155
+ 5100 val loss 5.7898
156
+ 5100 val perplexity 326.9575
157
+ 5100 train 5.715200 (lr=2.1107e-05) (hash(x)=157278728)
158
+ 5200 val loss 5.7817
159
+ 5200 val perplexity 324.3232
160
+ 5200 train 5.637352 (lr=2.0351e-05) (hash(x)=148198434)
161
+ 5300 val loss 5.7699
162
+ 5300 val perplexity 320.5101
163
+ 5300 train 5.696755 (lr=1.9602e-05) (hash(x)=138380906)
164
+ 5400 val loss 5.7576
165
+ 5400 val perplexity 316.5839
166
+ 5400 train 5.838938 (lr=1.8860e-05) (hash(x)=163441464)
167
+ 5500 val loss 5.7551
168
+ 5500 val perplexity 315.7874
169
+ 5500 train 5.701233 (lr=1.8127e-05) (hash(x)=154347714)
170
+ 5600 val loss 5.7429
171
+ 5600 val perplexity 311.9672
172
+ 5600 train 5.776368 (lr=1.7403e-05) (hash(x)=148449981)
173
+ 5700 val loss 5.7320
174
+ 5700 val perplexity 308.5851
175
+ 5700 train 5.623940 (lr=1.6690e-05) (hash(x)=139300274)
176
+ 5800 val loss 5.7271
177
+ 5800 val perplexity 307.0853
178
+ 5800 train 5.716210 (lr=1.5989e-05) (hash(x)=154857144)
179
+ 5900 val loss 5.7195
180
+ 5900 val perplexity 304.7662
181
+ 5900 train 5.779558 (lr=1.5300e-05) (hash(x)=151756013)
182
+ 6000 val loss 5.7133
183
+ 6000 val perplexity 302.8814
184
+ 6000 train 5.619116 (lr=1.4625e-05) (hash(x)=145414657)
185
+ 6100 val loss 5.7073
186
+ 6100 val perplexity 301.0667
187
+ 6100 train 5.534505 (lr=1.3965e-05) (hash(x)=139501217)
188
+ 6200 val loss 5.6980
189
+ 6200 val perplexity 298.2719
190
+ 6200 train 5.732094 (lr=1.3320e-05) (hash(x)=155783358)
191
+ 6300 val loss 5.6917
192
+ 6300 val perplexity 296.4115
193
+ 6300 train 5.751144 (lr=1.2692e-05) (hash(x)=157033091)
194
+ 6400 val loss 5.6877
195
+ 6400 val perplexity 295.2075
196
+ 6400 train 5.495005 (lr=1.2081e-05) (hash(x)=144475330)
197
+ 6500 val loss 5.6813
198
+ 6500 val perplexity 293.3268
199
+ 6500 train 5.727438 (lr=1.1489e-05) (hash(x)=124041822)
200
+ 6600 val loss 5.6776
201
+ 6600 val perplexity 292.2538
202
+ 6600 train 5.614210 (lr=1.0916e-05) (hash(x)=142131981)
203
+ 6700 val loss 5.6687
204
+ 6700 val perplexity 289.6444
205
+ 6700 train 5.769709 (lr=1.0363e-05) (hash(x)=154916248)
206
+ 6800 val loss 5.6677
207
+ 6800 val perplexity 289.3799
208
+ 6800 train 5.666794 (lr=9.8310e-06) (hash(x)=147996387)
209
+ 6900 val loss 5.6623
210
+ 6900 val perplexity 287.8063
211
+ 6900 train 5.690933 (lr=9.3205e-06) (hash(x)=153422428)
212
+ 7000 val loss 5.6558
213
+ 7000 val perplexity 285.9500
214
+ 7000 train 5.677885 (lr=8.8324e-06) (hash(x)=174135078)
215
+ 7100 val loss 5.6527
216
+ 7100 val perplexity 285.0608
217
+ 7100 train 5.732208 (lr=8.3674e-06) (hash(x)=160382475)
218
+ 7200 val loss 5.6492
219
+ 7200 val perplexity 284.0699
220
+ 7200 train 5.743230 (lr=7.9261e-06) (hash(x)=155310085)
221
+ 7300 val loss 5.6438
222
+ 7300 val perplexity 282.5283
223
+ 7300 train 5.782816 (lr=7.5093e-06) (hash(x)=159329031)
224
+ 7400 val loss 5.6415
225
+ 7400 val perplexity 281.8864
226
+ 7400 train 5.581890 (lr=7.1174e-06) (hash(x)=151577269)
227
+ 7500 val loss 5.6393
228
+ 7500 val perplexity 281.2724
229
+ 7500 train 5.690196 (lr=6.7511e-06) (hash(x)=160620287)
230
+ 7600 val loss 5.6345
231
+ 7600 val perplexity 279.9241
232
+ 7600 train 5.519984 (lr=6.4109e-06) (hash(x)=143711597)
233
+ 7700 val loss 5.6324
234
+ 7700 val perplexity 279.3272
235
+ 7700 train 5.632235 (lr=6.0972e-06) (hash(x)=149243577)
236
+ 7800 val loss 5.6312
237
+ 7800 val perplexity 278.9935
238
+ 7800 train 5.617033 (lr=5.8107e-06) (hash(x)=147677247)
239
+ 7900 val loss 5.6240
240
+ 7900 val perplexity 276.9847
241
+ 7900 train 5.574734 (lr=5.5515e-06) (hash(x)=145801236)
242
+ 8000 val loss 5.6234
243
+ 8000 val perplexity 276.8374
244
+ 8000 train 5.743947 (lr=5.3203e-06) (hash(x)=158007655)
245
+ 8100 val loss 5.6221
246
+ 8100 val perplexity 276.4651
247
+ 8100 train 5.681450 (lr=5.1172e-06) (hash(x)=158030048)
248
+ 8200 val loss 5.6184
249
+ 8200 val perplexity 275.4516
250
+ 8200 train 5.468052 (lr=4.9425e-06) (hash(x)=150674725)
251
+ 8300 val loss 5.6173
252
+ 8300 val perplexity 275.1565
253
+ 8300 train 5.523213 (lr=4.7966e-06) (hash(x)=150805711)
254
+ 8400 val loss 5.6133
255
+ 8400 val perplexity 274.0571
256
+ 8400 train 5.488098 (lr=4.6796e-06) (hash(x)=138396210)
257
+ 8500 val loss 5.6127
258
+ 8500 val perplexity 273.8857
259
+ 8500 train 5.489274 (lr=4.5917e-06) (hash(x)=156172740)
260
+ 8600 val loss 5.6100
261
+ 8600 val perplexity 273.1501
262
+ 8600 train 5.649384 (lr=4.5330e-06) (hash(x)=154911352)
263
+ 8700 val loss 5.6074
264
+ 8700 val perplexity 272.4272
265
+ 8700 train 5.664300 (lr=4.5037e-06) (hash(x)=153446789)
266
+ 8749 val loss 5.6057
267
+ 8749 val perplexity 271.9800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
attention_kindselective_n_heads4_seed1345/model_08749.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a375128a9f17c31f33b98216bcc6b05316b464fb3b587a6e156429f7a49eb32b
3
  size 92843394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63a5d75f3d50c1e38bd49d0b3ffde6667deba50fb9d33f7f876d2ce34040e7cc
3
  size 92843394
attention_kindselective_n_heads4_seed1345/optimizer_08749.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee5a8b46cb4807643d5e88ff5502c2e4b92b4746a4e2dd6d59ed8a639804c3bb
3
  size 179406214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48e14c02fa70197c42cfb86a6c654015643809abd3d0f2330ae92703b10052ff
3
  size 179406214