andrew-healey commited on
Commit
50bcb04
·
verified ·
1 Parent(s): 563f706

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads4_seed1338/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1338", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1338, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 3e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "3e-5_61440_4_1338", "n_embd": 256}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1338", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1338, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 5.5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "5.5e-5_61440_4_1338", "n_embd": 256}
attention_kindselective_n_heads4_seed1338/log2.txt CHANGED
@@ -1,533 +1,267 @@
1
  max_steps: 8750
2
  0 val loss 11.2646
3
  0 val perplexity 78014.3047
4
- 0 val loss 11.2646
5
- 0 val perplexity 78014.3047
6
- 0 train 11.267765 (lr=6.0000e-08) (hash(x)=150327452)
7
- 0 train 11.267765 (lr=7.0000e-08) (hash(x)=150327452)
8
- 100 val loss 10.0785
9
- 100 val perplexity 23826.3750
10
- 100 val loss 10.1480
11
- 100 val perplexity 25539.9902
12
- 100 train 10.130300 (lr=7.0700e-06) (hash(x)=166780046)
13
- 100 train 10.197817 (lr=6.0600e-06) (hash(x)=166780046)
14
- 200 val loss 9.5828
15
- 200 val perplexity 14512.7627
16
- 200 train 9.549955 (lr=1.4070e-05) (hash(x)=155040610)
17
- 200 val loss 9.6799
18
- 200 val perplexity 15992.7021
19
- 200 train 9.651791 (lr=1.2060e-05) (hash(x)=155040610)
20
- 300 val loss 8.7068
21
- 300 val perplexity 6043.8276
22
- 300 train 8.732620 (lr=2.1070e-05) (hash(x)=155504036)
23
- 300 val loss 8.8988
24
- 300 val perplexity 7323.0049
25
- 300 train 8.930824 (lr=1.8060e-05) (hash(x)=155504036)
26
- 400 val loss 7.9358
27
- 400 val perplexity 2795.6145
28
- 400 train 7.833813 (lr=2.8070e-05) (hash(x)=143823248)
29
- 400 val loss 8.0672
30
- 400 val perplexity 3188.0217
31
- 400 train 7.977346 (lr=2.4060e-05) (hash(x)=143823248)
32
- 500 val loss 7.6097
33
- 500 val perplexity 2017.7271
34
- 500 train 7.582412 (lr=3.5000e-05) (hash(x)=143734685)
35
- 500 val loss 7.6500
36
- 500 val perplexity 2100.6929
37
- 500 train 7.625039 (lr=3.0000e-05) (hash(x)=143734685)
38
- 600 val loss 7.4992
39
- 600 val perplexity 1806.5713
40
- 600 train 7.291920 (lr=3.4989e-05) (hash(x)=150678249)
41
- 600 val loss 7.5211
42
- 600 val perplexity 1846.5060
43
- 600 train 7.316416 (lr=2.9990e-05) (hash(x)=150678249)
44
- 700 val loss 7.4154
45
- 700 val perplexity 1661.3982
46
- 700 train 7.564420 (lr=3.4954e-05) (hash(x)=175802021)
47
- 700 val loss 7.4274
48
- 700 val perplexity 1681.4119
49
- 700 train 7.571382 (lr=2.9961e-05) (hash(x)=175802021)
50
- 800 val loss 7.3271
51
- 800 val perplexity 1520.9635
52
- 800 train 7.331609 (lr=3.4897e-05) (hash(x)=158681215)
53
- 800 val loss 7.3369
54
- 800 val perplexity 1535.9385
55
- 800 train 7.345709 (lr=2.9912e-05) (hash(x)=158681215)
56
- 900 val loss 7.2689
57
- 900 val perplexity 1434.9026
58
- 900 train 7.165131 (lr=3.4818e-05) (hash(x)=146108145)
59
- 900 val loss 7.2753
60
- 900 val perplexity 1444.1128
61
- 900 train 7.172338 (lr=2.9844e-05) (hash(x)=146108145)
62
- 1000 val loss 7.2140
63
- 1000 val perplexity 1358.2812
64
- 1000 train 7.060000 (lr=3.4715e-05) (hash(x)=154996086)
65
- 1000 val loss 7.2104
66
- 1000 val perplexity 1353.4595
67
- 1000 train 7.056768 (lr=2.9756e-05) (hash(x)=154996086)
68
- 1100 val loss 7.1419
69
- 1100 val perplexity 1263.8793
70
- 1100 train 7.124806 (lr=3.4591e-05) (hash(x)=153885445)
71
- 1100 val loss 7.1242
72
- 1100 val perplexity 1241.7123
73
- 1100 train 7.107671 (lr=2.9649e-05) (hash(x)=153885445)
74
- 1200 val loss 7.0668
75
- 1200 val perplexity 1172.3687
76
- 1200 train 7.020203 (lr=3.4444e-05) (hash(x)=142353087)
77
- 1200 val loss 7.0451
78
- 1200 val perplexity 1147.1823
79
- 1200 train 6.998048 (lr=2.9523e-05) (hash(x)=142353087)
80
- 1300 val loss 7.0154
81
- 1300 val perplexity 1113.6349
82
- 1300 train 6.930557 (lr=3.4275e-05) (hash(x)=150750353)
83
- 1300 val loss 6.9831
84
- 1300 val perplexity 1078.2115
85
- 1300 train 6.896412 (lr=2.9378e-05) (hash(x)=150750353)
86
- 1400 val loss 6.9752
87
- 1400 val perplexity 1069.7871
88
- 1400 train 6.799040 (lr=3.4084e-05) (hash(x)=152767913)
89
- 1400 val loss 6.9322
90
- 1400 val perplexity 1024.7649
91
- 1400 train 6.756323 (lr=2.9215e-05) (hash(x)=152767913)
92
- 1500 val loss 6.9097
93
- 1500 val perplexity 1001.9031
94
- 1500 train 6.945250 (lr=3.3872e-05) (hash(x)=151562048)
95
- 1500 val loss 6.8590
96
- 1500 val perplexity 952.3980
97
- 1500 train 6.893571 (lr=2.9033e-05) (hash(x)=151562048)
98
- 1600 val loss 6.8390
99
- 1600 val perplexity 933.5735
100
- 1600 train 6.832361 (lr=3.3638e-05) (hash(x)=166486165)
101
- 1600 val loss 6.7795
102
- 1600 val perplexity 879.6003
103
- 1600 train 6.775093 (lr=2.8833e-05) (hash(x)=166486165)
104
- 1700 val loss 6.7954
105
- 1700 val perplexity 893.7272
106
- 1700 train 6.446530 (lr=3.3384e-05) (hash(x)=130835396)
107
- 1700 val loss 6.7200
108
- 1700 val perplexity 828.8434
109
- 1700 train 6.368818 (lr=2.8615e-05) (hash(x)=130835396)
110
- 1800 val loss 6.7493
111
- 1800 val perplexity 853.4929
112
- 1800 train 6.801020 (lr=3.3109e-05) (hash(x)=158851816)
113
- 1800 val loss 6.6636
114
- 1800 val perplexity 783.3477
115
- 1800 train 6.714691 (lr=2.8379e-05) (hash(x)=158851816)
116
- 1900 val loss 6.7007
117
- 1900 val perplexity 812.9660
118
- 1900 train 6.740700 (lr=3.2814e-05) (hash(x)=153313879)
119
- 1900 val loss 6.6132
120
- 1900 val perplexity 744.8441
121
- 1900 train 6.650075 (lr=2.8127e-05) (hash(x)=153313879)
122
- 2000 val loss 6.6387
123
- 2000 val perplexity 764.0649
124
- 2000 train 6.572113 (lr=3.2500e-05) (hash(x)=158245023)
125
- 2000 val loss 6.5498
126
- 2000 val perplexity 699.1063
127
- 2000 train 6.483957 (lr=2.7857e-05) (hash(x)=158245023)
128
- 2100 val loss 6.5899
129
- 2100 val perplexity 727.7206
130
- 2100 train 6.421700 (lr=3.2166e-05) (hash(x)=157204896)
131
- 2100 val loss 6.5033
132
- 2100 val perplexity 667.3728
133
- 2100 train 6.327139 (lr=2.7571e-05) (hash(x)=157204896)
134
- 2200 val loss 6.5629
135
- 2200 val perplexity 708.3510
136
- 2200 train 6.407813 (lr=3.1813e-05) (hash(x)=137541932)
137
- 2200 val loss 6.4726
138
- 2200 val perplexity 647.1771
139
- 2200 train 6.316986 (lr=2.7269e-05) (hash(x)=137541932)
140
- 2300 val loss 6.5292
141
- 2300 val perplexity 684.8694
142
- 2300 train 6.481870 (lr=3.1443e-05) (hash(x)=150149692)
143
- 2300 val loss 6.4415
144
- 2300 val perplexity 627.3774
145
- 2300 train 6.394355 (lr=2.6951e-05) (hash(x)=150149692)
146
- 2400 val loss 6.4797
147
- 2400 val perplexity 651.7456
148
- 2400 train 6.469861 (lr=3.1054e-05) (hash(x)=151730720)
149
- 2400 val loss 6.3974
150
- 2400 val perplexity 600.2633
151
- 2400 train 6.381145 (lr=2.6618e-05) (hash(x)=151730720)
152
- 2500 val loss 6.4501
153
- 2500 val perplexity 632.7767
154
- 2500 train 6.218892 (lr=3.0649e-05) (hash(x)=143406752)
155
- 2500 val loss 6.3688
156
- 2500 val perplexity 583.3505
157
- 2500 train 6.147860 (lr=2.6270e-05) (hash(x)=143406752)
158
- 2600 val loss 6.4203
159
- 2600 val perplexity 614.1707
160
- 2600 train 6.254517 (lr=3.0227e-05) (hash(x)=157272496)
161
- 2600 val loss 6.3524
162
- 2600 val perplexity 573.8466
163
- 2600 train 6.186916 (lr=2.5909e-05) (hash(x)=157272496)
164
- 2700 val loss 6.3945
165
- 2700 val perplexity 598.5561
166
- 2700 train 6.410750 (lr=2.9789e-05) (hash(x)=155342327)
167
- 2700 val loss 6.3262
168
- 2700 val perplexity 559.0328
169
- 2700 train 6.334075 (lr=2.5533e-05) (hash(x)=155342327)
170
- 2800 val loss 6.3544
171
- 2800 val perplexity 575.0129
172
- 2800 train 6.225688 (lr=2.9336e-05) (hash(x)=140626679)
173
- 2800 val loss 6.2908
174
- 2800 val perplexity 539.5964
175
- 2800 train 6.168956 (lr=2.5145e-05) (hash(x)=140626679)
176
- 2900 val loss 6.3277
177
- 2900 val perplexity 559.8731
178
- 2900 train 6.188353 (lr=2.8868e-05) (hash(x)=144953350)
179
- 2900 val loss 6.2653
180
- 2900 val perplexity 526.0153
181
- 2900 train 6.125029 (lr=2.4744e-05) (hash(x)=144953350)
182
- 3000 val loss 6.3080
183
- 3000 val perplexity 548.9646
184
- 3000 train 6.216789 (lr=2.8386e-05) (hash(x)=172449837)
185
- 3000 val loss 6.2493
186
- 3000 val perplexity 517.6444
187
- 3000 train 6.155919 (lr=2.4331e-05) (hash(x)=172449837)
188
- 3100 val loss 6.2954
189
- 3100 val perplexity 542.0637
190
- 3100 train 6.085942 (lr=2.7891e-05) (hash(x)=141710086)
191
- 3100 val loss 6.2444
192
- 3100 val perplexity 515.1201
193
- 3100 train 6.041224 (lr=2.3906e-05) (hash(x)=141710086)
194
- 3200 val loss 6.2588
195
- 3200 val perplexity 522.5837
196
- 3200 train 6.189919 (lr=2.7383e-05) (hash(x)=151299772)
197
- 3200 val loss 6.2087
198
- 3200 val perplexity 497.0476
199
- 3200 train 6.142445 (lr=2.3471e-05) (hash(x)=151299772)
200
- 3300 val loss 6.2412
201
- 3300 val perplexity 513.4838
202
- 3300 train 6.105106 (lr=2.6864e-05) (hash(x)=146473110)
203
- 3300 val loss 6.1987
204
- 3300 val perplexity 492.0873
205
- 3300 train 6.058009 (lr=2.3026e-05) (hash(x)=146473110)
206
- 3400 val loss 6.2167
207
- 3400 val perplexity 501.0693
208
- 3400 train 6.293976 (lr=2.6333e-05) (hash(x)=153954157)
209
- 3400 val loss 6.1735
210
- 3400 val perplexity 479.8751
211
- 3400 train 6.249593 (lr=2.2572e-05) (hash(x)=153954157)
212
- 3500 val loss 6.2010
213
- 3500 val perplexity 493.2191
214
- 3500 train 6.127666 (lr=2.5793e-05) (hash(x)=153717336)
215
- 3500 val loss 6.1570
216
- 3500 val perplexity 471.9926
217
- 3500 train 6.084462 (lr=2.2108e-05) (hash(x)=153717336)
218
- 3600 val loss 6.1805
219
- 3600 val perplexity 483.2234
220
- 3600 train 5.934934 (lr=2.5243e-05) (hash(x)=144965161)
221
- 3600 val loss 6.1449
222
- 3600 val perplexity 466.3374
223
- 3600 train 5.896233 (lr=2.1637e-05) (hash(x)=144965161)
224
- 3700 val loss 6.1731
225
- 3700 val perplexity 479.6843
226
- 3700 train 5.946197 (lr=2.4684e-05) (hash(x)=125969741)
227
- 3700 val loss 6.1403
228
- 3700 val perplexity 464.2038
229
- 3700 train 5.915022 (lr=2.1158e-05) (hash(x)=125969741)
230
- 3800 val loss 6.1465
231
- 3800 val perplexity 467.0876
232
- 3800 train 6.011291 (lr=2.4117e-05) (hash(x)=155070487)
233
- 3800 val loss 6.1142
234
- 3800 val perplexity 452.2469
235
- 3800 train 5.975008 (lr=2.0672e-05) (hash(x)=155070487)
236
- 3900 val loss 6.1278
237
- 3900 val perplexity 458.4420
238
- 3900 train 6.005039 (lr=2.3543e-05) (hash(x)=149444644)
239
- 3900 val loss 6.1017
240
- 3900 val perplexity 446.5967
241
- 3900 train 5.975531 (lr=2.0180e-05) (hash(x)=149444644)
242
- 4000 val loss 6.1155
243
- 4000 val perplexity 452.8252
244
- 4000 train 5.972252 (lr=2.2963e-05) (hash(x)=151663033)
245
- 4000 val loss 6.0910
246
- 4000 val perplexity 441.8456
247
- 4000 train 5.942688 (lr=1.9683e-05) (hash(x)=151663033)
248
- 4100 val loss 6.1077
249
- 4100 val perplexity 449.3011
250
- 4100 train 6.060684 (lr=2.2378e-05) (hash(x)=143688282)
251
- 4100 val loss 6.0840
252
- 4100 val perplexity 438.7943
253
- 4100 train 6.035672 (lr=1.9181e-05) (hash(x)=143688282)
254
- 4200 val loss 6.0846
255
- 4200 val perplexity 439.0362
256
- 4200 train 6.078417 (lr=2.1788e-05) (hash(x)=163361651)
257
- 4200 val loss 6.0626
258
- 4200 val perplexity 429.4702
259
- 4200 train 6.058801 (lr=1.8675e-05) (hash(x)=163361651)
260
- 4300 val loss 6.0690
261
- 4300 val perplexity 432.2372
262
- 4300 train 6.133219 (lr=2.1194e-05) (hash(x)=153619361)
263
- 4300 val loss 6.0485
264
- 4300 val perplexity 423.4790
265
- 4300 train 6.109252 (lr=1.8166e-05) (hash(x)=153619361)
266
- 4400 val loss 6.0636
267
- 4400 val perplexity 429.9185
268
- 4400 train 6.355730 (lr=2.0598e-05) (hash(x)=168527064)
269
- 4400 val loss 6.0480
270
- 4400 val perplexity 423.2458
271
- 4400 train 6.335233 (lr=1.7655e-05) (hash(x)=168527064)
272
- 4500 val loss 6.0544
273
- 4500 val perplexity 425.9846
274
- 4500 train 5.854112 (lr=1.9999e-05) (hash(x)=125588037)
275
- 4500 val loss 6.0369
276
- 4500 val perplexity 418.6092
277
- 4500 train 5.830371 (lr=1.7142e-05) (hash(x)=125588037)
278
- 4600 val loss 6.0342
279
- 4600 val perplexity 417.4467
280
- 4600 train 5.998290 (lr=1.9400e-05) (hash(x)=143710941)
281
- 4600 val loss 6.0200
282
- 4600 val perplexity 411.5641
283
- 4600 train 5.977354 (lr=1.6629e-05) (hash(x)=143710941)
284
- 4700 val loss 6.0192
285
- 4700 val perplexity 411.2676
286
- 4700 train 5.844175 (lr=1.8800e-05) (hash(x)=150952742)
287
- 4700 val loss 6.0082
288
- 4700 val perplexity 406.7620
289
- 4700 train 5.829061 (lr=1.6114e-05) (hash(x)=150952742)
290
- 4800 val loss 6.0118
291
- 4800 val perplexity 408.2310
292
- 4800 train 5.876356 (lr=1.8201e-05) (hash(x)=145323659)
293
- 4800 val loss 6.0027
294
- 4800 val perplexity 404.5156
295
- 4800 train 5.861185 (lr=1.5601e-05) (hash(x)=145323659)
296
- 4900 val loss 6.0072
297
- 4900 val perplexity 406.3518
298
- 4900 train 6.022637 (lr=1.7604e-05) (hash(x)=153151397)
299
- 4900 val loss 5.9937
300
- 4900 val perplexity 400.8849
301
- 4900 train 6.009444 (lr=1.5089e-05) (hash(x)=153151397)
302
- 5000 val loss 5.9942
303
- 5000 val perplexity 401.1065
304
- 5000 train 5.895233 (lr=1.7009e-05) (hash(x)=143182059)
305
- 5000 val loss 5.9870
306
- 5000 val perplexity 398.2089
307
- 5000 train 5.890842 (lr=1.4579e-05) (hash(x)=143182059)
308
- 5100 val loss 5.9844
309
- 5100 val perplexity 397.1822
310
- 5100 train 6.032875 (lr=1.6417e-05) (hash(x)=170083586)
311
- 5100 val loss 5.9762
312
- 5100 val perplexity 393.9346
313
- 5100 train 6.027209 (lr=1.4071e-05) (hash(x)=170083586)
314
- 5200 val loss 5.9702
315
- 5200 val perplexity 391.5924
316
- 5200 train 5.865292 (lr=1.5829e-05) (hash(x)=149363919)
317
- 5200 val loss 5.9649
318
- 5200 val perplexity 389.5172
319
- 5200 train 5.861600 (lr=1.3568e-05) (hash(x)=149363919)
320
- 5300 val loss 5.9669
321
- 5300 val perplexity 390.3069
322
- 5300 train 5.877049 (lr=1.5246e-05) (hash(x)=152033784)
323
- 5300 val loss 5.9612
324
- 5300 val perplexity 388.0795
325
- 5300 train 5.867223 (lr=1.3068e-05) (hash(x)=152033784)
326
- 5400 val loss 5.9501
327
- 5400 val perplexity 383.8001
328
- 5400 train 6.054345 (lr=1.4669e-05) (hash(x)=154614289)
329
- 5400 val loss 5.9471
330
- 5400 val perplexity 382.6569
331
- 5400 train 6.046638 (lr=1.2573e-05) (hash(x)=154614289)
332
- 5500 val loss 5.9431
333
- 5500 val perplexity 381.0967
334
- 5500 train 6.028569 (lr=1.4099e-05) (hash(x)=157745174)
335
- 5500 val loss 5.9410
336
- 5500 val perplexity 380.3125
337
- 5500 train 6.025530 (lr=1.2085e-05) (hash(x)=157745174)
338
- 5600 val loss 5.9373
339
- 5600 val perplexity 378.9089
340
- 5600 train 5.829021 (lr=1.3536e-05) (hash(x)=147693222)
341
- 5600 val loss 5.9377
342
- 5600 val perplexity 379.0604
343
- 5600 train 5.827706 (lr=1.1602e-05) (hash(x)=147693222)
344
- 5700 val loss 5.9251
345
- 5700 val perplexity 374.3267
346
- 5700 train 5.866521 (lr=1.2981e-05) (hash(x)=149784627)
347
- 5700 val loss 5.9253
348
- 5700 val perplexity 374.3747
349
- 5700 train 5.866199 (lr=1.1127e-05) (hash(x)=149784627)
350
- 5800 val loss 5.9247
351
- 5800 val perplexity 374.1822
352
- 5800 train 5.834700 (lr=1.2436e-05) (hash(x)=158620729)
353
- 5800 val loss 5.9247
354
- 5800 val perplexity 374.1743
355
- 5800 train 5.826473 (lr=1.0659e-05) (hash(x)=158620729)
356
- 5900 val loss 5.9146
357
- 5900 val perplexity 370.3900
358
- 5900 train 5.858362 (lr=1.1900e-05) (hash(x)=159763910)
359
- 5900 val loss 5.9180
360
- 5900 val perplexity 371.6826
361
- 5900 train 5.861243 (lr=1.0200e-05) (hash(x)=159763910)
362
- 6000 val loss 5.9043
363
- 6000 val perplexity 366.6024
364
- 6000 train 5.836517 (lr=1.1375e-05) (hash(x)=147640561)
365
- 6000 val loss 5.9081
366
- 6000 val perplexity 367.9886
367
- 6000 train 5.841051 (lr=9.7500e-06) (hash(x)=147640561)
368
- 6100 val loss 5.8966
369
- 6100 val perplexity 363.7880
370
- 6100 train 5.878016 (lr=1.0861e-05) (hash(x)=156613394)
371
- 6100 val loss 5.9015
372
- 6100 val perplexity 365.5672
373
- 6100 train 5.885981 (lr=9.3098e-06) (hash(x)=156613394)
374
- 6200 val loss 5.8921
375
- 6200 val perplexity 362.1524
376
- 6200 train 5.853373 (lr=1.0360e-05) (hash(x)=186221290)
377
- 6200 val loss 5.8976
378
- 6200 val perplexity 364.1776
379
- 6200 train 5.857404 (lr=8.8800e-06) (hash(x)=186221290)
380
- 6300 val loss 5.8814
381
- 6300 val perplexity 358.3256
382
- 6300 train 5.760697 (lr=9.8715e-06) (hash(x)=152081419)
383
- 6300 val loss 5.8885
384
- 6300 val perplexity 360.8471
385
- 6300 train 5.763207 (lr=8.4613e-06) (hash(x)=152081419)
386
- 6400 val loss 5.8804
387
- 6400 val perplexity 357.9507
388
- 6400 train 5.825439 (lr=9.3966e-06) (hash(x)=154808349)
389
- 6400 val loss 5.8885
390
- 6400 val perplexity 360.8786
391
- 6400 train 5.834856 (lr=8.0542e-06) (hash(x)=154808349)
392
- 6500 val loss 5.8733
393
- 6500 val perplexity 355.4034
394
- 6500 train 5.840936 (lr=8.9359e-06) (hash(x)=159437208)
395
- 6500 val loss 5.8832
396
- 6500 val perplexity 358.9690
397
- 6500 train 5.854115 (lr=7.6594e-06) (hash(x)=159437208)
398
- 6600 val loss 5.8716
399
- 6600 val perplexity 354.8281
400
- 6600 train 5.681755 (lr=8.4903e-06) (hash(x)=157933074)
401
- 6600 val loss 5.8805
402
- 6600 val perplexity 358.0042
403
- 6600 train 5.683859 (lr=7.2774e-06) (hash(x)=157933074)
404
- 6700 val loss 5.8631
405
- 6700 val perplexity 351.8213
406
- 6700 train 5.846561 (lr=8.0602e-06) (hash(x)=161560240)
407
- 6700 val loss 5.8725
408
- 6700 val perplexity 355.1426
409
- 6700 train 5.855772 (lr=6.9087e-06) (hash(x)=161560240)
410
- 6800 val loss 5.8610
411
- 6800 val perplexity 351.0881
412
- 6800 train 5.835786 (lr=7.6463e-06) (hash(x)=155424292)
413
- 6800 val loss 5.8702
414
- 6800 val perplexity 354.3057
415
- 6800 train 5.845886 (lr=6.5540e-06) (hash(x)=155424292)
416
- 6900 val loss 5.8587
417
- 6900 val perplexity 350.2836
418
- 6900 train 5.851926 (lr=7.2493e-06) (hash(x)=148561470)
419
- 6900 val loss 5.8693
420
- 6900 val perplexity 354.0038
421
- 6900 train 5.859293 (lr=6.2137e-06) (hash(x)=148561470)
422
- 7000 val loss 5.8522
423
- 7000 val perplexity 348.0006
424
- 7000 train 5.749753 (lr=6.8697e-06) (hash(x)=141527450)
425
- 7000 val loss 5.8612
426
- 7000 val perplexity 351.1305
427
- 7000 train 5.753252 (lr=5.8883e-06) (hash(x)=141527450)
428
- 7100 val loss 5.8464
429
- 7100 val perplexity 345.9703
430
- 7100 train 5.800970 (lr=6.5080e-06) (hash(x)=151066339)
431
- 7100 val loss 5.8581
432
- 7100 val perplexity 350.0529
433
- 7100 train 5.809224 (lr=5.5783e-06) (hash(x)=151066339)
434
- 7200 val loss 5.8468
435
- 7200 val perplexity 346.1266
436
- 7200 train 5.652260 (lr=6.1648e-06) (hash(x)=155231264)
437
- 7200 val loss 5.8590
438
- 7200 val perplexity 350.3668
439
- 7200 train 5.663008 (lr=5.2841e-06) (hash(x)=155231264)
440
- 7300 val loss 5.8382
441
- 7300 val perplexity 343.1572
442
- 7300 train 5.854438 (lr=5.8405e-06) (hash(x)=150281149)
443
- 7300 val loss 5.8514
444
- 7300 val perplexity 347.7130
445
- 7300 train 5.861528 (lr=5.0062e-06) (hash(x)=150281149)
446
- 7400 val loss 5.8352
447
- 7400 val perplexity 342.1290
448
- 7400 train 5.717260 (lr=5.5357e-06) (hash(x)=148421717)
449
- 7400 val loss 5.8498
450
- 7400 val perplexity 347.1809
451
- 7400 train 5.727182 (lr=4.7449e-06) (hash(x)=148421717)
452
- 7500 val loss 5.8349
453
- 7500 val perplexity 342.0240
454
- 7500 train 5.600042 (lr=5.2508e-06) (hash(x)=146921118)
455
- 7500 val loss 5.8495
456
- 7500 val perplexity 347.0692
457
- 7500 train 5.604131 (lr=4.5007e-06) (hash(x)=146921118)
458
- 7600 val loss 5.8285
459
- 7600 val perplexity 339.8405
460
- 7600 train 5.879191 (lr=4.9862e-06) (hash(x)=150660048)
461
- 7600 val loss 5.8426
462
- 7600 val perplexity 344.6835
463
- 7600 train 5.886945 (lr=4.2739e-06) (hash(x)=150660048)
464
- 7700 val loss 5.8274
465
- 7700 val perplexity 339.4743
466
- 7700 train 5.699795 (lr=4.7423e-06) (hash(x)=148059852)
467
- 7700 val loss 5.8417
468
- 7700 val perplexity 344.3644
469
- 7700 train 5.711916 (lr=4.0648e-06) (hash(x)=148059852)
470
- 7800 val loss 5.8256
471
- 7800 val perplexity 338.8620
472
- 7800 train 5.527908 (lr=4.5194e-06) (hash(x)=148331002)
473
- 7800 val loss 5.8412
474
- 7800 val perplexity 344.1924
475
- 7800 train 5.541048 (lr=3.8738e-06) (hash(x)=148331002)
476
- 7900 val loss 5.8206
477
- 7900 val perplexity 337.1732
478
- 7900 train 5.777098 (lr=4.3179e-06) (hash(x)=164923883)
479
- 7900 val loss 5.8355
480
- 7900 val perplexity 342.2498
481
- 7900 train 5.794849 (lr=3.7010e-06) (hash(x)=164923883)
482
- 8000 val loss 5.8183
483
- 8000 val perplexity 336.3914
484
- 8000 train 5.699238 (lr=4.1380e-06) (hash(x)=143545384)
485
- 8000 val loss 5.8344
486
- 8000 val perplexity 341.8687
487
- 8000 train 5.713335 (lr=3.5468e-06) (hash(x)=143545384)
488
- 8100 val loss 5.8173
489
- 8100 val perplexity 336.0612
490
- 8100 train 5.634319 (lr=3.9800e-06) (hash(x)=160686959)
491
- 8100 val loss 5.8336
492
- 8100 val perplexity 341.5852
493
- 8100 train 5.655682 (lr=3.4114e-06) (hash(x)=160686959)
494
- 8200 val loss 5.8150
495
- 8200 val perplexity 335.3045
496
- 8200 train 5.773064 (lr=3.8442e-06) (hash(x)=156501889)
497
- 8200 val loss 5.8326
498
- 8200 val perplexity 341.2524
499
- 8200 train 5.789817 (lr=3.2950e-06) (hash(x)=156501889)
500
- 8300 val loss 5.8130
501
- 8300 val perplexity 334.6131
502
- 8300 train 5.656518 (lr=3.7307e-06) (hash(x)=142716875)
503
- 8300 val loss 5.8295
504
- 8300 val perplexity 340.2042
505
- 8300 train 5.667881 (lr=3.1977e-06) (hash(x)=142716875)
506
- 8400 val loss 5.8083
507
- 8400 val perplexity 333.0655
508
- 8400 train 5.715852 (lr=3.6397e-06) (hash(x)=154436684)
509
- 8400 val loss 5.8256
510
- 8400 val perplexity 338.8665
511
- 8400 train 5.732801 (lr=3.1197e-06) (hash(x)=154436684)
512
- 8500 val loss 5.8068
513
- 8500 val perplexity 332.5389
514
- 8500 train 5.878377 (lr=3.5713e-06) (hash(x)=147965839)
515
- 8500 val loss 5.8247
516
- 8500 val perplexity 338.5674
517
- 8500 train 5.897286 (lr=3.0611e-06) (hash(x)=147965839)
518
- 8600 val loss 5.8056
519
- 8600 val perplexity 332.1485
520
- 8600 train 5.608908 (lr=3.5257e-06) (hash(x)=145228097)
521
- 8600 val loss 5.8239
522
- 8600 val perplexity 338.2750
523
- 8600 train 5.626842 (lr=3.0220e-06) (hash(x)=145228097)
524
- 8700 val loss 5.8016
525
- 8700 val perplexity 330.8403
526
- 8700 train 5.983572 (lr=3.5029e-06) (hash(x)=152910357)
527
- 8700 val loss 5.8198
528
- 8700 val perplexity 336.9131
529
- 8700 train 5.998275 (lr=3.0024e-06) (hash(x)=152910357)
530
- 8749 val loss 5.8021
531
- 8749 val perplexity 330.9829
532
- 8749 val loss 5.8203
533
- 8749 val perplexity 337.0700
 
1
  max_steps: 8750
2
  0 val loss 11.2646
3
  0 val perplexity 78014.3047
4
+ 0 train 11.267765 (lr=1.1000e-07) (hash(x)=150327452)
5
+ 100 val loss 9.9457
6
+ 100 val perplexity 20862.5527
7
+ 100 train 9.998234 (lr=1.1110e-05) (hash(x)=166780046)
8
+ 200 val loss 9.1876
9
+ 200 val perplexity 9775.3037
10
+ 200 train 9.133502 (lr=2.2110e-05) (hash(x)=155040610)
11
+ 300 val loss 8.2995
12
+ 300 val perplexity 4022.0583
13
+ 300 train 8.305970 (lr=3.3110e-05) (hash(x)=155504036)
14
+ 400 val loss 7.7026
15
+ 400 val perplexity 2214.0981
16
+ 400 train 7.576903 (lr=4.4110e-05) (hash(x)=143823248)
17
+ 500 val loss 7.5018
18
+ 500 val perplexity 1811.3724
19
+ 500 train 7.471624 (lr=5.5000e-05) (hash(x)=143734685)
20
+ 600 val loss 7.4303
21
+ 600 val perplexity 1686.3419
22
+ 600 train 7.209157 (lr=5.4982e-05) (hash(x)=150678249)
23
+ 700 val loss 7.3549
24
+ 700 val perplexity 1563.7889
25
+ 700 train 7.492709 (lr=5.4928e-05) (hash(x)=175802021)
26
+ 800 val loss 7.2668
27
+ 800 val perplexity 1431.9143
28
+ 800 train 7.272984 (lr=5.4839e-05) (hash(x)=158681215)
29
+ 900 val loss 7.2057
30
+ 900 val perplexity 1347.0262
31
+ 900 train 7.102057 (lr=5.4713e-05) (hash(x)=146108145)
32
+ 1000 val loss 7.1370
33
+ 1000 val perplexity 1257.6271
34
+ 1000 train 6.979178 (lr=5.4553e-05) (hash(x)=154996086)
35
+ 1100 val loss 7.0562
36
+ 1100 val perplexity 1160.0608
37
+ 1100 train 7.036383 (lr=5.4357e-05) (hash(x)=153885445)
38
+ 1200 val loss 6.9739
39
+ 1200 val perplexity 1068.3485
40
+ 1200 train 6.932178 (lr=5.4126e-05) (hash(x)=142353087)
41
+ 1300 val loss 6.9191
42
+ 1300 val perplexity 1011.4178
43
+ 1300 train 6.830047 (lr=5.3860e-05) (hash(x)=150750353)
44
+ 1400 val loss 6.8690
45
+ 1400 val perplexity 961.9484
46
+ 1400 train 6.692386 (lr=5.3561e-05) (hash(x)=152767913)
47
+ 1500 val loss 6.7984
48
+ 1500 val perplexity 896.3729
49
+ 1500 train 6.837613 (lr=5.3227e-05) (hash(x)=151562048)
50
+ 1600 val loss 6.7206
51
+ 1600 val perplexity 829.3372
52
+ 1600 train 6.723626 (lr=5.2860e-05) (hash(x)=166486165)
53
+ 1700 val loss 6.6838
54
+ 1700 val perplexity 799.3312
55
+ 1700 train 6.325279 (lr=5.2461e-05) (hash(x)=130835396)
56
+ 1800 val loss 6.6342
57
+ 1800 val perplexity 760.6326
58
+ 1800 train 6.681156 (lr=5.2029e-05) (hash(x)=158851816)
59
+ 1900 val loss 6.5986
60
+ 1900 val perplexity 734.0478
61
+ 1900 train 6.630829 (lr=5.1565e-05) (hash(x)=153313879)
62
+ 2000 val loss 6.5331
63
+ 2000 val perplexity 687.5047
64
+ 2000 train 6.463948 (lr=5.1071e-05) (hash(x)=158245023)
65
+ 2100 val loss 6.4869
66
+ 2100 val perplexity 656.4935
67
+ 2100 train 6.311565 (lr=5.0547e-05) (hash(x)=157204896)
68
+ 2200 val loss 6.4565
69
+ 2200 val perplexity 636.8082
70
+ 2200 train 6.298766 (lr=4.9993e-05) (hash(x)=137541932)
71
+ 2300 val loss 6.4247
72
+ 2300 val perplexity 616.9203
73
+ 2300 train 6.379897 (lr=4.9410e-05) (hash(x)=150149692)
74
+ 2400 val loss 6.3730
75
+ 2400 val perplexity 585.8328
76
+ 2400 train 6.363482 (lr=4.8800e-05) (hash(x)=151730720)
77
+ 2500 val loss 6.3399
78
+ 2500 val perplexity 566.7245
79
+ 2500 train 6.111226 (lr=4.8162e-05) (hash(x)=143406752)
80
+ 2600 val loss 6.3211
81
+ 2600 val perplexity 556.1671
82
+ 2600 train 6.154155 (lr=4.7499e-05) (hash(x)=157272496)
83
+ 2700 val loss 6.2973
84
+ 2700 val perplexity 543.0967
85
+ 2700 train 6.291989 (lr=4.6811e-05) (hash(x)=155342327)
86
+ 2800 val loss 6.2554
87
+ 2800 val perplexity 520.8175
88
+ 2800 train 6.133845 (lr=4.6099e-05) (hash(x)=140626679)
89
+ 2900 val loss 6.2355
90
+ 2900 val perplexity 510.5799
91
+ 2900 train 6.095793 (lr=4.5364e-05) (hash(x)=144953350)
92
+ 3000 val loss 6.2132
93
+ 3000 val perplexity 499.2762
94
+ 3000 train 6.112416 (lr=4.4606e-05) (hash(x)=172449837)
95
+ 3100 val loss 6.2019
96
+ 3100 val perplexity 493.6857
97
+ 3100 train 5.997821 (lr=4.3828e-05) (hash(x)=141710086)
98
+ 3200 val loss 6.1629
99
+ 3200 val perplexity 474.8234
100
+ 3200 train 6.093801 (lr=4.3031e-05) (hash(x)=151299772)
101
+ 3300 val loss 6.1521
102
+ 3300 val perplexity 469.6945
103
+ 3300 train 6.020457 (lr=4.2215e-05) (hash(x)=146473110)
104
+ 3400 val loss 6.1303
105
+ 3400 val perplexity 459.5718
106
+ 3400 train 6.210660 (lr=4.1381e-05) (hash(x)=153954157)
107
+ 3500 val loss 6.1080
108
+ 3500 val perplexity 449.4359
109
+ 3500 train 6.042493 (lr=4.0532e-05) (hash(x)=153717336)
110
+ 3600 val loss 6.0886
111
+ 3600 val perplexity 440.8094
112
+ 3600 train 5.844422 (lr=3.9667e-05) (hash(x)=144965161)
113
+ 3700 val loss 6.0836
114
+ 3700 val perplexity 438.5945
115
+ 3700 train 5.863056 (lr=3.8789e-05) (hash(x)=125969741)
116
+ 3800 val loss 6.0621
117
+ 3800 val perplexity 429.2794
118
+ 3800 train 5.925856 (lr=3.7898e-05) (hash(x)=155070487)
119
+ 3900 val loss 6.0450
120
+ 3900 val perplexity 421.9990
121
+ 3900 train 5.920032 (lr=3.6996e-05) (hash(x)=149444644)
122
+ 4000 val loss 6.0358
123
+ 4000 val perplexity 418.1515
124
+ 4000 train 5.892958 (lr=3.6085e-05) (hash(x)=151663033)
125
+ 4100 val loss 6.0280
126
+ 4100 val perplexity 414.8808
127
+ 4100 train 5.983856 (lr=3.5165e-05) (hash(x)=143688282)
128
+ 4200 val loss 6.0011
129
+ 4200 val perplexity 403.8761
130
+ 4200 train 6.000255 (lr=3.4238e-05) (hash(x)=163361651)
131
+ 4300 val loss 5.9869
132
+ 4300 val perplexity 398.1633
133
+ 4300 train 6.021540 (lr=3.3305e-05) (hash(x)=153619361)
134
+ 4400 val loss 5.9853
135
+ 4400 val perplexity 397.5516
136
+ 4400 train 6.296560 (lr=3.2368e-05) (hash(x)=168527064)
137
+ 4500 val loss 5.9708
138
+ 4500 val perplexity 391.8027
139
+ 4500 train 5.778735 (lr=3.1428e-05) (hash(x)=125588037)
140
+ 4600 val loss 5.9559
141
+ 4600 val perplexity 386.0312
142
+ 4600 train 5.915372 (lr=3.0486e-05) (hash(x)=143710941)
143
+ 4700 val loss 5.9398
144
+ 4700 val perplexity 379.8761
145
+ 4700 train 5.773179 (lr=2.9543e-05) (hash(x)=150952742)
146
+ 4800 val loss 5.9353
147
+ 4800 val perplexity 378.1379
148
+ 4800 train 5.802930 (lr=2.8602e-05) (hash(x)=145323659)
149
+ 4900 val loss 5.9280
150
+ 4900 val perplexity 375.4010
151
+ 4900 train 5.946818 (lr=2.7663e-05) (hash(x)=153151397)
152
+ 5000 val loss 5.9167
153
+ 5000 val perplexity 371.1711
154
+ 5000 train 5.814476 (lr=2.6728e-05) (hash(x)=143182059)
155
+ 5100 val loss 5.9067
156
+ 5100 val perplexity 367.5034
157
+ 5100 train 5.954238 (lr=2.5798e-05) (hash(x)=170083586)
158
+ 5200 val loss 5.8899
159
+ 5200 val perplexity 361.3606
160
+ 5200 train 5.788404 (lr=2.4874e-05) (hash(x)=149363919)
161
+ 5300 val loss 5.8822
162
+ 5300 val perplexity 358.5796
163
+ 5300 train 5.769784 (lr=2.3958e-05) (hash(x)=152033784)
164
+ 5400 val loss 5.8687
165
+ 5400 val perplexity 353.7852
166
+ 5400 train 5.975514 (lr=2.3051e-05) (hash(x)=154614289)
167
+ 5500 val loss 5.8615
168
+ 5500 val perplexity 351.2599
169
+ 5500 train 5.953315 (lr=2.2155e-05) (hash(x)=157745174)
170
+ 5600 val loss 5.8554
171
+ 5600 val perplexity 349.1059
172
+ 5600 train 5.747090 (lr=2.1271e-05) (hash(x)=147693222)
173
+ 5700 val loss 5.8457
174
+ 5700 val perplexity 345.7462
175
+ 5700 train 5.784711 (lr=2.0399e-05) (hash(x)=149784627)
176
+ 5800 val loss 5.8401
177
+ 5800 val perplexity 343.8195
178
+ 5800 train 5.752527 (lr=1.9542e-05) (hash(x)=158620729)
179
+ 5900 val loss 5.8340
180
+ 5900 val perplexity 341.7084
181
+ 5900 train 5.779388 (lr=1.8700e-05) (hash(x)=159763910)
182
+ 6000 val loss 5.8213
183
+ 6000 val perplexity 337.4167
184
+ 6000 train 5.758483 (lr=1.7875e-05) (hash(x)=147640561)
185
+ 6100 val loss 5.8147
186
+ 6100 val perplexity 335.1861
187
+ 6100 train 5.795366 (lr=1.7068e-05) (hash(x)=156613394)
188
+ 6200 val loss 5.8099
189
+ 6200 val perplexity 333.6008
190
+ 6200 train 5.747422 (lr=1.6280e-05) (hash(x)=186221290)
191
+ 6300 val loss 5.7978
192
+ 6300 val perplexity 329.5759
193
+ 6300 train 5.679901 (lr=1.5512e-05) (hash(x)=152081419)
194
+ 6400 val loss 5.8018
195
+ 6400 val perplexity 330.8954
196
+ 6400 train 5.755892 (lr=1.4766e-05) (hash(x)=154808349)
197
+ 6500 val loss 5.7942
198
+ 6500 val perplexity 328.4028
199
+ 6500 train 5.763316 (lr=1.4042e-05) (hash(x)=159437208)
200
+ 6600 val loss 5.7910
201
+ 6600 val perplexity 327.3300
202
+ 6600 train 5.604477 (lr=1.3342e-05) (hash(x)=157933074)
203
+ 6700 val loss 5.7805
204
+ 6700 val perplexity 323.9347
205
+ 6700 train 5.757157 (lr=1.2666e-05) (hash(x)=161560240)
206
+ 6800 val loss 5.7777
207
+ 6800 val perplexity 323.0242
208
+ 6800 train 5.759155 (lr=1.2016e-05) (hash(x)=155424292)
209
+ 6900 val loss 5.7755
210
+ 6900 val perplexity 322.2903
211
+ 6900 train 5.768492 (lr=1.1392e-05) (hash(x)=148561470)
212
+ 7000 val loss 5.7678
213
+ 7000 val perplexity 319.8350
214
+ 7000 train 5.664386 (lr=1.0795e-05) (hash(x)=141527450)
215
+ 7100 val loss 5.7617
216
+ 7100 val perplexity 317.8992
217
+ 7100 train 5.717986 (lr=1.0227e-05) (hash(x)=151066339)
218
+ 7200 val loss 5.7629
219
+ 7200 val perplexity 318.2719
220
+ 7200 train 5.559781 (lr=9.6875e-06) (hash(x)=155231264)
221
+ 7300 val loss 5.7547
222
+ 7300 val perplexity 315.6577
223
+ 7300 train 5.772898 (lr=9.1780e-06) (hash(x)=150281149)
224
+ 7400 val loss 5.7514
225
+ 7400 val perplexity 314.6422
226
+ 7400 train 5.640527 (lr=8.6990e-06) (hash(x)=148421717)
227
+ 7500 val loss 5.7514
228
+ 7500 val perplexity 314.6400
229
+ 7500 train 5.513526 (lr=8.2513e-06) (hash(x)=146921118)
230
+ 7600 val loss 5.7441
231
+ 7600 val perplexity 312.3449
232
+ 7600 train 5.794578 (lr=7.8355e-06) (hash(x)=150660048)
233
+ 7700 val loss 5.7411
234
+ 7700 val perplexity 311.4137
235
+ 7700 train 5.619473 (lr=7.4522e-06) (hash(x)=148059852)
236
+ 7800 val loss 5.7406
237
+ 7800 val perplexity 311.2433
238
+ 7800 train 5.437676 (lr=7.1019e-06) (hash(x)=148331002)
239
+ 7900 val loss 5.7347
240
+ 7900 val perplexity 309.4308
241
+ 7900 train 5.687338 (lr=6.7852e-06) (hash(x)=164923883)
242
+ 8000 val loss 5.7323
243
+ 8000 val perplexity 308.6885
244
+ 8000 train 5.618847 (lr=6.5025e-06) (hash(x)=143545384)
245
+ 8100 val loss 5.7315
246
+ 8100 val perplexity 308.4400
247
+ 8100 train 5.545605 (lr=6.2543e-06) (hash(x)=160686959)
248
+ 8200 val loss 5.7291
249
+ 8200 val perplexity 307.6797
250
+ 8200 train 5.678798 (lr=6.0408e-06) (hash(x)=156501889)
251
+ 8300 val loss 5.7280
252
+ 8300 val perplexity 307.3556
253
+ 8300 train 5.569695 (lr=5.8625e-06) (hash(x)=142716875)
254
+ 8400 val loss 5.7214
255
+ 8400 val perplexity 305.3186
256
+ 8400 train 5.635721 (lr=5.7195e-06) (hash(x)=154436684)
257
+ 8500 val loss 5.7204
258
+ 8500 val perplexity 305.0224
259
+ 8500 train 5.795388 (lr=5.6121e-06) (hash(x)=147965839)
260
+ 8600 val loss 5.7193
261
+ 8600 val perplexity 304.6931
262
+ 8600 train 5.511634 (lr=5.5404e-06) (hash(x)=145228097)
263
+ 8700 val loss 5.7141
264
+ 8700 val perplexity 303.1104
265
+ 8700 train 5.900846 (lr=5.5045e-06) (hash(x)=152910357)
266
+ 8749 val loss 5.7145
267
+ 8749 val perplexity 303.2289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
attention_kindselective_n_heads4_seed1338/model_08749.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0ca05b3344aa66408b045c0838e43612e6fde6f89e7dfea8f7fe75e490896e0
3
  size 92843394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a131c48954674e396c86cde25dd3f7fc64149914014049b28ac85daeaa29f8e4
3
  size 92843394
attention_kindselective_n_heads4_seed1338/optimizer_08749.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70de7c3fffdc7ca40ec0743f8447c426842a43ad5a3f91f2f8dfdeb161f89769
3
  size 179406214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc5c4c2a91810fed0268ecc62c25629b3dbf30047fa030d380998253c008ba02
3
  size 179406214