andrew-healey commited on
Commit
e5ba3f9
·
verified ·
1 Parent(s): 8f0b6c9

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads2_seed1341/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads2_seed1341", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1341, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 0.0001, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "10e-5_10240_2_1341", "n_embd": 128}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads2_seed1341", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1341, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 0.00015, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "15e-5_10240_2_1341", "n_embd": 128}
attention_kindselective_n_heads2_seed1341/log2.txt CHANGED
@@ -1,3 +1,495 @@
1
  max_steps: 10000
 
 
 
 
 
 
2
  0 val loss 11.7712
3
  0 val perplexity 129466.1094
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  max_steps: 10000
2
+ 1500 val loss 7.3014
3
+ 1500 val perplexity 1482.3606
4
+ 1500 train 7.226598 (lr=9.6149e-05) (hash(x)=49016270)
5
+ 1600 val loss 7.2945
6
+ 1600 val perplexity 1472.1656
7
+ 1600 train 7.086260 (lr=9.5544e-05) (hash(x)=46100488)
8
  0 val loss 11.7712
9
  0 val perplexity 129466.1094
10
+ 1700 val loss 7.2754
11
+ 1700 val perplexity 1444.2773
12
+ 1700 train 7.331821 (lr=9.4897e-05) (hash(x)=49185350)
13
+ 1800 val loss 7.2540
14
+ 1800 val perplexity 1413.7415
15
+ 1800 train 7.142362 (lr=9.4209e-05) (hash(x)=48024574)
16
+ 1900 val loss 7.2813
17
+ 1900 val perplexity 1452.8243
18
+ 1900 train 7.092393 (lr=9.3481e-05) (hash(x)=45823189)
19
+ 0 train 11.770625 (lr=7.5000e-07) (hash(x)=47078120)
20
+ 2000 val loss 7.2275
21
+ 2000 val perplexity 1376.7460
22
+ 2000 train 7.047328 (lr=9.2714e-05) (hash(x)=45703932)
23
+ 100 val loss 9.4343
24
+ 100 val perplexity 12510.2705
25
+ 100 train 9.394177 (lr=7.5750e-05) (hash(x)=43429388)
26
+ 2100 val loss 7.1969
27
+ 2100 val perplexity 1335.3459
28
+ 2100 train 7.812517 (lr=9.1908e-05) (hash(x)=58570170)
29
+ 200 val loss 7.8396
30
+ 200 val perplexity 2539.0632
31
+ 200 train 8.055985 (lr=1.5000e-04) (hash(x)=52929681)
32
+ 2200 val loss 7.1753
33
+ 2200 val perplexity 1306.7522
34
+ 2200 train 7.213409 (lr=9.1064e-05) (hash(x)=55262880)
35
+ 300 val loss 7.6860
36
+ 300 val perplexity 2177.5486
37
+ 300 train 7.698088 (lr=1.4997e-04) (hash(x)=49930367)
38
+ 2300 val loss 7.1819
39
+ 2300 val perplexity 1315.4077
40
+ 2300 train 6.962734 (lr=9.0182e-05) (hash(x)=46415497)
41
+ 400 val loss 7.6539
42
+ 400 val perplexity 2108.8242
43
+ 2400 val loss 7.1252
44
+ 2400 val perplexity 1242.9492
45
+ 400 train 7.473444 (lr=1.4986e-04) (hash(x)=48542946)
46
+ 2400 train 7.056499 (lr=8.9265e-05) (hash(x)=49272278)
47
+ 2500 val loss 7.0784
48
+ 2500 val perplexity 1186.0667
49
+ 2500 train 6.929870 (lr=8.8313e-05) (hash(x)=48390803)
50
+ 500 val loss 7.5836
51
+ 500 val perplexity 1965.7809
52
+ 500 train 7.809551 (lr=1.4969e-04) (hash(x)=55286048)
53
+ 2600 val loss 7.0476
54
+ 2600 val perplexity 1150.1053
55
+ 2600 train 6.941715 (lr=8.7326e-05) (hash(x)=47450116)
56
+ 600 val loss 7.5162
57
+ 600 val perplexity 1837.5521
58
+ 600 train 7.756805 (lr=1.4945e-04) (hash(x)=51577760)
59
+ 2700 val loss 7.0309
60
+ 2700 val perplexity 1131.0837
61
+ 2700 train 7.128078 (lr=8.6306e-05) (hash(x)=52681152)
62
+ 700 val loss 7.4236
63
+ 700 val perplexity 1675.0077
64
+ 700 train 7.572020 (lr=1.4913e-04) (hash(x)=57433471)
65
+ 2800 val loss 6.9949
66
+ 2800 val perplexity 1091.0746
67
+ 2800 train 7.043819 (lr=8.5254e-05) (hash(x)=50664094)
68
+ 800 val loss 7.3456
69
+ 800 val perplexity 1549.4303
70
+ 800 train 7.255013 (lr=1.4876e-04) (hash(x)=49799291)
71
+ 2900 val loss 6.9631
72
+ 2900 val perplexity 1056.9043
73
+ 2900 train 6.776724 (lr=8.4170e-05) (hash(x)=47067144)
74
+ 900 val loss 7.2976
75
+ 900 val perplexity 1476.7948
76
+ 900 train 7.272717 (lr=1.4831e-04) (hash(x)=49502839)
77
+ 3000 val loss 6.9730
78
+ 3000 val perplexity 1067.3795
79
+ 3000 train 6.783483 (lr=8.3057e-05) (hash(x)=45015009)
80
+ 1000 val loss 7.2546
81
+ 1000 val perplexity 1414.6605
82
+ 1000 train 7.555295 (lr=1.4779e-04) (hash(x)=51142904)
83
+ 3100 val loss 6.9279
84
+ 3100 val perplexity 1020.3347
85
+ 3100 train 6.763106 (lr=8.1915e-05) (hash(x)=45245896)
86
+ 1100 val loss 7.2519
87
+ 1100 val perplexity 1410.7919
88
+ 1100 train 7.300293 (lr=1.4721e-04) (hash(x)=52751086)
89
+ 3200 val loss 6.9114
90
+ 3200 val perplexity 1003.6149
91
+ 3200 train 6.848857 (lr=8.0745e-05) (hash(x)=49995942)
92
+ 1200 val loss 7.2147
93
+ 1200 val perplexity 1359.3206
94
+ 1200 train 7.214197 (lr=1.4656e-04) (hash(x)=51538621)
95
+ 3300 val loss 6.8949
96
+ 3300 val perplexity 987.2419
97
+ 3300 train 6.779411 (lr=7.9549e-05) (hash(x)=52311504)
98
+ 3400 val loss 6.8761
99
+ 3400 val perplexity 968.8881
100
+ 3400 train 6.842221 (lr=7.8328e-05) (hash(x)=44332917)
101
+ 1300 val loss 7.1409
102
+ 1300 val perplexity 1262.5270
103
+ 1300 train 7.198671 (lr=1.4585e-04) (hash(x)=52034040)
104
+ 3500 val loss 6.8513
105
+ 3500 val perplexity 945.1112
106
+ 3500 train 6.955316 (lr=7.7082e-05) (hash(x)=56517159)
107
+ 1400 val loss 7.1240
108
+ 1400 val perplexity 1241.3577
109
+ 1400 train 7.189587 (lr=1.4507e-04) (hash(x)=50640105)
110
+ 3600 val loss 6.8735
111
+ 3600 val perplexity 966.3168
112
+ 3600 train 6.768483 (lr=7.5814e-05) (hash(x)=50720920)
113
+ 1500 val loss 7.0398
114
+ 1500 val perplexity 1141.1345
115
+ 1500 train 6.955029 (lr=1.4422e-04) (hash(x)=49016270)
116
+ 3700 val loss 6.8349
117
+ 3700 val perplexity 929.7078
118
+ 3700 train 7.343882 (lr=7.4525e-05) (hash(x)=62727701)
119
+ 1600 val loss 6.9730
120
+ 1600 val perplexity 1067.4009
121
+ 1600 train 6.752039 (lr=1.4332e-04) (hash(x)=46100488)
122
+ 3800 val loss 6.8337
123
+ 3800 val perplexity 928.6059
124
+ 3800 train 6.693852 (lr=7.3215e-05) (hash(x)=54772539)
125
+ 3900 val loss 6.8087
126
+ 3900 val perplexity 905.6780
127
+ 3900 train 6.816524 (lr=7.1887e-05) (hash(x)=52274485)
128
+ 1700 val loss 6.9253
129
+ 1700 val perplexity 1017.7491
130
+ 1700 train 6.972584 (lr=1.4235e-04) (hash(x)=49185350)
131
+ 4000 val loss 6.7848
132
+ 4000 val perplexity 884.2670
133
+ 4000 train 6.490056 (lr=7.0541e-05) (hash(x)=50118307)
134
+ 1800 val loss 6.9016
135
+ 1800 val perplexity 993.8886
136
+ 1800 train 6.798162 (lr=1.4131e-04) (hash(x)=48024574)
137
+ 4100 val loss 6.7726
138
+ 4100 val perplexity 873.5496
139
+ 4100 train 6.365652 (lr=6.9180e-05) (hash(x)=42771647)
140
+ 1900 val loss 6.8735
141
+ 1900 val perplexity 966.3652
142
+ 1900 train 6.644013 (lr=1.4022e-04) (hash(x)=45823189)
143
+ 4200 val loss 6.7686
144
+ 4200 val perplexity 870.0539
145
+ 4200 train 6.847145 (lr=6.7804e-05) (hash(x)=51748836)
146
+ 2000 val loss 6.8751
147
+ 2000 val perplexity 967.8617
148
+ 2000 train 6.675347 (lr=1.3907e-04) (hash(x)=45703932)
149
+ 4300 val loss 6.7499
150
+ 4300 val perplexity 853.9985
151
+ 4300 train 6.751280 (lr=6.6414e-05) (hash(x)=49021280)
152
+ 2100 val loss 6.8334
153
+ 2100 val perplexity 928.3230
154
+ 2100 train 7.513886 (lr=1.3786e-04) (hash(x)=58570170)
155
+ 4400 val loss 6.7124
156
+ 4400 val perplexity 822.5333
157
+ 4400 train 6.766258 (lr=6.5013e-05) (hash(x)=55200309)
158
+ 2200 val loss 6.8604
159
+ 2200 val perplexity 953.7519
160
+ 2200 train 6.891486 (lr=1.3660e-04) (hash(x)=55262880)
161
+ 4500 val loss 6.7050
162
+ 4500 val perplexity 816.5079
163
+ 4500 train 6.752545 (lr=6.3602e-05) (hash(x)=52085049)
164
+ 2300 val loss 6.8071
165
+ 2300 val perplexity 904.2579
166
+ 2300 train 6.533747 (lr=1.3527e-04) (hash(x)=46415497)
167
+ 4600 val loss 6.6968
168
+ 4600 val perplexity 809.8085
169
+ 4600 train 6.673974 (lr=6.2182e-05) (hash(x)=48935595)
170
+ 2400 val loss 6.8048
171
+ 2400 val perplexity 902.1807
172
+ 2400 train 6.739892 (lr=1.3390e-04) (hash(x)=49272278)
173
+ 4700 val loss 6.6875
174
+ 4700 val perplexity 802.3204
175
+ 4700 train 6.886833 (lr=6.0754e-05) (hash(x)=49182380)
176
+ 2500 val loss 6.8094
177
+ 2500 val perplexity 906.3088
178
+ 2500 train 6.669321 (lr=1.3247e-04) (hash(x)=48390803)
179
+ 4800 val loss 6.6605
180
+ 4800 val perplexity 780.9518
181
+ 4800 train 6.491550 (lr=5.9321e-05) (hash(x)=43941929)
182
+ 2600 val loss 6.7963
183
+ 2600 val perplexity 894.5168
184
+ 2600 train 6.670294 (lr=1.3099e-04) (hash(x)=47450116)
185
+ 4900 val loss 6.6521
186
+ 4900 val perplexity 774.4189
187
+ 4900 train 6.824918 (lr=5.7883e-05) (hash(x)=51852773)
188
+ 2700 val loss 6.7732
189
+ 2700 val perplexity 874.0683
190
+ 2700 train 6.866146 (lr=1.2946e-04) (hash(x)=52681152)
191
+ 5000 val loss 6.6418
192
+ 5000 val perplexity 766.5018
193
+ 5000 train 6.268013 (lr=5.6442e-05) (hash(x)=40509616)
194
+ 2800 val loss 6.7278
195
+ 2800 val perplexity 835.3218
196
+ 2800 train 6.812080 (lr=1.2788e-04) (hash(x)=50664094)
197
+ 5100 val loss 6.6423
198
+ 5100 val perplexity 766.8707
199
+ 5100 train 6.882291 (lr=5.5000e-05) (hash(x)=57585369)
200
+ 2900 val loss 6.7396
201
+ 2900 val perplexity 845.1917
202
+ 2900 train 6.555730 (lr=1.2626e-04) (hash(x)=47067144)
203
+ 5200 val loss 6.6396
204
+ 5200 val perplexity 764.7626
205
+ 5200 train 6.626321 (lr=5.3558e-05) (hash(x)=51042313)
206
+ 3000 val loss 6.7224
207
+ 3000 val perplexity 830.8346
208
+ 3000 train 6.528443 (lr=1.2459e-04) (hash(x)=45015009)
209
+ 5300 val loss 6.6180
210
+ 5300 val perplexity 748.4346
211
+ 5300 train 6.767743 (lr=5.2117e-05) (hash(x)=52001684)
212
+ 3100 val loss 6.7166
213
+ 3100 val perplexity 826.0220
214
+ 3100 train 6.519020 (lr=1.2287e-04) (hash(x)=45245896)
215
+ 5400 val loss 6.6201
216
+ 5400 val perplexity 750.0352
217
+ 5400 train 6.522018 (lr=5.0679e-05) (hash(x)=48831647)
218
+ 3200 val loss 6.7114
219
+ 3200 val perplexity 821.6827
220
+ 3200 train 6.646773 (lr=1.2112e-04) (hash(x)=49995942)
221
+ 5500 val loss 6.6321
222
+ 5500 val perplexity 759.0754
223
+ 5500 train 7.037599 (lr=4.9246e-05) (hash(x)=50192069)
224
+ 3300 val loss 6.6904
225
+ 3300 val perplexity 804.6768
226
+ 3300 train 6.555431 (lr=1.1932e-04) (hash(x)=52311504)
227
+ 5600 val loss 6.6034
228
+ 5600 val perplexity 737.5775
229
+ 5600 train 6.437107 (lr=4.7818e-05) (hash(x)=47208852)
230
+ 3400 val loss 6.6894
231
+ 3400 val perplexity 803.8519
232
+ 3400 train 6.636265 (lr=1.1749e-04) (hash(x)=44332917)
233
+ 5700 val loss 6.6001
234
+ 5700 val perplexity 735.1389
235
+ 5700 train 6.192906 (lr=4.6398e-05) (hash(x)=44061694)
236
+ 3500 val loss 6.6821
237
+ 3500 val perplexity 798.0287
238
+ 3500 train 6.763896 (lr=1.1562e-04) (hash(x)=56517159)
239
+ 5800 val loss 6.5920
240
+ 5800 val perplexity 729.2386
241
+ 5800 train 6.948812 (lr=4.4987e-05) (hash(x)=56513279)
242
+ 5900 val loss 6.5843
243
+ 5900 val perplexity 723.6619
244
+ 3600 val loss 6.6827
245
+ 3600 val perplexity 798.4920
246
+ 5900 train 6.729167 (lr=4.3586e-05) (hash(x)=50412818)
247
+ 3600 train 6.536693 (lr=1.1372e-04) (hash(x)=50720920)
248
+ 6000 val loss 6.5736
249
+ 6000 val perplexity 715.9265
250
+ 6000 train 6.400131 (lr=4.2196e-05) (hash(x)=47159634)
251
+ 3700 val loss 6.6724
252
+ 3700 val perplexity 790.3077
253
+ 3700 train 7.203786 (lr=1.1179e-04) (hash(x)=62727701)
254
+ 6100 val loss 6.5569
255
+ 6100 val perplexity 704.1142
256
+ 6100 train 6.635352 (lr=4.0820e-05) (hash(x)=54312795)
257
+ 3800 val loss 6.6596
258
+ 3800 val perplexity 780.2100
259
+ 3800 train 6.530156 (lr=1.0982e-04) (hash(x)=54772539)
260
+ 6200 val loss 6.5587
261
+ 6200 val perplexity 705.3464
262
+ 6200 train 6.674115 (lr=3.9459e-05) (hash(x)=54187587)
263
+ 3900 val loss 6.6374
264
+ 3900 val perplexity 763.1033
265
+ 3900 train 6.637521 (lr=1.0783e-04) (hash(x)=52274485)
266
+ 6300 val loss 6.5444
267
+ 6300 val perplexity 695.3438
268
+ 6300 train 6.631743 (lr=3.8113e-05) (hash(x)=53620387)
269
+ 4000 val loss 6.6360
270
+ 4000 val perplexity 762.0699
271
+ 4000 train 6.343906 (lr=1.0581e-04) (hash(x)=50118307)
272
+ 6400 val loss 6.5333
273
+ 6400 val perplexity 687.6434
274
+ 6400 train 6.477994 (lr=3.6785e-05) (hash(x)=48761774)
275
+ 4100 val loss 6.6389
276
+ 4100 val perplexity 764.2654
277
+ 4100 train 6.220299 (lr=1.0377e-04) (hash(x)=42771647)
278
+ 6500 val loss 6.5243
279
+ 6500 val perplexity 681.5228
280
+ 6500 train 6.738882 (lr=3.5475e-05) (hash(x)=56690281)
281
+ 4200 val loss 6.6299
282
+ 4200 val perplexity 757.4057
283
+ 4200 train 6.721921 (lr=1.0171e-04) (hash(x)=51748836)
284
+ 6600 val loss 6.5221
285
+ 6600 val perplexity 679.9893
286
+ 6600 train 6.334319 (lr=3.4186e-05) (hash(x)=42985269)
287
+ 4300 val loss 6.6093
288
+ 4300 val perplexity 741.9653
289
+ 4300 train 6.632129 (lr=9.9622e-05) (hash(x)=49021280)
290
+ 6700 val loss 6.5310
291
+ 6700 val perplexity 686.0870
292
+ 6700 train 6.588527 (lr=3.2918e-05) (hash(x)=53315447)
293
+ 4400 val loss 6.5829
294
+ 4400 val perplexity 722.6357
295
+ 4400 train 6.619546 (lr=9.7520e-05) (hash(x)=55200309)
296
+ 6800 val loss 6.5043
297
+ 6800 val perplexity 667.9774
298
+ 6800 train 6.952129 (lr=3.1672e-05) (hash(x)=61577166)
299
+ 4500 val loss 6.5829
300
+ 4500 val perplexity 722.6595
301
+ 4500 train 6.642871 (lr=9.5403e-05) (hash(x)=52085049)
302
+ 6900 val loss 6.5010
303
+ 6900 val perplexity 665.8378
304
+ 6900 train 6.649751 (lr=3.0451e-05) (hash(x)=54641005)
305
+ 4600 val loss 6.5709
306
+ 4600 val perplexity 714.0108
307
+ 4600 train 6.567044 (lr=9.3273e-05) (hash(x)=48935595)
308
+ 7000 val loss 6.4974
309
+ 7000 val perplexity 663.4460
310
+ 7000 train 7.009356 (lr=2.9255e-05) (hash(x)=60579512)
311
+ 4700 val loss 6.5487
312
+ 4700 val perplexity 698.3546
313
+ 4700 train 6.793222 (lr=9.1132e-05) (hash(x)=49182380)
314
+ 7100 val loss 6.4923
315
+ 7100 val perplexity 660.0073
316
+ 7100 train 6.391061 (lr=2.8085e-05) (hash(x)=53151549)
317
+ 4800 val loss 6.5477
318
+ 4800 val perplexity 697.6464
319
+ 4800 train 6.381069 (lr=8.8982e-05) (hash(x)=43941929)
320
+ 7200 val loss 6.4895
321
+ 7200 val perplexity 658.1876
322
+ 7200 train 7.420212 (lr=2.6943e-05) (hash(x)=71842455)
323
+ 4900 val loss 6.5418
324
+ 4900 val perplexity 693.5468
325
+ 4900 train 6.709642 (lr=8.6825e-05) (hash(x)=51852773)
326
+ 7300 val loss 6.4850
327
+ 7300 val perplexity 655.2300
328
+ 7300 train 6.246530 (lr=2.5830e-05) (hash(x)=44516452)
329
+ 5000 val loss 6.5309
330
+ 5000 val perplexity 686.0157
331
+ 5000 train 6.152505 (lr=8.4663e-05) (hash(x)=40509616)
332
+ 7400 val loss 6.4924
333
+ 7400 val perplexity 660.0942
334
+ 7400 train 6.132852 (lr=2.4746e-05) (hash(x)=42667710)
335
+ 5100 val loss 6.5262
336
+ 5100 val perplexity 682.7995
337
+ 5100 train 6.765361 (lr=8.2500e-05) (hash(x)=57585369)
338
+ 7500 val loss 6.4816
339
+ 7500 val perplexity 653.0245
340
+ 7500 train 6.229507 (lr=2.3694e-05) (hash(x)=47050797)
341
+ 5200 val loss 6.5454
342
+ 5200 val perplexity 696.0334
343
+ 5200 train 6.526903 (lr=8.0337e-05) (hash(x)=51042313)
344
+ 7600 val loss 6.4780
345
+ 7600 val perplexity 650.6663
346
+ 7600 train 6.379726 (lr=2.2674e-05) (hash(x)=49785056)
347
+ 5300 val loss 6.5289
348
+ 5300 val perplexity 684.6187
349
+ 5300 train 6.668643 (lr=7.8175e-05) (hash(x)=52001684)
350
+ 7700 val loss 6.4767
351
+ 7700 val perplexity 649.8347
352
+ 7700 train 6.284160 (lr=2.1687e-05) (hash(x)=53232030)
353
+ 5400 val loss 6.5403
354
+ 5400 val perplexity 692.4857
355
+ 5400 train 6.453519 (lr=7.6018e-05) (hash(x)=48831647)
356
+ 7800 val loss 6.4654
357
+ 7800 val perplexity 642.5144
358
+ 7800 train 6.285869 (lr=2.0735e-05) (hash(x)=48049749)
359
+ 5500 val loss 6.5345
360
+ 5500 val perplexity 688.4803
361
+ 5500 train 6.949864 (lr=7.3868e-05) (hash(x)=50192069)
362
+ 7900 val loss 6.4628
363
+ 7900 val perplexity 640.8224
364
+ 7900 train 6.296690 (lr=1.9818e-05) (hash(x)=44768513)
365
+ 5600 val loss 6.5215
366
+ 5600 val perplexity 679.6085
367
+ 5600 train 6.368748 (lr=7.1727e-05) (hash(x)=47208852)
368
+ 8000 val loss 6.4540
369
+ 8000 val perplexity 635.2330
370
+ 8000 train 6.295214 (lr=1.8936e-05) (hash(x)=46228039)
371
+ 5700 val loss 6.5209
372
+ 5700 val perplexity 679.1575
373
+ 5700 train 6.097320 (lr=6.9597e-05) (hash(x)=44061694)
374
+ 8100 val loss 6.4509
375
+ 8100 val perplexity 633.2829
376
+ 8100 train 6.780039 (lr=1.8092e-05) (hash(x)=60017091)
377
+ 5800 val loss 6.5331
378
+ 5800 val perplexity 687.5231
379
+ 5800 train 6.887940 (lr=6.7480e-05) (hash(x)=56513279)
380
+ 8200 val loss 6.4417
381
+ 8200 val perplexity 627.4558
382
+ 8200 train 6.451499 (lr=1.7286e-05) (hash(x)=49910198)
383
+ 5900 val loss 6.5202
384
+ 5900 val perplexity 678.6848
385
+ 8300 val loss 6.4402
386
+ 8300 val perplexity 626.5251
387
+ 5900 train 6.650718 (lr=6.5378e-05) (hash(x)=50412818)
388
+ 8300 train 6.745976 (lr=1.6519e-05) (hash(x)=57919055)
389
+ 8400 val loss 6.4405
390
+ 8400 val perplexity 626.7328
391
+ 8400 train 6.515765 (lr=1.5791e-05) (hash(x)=49694964)
392
+ 6000 val loss 6.5113
393
+ 6000 val perplexity 672.6966
394
+ 6000 train 6.349458 (lr=6.3294e-05) (hash(x)=47159634)
395
+ 8500 val loss 6.4337
396
+ 8500 val perplexity 622.4640
397
+ 8500 train 6.469025 (lr=1.5103e-05) (hash(x)=53762585)
398
+ 6100 val loss 6.4893
399
+ 6100 val perplexity 658.0756
400
+ 6100 train 6.571654 (lr=6.1230e-05) (hash(x)=54312795)
401
+ 8600 val loss 6.4334
402
+ 8600 val perplexity 622.3082
403
+ 8600 train 6.465587 (lr=1.4456e-05) (hash(x)=51166973)
404
+ 6200 val loss 6.4849
405
+ 6200 val perplexity 655.2000
406
+ 6200 train 6.599206 (lr=5.9188e-05) (hash(x)=54187587)
407
+ 8700 val loss 6.4311
408
+ 8700 val perplexity 620.8644
409
+ 8700 train 6.505373 (lr=1.3851e-05) (hash(x)=53968049)
410
+ 6300 val loss 6.4821
411
+ 6300 val perplexity 653.3412
412
+ 6300 train 6.581801 (lr=5.7169e-05) (hash(x)=53620387)
413
+ 8800 val loss 6.4265
414
+ 8800 val perplexity 617.9882
415
+ 8800 train 6.509236 (lr=1.3289e-05) (hash(x)=59231056)
416
+ 6400 val loss 6.4599
417
+ 6400 val perplexity 638.9672
418
+ 6400 train 6.395986 (lr=5.5177e-05) (hash(x)=48761774)
419
+ 8900 val loss 6.4245
420
+ 8900 val perplexity 616.7817
421
+ 8900 train 6.301777 (lr=1.2769e-05) (hash(x)=50488048)
422
+ 6500 val loss 6.4580
423
+ 6500 val perplexity 637.7895
424
+ 6500 train 6.647865 (lr=5.3213e-05) (hash(x)=56690281)
425
+ 9000 val loss 6.4237
426
+ 9000 val perplexity 616.2996
427
+ 9000 train 6.162522 (lr=1.2292e-05) (hash(x)=44492956)
428
+ 6600 val loss 6.4526
429
+ 6600 val perplexity 634.3764
430
+ 6600 train 6.277388 (lr=5.1279e-05) (hash(x)=42985269)
431
+ 9100 val loss 6.4321
432
+ 9100 val perplexity 621.4722
433
+ 9100 train 6.460136 (lr=1.1860e-05) (hash(x)=51134989)
434
+ 6700 val loss 6.4537
435
+ 6700 val perplexity 635.0168
436
+ 6700 train 6.495444 (lr=4.9377e-05) (hash(x)=53315447)
437
+ 9200 val loss 6.4255
438
+ 9200 val perplexity 617.3776
439
+ 9200 train 6.232402 (lr=1.1472e-05) (hash(x)=48636056)
440
+ 6800 val loss 6.4483
441
+ 6800 val perplexity 631.6550
442
+ 6800 train 6.872110 (lr=4.7509e-05) (hash(x)=61577166)
443
+ 9300 val loss 6.4248
444
+ 9300 val perplexity 616.9818
445
+ 9300 train 6.371828 (lr=1.1128e-05) (hash(x)=50200551)
446
+ 6900 val loss 6.4408
447
+ 6900 val perplexity 626.9297
448
+ 6900 train 6.579897 (lr=4.5676e-05) (hash(x)=54641005)
449
+ 9400 val loss 6.4224
450
+ 9400 val perplexity 615.4691
451
+ 9400 train 6.256755 (lr=1.0830e-05) (hash(x)=48057228)
452
+ 7000 val loss 6.4378
453
+ 7000 val perplexity 625.0102
454
+ 7000 train 6.968445 (lr=4.3882e-05) (hash(x)=60579512)
455
+ 9500 val loss 6.4180
456
+ 9500 val perplexity 612.8030
457
+ 9500 train 6.189470 (lr=1.0577e-05) (hash(x)=48125171)
458
+ 7100 val loss 6.4326
459
+ 7100 val perplexity 621.8027
460
+ 7100 train 6.320305 (lr=4.2128e-05) (hash(x)=53151549)
461
+ 9600 val loss 6.4210
462
+ 9600 val perplexity 614.6295
463
+ 9600 train 6.340978 (lr=1.0369e-05) (hash(x)=53375853)
464
+ 7200 val loss 6.4300
465
+ 7200 val perplexity 620.1948
466
+ 7200 train 7.380653 (lr=4.0414e-05) (hash(x)=71842455)
467
+ 9700 val loss 6.4135
468
+ 9700 val perplexity 610.0555
469
+ 9700 train 7.223229 (lr=1.0208e-05) (hash(x)=53924631)
470
+ 7300 val loss 6.4297
471
+ 7300 val perplexity 620.0109
472
+ 7300 train 6.191623 (lr=3.8745e-05) (hash(x)=44516452)
473
+ 9800 val loss 6.4102
474
+ 9800 val perplexity 608.0391
475
+ 9800 train 6.454649 (lr=1.0092e-05) (hash(x)=48895047)
476
+ 7400 val loss 6.4289
477
+ 7400 val perplexity 619.4763
478
+ 7400 train 6.074793 (lr=3.7120e-05) (hash(x)=42667710)
479
+ 9900 val loss 6.4071
480
+ 9900 val perplexity 606.1632
481
+ 9900 train 6.266374 (lr=1.0023e-05) (hash(x)=44269923)
482
+ 7500 val loss 6.4316
483
+ 7500 val perplexity 621.1490
484
+ 7500 train 6.169018 (lr=3.5541e-05) (hash(x)=47050797)
485
+ 9999 val loss 6.4119
486
+ 9999 val perplexity 609.0312
487
+ 7600 val loss 6.4237
488
+ 7600 val perplexity 616.2653
489
+ 7600 train 6.329450 (lr=3.4011e-05) (hash(x)=49785056)
490
+ 7700 val loss 6.4194
491
+ 7700 val perplexity 613.6569
492
+ 7700 train 6.231349 (lr=3.2531e-05) (hash(x)=53232030)
493
+ 7800 val loss 6.4186
494
+ 7800 val perplexity 613.1287
495
+ 7800 train 6.243061 (lr=3.1102e-05) (hash(x)=48049749)
attention_kindselective_n_heads2_seed1341/model_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:770772d03e0266a5c13e9dd699e1d8542d7e85d6fc0ca58c6198e7ffb1fd50d5
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f81c5305e28766ffe17b41d892c4195cc93a4d18969824c3a0f91c8db6cf9c61
3
  size 38587970
attention_kindselective_n_heads2_seed1341/model_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eae1983e9e16868647435651d905f85497a66f8a826b7be5246cb3a2f33b91d6
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1981d296a42527ba71384319070268006aa51ed1ce3f34d492953367c7d791d
3
  size 38587970
attention_kindselective_n_heads2_seed1341/model_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb6c178c3fdcbf43ee518d9bd77f7a3e5de3634668cee9c5492f260f40c7ec9c
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a38e1ba00e798ca0b0a8d8e9524e6b25782fda42839c6c43fc8de932eaaa2f8
3
  size 38587970
attention_kindselective_n_heads2_seed1341/model_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a9c526ea9748d4265486e8e5cf4f2e92786d25f160ec94c33eaf1d85b3fb394
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a259b68f4b982bba61e3791426a03173b6b6d7cf182ef648db0331096494d40
3
  size 38587970
attention_kindselective_n_heads2_seed1341/optimizer_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5da108cf7f292280f70d1ac7c1ea29e4f23fdd5281ff47d321797c160fd27319
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbe352394c98f7aeb470becf95bc3d2fadc1b26eaace9a300ecb094670cdc944
3
  size 70895430
attention_kindselective_n_heads2_seed1341/optimizer_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d305362d87fc48b343eba618199eed878c187e6eeeaf203e0d43d014df56b50d
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b8de7fcbdc412e17dba8f4280f46014e7504b0ac36b8318f82ce0046bc68161
3
  size 70895430
attention_kindselective_n_heads2_seed1341/optimizer_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25ca1280e09913927209fd304d196bcd3f957063396e0e06ee1f2b6e0d41af19
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d1fdfd4cbdb5969528ced4c2cc8e4fab4c83f4ad557140cab2cdaecb053a8fc
3
  size 70895430
attention_kindselective_n_heads2_seed1341/optimizer_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1134e0ac55901d13e7ee3307ec72f3f8085485b36581ad3855f9598d346b1fd4
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcc01240e1af4fab2e549d468dffd2af1d74e3d805146895fa6e76f4aea91cae
3
  size 70895430