andrew-healey commited on
Commit
31410e6
·
verified ·
1 Parent(s): d66c2e9

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads4_seed1339/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1339", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1339, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 6e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "6e-5_61440_4_1339", "n_embd": 256}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1339", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1339, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 4e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "4e-5_61440_4_1339", "n_embd": 256}
attention_kindselective_n_heads4_seed1339/log2.txt CHANGED
@@ -1,267 +1,441 @@
1
  max_steps: 8750
2
  0 val loss 11.2808
3
  0 val perplexity 79287.6797
4
- 0 train 11.289339 (lr=1.2000e-07) (hash(x)=150724848)
5
- 100 val loss 9.9708
6
- 100 val perplexity 21393.6426
7
- 100 train 9.984427 (lr=1.2120e-05) (hash(x)=149910534)
8
- 200 val loss 8.9782
9
- 200 val perplexity 7928.1284
10
- 200 train 9.005048 (lr=2.4120e-05) (hash(x)=148123706)
11
- 300 val loss 7.7778
12
- 300 val perplexity 2386.9246
13
- 300 train 7.620437 (lr=3.6120e-05) (hash(x)=146678221)
14
- 400 val loss 7.4544
15
- 400 val perplexity 1727.4391
16
- 400 train 7.335504 (lr=4.8120e-05) (hash(x)=151700982)
17
- 500 val loss 7.2946
18
- 500 val perplexity 1472.3665
19
- 500 train 7.381680 (lr=6.0000e-05) (hash(x)=156182087)
20
- 600 val loss 7.1667
21
- 600 val perplexity 1295.6157
22
- 600 train 7.135485 (lr=5.9980e-05) (hash(x)=149318660)
23
- 700 val loss 7.0523
24
- 700 val perplexity 1155.4626
25
- 700 train 7.087841 (lr=5.9922e-05) (hash(x)=150482428)
26
- 800 val loss 6.9016
27
- 800 val perplexity 993.8256
28
- 800 train 6.779600 (lr=5.9824e-05) (hash(x)=143268605)
29
- 900 val loss 6.7801
30
- 900 val perplexity 880.1960
31
- 900 train 6.748869 (lr=5.9687e-05) (hash(x)=152322423)
32
- 1000 val loss 6.6820
33
- 1000 val perplexity 797.9325
34
- 1000 train 6.548263 (lr=5.9512e-05) (hash(x)=147904298)
35
- 1100 val loss 6.5889
36
- 1100 val perplexity 726.9638
37
- 1100 train 6.798097 (lr=5.9298e-05) (hash(x)=154343147)
38
- 1200 val loss 6.4981
39
- 1200 val perplexity 663.8964
40
- 1200 train 6.485293 (lr=5.9046e-05) (hash(x)=141843115)
41
- 1300 val loss 6.4315
42
- 1300 val perplexity 621.1253
43
- 1300 train 6.326127 (lr=5.8757e-05) (hash(x)=145279030)
44
- 1400 val loss 6.4002
45
- 1400 val perplexity 601.9705
46
- 1400 train 6.344112 (lr=5.8430e-05) (hash(x)=152507639)
47
- 1500 val loss 6.3592
48
- 1500 val perplexity 577.7610
49
- 1500 train 6.397813 (lr=5.8066e-05) (hash(x)=148473774)
50
- 1600 val loss 6.2977
51
- 1600 val perplexity 543.3112
52
- 1600 train 6.342596 (lr=5.7666e-05) (hash(x)=151117002)
53
- 1700 val loss 6.2554
54
- 1700 val perplexity 520.8115
55
- 1700 train 6.206963 (lr=5.7230e-05) (hash(x)=138011335)
56
- 1800 val loss 6.2183
57
- 1800 val perplexity 501.8493
58
- 1800 train 6.337492 (lr=5.6759e-05) (hash(x)=171180926)
59
- 1900 val loss 6.1788
60
- 1900 val perplexity 482.4208
61
- 1900 train 6.123930 (lr=5.6253e-05) (hash(x)=141769419)
62
- 2000 val loss 6.1479
63
- 2000 val perplexity 467.7460
64
- 2000 train 6.107029 (lr=5.5714e-05) (hash(x)=151963443)
65
- 2100 val loss 6.1301
66
- 2100 val perplexity 459.4699
67
- 2100 train 6.177304 (lr=5.5142e-05) (hash(x)=162947470)
68
- 2200 val loss 6.0852
69
- 2200 val perplexity 439.3070
70
- 2200 train 6.288651 (lr=5.4537e-05) (hash(x)=154954810)
71
- 2300 val loss 6.0757
72
- 2300 val perplexity 435.1639
73
- 2300 train 6.050261 (lr=5.3902e-05) (hash(x)=151878111)
74
- 2400 val loss 6.0571
75
- 2400 val perplexity 427.1534
76
- 2400 train 5.953463 (lr=5.3236e-05) (hash(x)=158661057)
77
- 2500 val loss 6.0207
78
- 2500 val perplexity 411.8731
79
- 2500 train 6.041787 (lr=5.2541e-05) (hash(x)=150925584)
80
- 2600 val loss 5.9995
81
- 2600 val perplexity 403.2292
82
- 2600 train 5.913114 (lr=5.1817e-05) (hash(x)=144515755)
83
- 2700 val loss 5.9761
84
- 2700 val perplexity 393.9177
85
- 2700 train 5.959197 (lr=5.1067e-05) (hash(x)=153109144)
86
- 2800 val loss 5.9473
87
- 2800 val perplexity 382.7029
88
- 2800 train 5.855392 (lr=5.0290e-05) (hash(x)=151152897)
89
- 2900 val loss 5.9254
90
- 2900 val perplexity 374.4294
91
- 2900 train 5.869029 (lr=4.9487e-05) (hash(x)=145800210)
92
- 3000 val loss 5.9079
93
- 3000 val perplexity 367.9445
94
- 3000 train 5.716622 (lr=4.8662e-05) (hash(x)=141997485)
95
- 3100 val loss 5.8772
96
- 3100 val perplexity 356.8146
97
- 3100 train 5.803132 (lr=4.7813e-05) (hash(x)=154049740)
98
- 3200 val loss 5.8575
99
- 3200 val perplexity 349.8481
100
- 3200 train 5.783772 (lr=4.6943e-05) (hash(x)=150471842)
101
- 3300 val loss 5.8503
102
- 3300 val perplexity 347.3422
103
- 3300 train 5.798738 (lr=4.6052e-05) (hash(x)=149048126)
104
- 3400 val loss 5.8291
105
- 3400 val perplexity 340.0508
106
- 3400 train 5.949715 (lr=4.5143e-05) (hash(x)=161261339)
107
- 3500 val loss 5.8089
108
- 3500 val perplexity 333.2634
109
- 3500 train 5.763816 (lr=4.4216e-05) (hash(x)=157495564)
110
- 3600 val loss 5.7943
111
- 3600 val perplexity 328.4197
112
- 3600 train 5.711343 (lr=4.3273e-05) (hash(x)=144352932)
113
- 3700 val loss 5.7753
114
- 3700 val perplexity 322.2256
115
- 3700 train 5.737307 (lr=4.2315e-05) (hash(x)=149389012)
116
- 3800 val loss 5.7588
117
- 3800 val perplexity 316.9792
118
- 3800 train 5.704242 (lr=4.1343e-05) (hash(x)=146607620)
119
- 3900 val loss 5.7450
120
- 3900 val perplexity 312.6133
121
- 3900 train 5.664580 (lr=4.0360e-05) (hash(x)=143754617)
122
- 4000 val loss 5.7298
123
- 4000 val perplexity 307.9217
124
- 4000 train 5.700660 (lr=3.9365e-05) (hash(x)=156930722)
125
- 4100 val loss 5.7148
126
- 4100 val perplexity 303.3377
127
- 4100 train 5.554720 (lr=3.8362e-05) (hash(x)=147500519)
128
- 4200 val loss 5.7047
129
- 4200 val perplexity 300.2655
130
- 4200 train 5.543267 (lr=3.7351e-05) (hash(x)=143232237)
131
- 4300 val loss 5.6875
132
- 4300 val perplexity 295.1630
133
- 4300 train 5.596540 (lr=3.6333e-05) (hash(x)=146811670)
134
- 4400 val loss 5.6755
135
- 4400 val perplexity 291.6416
136
- 4400 train 5.703570 (lr=3.5311e-05) (hash(x)=158418746)
137
- 4500 val loss 5.6685
138
- 4500 val perplexity 289.5956
139
- 4500 train 5.627564 (lr=3.4285e-05) (hash(x)=156695778)
140
- 4600 val loss 5.6525
141
- 4600 val perplexity 284.9904
142
- 4600 train 5.663891 (lr=3.3257e-05) (hash(x)=147791497)
143
- 4700 val loss 5.6407
144
- 4700 val perplexity 281.6539
145
- 4700 train 5.535069 (lr=3.2229e-05) (hash(x)=155533088)
146
- 4800 val loss 5.6311
147
- 4800 val perplexity 278.9583
148
- 4800 train 5.483893 (lr=3.1202e-05) (hash(x)=138350044)
149
- 4900 val loss 5.6160
150
- 4900 val perplexity 274.7812
151
- 4900 train 5.606287 (lr=3.0178e-05) (hash(x)=143735284)
152
- 5000 val loss 5.6043
153
- 5000 val perplexity 271.5802
154
- 5000 train 5.561104 (lr=2.9157e-05) (hash(x)=154976463)
155
- 5100 val loss 5.6031
156
- 5100 val perplexity 271.2580
157
- 5100 train 5.495253 (lr=2.8143e-05) (hash(x)=149894982)
158
- 5200 val loss 5.5930
159
- 5200 val perplexity 268.5366
160
- 5200 train 5.358963 (lr=2.7135e-05) (hash(x)=159326689)
161
- 5300 val loss 5.5795
162
- 5300 val perplexity 264.9523
163
- 5300 train 5.596076 (lr=2.6136e-05) (hash(x)=159484800)
164
- 5400 val loss 5.5655
165
- 5400 val perplexity 261.2596
166
- 5400 train 5.476205 (lr=2.5147e-05) (hash(x)=140385615)
167
- 5500 val loss 5.5596
168
- 5500 val perplexity 259.7308
169
- 5500 train 5.390671 (lr=2.4169e-05) (hash(x)=148498335)
170
- 5600 val loss 5.5589
171
- 5600 val perplexity 259.5334
172
- 5600 train 5.302035 (lr=2.3204e-05) (hash(x)=151907614)
173
- 5700 val loss 5.5491
174
- 5700 val perplexity 257.0125
175
- 5700 train 5.521362 (lr=2.2253e-05) (hash(x)=155192267)
176
- 5800 val loss 5.5356
177
- 5800 val perplexity 253.5545
178
- 5800 train 5.586134 (lr=2.1318e-05) (hash(x)=153132158)
179
- 5900 val loss 5.5336
180
- 5900 val perplexity 253.0408
181
- 5900 train 5.508742 (lr=2.0400e-05) (hash(x)=161446764)
182
- 6000 val loss 5.5270
183
- 6000 val perplexity 251.3763
184
- 6000 train 5.310840 (lr=1.9500e-05) (hash(x)=151512446)
185
- 6100 val loss 5.5166
186
- 6100 val perplexity 248.7991
187
- 6100 train 5.594731 (lr=1.8620e-05) (hash(x)=188094053)
188
- 6200 val loss 5.5064
189
- 6200 val perplexity 246.2617
190
- 6200 train 5.382390 (lr=1.7760e-05) (hash(x)=149389789)
191
- 6300 val loss 5.5008
192
- 6300 val perplexity 244.8927
193
- 6300 train 5.318002 (lr=1.6923e-05) (hash(x)=138212820)
194
- 6400 val loss 5.4985
195
- 6400 val perplexity 244.3206
196
- 6400 train 5.281260 (lr=1.6108e-05) (hash(x)=146535423)
197
- 6500 val loss 5.4905
198
- 6500 val perplexity 242.3754
199
- 6500 train 5.364082 (lr=1.5319e-05) (hash(x)=145950843)
200
- 6600 val loss 5.4853
201
- 6600 val perplexity 241.1134
202
- 6600 train 5.395124 (lr=1.4555e-05) (hash(x)=141162902)
203
- 6700 val loss 5.4824
204
- 6700 val perplexity 240.4124
205
- 6700 train 5.401274 (lr=1.3817e-05) (hash(x)=153018737)
206
- 6800 val loss 5.4738
207
- 6800 val perplexity 238.3549
208
- 6800 train 5.385234 (lr=1.3108e-05) (hash(x)=155640155)
209
- 6900 val loss 5.4705
210
- 6900 val perplexity 237.5830
211
- 6900 train 5.447811 (lr=1.2427e-05) (hash(x)=153722115)
212
- 7000 val loss 5.4645
213
- 7000 val perplexity 236.1570
214
- 7000 train 5.380439 (lr=1.1777e-05) (hash(x)=146953450)
215
- 7100 val loss 5.4627
216
- 7100 val perplexity 235.7310
217
- 7100 train 5.315409 (lr=1.1157e-05) (hash(x)=137663885)
218
- 7200 val loss 5.4541
219
- 7200 val perplexity 233.7043
220
- 7200 train 5.542728 (lr=1.0568e-05) (hash(x)=146172950)
221
- 7300 val loss 5.4505
222
- 7300 val perplexity 232.8709
223
- 7300 train 5.346293 (lr=1.0012e-05) (hash(x)=150018163)
224
- 7400 val loss 5.4469
225
- 7400 val perplexity 232.0418
226
- 7400 train 5.477500 (lr=9.4899e-06) (hash(x)=145351166)
227
- 7500 val loss 5.4470
228
- 7500 val perplexity 232.0578
229
- 7500 train 5.224993 (lr=9.0014e-06) (hash(x)=145292116)
230
- 7600 val loss 5.4397
231
- 7600 val perplexity 230.3639
232
- 7600 train 5.361894 (lr=8.5478e-06) (hash(x)=150235132)
233
- 7700 val loss 5.4359
234
- 7700 val perplexity 229.5044
235
- 7700 train 5.321193 (lr=8.1297e-06) (hash(x)=154543455)
236
- 7800 val loss 5.4347
237
- 7800 val perplexity 229.2233
238
- 7800 train 5.339501 (lr=7.7476e-06) (hash(x)=142456852)
239
- 7900 val loss 5.4337
240
- 7900 val perplexity 229.0003
241
- 7900 train 5.211949 (lr=7.4021e-06) (hash(x)=147363479)
242
- 8000 val loss 5.4269
243
- 8000 val perplexity 227.4454
244
- 8000 train 5.460956 (lr=7.0937e-06) (hash(x)=156122973)
245
- 8100 val loss 5.4243
246
- 8100 val perplexity 226.8484
247
- 8100 train 5.472881 (lr=6.8229e-06) (hash(x)=156153179)
248
- 8200 val loss 5.4227
249
- 8200 val perplexity 226.4853
250
- 8200 train 5.486504 (lr=6.5900e-06) (hash(x)=146430698)
251
- 8300 val loss 5.4212
252
- 8300 val perplexity 226.1495
253
- 8300 train 5.301962 (lr=6.3954e-06) (hash(x)=143507257)
254
- 8400 val loss 5.4153
255
- 8400 val perplexity 224.8256
256
- 8400 train 5.358385 (lr=6.2395e-06) (hash(x)=166272643)
257
- 8500 val loss 5.4145
258
- 8500 val perplexity 224.6475
259
- 8500 train 5.319081 (lr=6.1223e-06) (hash(x)=143887848)
260
- 8600 val loss 5.4134
261
- 8600 val perplexity 224.3872
262
- 8600 train 5.440458 (lr=6.0440e-06) (hash(x)=156900341)
263
- 8700 val loss 5.4085
264
- 8700 val perplexity 223.2952
265
- 8700 train 5.625781 (lr=6.0049e-06) (hash(x)=146417632)
266
- 8749 val loss 5.4078
267
- 8749 val perplexity 223.1394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  max_steps: 8750
2
  0 val loss 11.2808
3
  0 val perplexity 79287.6797
4
+ 1000 val loss 6.8485
5
+ 1000 val perplexity 942.5023
6
+ 1000 train 6.714951 (lr=4.9593e-05) (hash(x)=147904298)
7
+ 0 train 11.289339 (lr=8.0000e-08) (hash(x)=150724848)
8
+ 1100 val loss 6.7624
9
+ 1100 val perplexity 864.6933
10
+ 1100 train 6.960951 (lr=4.9415e-05) (hash(x)=154343147)
11
+ 100 val loss 10.0778
12
+ 100 val perplexity 23808.3164
13
+ 100 train 10.091505 (lr=8.0800e-06) (hash(x)=149910534)
14
+ 1200 val loss 6.6617
15
+ 1200 val perplexity 781.8986
16
+ 1200 train 6.648478 (lr=4.9205e-05) (hash(x)=141843115)
17
+ 200 val loss 9.4757
18
+ 200 val perplexity 13038.8037
19
+ 200 train 9.504941 (lr=1.6080e-05) (hash(x)=148123706)
20
+ 1300 val loss 6.5919
21
+ 1300 val perplexity 729.1332
22
+ 1300 train 6.484201 (lr=4.8964e-05) (hash(x)=145279030)
23
+ 1400 val loss 6.5438
24
+ 1400 val perplexity 694.8960
25
+ 1400 train 6.487101 (lr=4.8691e-05) (hash(x)=152507639)
26
+ 300 val loss 8.3222
27
+ 300 val perplexity 4114.3809
28
+ 300 train 8.230344 (lr=2.4080e-05) (hash(x)=146678221)
29
+ 1500 val loss 6.4918
30
+ 1500 val perplexity 659.7304
31
+ 1500 train 6.522910 (lr=4.8388e-05) (hash(x)=148473774)
32
+ 400 val loss 7.6551
33
+ 400 val perplexity 2111.4595
34
+ 400 train 7.555830 (lr=3.2080e-05) (hash(x)=151700982)
35
+ 1600 val loss 6.4234
36
+ 1600 val perplexity 616.1007
37
+ 1600 train 6.467561 (lr=4.8055e-05) (hash(x)=151117002)
38
+ 500 val loss 7.4756
39
+ 500 val perplexity 1764.5110
40
+ 500 train 7.555832 (lr=4.0000e-05) (hash(x)=156182087)
41
+ 1700 val loss 6.3840
42
+ 1700 val perplexity 592.2697
43
+ 1700 train 6.331007 (lr=4.7691e-05) (hash(x)=138011335)
44
+ 600 val loss 7.3637
45
+ 600 val perplexity 1577.7322
46
+ 600 train 7.348773 (lr=3.9987e-05) (hash(x)=149318660)
47
+ 1800 val loss 6.3464
48
+ 1800 val perplexity 570.4446
49
+ 1800 train 6.484773 (lr=4.7299e-05) (hash(x)=171180926)
50
+ 700 val loss 7.2805
51
+ 700 val perplexity 1451.7614
52
+ 700 train 7.319925 (lr=3.9948e-05) (hash(x)=150482428)
53
+ 1900 val loss 6.3032
54
+ 1900 val perplexity 546.3349
55
+ 1900 train 6.237638 (lr=4.6878e-05) (hash(x)=141769419)
56
+ 800 val loss 7.1764
57
+ 800 val perplexity 1308.1818
58
+ 800 train 7.046847 (lr=3.9883e-05) (hash(x)=143268605)
59
+ 2000 val loss 6.2670
60
+ 2000 val perplexity 526.8969
61
+ 2000 train 6.216142 (lr=4.6428e-05) (hash(x)=151963443)
62
+ 900 val loss 7.0949
63
+ 900 val perplexity 1205.8468
64
+ 900 train 7.057036 (lr=3.9792e-05) (hash(x)=152322423)
65
+ 2100 val loss 6.2615
66
+ 2100 val perplexity 524.0168
67
+ 2100 train 6.314044 (lr=4.5951e-05) (hash(x)=162947470)
68
+ 2200 val loss 6.2071
69
+ 2200 val perplexity 496.2846
70
+ 2200 train 6.429786 (lr=4.5448e-05) (hash(x)=154954810)
71
+ 1000 val loss 7.0331
72
+ 1000 val perplexity 1133.5393
73
+ 1000 train 6.903413 (lr=3.9675e-05) (hash(x)=147904298)
74
+ 2300 val loss 6.1801
75
+ 2300 val perplexity 483.0446
76
+ 2300 train 6.156929 (lr=4.4918e-05) (hash(x)=151878111)
77
+ 1100 val loss 6.9628
78
+ 1100 val perplexity 1056.5919
79
+ 1100 train 7.155293 (lr=3.9532e-05) (hash(x)=154343147)
80
+ 2400 val loss 6.1673
81
+ 2400 val perplexity 476.9055
82
+ 2400 train 6.058043 (lr=4.4363e-05) (hash(x)=158661057)
83
+ 1200 val loss 6.8789
84
+ 1200 val perplexity 971.5381
85
+ 1200 train 6.864298 (lr=3.9364e-05) (hash(x)=141843115)
86
+ 2500 val loss 6.1249
87
+ 2500 val perplexity 457.0798
88
+ 2500 train 6.145768 (lr=4.3784e-05) (hash(x)=150925584)
89
+ 1300 val loss 6.8076
90
+ 1300 val perplexity 904.6672
91
+ 1300 train 6.690363 (lr=3.9171e-05) (hash(x)=145279030)
92
+ 2600 val loss 6.1045
93
+ 2600 val perplexity 447.8694
94
+ 2600 train 6.016246 (lr=4.3181e-05) (hash(x)=144515755)
95
+ 1400 val loss 6.7498
96
+ 1400 val perplexity 853.9032
97
+ 1400 train 6.700829 (lr=3.8953e-05) (hash(x)=152507639)
98
+ 2700 val loss 6.0887
99
+ 2700 val perplexity 440.8371
100
+ 2700 train 6.080718 (lr=4.2555e-05) (hash(x)=153109144)
101
+ 1500 val loss 6.6907
102
+ 1500 val perplexity 804.8594
103
+ 1500 train 6.725736 (lr=3.8711e-05) (hash(x)=148473774)
104
+ 2800 val loss 6.0568
105
+ 2800 val perplexity 427.0019
106
+ 2800 train 5.960973 (lr=4.1908e-05) (hash(x)=151152897)
107
+ 1600 val loss 6.6138
108
+ 1600 val perplexity 745.3262
109
+ 1600 train 6.649402 (lr=3.8444e-05) (hash(x)=151117002)
110
+ 2900 val loss 6.0378
111
+ 2900 val perplexity 418.9864
112
+ 2900 train 5.981798 (lr=4.1240e-05) (hash(x)=145800210)
113
+ 3000 val loss 6.0244
114
+ 3000 val perplexity 413.3761
115
+ 3000 train 5.830175 (lr=4.0551e-05) (hash(x)=141997485)
116
+ 1700 val loss 6.5669
117
+ 1700 val perplexity 711.1898
118
+ 1700 train 6.498591 (lr=3.8153e-05) (hash(x)=138011335)
119
+ 3100 val loss 5.9956
120
+ 3100 val perplexity 401.6418
121
+ 3100 train 5.917317 (lr=3.9844e-05) (hash(x)=154049740)
122
+ 1800 val loss 6.5198
123
+ 1800 val perplexity 678.4590
124
+ 1800 train 6.672019 (lr=3.7839e-05) (hash(x)=171180926)
125
+ 3200 val loss 5.9811
126
+ 3200 val perplexity 395.8601
127
+ 3200 train 5.912686 (lr=3.9119e-05) (hash(x)=150471842)
128
+ 1900 val loss 6.4677
129
+ 1900 val perplexity 644.0254
130
+ 1900 train 6.386144 (lr=3.7502e-05) (hash(x)=141769419)
131
+ 3300 val loss 5.9656
132
+ 3300 val perplexity 389.7935
133
+ 3300 train 5.912046 (lr=3.8377e-05) (hash(x)=149048126)
134
+ 2000 val loss 6.4305
135
+ 2000 val perplexity 620.4788
136
+ 2000 train 6.365761 (lr=3.7143e-05) (hash(x)=151963443)
137
+ 3400 val loss 5.9430
138
+ 3400 val perplexity 381.0898
139
+ 3400 train 6.063977 (lr=3.7619e-05) (hash(x)=161261339)
140
+ 2100 val loss 6.4172
141
+ 2100 val perplexity 612.2800
142
+ 2100 train 6.476425 (lr=3.6761e-05) (hash(x)=162947470)
143
+ 3500 val loss 5.9293
144
+ 3500 val perplexity 375.8853
145
+ 3500 train 5.890701 (lr=3.6847e-05) (hash(x)=157495564)
146
+ 2200 val loss 6.3587
147
+ 2200 val perplexity 577.4716
148
+ 2200 train 6.609720 (lr=3.6358e-05) (hash(x)=154954810)
149
+ 3600 val loss 5.9167
150
+ 3600 val perplexity 371.1994
151
+ 3600 train 5.823642 (lr=3.6061e-05) (hash(x)=144352932)
152
+ 3700 val loss 5.9019
153
+ 3700 val perplexity 365.7276
154
+ 3700 train 5.865877 (lr=3.5263e-05) (hash(x)=149389012)
155
+ 2300 val loss 6.3315
156
+ 2300 val perplexity 562.0033
157
+ 2300 train 6.319509 (lr=3.5935e-05) (hash(x)=151878111)
158
+ 3800 val loss 5.8795
159
+ 3800 val perplexity 357.6370
160
+ 3800 train 5.826324 (lr=3.4453e-05) (hash(x)=146607620)
161
+ 2400 val loss 6.3046
162
+ 2400 val perplexity 547.0805
163
+ 2400 train 6.197392 (lr=3.5491e-05) (hash(x)=158661057)
164
+ 3900 val loss 5.8674
165
+ 3900 val perplexity 353.3137
166
+ 3900 train 5.783121 (lr=3.3633e-05) (hash(x)=143754617)
167
+ 2500 val loss 6.2686
168
+ 2500 val perplexity 527.7335
169
+ 2500 train 6.292732 (lr=3.5027e-05) (hash(x)=150925584)
170
+ 4000 val loss 5.8476
171
+ 4000 val perplexity 346.4010
172
+ 4000 train 5.822481 (lr=3.2805e-05) (hash(x)=156930722)
173
+ 2600 val loss 6.2464
174
+ 2600 val perplexity 516.1537
175
+ 2600 train 6.152393 (lr=3.4545e-05) (hash(x)=144515755)
176
+ 4100 val loss 5.8330
177
+ 4100 val perplexity 341.3765
178
+ 4100 train 5.661245 (lr=3.1968e-05) (hash(x)=147500519)
179
+ 2700 val loss 6.2261
180
+ 2700 val perplexity 505.7803
181
+ 2700 train 6.210160 (lr=3.4044e-05) (hash(x)=153109144)
182
+ 4200 val loss 5.8236
183
+ 4200 val perplexity 338.1779
184
+ 4200 train 5.663101 (lr=3.1126e-05) (hash(x)=143232237)
185
+ 2800 val loss 6.1942
186
+ 2800 val perplexity 489.9097
187
+ 2800 train 6.096017 (lr=3.3526e-05) (hash(x)=151152897)
188
+ 4300 val loss 5.8086
189
+ 4300 val perplexity 333.1535
190
+ 4300 train 5.714221 (lr=3.0277e-05) (hash(x)=146811670)
191
+ 2900 val loss 6.1729
192
+ 2900 val perplexity 479.5571
193
+ 2900 train 6.113084 (lr=3.2992e-05) (hash(x)=145800210)
194
+ 4400 val loss 5.7936
195
+ 4400 val perplexity 328.2065
196
+ 4400 train 5.824045 (lr=2.9425e-05) (hash(x)=158418746)
197
+ 3000 val loss 6.1580
198
+ 3000 val perplexity 472.4610
199
+ 3000 train 5.950809 (lr=3.2441e-05) (hash(x)=141997485)
200
+ 4500 val loss 5.7892
201
+ 4500 val perplexity 326.7647
202
+ 4500 train 5.755840 (lr=2.8571e-05) (hash(x)=156695778)
203
+ 4600 val loss 5.7734
204
+ 4600 val perplexity 321.6264
205
+ 4600 train 5.791765 (lr=2.7714e-05) (hash(x)=147791497)
206
+ 3100 val loss 6.1296
207
+ 3100 val perplexity 459.2465
208
+ 3100 train 6.037282 (lr=3.1875e-05) (hash(x)=154049740)
209
+ 4700 val loss 5.7574
210
+ 4700 val perplexity 316.5383
211
+ 4700 train 5.646436 (lr=2.6857e-05) (hash(x)=155533088)
212
+ 3200 val loss 6.1167
213
+ 3200 val perplexity 453.3483
214
+ 3200 train 6.055669 (lr=3.1295e-05) (hash(x)=150471842)
215
+ 4800 val loss 5.7500
216
+ 4800 val perplexity 314.1985
217
+ 4800 train 5.606719 (lr=2.6002e-05) (hash(x)=138350044)
218
+ 3300 val loss 6.1048
219
+ 3300 val perplexity 447.9950
220
+ 3300 train 6.045102 (lr=3.0702e-05) (hash(x)=149048126)
221
+ 4900 val loss 5.7358
222
+ 4900 val perplexity 309.7656
223
+ 4900 train 5.729972 (lr=2.5148e-05) (hash(x)=143735284)
224
+ 3400 val loss 6.0825
225
+ 3400 val perplexity 438.1187
226
+ 3400 train 6.207899 (lr=3.0095e-05) (hash(x)=161261339)
227
+ 5000 val loss 5.7237
228
+ 5000 val perplexity 306.0336
229
+ 5000 train 5.682995 (lr=2.4298e-05) (hash(x)=154976463)
230
+ 3500 val loss 6.0666
231
+ 3500 val perplexity 431.2192
232
+ 3500 train 6.035659 (lr=2.9477e-05) (hash(x)=157495564)
233
+ 5100 val loss 5.7224
234
+ 5100 val perplexity 305.6486
235
+ 5100 train 5.606878 (lr=2.3452e-05) (hash(x)=149894982)
236
+ 3600 val loss 6.0552
237
+ 3600 val perplexity 426.3279
238
+ 3600 train 5.949899 (lr=2.8849e-05) (hash(x)=144352932)
239
+ 5200 val loss 5.7127
240
+ 5200 val perplexity 302.6869
241
+ 5200 train 5.487281 (lr=2.2613e-05) (hash(x)=159326689)
242
+ 3700 val loss 6.0350
243
+ 3700 val perplexity 417.7983
244
+ 5300 val loss 5.7008
245
+ 5300 val perplexity 299.1067
246
+ 3700 train 6.012226 (lr=2.8210e-05) (hash(x)=149389012)
247
+ 5300 train 5.708265 (lr=2.1780e-05) (hash(x)=159484800)
248
+ 5400 val loss 5.6879
249
+ 5400 val perplexity 295.2607
250
+ 5400 train 5.592360 (lr=2.0956e-05) (hash(x)=140385615)
251
+ 3800 val loss 6.0172
252
+ 3800 val perplexity 410.4391
253
+ 3800 train 5.954288 (lr=2.7562e-05) (hash(x)=146607620)
254
+ 5500 val loss 5.6801
255
+ 5500 val perplexity 292.9758
256
+ 5500 train 5.510561 (lr=2.0141e-05) (hash(x)=148498335)
257
+ 3900 val loss 6.0051
258
+ 3900 val perplexity 405.5009
259
+ 3900 train 5.917937 (lr=2.6907e-05) (hash(x)=143754617)
260
+ 5600 val loss 5.6781
261
+ 5600 val perplexity 292.3898
262
+ 5600 train 5.416596 (lr=1.9337e-05) (hash(x)=151907614)
263
+ 4000 val loss 5.9909
264
+ 4000 val perplexity 399.7809
265
+ 4000 train 5.969647 (lr=2.6244e-05) (hash(x)=156930722)
266
+ 5700 val loss 5.6678
267
+ 5700 val perplexity 289.3985
268
+ 5700 train 5.640378 (lr=1.8545e-05) (hash(x)=155192267)
269
+ 4100 val loss 5.9793
270
+ 4100 val perplexity 395.1578
271
+ 4100 train 5.796338 (lr=2.5575e-05) (hash(x)=147500519)
272
+ 5800 val loss 5.6571
273
+ 5800 val perplexity 286.3109
274
+ 5800 train 5.709635 (lr=1.7765e-05) (hash(x)=153132158)
275
+ 4200 val loss 5.9705
276
+ 4200 val perplexity 391.6935
277
+ 4200 train 5.811824 (lr=2.4900e-05) (hash(x)=143232237)
278
+ 5900 val loss 5.6528
279
+ 5900 val perplexity 285.0764
280
+ 5900 train 5.620965 (lr=1.7000e-05) (hash(x)=161446764)
281
+ 4300 val loss 5.9548
282
+ 4300 val perplexity 385.6090
283
+ 4300 train 5.864467 (lr=2.4222e-05) (hash(x)=146811670)
284
+ 6000 val loss 5.6491
285
+ 6000 val perplexity 284.0327
286
+ 6000 train 5.444342 (lr=1.6250e-05) (hash(x)=151512446)
287
+ 4400 val loss 5.9395
288
+ 4400 val perplexity 379.7272
289
+ 4400 train 5.971974 (lr=2.3540e-05) (hash(x)=158418746)
290
+ 6100 val loss 5.6369
291
+ 6100 val perplexity 280.5792
292
+ 6100 train 5.744586 (lr=1.5516e-05) (hash(x)=188094053)
293
+ 6200 val loss 5.6278
294
+ 6200 val perplexity 278.0362
295
+ 6200 train 5.510383 (lr=1.4800e-05) (hash(x)=149389789)
296
+ 4500 val loss 5.9340
297
+ 4500 val perplexity 377.6807
298
+ 4500 train 5.896394 (lr=2.2856e-05) (hash(x)=156695778)
299
+ 6300 val loss 5.6223
300
+ 6300 val perplexity 276.5230
301
+ 6300 train 5.444717 (lr=1.4102e-05) (hash(x)=138212820)
302
+ 4600 val loss 5.9184
303
+ 4600 val perplexity 371.8292
304
+ 4600 train 5.946771 (lr=2.2171e-05) (hash(x)=147791497)
305
+ 6400 val loss 5.6209
306
+ 6400 val perplexity 276.1318
307
+ 6400 train 5.406941 (lr=1.3424e-05) (hash(x)=146535423)
308
+ 4700 val loss 5.9067
309
+ 4700 val perplexity 367.4766
310
+ 4700 train 5.782763 (lr=2.1486e-05) (hash(x)=155533088)
311
+ 6500 val loss 5.6110
312
+ 6500 val perplexity 273.4212
313
+ 6500 train 5.487208 (lr=1.2766e-05) (hash(x)=145950843)
314
+ 4800 val loss 5.9002
315
+ 4800 val perplexity 365.1052
316
+ 4800 train 5.750734 (lr=2.0801e-05) (hash(x)=138350044)
317
+ 6600 val loss 5.6068
318
+ 6600 val perplexity 272.2652
319
+ 6600 train 5.508886 (lr=1.2129e-05) (hash(x)=141162902)
320
+ 4900 val loss 5.8856
321
+ 4900 val perplexity 359.8099
322
+ 4900 train 5.886899 (lr=2.0118e-05) (hash(x)=143735284)
323
+ 6700 val loss 5.6044
324
+ 6700 val perplexity 271.6056
325
+ 6700 train 5.520892 (lr=1.1515e-05) (hash(x)=153018737)
326
+ 5000 val loss 5.8735
327
+ 5000 val perplexity 355.4951
328
+ 5000 train 5.836714 (lr=1.9438e-05) (hash(x)=154976463)
329
+ 6800 val loss 5.5954
330
+ 6800 val perplexity 269.1932
331
+ 6800 train 5.514775 (lr=1.0923e-05) (hash(x)=155640155)
332
+ 5100 val loss 5.8707
333
+ 5100 val perplexity 354.4862
334
+ 5100 train 5.748594 (lr=1.8762e-05) (hash(x)=149894982)
335
+ 6900 val loss 5.5929
336
+ 6900 val perplexity 268.5220
337
+ 6900 train 5.559921 (lr=1.0356e-05) (hash(x)=153722115)
338
+ 5200 val loss 5.8617
339
+ 5200 val perplexity 351.3219
340
+ 5200 train 5.652061 (lr=1.8090e-05) (hash(x)=159326689)
341
+ 7000 val loss 5.5867
342
+ 7000 val perplexity 266.8477
343
+ 7000 train 5.499893 (lr=9.8138e-06) (hash(x)=146953450)
344
+ 7100 val loss 5.5882
345
+ 7100 val perplexity 267.2558
346
+ 7100 train 5.430435 (lr=9.2971e-06) (hash(x)=137663885)
347
+ 5300 val loss 5.8517
348
+ 5300 val perplexity 347.8377
349
+ 5300 train 5.855162 (lr=1.7424e-05) (hash(x)=159484800)
350
+ 7200 val loss 5.5763
351
+ 7200 val perplexity 264.1055
352
+ 7200 train 5.650030 (lr=8.8068e-06) (hash(x)=146172950)
353
+ 5400 val loss 5.8359
354
+ 5400 val perplexity 342.3702
355
+ 5400 train 5.735655 (lr=1.6765e-05) (hash(x)=140385615)
356
+ 7300 val loss 5.5728
357
+ 7300 val perplexity 263.1799
358
+ 7300 train 5.464963 (lr=8.3436e-06) (hash(x)=150018163)
359
+ 5500 val loss 5.8327
360
+ 5500 val perplexity 341.2769
361
+ 5500 train 5.665972 (lr=1.6113e-05) (hash(x)=148498335)
362
+ 7400 val loss 5.5708
363
+ 7400 val perplexity 262.6541
364
+ 7400 train 5.579797 (lr=7.9082e-06) (hash(x)=145351166)
365
+ 5600 val loss 5.8304
366
+ 5600 val perplexity 340.4807
367
+ 5600 train 5.565631 (lr=1.5469e-05) (hash(x)=151907614)
368
+ 7500 val loss 5.5705
369
+ 7500 val perplexity 262.5565
370
+ 7500 train 5.350385 (lr=7.5012e-06) (hash(x)=145292116)
371
+ 5700 val loss 5.8211
372
+ 5700 val perplexity 337.3530
373
+ 5700 train 5.784693 (lr=1.4836e-05) (hash(x)=155192267)
374
+ 7600 val loss 5.5629
375
+ 7600 val perplexity 260.5808
376
+ 7600 train 5.482468 (lr=7.1232e-06) (hash(x)=150235132)
377
+ 5800 val loss 5.8096
378
+ 5800 val perplexity 333.4767
379
+ 5800 train 5.863295 (lr=1.4212e-05) (hash(x)=153132158)
380
+ 7700 val loss 5.5596
381
+ 7700 val perplexity 259.7070
382
+ 7700 train 5.454187 (lr=6.7747e-06) (hash(x)=154543455)
383
+ 5900 val loss 5.8050
384
+ 5900 val perplexity 331.9670
385
+ 5900 train 5.777120 (lr=1.3600e-05) (hash(x)=161446764)
386
+ 7800 val loss 5.5585
387
+ 7800 val perplexity 259.4284
388
+ 7800 train 5.460950 (lr=6.4563e-06) (hash(x)=142456852)
389
+ 6000 val loss 5.8010
390
+ 6000 val perplexity 330.6328
391
+ 6000 train 5.614186 (lr=1.3000e-05) (hash(x)=151512446)
392
+ 7900 val loss 5.5591
393
+ 7900 val perplexity 259.5926
394
+ 7900 train 5.338410 (lr=6.1684e-06) (hash(x)=147363479)
395
+ 8000 val loss 5.5506
396
+ 8000 val perplexity 257.3806
397
+ 8000 train 5.585436 (lr=5.9114e-06) (hash(x)=156122973)
398
+ 6100 val loss 5.7907
399
+ 6100 val perplexity 327.2283
400
+ 6100 train 5.928180 (lr=1.2413e-05) (hash(x)=188094053)
401
+ 8100 val loss 5.5496
402
+ 8100 val perplexity 257.1343
403
+ 8100 train 5.594352 (lr=5.6857e-06) (hash(x)=156153179)
404
+ 6200 val loss 5.7821
405
+ 6200 val perplexity 324.4371
406
+ 6200 train 5.670990 (lr=1.1840e-05) (hash(x)=149389789)
407
+ 8200 val loss 5.5474
408
+ 8200 val perplexity 256.5680
409
+ 8200 train 5.598216 (lr=5.4917e-06) (hash(x)=146430698)
410
+ 6300 val loss 5.7771
411
+ 6300 val perplexity 322.8166
412
+ 6300 train 5.596047 (lr=1.1282e-05) (hash(x)=138212820)
413
+ 8300 val loss 5.5461
414
+ 8300 val perplexity 256.2249
415
+ 8300 train 5.414905 (lr=5.3295e-06) (hash(x)=143507257)
416
+ 6400 val loss 5.7752
417
+ 6400 val perplexity 322.2122
418
+ 6400 train 5.569002 (lr=1.0739e-05) (hash(x)=146535423)
419
+ 8400 val loss 5.5406
420
+ 8400 val perplexity 254.8408
421
+ 8400 train 5.536300 (lr=5.1995e-06) (hash(x)=166272643)
422
+ 6500 val loss 5.7667
423
+ 6500 val perplexity 319.4798
424
+ 6500 train 5.635542 (lr=1.0213e-05) (hash(x)=145950843)
425
+ 8500 val loss 5.5396
426
+ 8500 val perplexity 254.5714
427
+ 8500 train 5.443057 (lr=5.1019e-06) (hash(x)=143887848)
428
+ 6600 val loss 5.7620
429
+ 6600 val perplexity 317.9725
430
+ 6600 train 5.657029 (lr=9.7032e-06) (hash(x)=141162902)
431
+ 8600 val loss 5.5384
432
+ 8600 val perplexity 254.2664
433
+ 8600 train 5.575128 (lr=5.0367e-06) (hash(x)=156900341)
434
+ 6700 val loss 5.7599
435
+ 6700 val perplexity 317.3207
436
+ 6700 train 5.672526 (lr=9.2116e-06) (hash(x)=153018737)
437
+ 8700 val loss 5.5339
438
+ 8700 val perplexity 253.1181
439
+ 8700 train 5.756435 (lr=5.0041e-06) (hash(x)=146417632)
440
+ 8749 val loss 5.5336
441
+ 8749 val perplexity 253.0482
attention_kindselective_n_heads4_seed1339/model_08749.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:670863be01d7591c43c5c574c64109eb1a0e950c101c5d0b4496599d38dd0bd9
3
  size 92843394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60ed2548fcfa7fffe8d4b22ec9f8b7995758e15c929311acbe1a883e5373605f
3
  size 92843394
attention_kindselective_n_heads4_seed1339/optimizer_08749.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:92d1b73e300d1a0662b1eec7fad7b3088041dc4bda11603e2d39b3a02feba96e
3
  size 179406214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f65d1d60081c55216179bc3356d012509461de93333d3ddfdd692c3fe5f2144
3
  size 179406214