andrew-healey commited on
Commit
245cd66
·
verified ·
1 Parent(s): 48b4f15

Upload folder using huggingface_hub

Browse files
lr5.5e-5_total_batch_size61440_two_masks_4_heads_seed1338/args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_10/lr5.5e-5_total_batch_size61440_two_masks_4_heads_seed1338", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_10", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1338, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "two_masks", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 5.5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "5.5e-5_61440_1338", "n_embd": 256}
lr5.5e-5_total_batch_size61440_two_masks_4_heads_seed1338/dataloader_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:953385078aa3787b69fc6857dfd48b0a2cd2f4d27c6f8892e01211aca53d07f5
3
+ size 964
lr5.5e-5_total_batch_size61440_two_masks_4_heads_seed1338/log2.txt ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ max_steps: 8750
2
+ 0 val loss 11.3155
3
+ 0 val perplexity 82083.8438
4
+ 0 train 11.313581 (lr=1.1000e-07) (hash(x)=150327452)
5
+ 100 val loss 9.9128
6
+ 100 val perplexity 20187.4805
7
+ 100 train 9.943237 (lr=1.1110e-05) (hash(x)=166780046)
8
+ 200 val loss 9.2663
9
+ 200 val perplexity 10575.0576
10
+ 200 train 9.231794 (lr=2.2110e-05) (hash(x)=155040610)
11
+ 300 val loss 8.1565
12
+ 300 val perplexity 3485.8176
13
+ 300 train 8.171751 (lr=3.3110e-05) (hash(x)=155504036)
14
+ 400 val loss 7.6849
15
+ 400 val perplexity 2175.2903
16
+ 400 train 7.557720 (lr=4.4110e-05) (hash(x)=143823248)
17
+ 500 val loss 7.5077
18
+ 500 val perplexity 1821.9685
19
+ 500 train 7.480737 (lr=5.5000e-05) (hash(x)=143734685)
20
+ 600 val loss 7.4258
21
+ 600 val perplexity 1678.8130
22
+ 600 train 7.210340 (lr=5.4982e-05) (hash(x)=150678249)
23
+ 700 val loss 7.3397
24
+ 700 val perplexity 1540.1981
25
+ 700 train 7.492605 (lr=5.4928e-05) (hash(x)=175802021)
26
+ 800 val loss 7.2282
27
+ 800 val perplexity 1377.8000
28
+ 800 train 7.236165 (lr=5.4839e-05) (hash(x)=158681215)
29
+ 900 val loss 7.1335
30
+ 900 val perplexity 1253.2415
31
+ 900 train 7.028633 (lr=5.4713e-05) (hash(x)=146108145)
32
+ 1000 val loss 7.0430
33
+ 1000 val perplexity 1144.8357
34
+ 1000 train 6.876802 (lr=5.4553e-05) (hash(x)=154996086)
35
+ 1100 val loss 6.9323
36
+ 1100 val perplexity 1024.8070
37
+ 1100 train 6.923301 (lr=5.4357e-05) (hash(x)=153885445)
38
+ 1200 val loss 6.8277
39
+ 1200 val perplexity 923.0887
40
+ 1200 train 6.778232 (lr=5.4126e-05) (hash(x)=142353087)
41
+ 1300 val loss 6.7412
42
+ 1300 val perplexity 846.5897
43
+ 1300 train 6.648426 (lr=5.3860e-05) (hash(x)=150750353)
44
+ 1400 val loss 6.6735
45
+ 1400 val perplexity 791.1674
46
+ 1400 train 6.484898 (lr=5.3561e-05) (hash(x)=152767913)
47
+ 1500 val loss 6.5877
48
+ 1500 val perplexity 726.0728
49
+ 1500 train 6.641434 (lr=5.3227e-05) (hash(x)=151562048)
50
+ 1600 val loss 6.5119
51
+ 1600 val perplexity 673.1092
52
+ 1600 train 6.518369 (lr=5.2860e-05) (hash(x)=166486165)
53
+ 1700 val loss 6.4746
54
+ 1700 val perplexity 648.4517
55
+ 1700 train 6.117424 (lr=5.2461e-05) (hash(x)=130835396)
56
+ 1800 val loss 6.4193
57
+ 1800 val perplexity 613.5454
58
+ 1800 train 6.476211 (lr=5.2029e-05) (hash(x)=158851816)
59
+ 1900 val loss 6.3907
60
+ 1900 val perplexity 596.2837
61
+ 1900 train 6.418846 (lr=5.1565e-05) (hash(x)=153313879)
62
+ 2000 val loss 6.3350
63
+ 2000 val perplexity 563.9431
64
+ 2000 train 6.263374 (lr=5.1071e-05) (hash(x)=158245023)
65
+ 2100 val loss 6.2939
66
+ 2100 val perplexity 541.2620
67
+ 2100 train 6.122134 (lr=5.0547e-05) (hash(x)=157204896)
68
+ 2200 val loss 6.2733
69
+ 2200 val perplexity 530.2380
70
+ 2200 train 6.114381 (lr=4.9993e-05) (hash(x)=137541932)
71
+ 2300 val loss 6.2499
72
+ 2300 val perplexity 517.9681
73
+ 2300 train 6.202806 (lr=4.9410e-05) (hash(x)=150149692)
74
+ 2400 val loss 6.2105
75
+ 2400 val perplexity 497.9619
76
+ 2400 train 6.195385 (lr=4.8800e-05) (hash(x)=151730720)
77
+ 2500 val loss 6.1812
78
+ 2500 val perplexity 483.5811
79
+ 2500 train 5.974617 (lr=4.8162e-05) (hash(x)=143406752)
80
+ 2600 val loss 6.1656
81
+ 2600 val perplexity 476.0889
82
+ 2600 train 6.002447 (lr=4.7499e-05) (hash(x)=157272496)
83
+ 2700 val loss 6.1443
84
+ 2700 val perplexity 466.0529
85
+ 2700 train 6.119094 (lr=4.6811e-05) (hash(x)=155342327)
86
+ 2800 val loss 6.1101
87
+ 2800 val perplexity 450.3995
88
+ 2800 train 5.991596 (lr=4.6099e-05) (hash(x)=140626679)
89
+ 2900 val loss 6.0887
90
+ 2900 val perplexity 440.8409
91
+ 2900 train 5.951784 (lr=4.5364e-05) (hash(x)=144953350)
92
+ 3000 val loss 6.0731
93
+ 3000 val perplexity 434.0351
94
+ 3000 train 5.966191 (lr=4.4606e-05) (hash(x)=172449837)
95
+ 3100 val loss 6.0628
96
+ 3100 val perplexity 429.5563
97
+ 3100 train 5.846348 (lr=4.3828e-05) (hash(x)=141710086)
98
+ 3200 val loss 6.0319
99
+ 3200 val perplexity 416.5006
100
+ 3200 train 5.970057 (lr=4.3031e-05) (hash(x)=151299772)
101
+ 3300 val loss 6.0187
102
+ 3300 val perplexity 411.0641
103
+ 3300 train 5.889454 (lr=4.2215e-05) (hash(x)=146473110)
104
+ 3400 val loss 5.9937
105
+ 3400 val perplexity 400.9143
106
+ 3400 train 6.079772 (lr=4.1381e-05) (hash(x)=153954157)
107
+ 3500 val loss 5.9736
108
+ 3500 val perplexity 392.9279
109
+ 3500 train 5.919667 (lr=4.0532e-05) (hash(x)=153717336)
110
+ 3600 val loss 5.9564
111
+ 3600 val perplexity 386.2058
112
+ 3600 train 5.711346 (lr=3.9667e-05) (hash(x)=144965161)
113
+ 3700 val loss 5.9486
114
+ 3700 val perplexity 383.2021
115
+ 3700 train 5.729526 (lr=3.8789e-05) (hash(x)=125969741)
116
+ 3800 val loss 5.9299
117
+ 3800 val perplexity 376.1209
118
+ 3800 train 5.796446 (lr=3.7898e-05) (hash(x)=155070487)
119
+ 3900 val loss 5.9091
120
+ 3900 val perplexity 368.3831
121
+ 3900 train 5.779882 (lr=3.6996e-05) (hash(x)=149444644)
122
+ 4000 val loss 5.8943
123
+ 4000 val perplexity 362.9653
124
+ 4000 train 5.750712 (lr=3.6085e-05) (hash(x)=151663033)
125
+ 4100 val loss 5.8838
126
+ 4100 val perplexity 359.1650
127
+ 4100 train 5.844155 (lr=3.5165e-05) (hash(x)=143688282)
128
+ 4200 val loss 5.8648
129
+ 4200 val perplexity 352.4201
130
+ 4200 train 5.866295 (lr=3.4238e-05) (hash(x)=163361651)
131
+ 4300 val loss 5.8471
132
+ 4300 val perplexity 346.2207
133
+ 4300 train 5.879082 (lr=3.3305e-05) (hash(x)=153619361)
134
+ 4400 val loss 5.8360
135
+ 4400 val perplexity 342.4117
136
+ 4400 train 6.163618 (lr=3.2368e-05) (hash(x)=168527064)
137
+ 4500 val loss 5.8258
138
+ 4500 val perplexity 338.9414
139
+ 4500 train 5.661376 (lr=3.1428e-05) (hash(x)=125588037)
140
+ 4600 val loss 5.8062
141
+ 4600 val perplexity 332.3514
142
+ 4600 train 5.764658 (lr=3.0486e-05) (hash(x)=143710941)
143
+ 4700 val loss 5.7933
144
+ 4700 val perplexity 328.0804
145
+ 4700 train 5.635021 (lr=2.9543e-05) (hash(x)=150952742)
146
+ 4800 val loss 5.7829
147
+ 4800 val perplexity 324.6836
148
+ 4800 train 5.671600 (lr=2.8602e-05) (hash(x)=145323659)
149
+ 4900 val loss 5.7763
150
+ 4900 val perplexity 322.5778
151
+ 4900 train 5.801414 (lr=2.7663e-05) (hash(x)=153151397)
152
+ 5000 val loss 5.7645
153
+ 5000 val perplexity 318.7687
154
+ 5000 train 5.678666 (lr=2.6728e-05) (hash(x)=143182059)
155
+ 5100 val loss 5.7535
156
+ 5100 val perplexity 315.2778
157
+ 5100 train 5.775939 (lr=2.5798e-05) (hash(x)=170083586)
158
+ 5200 val loss 5.7402
159
+ 5200 val perplexity 311.1232
160
+ 5200 train 5.620294 (lr=2.4874e-05) (hash(x)=149363919)
161
+ 5300 val loss 5.7335
162
+ 5300 val perplexity 309.0508
163
+ 5300 train 5.614397 (lr=2.3958e-05) (hash(x)=152033784)
164
+ 5400 val loss 5.7199
165
+ 5400 val perplexity 304.8666
166
+ 5400 train 5.843496 (lr=2.3051e-05) (hash(x)=154614289)
167
+ 5500 val loss 5.7092
168
+ 5500 val perplexity 301.6268
169
+ 5500 train 5.813334 (lr=2.2155e-05) (hash(x)=157745174)
170
+ 5600 val loss 5.7030
171
+ 5600 val perplexity 299.7621
172
+ 5600 train 5.604224 (lr=2.1271e-05) (hash(x)=147693222)
173
+ 5700 val loss 5.6927
174
+ 5700 val perplexity 296.6961
175
+ 5700 train 5.626872 (lr=2.0399e-05) (hash(x)=149784627)
176
+ 5800 val loss 5.6928
177
+ 5800 val perplexity 296.7305
178
+ 5800 train 5.625709 (lr=1.9542e-05) (hash(x)=158620729)
179
+ 5900 val loss 5.6774
180
+ 5900 val perplexity 292.1962
181
+ 5900 train 5.628016 (lr=1.8700e-05) (hash(x)=159763910)
182
+ 6000 val loss 5.6683
183
+ 6000 val perplexity 289.5501
184
+ 6000 train 5.601182 (lr=1.7875e-05) (hash(x)=147640561)
185
+ 6100 val loss 5.6617
186
+ 6100 val perplexity 287.6270
187
+ 6100 train 5.651609 (lr=1.7068e-05) (hash(x)=156613394)
188
+ 6200 val loss 5.6572
189
+ 6200 val perplexity 286.3442
190
+ 6200 train 5.583783 (lr=1.6280e-05) (hash(x)=186221290)
191
+ 6300 val loss 5.6450
192
+ 6300 val perplexity 282.8679
193
+ 6300 train 5.524905 (lr=1.5512e-05) (hash(x)=152081419)
194
+ 6400 val loss 5.6425
195
+ 6400 val perplexity 282.1548
196
+ 6400 train 5.596219 (lr=1.4766e-05) (hash(x)=154808349)
197
+ 6500 val loss 5.6376
198
+ 6500 val perplexity 280.8000
199
+ 6500 train 5.603078 (lr=1.4042e-05) (hash(x)=159437208)
200
+ 6600 val loss 5.6334
201
+ 6600 val perplexity 279.6216
202
+ 6600 train 5.452362 (lr=1.3342e-05) (hash(x)=157933074)
203
+ 6700 val loss 5.6265
204
+ 6700 val perplexity 277.6952
205
+ 6700 train 5.604262 (lr=1.2666e-05) (hash(x)=161560240)
206
+ 6800 val loss 5.6225
207
+ 6800 val perplexity 276.5717
208
+ 6800 train 5.598015 (lr=1.2016e-05) (hash(x)=155424292)
209
+ 6900 val loss 5.6213
210
+ 6900 val perplexity 276.2486
211
+ 6900 train 5.624365 (lr=1.1392e-05) (hash(x)=148561470)
212
+ 7000 val loss 5.6141
213
+ 7000 val perplexity 274.2659
214
+ 7000 train 5.514448 (lr=1.0795e-05) (hash(x)=141527450)
215
+ 7100 val loss 5.6083
216
+ 7100 val perplexity 272.6889
217
+ 7100 train 5.558328 (lr=1.0227e-05) (hash(x)=151066339)
218
+ 7200 val loss 5.6086
219
+ 7200 val perplexity 272.7735
220
+ 7200 train 5.406475 (lr=9.6875e-06) (hash(x)=155231264)
221
+ 7300 val loss 5.5995
222
+ 7300 val perplexity 270.2829
223
+ 7300 train 5.626328 (lr=9.1780e-06) (hash(x)=150281149)
224
+ 7400 val loss 5.5969
225
+ 7400 val perplexity 269.5853
226
+ 7400 train 5.487207 (lr=8.6990e-06) (hash(x)=148421717)
227
+ 7500 val loss 5.5960
228
+ 7500 val perplexity 269.3396
229
+ 7500 train 5.359210 (lr=8.2513e-06) (hash(x)=146921118)
230
+ 7600 val loss 5.5897
231
+ 7600 val perplexity 267.6464
232
+ 7600 train 5.632684 (lr=7.8355e-06) (hash(x)=150660048)
233
+ 7700 val loss 5.5878
234
+ 7700 val perplexity 267.1529
235
+ 7700 train 5.470635 (lr=7.4522e-06) (hash(x)=148059852)
236
+ 7800 val loss 5.5863
237
+ 7800 val perplexity 266.7471
238
+ 7800 train 5.278240 (lr=7.1019e-06) (hash(x)=148331002)
239
+ 7900 val loss 5.5804
240
+ 7900 val perplexity 265.1806
241
+ 7900 train 5.522924 (lr=6.7852e-06) (hash(x)=164923883)
242
+ 8000 val loss 5.5783
243
+ 8000 val perplexity 264.6336
244
+ 8000 train 5.464546 (lr=6.5025e-06) (hash(x)=143545384)
245
+ 8100 val loss 5.5780
246
+ 8100 val perplexity 264.5465
247
+ 8100 train 5.397294 (lr=6.2543e-06) (hash(x)=160686959)
248
+ 8200 val loss 5.5748
249
+ 8200 val perplexity 263.7010
250
+ 8200 train 5.522984 (lr=6.0408e-06) (hash(x)=156501889)
251
+ 8300 val loss 5.5727
252
+ 8300 val perplexity 263.1430
253
+ 8300 train 5.416498 (lr=5.8625e-06) (hash(x)=142716875)
254
+ 8400 val loss 5.5677
255
+ 8400 val perplexity 261.8307
256
+ 8400 train 5.489812 (lr=5.7195e-06) (hash(x)=154436684)
257
+ 8500 val loss 5.5661
258
+ 8500 val perplexity 261.4184
259
+ 8500 train 5.648222 (lr=5.6121e-06) (hash(x)=147965839)
260
+ 8600 val loss 5.5657
261
+ 8600 val perplexity 261.3042
262
+ 8600 train 5.360355 (lr=5.5404e-06) (hash(x)=145228097)
263
+ 8700 val loss 5.5602
264
+ 8700 val perplexity 259.8835
265
+ 8700 train 5.757755 (lr=5.5045e-06) (hash(x)=152910357)
266
+ 8749 val loss 5.5604
267
+ 8749 val perplexity 259.9335
lr5.5e-5_total_batch_size61440_two_masks_4_heads_seed1338/model_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8d4e9621f4fc185b2befb751d7a28f6879b8d963ac35a7f64cc39062a23e308
3
+ size 97580418
lr5.5e-5_total_batch_size61440_two_masks_4_heads_seed1338/optimizer_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cad669547207480940d6bacfef3dc2a5692c0635868b8d4c50d9cfa56378cc67
3
+ size 188880262