andrew-healey commited on
Commit
f402ab3
·
verified ·
1 Parent(s): 42b6520

Upload folder using huggingface_hub

Browse files
lr6.5e-5_total_batch_size61440_two_masks_4_heads_seed1341/args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_10/lr6.5e-5_total_batch_size61440_two_masks_4_heads_seed1341", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_10", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1341, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "two_masks", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 6.5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "6.5e-5_61440_1341", "n_embd": 256}
lr6.5e-5_total_batch_size61440_two_masks_4_heads_seed1341/dataloader_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:953385078aa3787b69fc6857dfd48b0a2cd2f4d27c6f8892e01211aca53d07f5
3
+ size 964
lr6.5e-5_total_batch_size61440_two_masks_4_heads_seed1341/log2.txt ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ max_steps: 8750
2
+ 0 val loss 11.3043
3
+ 0 val perplexity 81169.7891
4
+ 0 train 11.302660 (lr=1.3000e-07) (hash(x)=145079536)
5
+ 100 val loss 9.9721
6
+ 100 val perplexity 21420.5293
7
+ 100 train 9.973979 (lr=1.3130e-05) (hash(x)=155800595)
8
+ 200 val loss 9.2564
9
+ 200 val perplexity 10471.1104
10
+ 200 train 9.232339 (lr=2.6130e-05) (hash(x)=145606733)
11
+ 300 val loss 8.2573
12
+ 300 val perplexity 3855.5793
13
+ 300 train 8.243223 (lr=3.9130e-05) (hash(x)=150367139)
14
+ 400 val loss 7.7358
15
+ 400 val perplexity 2288.7698
16
+ 400 train 8.050438 (lr=5.2130e-05) (hash(x)=155747374)
17
+ 500 val loss 7.4841
18
+ 500 val perplexity 1779.5862
19
+ 500 train 7.362129 (lr=6.5000e-05) (hash(x)=140604760)
20
+ 600 val loss 7.3726
21
+ 600 val perplexity 1591.7557
22
+ 600 train 7.307734 (lr=6.4979e-05) (hash(x)=148404734)
23
+ 700 val loss 7.3043
24
+ 700 val perplexity 1486.7474
25
+ 700 train 7.247661 (lr=6.4915e-05) (hash(x)=148115934)
26
+ 800 val loss 7.2357
27
+ 800 val perplexity 1388.1753
28
+ 800 train 7.118206 (lr=6.4809e-05) (hash(x)=137464699)
29
+ 900 val loss 7.1791
30
+ 900 val perplexity 1311.7572
31
+ 900 train 6.993539 (lr=6.4661e-05) (hash(x)=143886042)
32
+ 1000 val loss 7.1310
33
+ 1000 val perplexity 1250.1498
34
+ 1000 train 7.133737 (lr=6.4471e-05) (hash(x)=163799796)
35
+ 1100 val loss 7.0560
36
+ 1100 val perplexity 1159.7477
37
+ 1100 train 7.015691 (lr=6.4240e-05) (hash(x)=144592844)
38
+ 1200 val loss 7.0007
39
+ 1200 val perplexity 1097.4194
40
+ 1200 train 7.344011 (lr=6.3967e-05) (hash(x)=204706354)
41
+ 1300 val loss 6.9319
42
+ 1300 val perplexity 1024.4816
43
+ 1300 train 6.869189 (lr=6.3653e-05) (hash(x)=150862210)
44
+ 1400 val loss 6.8622
45
+ 1400 val perplexity 955.4597
46
+ 1400 train 6.846242 (lr=6.3299e-05) (hash(x)=147766811)
47
+ 1500 val loss 6.7958
48
+ 1500 val perplexity 894.0648
49
+ 1500 train 6.608245 (lr=6.2905e-05) (hash(x)=135925327)
50
+ 1600 val loss 6.7352
51
+ 1600 val perplexity 841.4756
52
+ 1600 train 6.711575 (lr=6.2471e-05) (hash(x)=160440642)
53
+ 1700 val loss 6.6554
54
+ 1700 val perplexity 776.9311
55
+ 1700 train 6.757433 (lr=6.1999e-05) (hash(x)=151184106)
56
+ 1800 val loss 6.5967
57
+ 1800 val perplexity 732.6487
58
+ 1800 train 6.483405 (lr=6.1489e-05) (hash(x)=148100580)
59
+ 1900 val loss 6.5602
60
+ 1900 val perplexity 706.4397
61
+ 1900 train 6.384830 (lr=6.0941e-05) (hash(x)=149434659)
62
+ 2000 val loss 6.4969
63
+ 2000 val perplexity 663.0624
64
+ 2000 train 6.392022 (lr=6.0357e-05) (hash(x)=152285486)
65
+ 2100 val loss 6.4514
66
+ 2100 val perplexity 633.5876
67
+ 2100 train 6.196590 (lr=5.9737e-05) (hash(x)=144294295)
68
+ 2200 val loss 6.4209
69
+ 2200 val perplexity 614.5562
70
+ 2200 train 6.446634 (lr=5.9082e-05) (hash(x)=175030215)
71
+ 2300 val loss 6.4052
72
+ 2300 val perplexity 604.9811
73
+ 2300 train 6.216255 (lr=5.8394e-05) (hash(x)=150831428)
74
+ 2400 val loss 6.3549
75
+ 2400 val perplexity 575.3108
76
+ 2400 train 6.521008 (lr=5.7672e-05) (hash(x)=140808297)
77
+ 2500 val loss 6.3279
78
+ 2500 val perplexity 559.9970
79
+ 2500 train 6.309899 (lr=5.6919e-05) (hash(x)=153160275)
80
+ 2600 val loss 6.3013
81
+ 2600 val perplexity 545.3009
82
+ 2600 train 6.082554 (lr=5.6135e-05) (hash(x)=133990623)
83
+ 2700 val loss 6.2872
84
+ 2700 val perplexity 537.6653
85
+ 2700 train 6.260504 (lr=5.5322e-05) (hash(x)=142860944)
86
+ 2800 val loss 6.2514
87
+ 2800 val perplexity 518.7549
88
+ 2800 train 6.259696 (lr=5.4480e-05) (hash(x)=137959511)
89
+ 2900 val loss 6.2276
90
+ 2900 val perplexity 506.5560
91
+ 2900 train 6.174878 (lr=5.3611e-05) (hash(x)=147009873)
92
+ 3000 val loss 6.2098
93
+ 3000 val perplexity 497.6184
94
+ 3000 train 6.063195 (lr=5.2717e-05) (hash(x)=158264841)
95
+ 3100 val loss 6.1953
96
+ 3100 val perplexity 490.4456
97
+ 3100 train 6.045856 (lr=5.1797e-05) (hash(x)=139232251)
98
+ 3200 val loss 6.1620
99
+ 3200 val perplexity 474.3690
100
+ 3200 train 6.203135 (lr=5.0855e-05) (hash(x)=153436104)
101
+ 3300 val loss 6.1390
102
+ 3300 val perplexity 463.6094
103
+ 3300 train 5.959480 (lr=4.9890e-05) (hash(x)=149681831)
104
+ 3400 val loss 6.1321
105
+ 3400 val perplexity 460.4156
106
+ 3400 train 6.011100 (lr=4.8905e-05) (hash(x)=168425516)
107
+ 3500 val loss 6.0999
108
+ 3500 val perplexity 445.8005
109
+ 3500 train 6.128118 (lr=4.7901e-05) (hash(x)=163104338)
110
+ 3600 val loss 6.0817
111
+ 3600 val perplexity 437.7570
112
+ 3600 train 6.056296 (lr=4.6879e-05) (hash(x)=165109772)
113
+ 3700 val loss 6.0798
114
+ 3700 val perplexity 436.9445
115
+ 3700 train 5.810435 (lr=4.5841e-05) (hash(x)=153420306)
116
+ 3800 val loss 6.0445
117
+ 3800 val perplexity 421.7922
118
+ 3800 train 6.060985 (lr=4.4789e-05) (hash(x)=160168863)
119
+ 3900 val loss 6.0278
120
+ 3900 val perplexity 414.8092
121
+ 3900 train 5.938770 (lr=4.3723e-05) (hash(x)=153906073)
122
+ 4000 val loss 6.0148
123
+ 4000 val perplexity 409.4471
124
+ 4000 train 5.955269 (lr=4.2646e-05) (hash(x)=151055067)
125
+ 4100 val loss 5.9999
126
+ 4100 val perplexity 403.3711
127
+ 4100 train 5.976469 (lr=4.1559e-05) (hash(x)=149629830)
128
+ 4200 val loss 5.9821
129
+ 4200 val perplexity 396.2775
130
+ 4200 train 5.867581 (lr=4.0463e-05) (hash(x)=143101381)
131
+ 4300 val loss 5.9691
132
+ 4300 val perplexity 391.1471
133
+ 4300 train 5.874200 (lr=3.9361e-05) (hash(x)=149712044)
134
+ 4400 val loss 5.9528
135
+ 4400 val perplexity 384.8230
136
+ 4400 train 5.876326 (lr=3.8253e-05) (hash(x)=153446449)
137
+ 4500 val loss 5.9403
138
+ 4500 val perplexity 380.0366
139
+ 4500 train 5.888470 (lr=3.7142e-05) (hash(x)=146086947)
140
+ 4600 val loss 5.9251
141
+ 4600 val perplexity 374.3124
142
+ 4600 train 5.770452 (lr=3.6028e-05) (hash(x)=153800173)
143
+ 4700 val loss 5.9120
144
+ 4700 val perplexity 369.4611
145
+ 4700 train 5.979842 (lr=3.4915e-05) (hash(x)=155962726)
146
+ 4800 val loss 5.9002
147
+ 4800 val perplexity 365.1265
148
+ 4800 train 6.128624 (lr=3.3802e-05) (hash(x)=142045616)
149
+ 4900 val loss 5.8833
150
+ 4900 val perplexity 358.9994
151
+ 4900 train 5.847193 (lr=3.2693e-05) (hash(x)=143418248)
152
+ 5000 val loss 5.8712
153
+ 5000 val perplexity 354.6879
154
+ 5000 train 5.795837 (lr=3.1587e-05) (hash(x)=145789790)
155
+ 5100 val loss 5.8630
156
+ 5100 val perplexity 351.7876
157
+ 5100 train 5.843873 (lr=3.0488e-05) (hash(x)=137795633)
158
+ 5200 val loss 5.8526
159
+ 5200 val perplexity 348.1467
160
+ 5200 train 6.007545 (lr=2.9396e-05) (hash(x)=148907132)
161
+ 5300 val loss 5.8412
162
+ 5300 val perplexity 344.1901
163
+ 5300 train 5.714124 (lr=2.8314e-05) (hash(x)=152343580)
164
+ 5400 val loss 5.8304
165
+ 5400 val perplexity 340.4836
166
+ 5400 train 5.692177 (lr=2.7243e-05) (hash(x)=148578264)
167
+ 5500 val loss 5.8161
168
+ 5500 val perplexity 335.6520
169
+ 5500 train 5.971477 (lr=2.6183e-05) (hash(x)=145635833)
170
+ 5600 val loss 5.8086
171
+ 5600 val perplexity 333.1555
172
+ 5600 train 5.783332 (lr=2.5138e-05) (hash(x)=156337844)
173
+ 5700 val loss 5.8008
174
+ 5700 val perplexity 330.5612
175
+ 5700 train 5.671460 (lr=2.4108e-05) (hash(x)=147168506)
176
+ 5800 val loss 5.7893
177
+ 5800 val perplexity 326.7686
178
+ 5800 train 5.834176 (lr=2.3095e-05) (hash(x)=159566920)
179
+ 5900 val loss 5.7812
180
+ 5900 val perplexity 324.1459
181
+ 5900 train 5.837295 (lr=2.2100e-05) (hash(x)=158273929)
182
+ 6000 val loss 5.7772
183
+ 6000 val perplexity 322.8699
184
+ 6000 train 5.746500 (lr=2.1125e-05) (hash(x)=156649749)
185
+ 6100 val loss 5.7627
186
+ 6100 val perplexity 318.2118
187
+ 6100 train 5.677865 (lr=2.0171e-05) (hash(x)=146812388)
188
+ 6200 val loss 5.7575
189
+ 6200 val perplexity 316.5652
190
+ 6200 train 5.588995 (lr=1.9240e-05) (hash(x)=143522146)
191
+ 6300 val loss 5.7504
192
+ 6300 val perplexity 314.3252
193
+ 6300 train 5.577026 (lr=1.8333e-05) (hash(x)=150124474)
194
+ 6400 val loss 5.7411
195
+ 6400 val perplexity 311.3962
196
+ 6400 train 5.765630 (lr=1.7451e-05) (hash(x)=141242117)
197
+ 6500 val loss 5.7331
198
+ 6500 val perplexity 308.9381
199
+ 6500 train 5.613358 (lr=1.6595e-05) (hash(x)=143529762)
200
+ 6600 val loss 5.7277
201
+ 6600 val perplexity 307.2555
202
+ 6600 train 5.516946 (lr=1.5768e-05) (hash(x)=136948374)
203
+ 6700 val loss 5.7246
204
+ 6700 val perplexity 306.3026
205
+ 6700 train 5.523249 (lr=1.4969e-05) (hash(x)=146268592)
206
+ 6800 val loss 5.7164
207
+ 6800 val perplexity 303.8000
208
+ 6800 train 5.664418 (lr=1.4200e-05) (hash(x)=152676836)
209
+ 6900 val loss 5.7071
210
+ 6900 val perplexity 300.9868
211
+ 6900 train 5.625043 (lr=1.3463e-05) (hash(x)=134657776)
212
+ 7000 val loss 5.7045
213
+ 7000 val perplexity 300.2240
214
+ 7000 train 5.654274 (lr=1.2758e-05) (hash(x)=166721861)
215
+ 7100 val loss 5.7011
216
+ 7100 val perplexity 299.1987
217
+ 7100 train 5.504437 (lr=1.2086e-05) (hash(x)=135496702)
218
+ 7200 val loss 5.6966
219
+ 7200 val perplexity 297.8449
220
+ 7200 train 5.805260 (lr=1.1449e-05) (hash(x)=155567461)
221
+ 7300 val loss 5.6878
222
+ 7300 val perplexity 295.2473
223
+ 7300 train 5.495393 (lr=1.0847e-05) (hash(x)=142803829)
224
+ 7400 val loss 5.6863
225
+ 7400 val perplexity 294.7995
226
+ 7400 train 5.506249 (lr=1.0281e-05) (hash(x)=145294178)
227
+ 7500 val loss 5.6839
228
+ 7500 val perplexity 294.0865
229
+ 7500 train 5.445815 (lr=9.7516e-06) (hash(x)=150573713)
230
+ 7600 val loss 5.6800
231
+ 7600 val perplexity 292.9466
232
+ 7600 train 5.685305 (lr=9.2601e-06) (hash(x)=142771511)
233
+ 7700 val loss 5.6735
234
+ 7700 val perplexity 291.0613
235
+ 7700 train 5.577940 (lr=8.8071e-06) (hash(x)=143602175)
236
+ 7800 val loss 5.6693
237
+ 7800 val perplexity 289.8175
238
+ 7800 train 5.726068 (lr=8.3932e-06) (hash(x)=152379862)
239
+ 7900 val loss 5.6682
240
+ 7900 val perplexity 289.5185
241
+ 7900 train 5.493036 (lr=8.0189e-06) (hash(x)=146655921)
242
+ 8000 val loss 5.6682
243
+ 8000 val perplexity 289.5221
244
+ 8000 train 5.793446 (lr=7.6848e-06) (hash(x)=148262482)
245
+ 8100 val loss 5.6594
246
+ 8100 val perplexity 286.9648
247
+ 8100 train 5.603802 (lr=7.3914e-06) (hash(x)=147683655)
248
+ 8200 val loss 5.6591
249
+ 8200 val perplexity 286.8977
250
+ 8200 train 5.768569 (lr=7.1392e-06) (hash(x)=157312987)
251
+ 8300 val loss 5.6551
252
+ 8300 val perplexity 285.7556
253
+ 8300 train 5.615245 (lr=6.9284e-06) (hash(x)=141107543)
254
+ 8400 val loss 5.6504
255
+ 8400 val perplexity 284.4099
256
+ 8400 train 5.646900 (lr=6.7594e-06) (hash(x)=141323024)
257
+ 8500 val loss 5.6493
258
+ 8500 val perplexity 284.0821
259
+ 8500 train 5.620936 (lr=6.6324e-06) (hash(x)=150696521)
260
+ 8600 val loss 5.6473
261
+ 8600 val perplexity 283.5228
262
+ 8600 train 5.590692 (lr=6.5477e-06) (hash(x)=162288191)
263
+ 8700 val loss 5.6436
264
+ 8700 val perplexity 282.4887
265
+ 8700 train 5.516526 (lr=6.5053e-06) (hash(x)=152860941)
266
+ 8749 val loss 5.6418
267
+ 8749 val perplexity 281.9832
lr6.5e-5_total_batch_size61440_two_masks_4_heads_seed1341/model_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f1ed26a739e65ee7234a60dd520733c75a1db9699f699dd26e275616d4d2418
3
+ size 97580418
lr6.5e-5_total_batch_size61440_two_masks_4_heads_seed1341/optimizer_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c532789fc8f30b321eb846ee841526000a09d63acb13116df3f6c21718f886c
3
+ size 188880262