andrew-healey commited on
Commit
2af8691
·
verified ·
1 Parent(s): 7b7c4ac

Upload folder using huggingface_hub

Browse files
lr5.5e-5_total_batch_size61440_one_mask_per_head_2_latent_vectors_seed1339/args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_10/lr5.5e-5_total_batch_size61440_one_mask_per_head_2_latent_vectors_seed1339", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_10", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1339, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "n_latent_masks", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": 2, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 5.5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "5.5e-5_61440_1339", "n_embd": 256}
lr5.5e-5_total_batch_size61440_one_mask_per_head_2_latent_vectors_seed1339/dataloader_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:953385078aa3787b69fc6857dfd48b0a2cd2f4d27c6f8892e01211aca53d07f5
3
+ size 964
lr5.5e-5_total_batch_size61440_one_mask_per_head_2_latent_vectors_seed1339/log2.txt ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ max_steps: 8750
2
+ 0 val loss 11.2920
3
+ 0 val perplexity 80178.2422
4
+ 0 train 11.285019 (lr=1.1000e-07) (hash(x)=150724848)
5
+ 100 val loss 10.2608
6
+ 100 val perplexity 28591.0488
7
+ 100 train 10.258900 (lr=1.1110e-05) (hash(x)=149910534)
8
+ 200 val loss 9.9900
9
+ 200 val perplexity 21806.6914
10
+ 200 train 9.993454 (lr=2.2110e-05) (hash(x)=148123706)
11
+ 300 val loss 9.7780
12
+ 300 val perplexity 17640.6270
13
+ 300 train 9.743733 (lr=3.3110e-05) (hash(x)=146678221)
14
+ 400 val loss 9.4123
15
+ 400 val perplexity 12237.6143
16
+ 400 train 9.373756 (lr=4.4110e-05) (hash(x)=151700982)
17
+ 500 val loss 9.1703
18
+ 500 val perplexity 9607.4648
19
+ 500 train 9.226584 (lr=5.5000e-05) (hash(x)=156182087)
20
+ 600 val loss 8.9973
21
+ 600 val perplexity 8080.9355
22
+ 600 train 9.065878 (lr=5.4982e-05) (hash(x)=149318660)
23
+ 700 val loss 8.8351
24
+ 700 val perplexity 6871.5503
25
+ 700 train 8.877204 (lr=5.4928e-05) (hash(x)=150482428)
26
+ 800 val loss 8.4219
27
+ 800 val perplexity 4545.4097
28
+ 800 train 8.374897 (lr=5.4839e-05) (hash(x)=143268605)
29
+ 900 val loss 8.1240
30
+ 900 val perplexity 3374.3938
31
+ 900 train 8.122694 (lr=5.4713e-05) (hash(x)=152322423)
32
+ 1000 val loss 7.9393
33
+ 1000 val perplexity 2805.3359
34
+ 1000 train 7.797348 (lr=5.4553e-05) (hash(x)=147904298)
35
+ 1100 val loss 7.8577
36
+ 1100 val perplexity 2585.5923
37
+ 1100 train 7.963579 (lr=5.4357e-05) (hash(x)=154343147)
38
+ 1200 val loss 7.8121
39
+ 1200 val perplexity 2470.2959
40
+ 1200 train 7.828610 (lr=5.4126e-05) (hash(x)=141843115)
41
+ 1300 val loss 8.6889
42
+ 1300 val perplexity 5936.3994
43
+ 1300 train 8.634224 (lr=5.3860e-05) (hash(x)=145279030)
44
+ 1400 val loss 7.8415
45
+ 1400 val perplexity 2544.1401
46
+ 1400 train 7.878081 (lr=5.3561e-05) (hash(x)=152507639)
47
+ 1500 val loss 7.8430
48
+ 1500 val perplexity 2547.9570
49
+ 1500 train 7.861599 (lr=5.3227e-05) (hash(x)=148473774)
50
+ 1600 val loss 7.8155
51
+ 1600 val perplexity 2478.7881
52
+ 1600 train 7.821124 (lr=5.2860e-05) (hash(x)=151117002)
53
+ 1700 val loss 7.8875
54
+ 1700 val perplexity 2663.7922
55
+ 1700 train 7.843746 (lr=5.2461e-05) (hash(x)=138011335)
56
+ 1800 val loss 8.8606
57
+ 1800 val perplexity 7048.6533
58
+ 1800 train 9.002642 (lr=5.2029e-05) (hash(x)=171180926)
59
+ 1900 val loss 7.9528
60
+ 1900 val perplexity 2843.4111
61
+ 1900 train 7.869714 (lr=5.1565e-05) (hash(x)=141769419)
62
+ 2000 val loss 7.8816
63
+ 2000 val perplexity 2647.9783
64
+ 2000 train 7.855038 (lr=5.1071e-05) (hash(x)=151963443)
65
+ 2100 val loss 7.8688
66
+ 2100 val perplexity 2614.4954
67
+ 2100 train 7.953882 (lr=5.0547e-05) (hash(x)=162947470)
68
+ 2200 val loss 7.8535
69
+ 2200 val perplexity 2574.7532
70
+ 2200 train 8.085760 (lr=4.9993e-05) (hash(x)=154954810)
71
+ 2300 val loss 7.8731
72
+ 2300 val perplexity 2625.7310
73
+ 2300 train 7.908112 (lr=4.9410e-05) (hash(x)=151878111)
74
+ 2400 val loss 7.8939
75
+ 2400 val perplexity 2680.9504
76
+ 2400 train 7.835692 (lr=4.8800e-05) (hash(x)=158661057)
77
+ 2500 val loss 7.8550
78
+ 2500 val perplexity 2578.7207
79
+ 2500 train 7.856666 (lr=4.8162e-05) (hash(x)=150925584)
80
+ 2600 val loss 8.4848
81
+ 2600 val perplexity 4840.8525
82
+ 2600 train 8.432496 (lr=4.7499e-05) (hash(x)=144515755)
83
+ 2700 val loss 7.8524
84
+ 2700 val perplexity 2571.8635
85
+ 2700 train 7.836430 (lr=4.6811e-05) (hash(x)=153109144)
86
+ 2800 val loss 7.8860
87
+ 2800 val perplexity 2659.7458
88
+ 2800 train 7.798884 (lr=4.6099e-05) (hash(x)=151152897)
89
+ 2900 val loss 7.8625
90
+ 2900 val perplexity 2597.9656
91
+ 2900 train 7.815040 (lr=4.5364e-05) (hash(x)=145800210)
92
+ 3000 val loss 7.8458
93
+ 3000 val perplexity 2554.8699
94
+ 3000 train 7.708117 (lr=4.4606e-05) (hash(x)=141997485)
95
+ 3100 val loss 8.5311
96
+ 3100 val perplexity 5070.0986
97
+ 3100 train 8.447453 (lr=4.3828e-05) (hash(x)=154049740)
98
+ 3200 val loss 7.8850
99
+ 3200 val perplexity 2657.1155
100
+ 3200 train 7.870227 (lr=4.3031e-05) (hash(x)=150471842)
101
+ 3300 val loss 7.8587
102
+ 3300 val perplexity 2588.0569
103
+ 3300 train 7.817213 (lr=4.2215e-05) (hash(x)=149048126)
104
+ 3400 val loss 7.8637
105
+ 3400 val perplexity 2601.2356
106
+ 3400 train 7.964822 (lr=4.1381e-05) (hash(x)=161261339)
107
+ 3500 val loss 7.8450
108
+ 3500 val perplexity 2552.8652
109
+ 3500 train 7.844921 (lr=4.0532e-05) (hash(x)=157495564)
110
+ 3600 val loss 7.8408
111
+ 3600 val perplexity 2542.2930
112
+ 3600 train 7.771780 (lr=3.9667e-05) (hash(x)=144352932)
113
+ 3700 val loss 8.0624
114
+ 3700 val perplexity 3173.0195
115
+ 3700 train 8.105776 (lr=3.8789e-05) (hash(x)=149389012)
116
+ 3800 val loss 7.8637
117
+ 3800 val perplexity 2601.0918
118
+ 3800 train 7.800966 (lr=3.7898e-05) (hash(x)=146607620)
119
+ 3900 val loss 7.8185
120
+ 3900 val perplexity 2486.1604
121
+ 3900 train 7.755822 (lr=3.6996e-05) (hash(x)=143754617)
122
+ 4000 val loss 8.4180
123
+ 4000 val perplexity 4527.7144
124
+ 4000 train 8.453739 (lr=3.6085e-05) (hash(x)=156930722)
125
+ 4100 val loss 7.8077
126
+ 4100 val perplexity 2459.5415
127
+ 4100 train 7.667881 (lr=3.5165e-05) (hash(x)=147500519)
128
+ 4200 val loss 7.7921
129
+ 4200 val perplexity 2421.4041
130
+ 4200 train 7.709275 (lr=3.4238e-05) (hash(x)=143232237)
131
+ 4300 val loss 7.7906
132
+ 4300 val perplexity 2417.6555
133
+ 4300 train 7.771226 (lr=3.3305e-05) (hash(x)=146811670)
134
+ 4400 val loss 8.0326
135
+ 4400 val perplexity 3079.8545
136
+ 4400 train 8.087058 (lr=3.2368e-05) (hash(x)=158418746)
137
+ 4500 val loss 7.7882
138
+ 4500 val perplexity 2411.9189
139
+ 4500 train 7.766870 (lr=3.1428e-05) (hash(x)=156695778)
140
+ 4600 val loss 7.7835
141
+ 4600 val perplexity 2400.6045
142
+ 4600 train 7.873107 (lr=3.0486e-05) (hash(x)=147791497)
143
+ 4700 val loss 7.7862
144
+ 4700 val perplexity 2407.1348
145
+ 4700 train 7.645937 (lr=2.9543e-05) (hash(x)=155533088)
146
+ 4800 val loss 7.8588
147
+ 4800 val perplexity 2588.4270
148
+ 4800 train 7.751788 (lr=2.8602e-05) (hash(x)=138350044)
149
+ 4900 val loss 7.9716
150
+ 4900 val perplexity 2897.5920
151
+ 4900 train 7.992471 (lr=2.7663e-05) (hash(x)=143735284)
152
+ 5000 val loss 7.8255
153
+ 5000 val perplexity 2503.7129
154
+ 5000 train 7.797208 (lr=2.6728e-05) (hash(x)=154976463)
155
+ 5100 val loss 7.8164
156
+ 5100 val perplexity 2480.9829
157
+ 5100 train 7.736232 (lr=2.5798e-05) (hash(x)=149894982)
158
+ 5200 val loss 7.7971
159
+ 5200 val perplexity 2433.5532
160
+ 5200 train 7.773500 (lr=2.4874e-05) (hash(x)=159326689)
161
+ 5300 val loss 7.7868
162
+ 5300 val perplexity 2408.6169
163
+ 5300 train 7.749965 (lr=2.3958e-05) (hash(x)=159484800)
164
+ 5400 val loss 7.7676
165
+ 5400 val perplexity 2362.7634
166
+ 5400 train 7.651670 (lr=2.3051e-05) (hash(x)=140385615)
167
+ 5500 val loss 7.7672
168
+ 5500 val perplexity 2361.8455
169
+ 5500 train 7.658051 (lr=2.2155e-05) (hash(x)=148498335)
170
+ 5600 val loss 7.7606
171
+ 5600 val perplexity 2346.2495
172
+ 5600 train 7.604352 (lr=2.1271e-05) (hash(x)=151907614)
173
+ 5700 val loss 7.7543
174
+ 5700 val perplexity 2331.6748
175
+ 5700 train 7.715114 (lr=2.0399e-05) (hash(x)=155192267)
176
+ 5800 val loss 7.7598
177
+ 5800 val perplexity 2344.5452
178
+ 5800 train 7.834521 (lr=1.9542e-05) (hash(x)=153132158)
179
+ 5900 val loss 7.7550
180
+ 5900 val perplexity 2333.1206
181
+ 5900 train 7.782391 (lr=1.8700e-05) (hash(x)=161446764)
182
+ 6000 val loss 7.7529
183
+ 6000 val perplexity 2328.2063
184
+ 6000 train 7.670533 (lr=1.7875e-05) (hash(x)=151512446)
185
+ 6100 val loss 7.7478
186
+ 6100 val perplexity 2316.3940
187
+ 6100 train 7.902486 (lr=1.7068e-05) (hash(x)=188094053)
188
+ 6200 val loss 7.7423
189
+ 6200 val perplexity 2303.8662
190
+ 6200 train 7.683421 (lr=1.6280e-05) (hash(x)=149389789)
191
+ 6300 val loss 7.7421
192
+ 6300 val perplexity 2303.2522
193
+ 6300 train 7.589477 (lr=1.5512e-05) (hash(x)=138212820)
194
+ 6400 val loss 7.7406
195
+ 6400 val perplexity 2299.8247
196
+ 6400 train 7.607221 (lr=1.4766e-05) (hash(x)=146535423)
197
+ 6500 val loss 7.7366
198
+ 6500 val perplexity 2290.7712
199
+ 6500 train 7.649750 (lr=1.4042e-05) (hash(x)=145950843)
200
+ 6600 val loss 7.7349
201
+ 6600 val perplexity 2286.8442
202
+ 6600 train 7.632418 (lr=1.3342e-05) (hash(x)=141162902)
203
+ 6700 val loss 7.7339
204
+ 6700 val perplexity 2284.6079
205
+ 6700 train 7.655795 (lr=1.2666e-05) (hash(x)=153018737)
206
+ 6800 val loss 7.7317
207
+ 6800 val perplexity 2279.5457
208
+ 6800 train 7.737814 (lr=1.2016e-05) (hash(x)=155640155)
209
+ 6900 val loss 7.7308
210
+ 6900 val perplexity 2277.3293
211
+ 6900 train 7.683605 (lr=1.1392e-05) (hash(x)=153722115)
212
+ 7000 val loss 7.7304
213
+ 7000 val perplexity 2276.5693
214
+ 7000 train 7.648378 (lr=1.0795e-05) (hash(x)=146953450)
215
+ 7100 val loss 7.7445
216
+ 7100 val perplexity 2308.8767
217
+ 7100 train 7.623209 (lr=1.0227e-05) (hash(x)=137663885)
218
+ 7200 val loss 7.7381
219
+ 7200 val perplexity 2294.0723
220
+ 7200 train 7.747644 (lr=9.6875e-06) (hash(x)=146172950)
221
+ 7300 val loss 7.7362
222
+ 7300 val perplexity 2289.7336
223
+ 7300 train 7.648952 (lr=9.1780e-06) (hash(x)=150018163)
224
+ 7400 val loss 7.7301
225
+ 7400 val perplexity 2275.7217
226
+ 7400 train 7.649073 (lr=8.6990e-06) (hash(x)=145351166)
227
+ 7500 val loss 7.7292
228
+ 7500 val perplexity 2273.7791
229
+ 7500 train 7.578366 (lr=8.2513e-06) (hash(x)=145292116)
230
+ 7600 val loss 7.7288
231
+ 7600 val perplexity 2272.8857
232
+ 7600 train 7.662086 (lr=7.8355e-06) (hash(x)=150235132)
233
+ 7700 val loss 7.7275
234
+ 7700 val perplexity 2269.9656
235
+ 7700 train 7.666578 (lr=7.4522e-06) (hash(x)=154543455)
236
+ 7800 val loss 7.7335
237
+ 7800 val perplexity 2283.5623
238
+ 7800 train 7.615070 (lr=7.1019e-06) (hash(x)=142456852)
239
+ 7900 val loss 7.7393
240
+ 7900 val perplexity 2296.9709
241
+ 7900 train 7.624690 (lr=6.7852e-06) (hash(x)=147363479)
242
+ 8000 val loss 7.7376
243
+ 8000 val perplexity 2292.9438
244
+ 8000 train 7.765561 (lr=6.5025e-06) (hash(x)=156122973)
245
+ 8100 val loss 7.7261
246
+ 8100 val perplexity 2266.7705
247
+ 8100 train 7.699772 (lr=6.2543e-06) (hash(x)=156153179)
248
+ 8200 val loss 7.7249
249
+ 8200 val perplexity 2264.1238
250
+ 8200 train 7.703266 (lr=6.0408e-06) (hash(x)=146430698)
251
+ 8300 val loss 7.7298
252
+ 8300 val perplexity 2275.2605
253
+ 8300 train 7.635367 (lr=5.8625e-06) (hash(x)=143507257)
254
+ 8400 val loss 7.7226
255
+ 8400 val perplexity 2258.8550
256
+ 8400 train 7.948477 (lr=5.7195e-06) (hash(x)=166272643)
257
+ 8500 val loss 7.7224
258
+ 8500 val perplexity 2258.4243
259
+ 8500 train 7.649069 (lr=5.6121e-06) (hash(x)=143887848)
260
+ 8600 val loss 7.7242
261
+ 8600 val perplexity 2262.3799
262
+ 8600 train 7.830824 (lr=5.5404e-06) (hash(x)=156900341)
263
+ 8700 val loss 7.7243
264
+ 8700 val perplexity 2262.7166
265
+ 8700 train 7.869930 (lr=5.5045e-06) (hash(x)=146417632)
266
+ 8749 val loss 7.7224
267
+ 8749 val perplexity 2258.3315
lr5.5e-5_total_batch_size61440_one_mask_per_head_2_latent_vectors_seed1339/model_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3d5a9a525b7af43301651a05b78b314211e8b59c8d7bebcc5ded3290e9fda67
3
+ size 95220594
lr5.5e-5_total_batch_size61440_one_mask_per_head_2_latent_vectors_seed1339/optimizer_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cf45d6641ce19d3f9ed005521a1e3e0d01c89955ae85836c1fed8e6ca63271e
3
+ size 184163894