andrew-healey commited on
Commit
d52dac4
·
verified ·
1 Parent(s): 0f45f8d

Upload folder using huggingface_hub

Browse files
lr6.5e-5_total_batch_size61440_two_masks_4_heads_seed1339/args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_10/lr6.5e-5_total_batch_size61440_two_masks_4_heads_seed1339", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_10", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1339, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "two_masks", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 6.5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "6.5e-5_61440_1339", "n_embd": 256}
lr6.5e-5_total_batch_size61440_two_masks_4_heads_seed1339/dataloader_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:953385078aa3787b69fc6857dfd48b0a2cd2f4d27c6f8892e01211aca53d07f5
3
+ size 964
lr6.5e-5_total_batch_size61440_two_masks_4_heads_seed1339/log2.txt ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ max_steps: 8750
2
+ 0 val loss 11.2561
3
+ 0 val perplexity 77351.1797
4
+ 0 train 11.256043 (lr=1.3000e-07) (hash(x)=150724848)
5
+ 100 val loss 9.9940
6
+ 100 val perplexity 21894.0645
7
+ 100 train 10.001432 (lr=1.3130e-05) (hash(x)=149910534)
8
+ 200 val loss 9.3788
9
+ 200 val perplexity 11835.3730
10
+ 200 train 9.411184 (lr=2.6130e-05) (hash(x)=148123706)
11
+ 300 val loss 8.2814
12
+ 300 val perplexity 3949.5344
13
+ 300 train 8.180637 (lr=3.9130e-05) (hash(x)=146678221)
14
+ 400 val loss 7.6893
15
+ 400 val perplexity 2184.9395
16
+ 400 train 7.581092 (lr=5.2130e-05) (hash(x)=151700982)
17
+ 500 val loss 7.4754
18
+ 500 val perplexity 1764.0533
19
+ 500 train 7.569458 (lr=6.5000e-05) (hash(x)=156182087)
20
+ 600 val loss 7.3755
21
+ 600 val perplexity 1596.3346
22
+ 600 train 7.365134 (lr=6.4979e-05) (hash(x)=149318660)
23
+ 700 val loss 7.3022
24
+ 700 val perplexity 1483.6108
25
+ 700 train 7.335571 (lr=6.4915e-05) (hash(x)=150482428)
26
+ 800 val loss 7.1834
27
+ 800 val perplexity 1317.3447
28
+ 800 train 7.061414 (lr=6.4809e-05) (hash(x)=143268605)
29
+ 900 val loss 7.0888
30
+ 900 val perplexity 1198.5144
31
+ 900 train 7.052664 (lr=6.4661e-05) (hash(x)=152322423)
32
+ 1000 val loss 7.0036
33
+ 1000 val perplexity 1100.5641
34
+ 1000 train 6.867215 (lr=6.4471e-05) (hash(x)=147904298)
35
+ 1100 val loss 6.9240
36
+ 1100 val perplexity 1016.3733
37
+ 1100 train 7.126366 (lr=6.4240e-05) (hash(x)=154343147)
38
+ 1200 val loss 6.8338
39
+ 1200 val perplexity 928.6927
40
+ 1200 train 6.813411 (lr=6.3967e-05) (hash(x)=141843115)
41
+ 1300 val loss 6.7634
42
+ 1300 val perplexity 865.6054
43
+ 1300 train 6.645342 (lr=6.3653e-05) (hash(x)=145279030)
44
+ 1400 val loss 6.7080
45
+ 1400 val perplexity 818.9465
46
+ 1400 train 6.654157 (lr=6.3299e-05) (hash(x)=152507639)
47
+ 1500 val loss 6.6482
48
+ 1500 val perplexity 771.4277
49
+ 1500 train 6.674627 (lr=6.2905e-05) (hash(x)=148473774)
50
+ 1600 val loss 6.5691
51
+ 1600 val perplexity 712.6937
52
+ 1600 train 6.607936 (lr=6.2471e-05) (hash(x)=151117002)
53
+ 1700 val loss 6.5181
54
+ 1700 val perplexity 677.3141
55
+ 1700 train 6.450616 (lr=6.1999e-05) (hash(x)=138011335)
56
+ 1800 val loss 6.4773
57
+ 1800 val perplexity 650.1862
58
+ 1800 train 6.614837 (lr=6.1489e-05) (hash(x)=171180926)
59
+ 1900 val loss 6.4332
60
+ 1900 val perplexity 622.1785
61
+ 1900 train 6.356093 (lr=6.0941e-05) (hash(x)=141769419)
62
+ 2000 val loss 6.3952
63
+ 2000 val perplexity 598.9396
64
+ 2000 train 6.341516 (lr=6.0357e-05) (hash(x)=151963443)
65
+ 2100 val loss 6.3691
66
+ 2100 val perplexity 583.5094
67
+ 2100 train 6.421648 (lr=5.9737e-05) (hash(x)=162947470)
68
+ 2200 val loss 6.3235
69
+ 2200 val perplexity 557.4987
70
+ 2200 train 6.556474 (lr=5.9082e-05) (hash(x)=154954810)
71
+ 2300 val loss 6.3013
72
+ 2300 val perplexity 545.2720
73
+ 2300 train 6.285894 (lr=5.8394e-05) (hash(x)=151878111)
74
+ 2400 val loss 6.2742
75
+ 2400 val perplexity 530.7079
76
+ 2400 train 6.169062 (lr=5.7672e-05) (hash(x)=158661057)
77
+ 2500 val loss 6.2334
78
+ 2500 val perplexity 509.4879
79
+ 2500 train 6.254779 (lr=5.6919e-05) (hash(x)=150925584)
80
+ 2600 val loss 6.2089
81
+ 2600 val perplexity 497.1681
82
+ 2600 train 6.116327 (lr=5.6135e-05) (hash(x)=144515755)
83
+ 2700 val loss 6.1824
84
+ 2700 val perplexity 484.1358
85
+ 2700 train 6.166272 (lr=5.5322e-05) (hash(x)=153109144)
86
+ 2800 val loss 6.1581
87
+ 2800 val perplexity 472.5223
88
+ 2800 train 6.062727 (lr=5.4480e-05) (hash(x)=151152897)
89
+ 2900 val loss 6.1326
90
+ 2900 val perplexity 460.6337
91
+ 2900 train 6.071653 (lr=5.3611e-05) (hash(x)=145800210)
92
+ 3000 val loss 6.1203
93
+ 3000 val perplexity 455.0056
94
+ 3000 train 5.917623 (lr=5.2717e-05) (hash(x)=141997485)
95
+ 3100 val loss 6.0916
96
+ 3100 val perplexity 442.1487
97
+ 3100 train 6.010593 (lr=5.1797e-05) (hash(x)=154049740)
98
+ 3200 val loss 6.0688
99
+ 3200 val perplexity 432.1403
100
+ 3200 train 6.014441 (lr=5.0855e-05) (hash(x)=150471842)
101
+ 3300 val loss 6.0532
102
+ 3300 val perplexity 425.4731
103
+ 3300 train 5.996249 (lr=4.9890e-05) (hash(x)=149048126)
104
+ 3400 val loss 6.0263
105
+ 3400 val perplexity 414.1900
106
+ 3400 train 6.153189 (lr=4.8905e-05) (hash(x)=161261339)
107
+ 3500 val loss 6.0080
108
+ 3500 val perplexity 406.6802
109
+ 3500 train 5.975532 (lr=4.7901e-05) (hash(x)=157495564)
110
+ 3600 val loss 5.9915
111
+ 3600 val perplexity 400.0105
112
+ 3600 train 5.901582 (lr=4.6879e-05) (hash(x)=144352932)
113
+ 3700 val loss 5.9676
114
+ 3700 val perplexity 390.5811
115
+ 3700 train 5.941074 (lr=4.5841e-05) (hash(x)=149389012)
116
+ 3800 val loss 5.9490
117
+ 3800 val perplexity 383.3586
118
+ 3800 train 5.890903 (lr=4.4789e-05) (hash(x)=146607620)
119
+ 3900 val loss 5.9335
120
+ 3900 val perplexity 377.4571
121
+ 3900 train 5.847597 (lr=4.3723e-05) (hash(x)=143754617)
122
+ 4000 val loss 5.9129
123
+ 4000 val perplexity 369.7672
124
+ 4000 train 5.886193 (lr=4.2646e-05) (hash(x)=156930722)
125
+ 4100 val loss 5.8985
126
+ 4100 val perplexity 364.5070
127
+ 4100 train 5.728021 (lr=4.1559e-05) (hash(x)=147500519)
128
+ 4200 val loss 5.8877
129
+ 4200 val perplexity 360.5778
130
+ 4200 train 5.732339 (lr=4.0463e-05) (hash(x)=143232237)
131
+ 4300 val loss 5.8692
132
+ 4300 val perplexity 353.9501
133
+ 4300 train 5.783058 (lr=3.9361e-05) (hash(x)=146811670)
134
+ 4400 val loss 5.8488
135
+ 4400 val perplexity 346.8104
136
+ 4400 train 5.880957 (lr=3.8253e-05) (hash(x)=158418746)
137
+ 4500 val loss 5.8428
138
+ 4500 val perplexity 344.7361
139
+ 4500 train 5.802954 (lr=3.7142e-05) (hash(x)=156695778)
140
+ 4600 val loss 5.8267
141
+ 4600 val perplexity 339.2403
142
+ 4600 train 5.847702 (lr=3.6028e-05) (hash(x)=147791497)
143
+ 4700 val loss 5.8119
144
+ 4700 val perplexity 334.2588
145
+ 4700 train 5.703734 (lr=3.4915e-05) (hash(x)=155533088)
146
+ 4800 val loss 5.8020
147
+ 4800 val perplexity 330.9605
148
+ 4800 train 5.651344 (lr=3.3802e-05) (hash(x)=138350044)
149
+ 4900 val loss 5.7851
150
+ 4900 val perplexity 325.4107
151
+ 4900 train 5.791443 (lr=3.2693e-05) (hash(x)=143735284)
152
+ 5000 val loss 5.7695
153
+ 5000 val perplexity 320.3835
154
+ 5000 train 5.727303 (lr=3.1587e-05) (hash(x)=154976463)
155
+ 5100 val loss 5.7662
156
+ 5100 val perplexity 319.3217
157
+ 5100 train 5.655946 (lr=3.0488e-05) (hash(x)=149894982)
158
+ 5200 val loss 5.7550
159
+ 5200 val perplexity 315.7560
160
+ 5200 train 5.539772 (lr=2.9396e-05) (hash(x)=159326689)
161
+ 5300 val loss 5.7431
162
+ 5300 val perplexity 312.0416
163
+ 5300 train 5.753727 (lr=2.8314e-05) (hash(x)=159484800)
164
+ 5400 val loss 5.7275
165
+ 5400 val perplexity 307.2091
166
+ 5400 train 5.637153 (lr=2.7243e-05) (hash(x)=140385615)
167
+ 5500 val loss 5.7180
168
+ 5500 val perplexity 304.2932
169
+ 5500 train 5.546675 (lr=2.6183e-05) (hash(x)=148498335)
170
+ 5600 val loss 5.7139
171
+ 5600 val perplexity 303.0530
172
+ 5600 train 5.459474 (lr=2.5138e-05) (hash(x)=151907614)
173
+ 5700 val loss 5.7060
174
+ 5700 val perplexity 300.6536
175
+ 5700 train 5.670681 (lr=2.4108e-05) (hash(x)=155192267)
176
+ 5800 val loss 5.6896
177
+ 5800 val perplexity 295.7684
178
+ 5800 train 5.737008 (lr=2.3095e-05) (hash(x)=153132158)
179
+ 5900 val loss 5.6892
180
+ 5900 val perplexity 295.6611
181
+ 5900 train 5.656782 (lr=2.2100e-05) (hash(x)=161446764)
182
+ 6000 val loss 5.6824
183
+ 6000 val perplexity 293.6534
184
+ 6000 train 5.483084 (lr=2.1125e-05) (hash(x)=151512446)
185
+ 6100 val loss 5.6690
186
+ 6100 val perplexity 289.7578
187
+ 6100 train 5.780985 (lr=2.0171e-05) (hash(x)=188094053)
188
+ 6200 val loss 5.6586
189
+ 6200 val perplexity 286.7401
190
+ 6200 train 5.547750 (lr=1.9240e-05) (hash(x)=149389789)
191
+ 6300 val loss 5.6536
192
+ 6300 val perplexity 285.3286
193
+ 6300 train 5.473921 (lr=1.8333e-05) (hash(x)=138212820)
194
+ 6400 val loss 5.6504
195
+ 6400 val perplexity 284.4080
196
+ 6400 train 5.440686 (lr=1.7451e-05) (hash(x)=146535423)
197
+ 6500 val loss 5.6402
198
+ 6500 val perplexity 281.5299
199
+ 6500 train 5.515539 (lr=1.6595e-05) (hash(x)=145950843)
200
+ 6600 val loss 5.6335
201
+ 6600 val perplexity 279.6408
202
+ 6600 train 5.541010 (lr=1.5768e-05) (hash(x)=141162902)
203
+ 6700 val loss 5.6328
204
+ 6700 val perplexity 279.4510
205
+ 6700 train 5.544148 (lr=1.4969e-05) (hash(x)=153018737)
206
+ 6800 val loss 5.6232
207
+ 6800 val perplexity 276.7654
208
+ 6800 train 5.542342 (lr=1.4200e-05) (hash(x)=155640155)
209
+ 6900 val loss 5.6179
210
+ 6900 val perplexity 275.2988
211
+ 6900 train 5.582727 (lr=1.3463e-05) (hash(x)=153722115)
212
+ 7000 val loss 5.6117
213
+ 7000 val perplexity 273.6027
214
+ 7000 train 5.529258 (lr=1.2758e-05) (hash(x)=146953450)
215
+ 7100 val loss 5.6115
216
+ 7100 val perplexity 273.5477
217
+ 7100 train 5.453711 (lr=1.2086e-05) (hash(x)=137663885)
218
+ 7200 val loss 5.6003
219
+ 7200 val perplexity 270.5173
220
+ 7200 train 5.675512 (lr=1.1449e-05) (hash(x)=146172950)
221
+ 7300 val loss 5.5982
222
+ 7300 val perplexity 269.9449
223
+ 7300 train 5.488262 (lr=1.0847e-05) (hash(x)=150018163)
224
+ 7400 val loss 5.5945
225
+ 7400 val perplexity 268.9428
226
+ 7400 train 5.604101 (lr=1.0281e-05) (hash(x)=145351166)
227
+ 7500 val loss 5.5956
228
+ 7500 val perplexity 269.2258
229
+ 7500 train 5.382076 (lr=9.7516e-06) (hash(x)=145292116)
230
+ 7600 val loss 5.5859
231
+ 7600 val perplexity 266.6435
232
+ 7600 train 5.510931 (lr=9.2601e-06) (hash(x)=150235132)
233
+ 7700 val loss 5.5818
234
+ 7700 val perplexity 265.5468
235
+ 7700 train 5.491075 (lr=8.8071e-06) (hash(x)=154543455)
236
+ 7800 val loss 5.5803
237
+ 7800 val perplexity 265.1616
238
+ 7800 train 5.484360 (lr=8.3932e-06) (hash(x)=142456852)
239
+ 7900 val loss 5.5795
240
+ 7900 val perplexity 264.9517
241
+ 7900 train 5.357276 (lr=8.0189e-06) (hash(x)=147363479)
242
+ 8000 val loss 5.5727
243
+ 8000 val perplexity 263.1382
244
+ 8000 train 5.601532 (lr=7.6848e-06) (hash(x)=156122973)
245
+ 8100 val loss 5.5701
246
+ 8100 val perplexity 262.4608
247
+ 8100 train 5.620697 (lr=7.3914e-06) (hash(x)=156153179)
248
+ 8200 val loss 5.5676
249
+ 8200 val perplexity 261.8176
250
+ 8200 train 5.613224 (lr=7.1392e-06) (hash(x)=146430698)
251
+ 8300 val loss 5.5657
252
+ 8300 val perplexity 261.2977
253
+ 8300 train 5.435045 (lr=6.9284e-06) (hash(x)=143507257)
254
+ 8400 val loss 5.5601
255
+ 8400 val perplexity 259.8415
256
+ 8400 train 5.550940 (lr=6.7594e-06) (hash(x)=166272643)
257
+ 8500 val loss 5.5586
258
+ 8500 val perplexity 259.4559
259
+ 8500 train 5.463182 (lr=6.6324e-06) (hash(x)=143887848)
260
+ 8600 val loss 5.5584
261
+ 8600 val perplexity 259.4134
262
+ 8600 train 5.595049 (lr=6.5477e-06) (hash(x)=156900341)
263
+ 8700 val loss 5.5520
264
+ 8700 val perplexity 257.7443
265
+ 8700 train 5.773742 (lr=6.5053e-06) (hash(x)=146417632)
266
+ 8749 val loss 5.5515
267
+ 8749 val perplexity 257.6334
lr6.5e-5_total_batch_size61440_two_masks_4_heads_seed1339/model_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:262c33df25a685fd0cabbe18f1adb39bce01861741de8525b47362a680b5c7a8
3
+ size 97580418
lr6.5e-5_total_batch_size61440_two_masks_4_heads_seed1339/optimizer_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fa510b02761d06875237230b2803617c4fc1b43eecd3b31ad7283bf30fb3d36
3
+ size 188880262