andrew-healey commited on
Commit
2edc727
·
verified ·
1 Parent(s): 7fe88cc

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads4_seed1339/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1339", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1339, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 4.5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "4.5e-5_61440_4_1339", "n_embd": 256}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1339", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1339, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 5.5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "5.5e-5_61440_4_1339", "n_embd": 256}
attention_kindselective_n_heads4_seed1339/log2.txt CHANGED
@@ -1,267 +1,267 @@
1
  max_steps: 8750
2
  0 val loss 11.2808
3
- 0 val perplexity 79287.8281
4
- 0 train 11.289343 (lr=9.0000e-08) (hash(x)=150724848)
5
- 100 val loss 10.0387
6
- 100 val perplexity 22894.7715
7
- 100 train 10.053072 (lr=9.0900e-06) (hash(x)=149910534)
8
- 200 val loss 9.3866
9
- 200 val perplexity 11926.9707
10
- 200 train 9.415236 (lr=1.8090e-05) (hash(x)=148123706)
11
- 300 val loss 8.2246
12
- 300 val perplexity 3731.5139
13
- 300 train 8.125541 (lr=2.7090e-05) (hash(x)=146678221)
14
- 400 val loss 7.6337
15
- 400 val perplexity 2066.6184
16
- 400 train 7.529495 (lr=3.6090e-05) (hash(x)=151700982)
17
- 500 val loss 7.4691
18
- 500 val perplexity 1752.9563
19
- 500 train 7.553552 (lr=4.5000e-05) (hash(x)=156182087)
20
- 600 val loss 7.3525
21
- 600 val perplexity 1560.1439
22
- 600 train 7.332888 (lr=4.4985e-05) (hash(x)=149318660)
23
- 700 val loss 7.2674
24
- 700 val perplexity 1432.8541
25
- 700 train 7.306005 (lr=4.4941e-05) (hash(x)=150482428)
26
- 800 val loss 7.1371
27
- 800 val perplexity 1257.8130
28
- 800 train 7.013755 (lr=4.4868e-05) (hash(x)=143268605)
29
- 900 val loss 7.0395
30
- 900 val perplexity 1140.7983
31
- 900 train 7.003427 (lr=4.4766e-05) (hash(x)=152322423)
32
- 1000 val loss 6.9457
33
- 1000 val perplexity 1038.6819
34
- 1000 train 6.811792 (lr=4.4634e-05) (hash(x)=147904298)
35
- 1100 val loss 6.8599
36
- 1100 val perplexity 953.2381
37
- 1100 train 7.054464 (lr=4.4474e-05) (hash(x)=154343147)
38
- 1200 val loss 6.7650
39
- 1200 val perplexity 866.9272
40
- 1200 train 6.752739 (lr=4.4285e-05) (hash(x)=141843115)
41
- 1300 val loss 6.6892
42
- 1300 val perplexity 803.6893
43
- 1300 train 6.579344 (lr=4.4068e-05) (hash(x)=145279030)
44
- 1400 val loss 6.6278
45
- 1400 val perplexity 755.7952
46
- 1400 train 6.572786 (lr=4.3822e-05) (hash(x)=152507639)
47
- 1500 val loss 6.5699
48
- 1500 val perplexity 713.3077
49
- 1500 train 6.604732 (lr=4.3549e-05) (hash(x)=148473774)
50
- 1600 val loss 6.4935
51
- 1600 val perplexity 660.8264
52
- 1600 train 6.535246 (lr=4.3249e-05) (hash(x)=151117002)
53
- 1700 val loss 6.4515
54
- 1700 val perplexity 633.6206
55
- 1700 train 6.387179 (lr=4.2922e-05) (hash(x)=138011335)
56
- 1800 val loss 6.4102
57
- 1800 val perplexity 607.9948
58
- 1800 train 6.550987 (lr=4.2569e-05) (hash(x)=171180926)
59
- 1900 val loss 6.3649
60
- 1900 val perplexity 581.0685
61
- 1900 train 6.291069 (lr=4.2190e-05) (hash(x)=141769419)
62
- 2000 val loss 6.3258
63
- 2000 val perplexity 558.8239
64
- 2000 train 6.267239 (lr=4.1785e-05) (hash(x)=151963443)
65
- 2100 val loss 6.3109
66
- 2100 val perplexity 550.5495
67
- 2100 train 6.360897 (lr=4.1356e-05) (hash(x)=162947470)
68
- 2200 val loss 6.2599
69
- 2200 val perplexity 523.1447
70
- 2200 train 6.505328 (lr=4.0903e-05) (hash(x)=154954810)
71
- 2300 val loss 6.2322
72
- 2300 val perplexity 508.8639
73
- 2300 train 6.212578 (lr=4.0426e-05) (hash(x)=151878111)
74
- 2400 val loss 6.2105
75
- 2400 val perplexity 497.9667
76
- 2400 train 6.103368 (lr=3.9927e-05) (hash(x)=158661057)
77
- 2500 val loss 6.1755
78
- 2500 val perplexity 480.8314
79
- 2500 train 6.202045 (lr=3.9406e-05) (hash(x)=150925584)
80
- 2600 val loss 6.1552
81
- 2600 val perplexity 471.1487
82
- 2600 train 6.061382 (lr=3.8863e-05) (hash(x)=144515755)
83
- 2700 val loss 6.1338
84
- 2700 val perplexity 461.1937
85
- 2700 train 6.129019 (lr=3.8300e-05) (hash(x)=153109144)
86
- 2800 val loss 6.1062
87
- 2800 val perplexity 448.6117
88
- 2800 train 6.009483 (lr=3.7717e-05) (hash(x)=151152897)
89
- 2900 val loss 6.0859
90
- 2900 val perplexity 439.6326
91
- 2900 train 6.025854 (lr=3.7116e-05) (hash(x)=145800210)
92
- 3000 val loss 6.0743
93
- 3000 val perplexity 434.5630
94
- 3000 train 5.871215 (lr=3.6496e-05) (hash(x)=141997485)
95
- 3100 val loss 6.0462
96
- 3100 val perplexity 422.4893
97
- 3100 train 5.963703 (lr=3.5860e-05) (hash(x)=154049740)
98
- 3200 val loss 6.0270
99
- 3200 val perplexity 414.4612
100
- 3200 train 5.956031 (lr=3.5207e-05) (hash(x)=150471842)
101
- 3300 val loss 6.0160
102
- 3300 val perplexity 409.9505
103
- 3300 train 5.961557 (lr=3.4539e-05) (hash(x)=149048126)
104
- 3400 val loss 5.9919
105
- 3400 val perplexity 400.1670
106
- 3400 train 6.120944 (lr=3.3857e-05) (hash(x)=161261339)
107
- 3500 val loss 5.9792
108
- 3500 val perplexity 395.1056
109
- 3500 train 5.945442 (lr=3.3162e-05) (hash(x)=157495564)
110
- 3600 val loss 5.9652
111
- 3600 val perplexity 389.6129
112
- 3600 train 5.867484 (lr=3.2455e-05) (hash(x)=144352932)
113
- 3700 val loss 5.9466
114
- 3700 val perplexity 382.4526
115
- 3700 train 5.919501 (lr=3.1736e-05) (hash(x)=149389012)
116
- 3800 val loss 5.9294
117
- 3800 val perplexity 375.9380
118
- 3800 train 5.871775 (lr=3.1008e-05) (hash(x)=146607620)
119
- 3900 val loss 5.9150
120
- 3900 val perplexity 370.5585
121
- 3900 train 5.833241 (lr=3.0270e-05) (hash(x)=143754617)
122
- 4000 val loss 5.8975
123
- 4000 val perplexity 364.1153
124
- 4000 train 5.870857 (lr=2.9524e-05) (hash(x)=156930722)
125
- 4100 val loss 5.8849
126
- 4100 val perplexity 359.5775
127
- 4100 train 5.711125 (lr=2.8771e-05) (hash(x)=147500519)
128
- 4200 val loss 5.8724
129
- 4200 val perplexity 355.0967
130
- 4200 train 5.711286 (lr=2.8013e-05) (hash(x)=143232237)
131
- 4300 val loss 5.8596
132
- 4300 val perplexity 350.5836
133
- 4300 train 5.764656 (lr=2.7250e-05) (hash(x)=146811670)
134
- 4400 val loss 5.8431
135
- 4400 val perplexity 344.8451
136
- 4400 train 5.873563 (lr=2.6483e-05) (hash(x)=158418746)
137
- 4500 val loss 5.8401
138
- 4500 val perplexity 343.8246
139
- 4500 train 5.804694 (lr=2.5714e-05) (hash(x)=156695778)
140
- 4600 val loss 5.8231
141
- 4600 val perplexity 338.0122
142
- 4600 train 5.847899 (lr=2.4943e-05) (hash(x)=147791497)
143
- 4700 val loss 5.8089
144
- 4700 val perplexity 333.2647
145
- 4700 train 5.697839 (lr=2.4172e-05) (hash(x)=155533088)
146
- 4800 val loss 5.8027
147
- 4800 val perplexity 331.2058
148
- 4800 train 5.654629 (lr=2.3402e-05) (hash(x)=138350044)
149
- 4900 val loss 5.7873
150
- 4900 val perplexity 326.1176
151
- 4900 train 5.788656 (lr=2.2633e-05) (hash(x)=143735284)
152
- 5000 val loss 5.7758
153
- 5000 val perplexity 322.3993
154
- 5000 train 5.734388 (lr=2.1868e-05) (hash(x)=154976463)
155
- 5100 val loss 5.7729
156
- 5100 val perplexity 321.4783
157
- 5100 train 5.654710 (lr=2.1107e-05) (hash(x)=149894982)
158
- 5200 val loss 5.7619
159
- 5200 val perplexity 317.9573
160
- 5200 train 5.548586 (lr=2.0351e-05) (hash(x)=159326689)
161
- 5300 val loss 5.7530
162
- 5300 val perplexity 315.1419
163
- 5300 train 5.758868 (lr=1.9602e-05) (hash(x)=159484800)
164
- 5400 val loss 5.7385
165
- 5400 val perplexity 310.5921
166
- 5400 train 5.640297 (lr=1.8860e-05) (hash(x)=140385615)
167
- 5500 val loss 5.7310
168
- 5500 val perplexity 308.2803
169
- 5500 train 5.561502 (lr=1.8127e-05) (hash(x)=148498335)
170
- 5600 val loss 5.7311
171
- 5600 val perplexity 308.3099
172
- 5600 train 5.465002 (lr=1.7403e-05) (hash(x)=151907614)
173
- 5700 val loss 5.7218
174
- 5700 val perplexity 305.4481
175
- 5700 train 5.691004 (lr=1.6690e-05) (hash(x)=155192267)
176
- 5800 val loss 5.7090
177
- 5800 val perplexity 301.5787
178
- 5800 train 5.762886 (lr=1.5989e-05) (hash(x)=153132158)
179
- 5900 val loss 5.7053
180
- 5900 val perplexity 300.4576
181
- 5900 train 5.679949 (lr=1.5300e-05) (hash(x)=161446764)
182
- 6000 val loss 5.7017
183
- 6000 val perplexity 299.3758
184
- 6000 train 5.506324 (lr=1.4625e-05) (hash(x)=151512446)
185
- 6100 val loss 5.6905
186
- 6100 val perplexity 296.0348
187
- 6100 train 5.810744 (lr=1.3965e-05) (hash(x)=188094053)
188
- 6200 val loss 5.6813
189
- 6200 val perplexity 293.3209
190
- 6200 train 5.564333 (lr=1.3320e-05) (hash(x)=149389789)
191
- 6300 val loss 5.6772
192
- 6300 val perplexity 292.1324
193
- 6300 train 5.496711 (lr=1.2692e-05) (hash(x)=138212820)
194
- 6400 val loss 5.6753
195
- 6400 val perplexity 291.5690
196
- 6400 train 5.467338 (lr=1.2081e-05) (hash(x)=146535423)
197
- 6500 val loss 5.6654
198
- 6500 val perplexity 288.7062
199
- 6500 train 5.533089 (lr=1.1489e-05) (hash(x)=145950843)
200
- 6600 val loss 5.6610
201
- 6600 val perplexity 287.4485
202
- 6600 train 5.562228 (lr=1.0916e-05) (hash(x)=141162902)
203
- 6700 val loss 5.6593
204
- 6700 val perplexity 286.9351
205
- 6700 train 5.572202 (lr=1.0363e-05) (hash(x)=153018737)
206
- 6800 val loss 5.6498
207
- 6800 val perplexity 284.2420
208
- 6800 train 5.566178 (lr=9.8310e-06) (hash(x)=155640155)
209
- 6900 val loss 5.6472
210
- 6900 val perplexity 283.4846
211
- 6900 train 5.618537 (lr=9.3205e-06) (hash(x)=153722115)
212
- 7000 val loss 5.6406
213
- 7000 val perplexity 281.6339
214
- 7000 train 5.552649 (lr=8.8324e-06) (hash(x)=146953450)
215
- 7100 val loss 5.6403
216
- 7100 val perplexity 281.5345
217
- 7100 train 5.483966 (lr=8.3674e-06) (hash(x)=137663885)
218
- 7200 val loss 5.6309
219
- 7200 val perplexity 278.9171
220
- 7200 train 5.704070 (lr=7.9261e-06) (hash(x)=146172950)
221
- 7300 val loss 5.6273
222
- 7300 val perplexity 277.9214
223
- 7300 train 5.517547 (lr=7.5093e-06) (hash(x)=150018163)
224
- 7400 val loss 5.6247
225
- 7400 val perplexity 277.1830
226
- 7400 train 5.627498 (lr=7.1174e-06) (hash(x)=145351166)
227
- 7500 val loss 5.6248
228
- 7500 val perplexity 277.2080
229
- 7500 train 5.407495 (lr=6.7511e-06) (hash(x)=145292116)
230
- 7600 val loss 5.6174
231
- 7600 val perplexity 275.1786
232
- 7600 train 5.533510 (lr=6.4109e-06) (hash(x)=150235132)
233
- 7700 val loss 5.6147
234
- 7700 val perplexity 274.4358
235
- 7700 train 5.519080 (lr=6.0972e-06) (hash(x)=154543455)
236
- 7800 val loss 5.6131
237
- 7800 val perplexity 273.9905
238
- 7800 train 5.514878 (lr=5.8107e-06) (hash(x)=142456852)
239
- 7900 val loss 5.6123
240
- 7900 val perplexity 273.7810
241
- 7900 train 5.390297 (lr=5.5515e-06) (hash(x)=147363479)
242
- 8000 val loss 5.6059
243
- 8000 val perplexity 272.0219
244
- 8000 train 5.638661 (lr=5.3203e-06) (hash(x)=156122973)
245
- 8100 val loss 5.6041
246
- 8100 val perplexity 271.5290
247
- 8100 train 5.650138 (lr=5.1172e-06) (hash(x)=156153179)
248
- 8200 val loss 5.6029
249
- 8200 val perplexity 271.2119
250
- 8200 train 5.651247 (lr=4.9425e-06) (hash(x)=146430698)
251
- 8300 val loss 5.6011
252
- 8300 val perplexity 270.7372
253
- 8300 train 5.470097 (lr=4.7966e-06) (hash(x)=143507257)
254
- 8400 val loss 5.5964
255
- 8400 val perplexity 269.4478
256
- 8400 train 5.624218 (lr=4.6796e-06) (hash(x)=166272643)
257
- 8500 val loss 5.5950
258
- 8500 val perplexity 269.0811
259
- 8500 train 5.498229 (lr=4.5917e-06) (hash(x)=143887848)
260
- 8600 val loss 5.5944
261
- 8600 val perplexity 268.9143
262
- 8600 train 5.633615 (lr=4.5330e-06) (hash(x)=156900341)
263
- 8700 val loss 5.5894
264
- 8700 val perplexity 267.5721
265
- 8700 train 5.830091 (lr=4.5037e-06) (hash(x)=146417632)
266
- 8749 val loss 5.5894
267
- 8749 val perplexity 267.5743
 
1
  max_steps: 8750
2
  0 val loss 11.2808
3
+ 0 val perplexity 79287.6797
4
+ 0 train 11.289297 (lr=1.1000e-07) (hash(x)=150724848)
5
+ 100 val loss 9.9926
6
+ 100 val perplexity 21863.0391
7
+ 100 train 10.007814 (lr=1.1110e-05) (hash(x)=149910534)
8
+ 200 val loss 9.1223
9
+ 200 val perplexity 9157.1182
10
+ 200 train 9.148949 (lr=2.2110e-05) (hash(x)=148123706)
11
+ 300 val loss 7.9004
12
+ 300 val perplexity 2698.2358
13
+ 300 train 7.756021 (lr=3.3110e-05) (hash(x)=146678221)
14
+ 400 val loss 7.5196
15
+ 400 val perplexity 1843.8901
16
+ 400 train 7.404154 (lr=4.4110e-05) (hash(x)=151700982)
17
+ 500 val loss 7.3670
18
+ 500 val perplexity 1582.9152
19
+ 500 train 7.450415 (lr=5.5000e-05) (hash(x)=156182087)
20
+ 600 val loss 7.2367
21
+ 600 val perplexity 1389.4574
22
+ 600 train 7.212444 (lr=5.4982e-05) (hash(x)=149318660)
23
+ 700 val loss 7.1474
24
+ 700 val perplexity 1270.7859
25
+ 700 train 7.189300 (lr=5.4928e-05) (hash(x)=150482428)
26
+ 800 val loss 7.0296
27
+ 800 val perplexity 1129.5702
28
+ 800 train 6.903787 (lr=5.4839e-05) (hash(x)=143268605)
29
+ 900 val loss 6.9340
30
+ 900 val perplexity 1026.5736
31
+ 900 train 6.897569 (lr=5.4713e-05) (hash(x)=152322423)
32
+ 1000 val loss 6.8391
33
+ 1000 val perplexity 933.6412
34
+ 1000 train 6.705533 (lr=5.4553e-05) (hash(x)=147904298)
35
+ 1100 val loss 6.7474
36
+ 1100 val perplexity 851.8519
37
+ 1100 train 6.950415 (lr=5.4357e-05) (hash(x)=154343147)
38
+ 1200 val loss 6.6458
39
+ 1200 val perplexity 769.5319
40
+ 1200 train 6.633022 (lr=5.4126e-05) (hash(x)=141843115)
41
+ 1300 val loss 6.5715
42
+ 1300 val perplexity 714.4750
43
+ 1300 train 6.459829 (lr=5.3860e-05) (hash(x)=145279030)
44
+ 1400 val loss 6.5204
45
+ 1400 val perplexity 678.8162
46
+ 1400 train 6.464448 (lr=5.3561e-05) (hash(x)=152507639)
47
+ 1500 val loss 6.4720
48
+ 1500 val perplexity 646.7606
49
+ 1500 train 6.506417 (lr=5.3227e-05) (hash(x)=148473774)
50
+ 1600 val loss 6.4094
51
+ 1600 val perplexity 607.5082
52
+ 1600 train 6.450625 (lr=5.2860e-05) (hash(x)=151117002)
53
+ 1700 val loss 6.3770
54
+ 1700 val perplexity 588.1442
55
+ 1700 train 6.323629 (lr=5.2461e-05) (hash(x)=138011335)
56
+ 1800 val loss 6.3356
57
+ 1800 val perplexity 564.3019
58
+ 1800 train 6.464371 (lr=5.2029e-05) (hash(x)=171180926)
59
+ 1900 val loss 6.2924
60
+ 1900 val perplexity 540.4733
61
+ 1900 train 6.228439 (lr=5.1565e-05) (hash(x)=141769419)
62
+ 2000 val loss 6.2590
63
+ 2000 val perplexity 522.6740
64
+ 2000 train 6.211834 (lr=5.1071e-05) (hash(x)=151963443)
65
+ 2100 val loss 6.2432
66
+ 2100 val perplexity 514.5130
67
+ 2100 train 6.295941 (lr=5.0547e-05) (hash(x)=162947470)
68
+ 2200 val loss 6.1986
69
+ 2200 val perplexity 492.0483
70
+ 2200 train 6.424762 (lr=4.9993e-05) (hash(x)=154954810)
71
+ 2300 val loss 6.1768
72
+ 2300 val perplexity 481.4570
73
+ 2300 train 6.154170 (lr=4.9410e-05) (hash(x)=151878111)
74
+ 2400 val loss 6.1559
75
+ 2400 val perplexity 471.4818
76
+ 2400 train 6.050483 (lr=4.8800e-05) (hash(x)=158661057)
77
+ 2500 val loss 6.1194
78
+ 2500 val perplexity 454.5914
79
+ 2500 train 6.141537 (lr=4.8162e-05) (hash(x)=150925584)
80
+ 2600 val loss 6.0965
81
+ 2600 val perplexity 444.3190
82
+ 2600 train 6.004150 (lr=4.7499e-05) (hash(x)=144515755)
83
+ 2700 val loss 6.0754
84
+ 2700 val perplexity 435.0298
85
+ 2700 train 6.059896 (lr=4.6811e-05) (hash(x)=153109144)
86
+ 2800 val loss 6.0466
87
+ 2800 val perplexity 422.6543
88
+ 2800 train 5.952908 (lr=4.6099e-05) (hash(x)=151152897)
89
+ 2900 val loss 6.0262
90
+ 2900 val perplexity 414.1284
91
+ 2900 train 5.969955 (lr=4.5364e-05) (hash(x)=145800210)
92
+ 3000 val loss 6.0130
93
+ 3000 val perplexity 408.7055
94
+ 3000 train 5.818487 (lr=4.4606e-05) (hash(x)=141997485)
95
+ 3100 val loss 5.9902
96
+ 3100 val perplexity 399.5090
97
+ 3100 train 5.910625 (lr=4.3828e-05) (hash(x)=154049740)
98
+ 3200 val loss 5.9697
99
+ 3200 val perplexity 391.3949
100
+ 3200 train 5.898226 (lr=4.3031e-05) (hash(x)=150471842)
101
+ 3300 val loss 5.9616
102
+ 3300 val perplexity 388.2483
103
+ 3300 train 5.906397 (lr=4.2215e-05) (hash(x)=149048126)
104
+ 3400 val loss 5.9343
105
+ 3400 val perplexity 377.7688
106
+ 3400 train 6.056663 (lr=4.1381e-05) (hash(x)=161261339)
107
+ 3500 val loss 5.9248
108
+ 3500 val perplexity 374.2032
109
+ 3500 train 5.890368 (lr=4.0532e-05) (hash(x)=157495564)
110
+ 3600 val loss 5.9106
111
+ 3600 val perplexity 368.9154
112
+ 3600 train 5.813707 (lr=3.9667e-05) (hash(x)=144352932)
113
+ 3700 val loss 5.8902
114
+ 3700 val perplexity 361.4862
115
+ 3700 train 5.860925 (lr=3.8789e-05) (hash(x)=149389012)
116
+ 3800 val loss 5.8681
117
+ 3800 val perplexity 353.5889
118
+ 3800 train 5.809790 (lr=3.7898e-05) (hash(x)=146607620)
119
+ 3900 val loss 5.8539
120
+ 3900 val perplexity 348.6085
121
+ 3900 train 5.772669 (lr=3.6996e-05) (hash(x)=143754617)
122
+ 4000 val loss 5.8410
123
+ 4000 val perplexity 344.1350
124
+ 4000 train 5.813826 (lr=3.6085e-05) (hash(x)=156930722)
125
+ 4100 val loss 5.8236
126
+ 4100 val perplexity 338.1763
127
+ 4100 train 5.653216 (lr=3.5165e-05) (hash(x)=147500519)
128
+ 4200 val loss 5.8148
129
+ 4200 val perplexity 335.2222
130
+ 4200 train 5.652660 (lr=3.4238e-05) (hash(x)=143232237)
131
+ 4300 val loss 5.8024
132
+ 4300 val perplexity 331.0877
133
+ 4300 train 5.709325 (lr=3.3305e-05) (hash(x)=146811670)
134
+ 4400 val loss 5.7871
135
+ 4400 val perplexity 326.0585
136
+ 4400 train 5.821728 (lr=3.2368e-05) (hash(x)=158418746)
137
+ 4500 val loss 5.7772
138
+ 4500 val perplexity 322.8669
139
+ 4500 train 5.743765 (lr=3.1428e-05) (hash(x)=156695778)
140
+ 4600 val loss 5.7656
141
+ 4600 val perplexity 319.1193
142
+ 4600 train 5.785645 (lr=3.0486e-05) (hash(x)=147791497)
143
+ 4700 val loss 5.7494
144
+ 4700 val perplexity 314.0002
145
+ 4700 train 5.637969 (lr=2.9543e-05) (hash(x)=155533088)
146
+ 4800 val loss 5.7457
147
+ 4800 val perplexity 312.8485
148
+ 4800 train 5.597758 (lr=2.8602e-05) (hash(x)=138350044)
149
+ 4900 val loss 5.7309
150
+ 4900 val perplexity 308.2330
151
+ 4900 train 5.726795 (lr=2.7663e-05) (hash(x)=143735284)
152
+ 5000 val loss 5.7179
153
+ 5000 val perplexity 304.2539
154
+ 5000 train 5.672445 (lr=2.6728e-05) (hash(x)=154976463)
155
+ 5100 val loss 5.7154
156
+ 5100 val perplexity 303.5133
157
+ 5100 train 5.600068 (lr=2.5798e-05) (hash(x)=149894982)
158
+ 5200 val loss 5.7096
159
+ 5200 val perplexity 301.7632
160
+ 5200 train 5.481579 (lr=2.4874e-05) (hash(x)=159326689)
161
+ 5300 val loss 5.6972
162
+ 5300 val perplexity 298.0291
163
+ 5300 train 5.708793 (lr=2.3958e-05) (hash(x)=159484800)
164
+ 5400 val loss 5.6821
165
+ 5400 val perplexity 293.5782
166
+ 5400 train 5.585687 (lr=2.3051e-05) (hash(x)=140385615)
167
+ 5500 val loss 5.6753
168
+ 5500 val perplexity 291.5740
169
+ 5500 train 5.498550 (lr=2.2155e-05) (hash(x)=148498335)
170
+ 5600 val loss 5.6751
171
+ 5600 val perplexity 291.5265
172
+ 5600 train 5.412326 (lr=2.1271e-05) (hash(x)=151907614)
173
+ 5700 val loss 5.6657
174
+ 5700 val perplexity 288.7810
175
+ 5700 train 5.637896 (lr=2.0399e-05) (hash(x)=155192267)
176
+ 5800 val loss 5.6533
177
+ 5800 val perplexity 285.2272
178
+ 5800 train 5.701861 (lr=1.9542e-05) (hash(x)=153132158)
179
+ 5900 val loss 5.6519
180
+ 5900 val perplexity 284.8352
181
+ 5900 train 5.619544 (lr=1.8700e-05) (hash(x)=161446764)
182
+ 6000 val loss 5.6450
183
+ 6000 val perplexity 282.8669
184
+ 6000 train 5.439802 (lr=1.7875e-05) (hash(x)=151512446)
185
+ 6100 val loss 5.6340
186
+ 6100 val perplexity 279.7804
187
+ 6100 train 5.747126 (lr=1.7068e-05) (hash(x)=188094053)
188
+ 6200 val loss 5.6256
189
+ 6200 val perplexity 277.4352
190
+ 6200 train 5.508698 (lr=1.6280e-05) (hash(x)=149389789)
191
+ 6300 val loss 5.6217
192
+ 6300 val perplexity 276.3475
193
+ 6300 train 5.438232 (lr=1.5512e-05) (hash(x)=138212820)
194
+ 6400 val loss 5.6208
195
+ 6400 val perplexity 276.1091
196
+ 6400 train 5.405768 (lr=1.4766e-05) (hash(x)=146535423)
197
+ 6500 val loss 5.6109
198
+ 6500 val perplexity 273.3779
199
+ 6500 train 5.479463 (lr=1.4042e-05) (hash(x)=145950843)
200
+ 6600 val loss 5.6075
201
+ 6600 val perplexity 272.4713
202
+ 6600 train 5.507493 (lr=1.3342e-05) (hash(x)=141162902)
203
+ 6700 val loss 5.6048
204
+ 6700 val perplexity 271.7363
205
+ 6700 train 5.515503 (lr=1.2666e-05) (hash(x)=153018737)
206
+ 6800 val loss 5.5980
207
+ 6800 val perplexity 269.8820
208
+ 6800 train 5.513815 (lr=1.2016e-05) (hash(x)=155640155)
209
+ 6900 val loss 5.5941
210
+ 6900 val perplexity 268.8358
211
+ 6900 train 5.567899 (lr=1.1392e-05) (hash(x)=153722115)
212
+ 7000 val loss 5.5878
213
+ 7000 val perplexity 267.1516
214
+ 7000 train 5.493548 (lr=1.0795e-05) (hash(x)=146953450)
215
+ 7100 val loss 5.5874
216
+ 7100 val perplexity 267.0288
217
+ 7100 train 5.427116 (lr=1.0227e-05) (hash(x)=137663885)
218
+ 7200 val loss 5.5790
219
+ 7200 val perplexity 264.7992
220
+ 7200 train 5.650048 (lr=9.6875e-06) (hash(x)=146172950)
221
+ 7300 val loss 5.5747
222
+ 7300 val perplexity 263.6632
223
+ 7300 train 5.460082 (lr=9.1780e-06) (hash(x)=150018163)
224
+ 7400 val loss 5.5715
225
+ 7400 val perplexity 262.8262
226
+ 7400 train 5.576471 (lr=8.6990e-06) (hash(x)=145351166)
227
+ 7500 val loss 5.5722
228
+ 7500 val perplexity 263.0211
229
+ 7500 train 5.347385 (lr=8.2513e-06) (hash(x)=145292116)
230
+ 7600 val loss 5.5646
231
+ 7600 val perplexity 261.0114
232
+ 7600 train 5.485201 (lr=7.8355e-06) (hash(x)=150235132)
233
+ 7700 val loss 5.5624
234
+ 7700 val perplexity 260.4434
235
+ 7700 train 5.463830 (lr=7.4522e-06) (hash(x)=154543455)
236
+ 7800 val loss 5.5601
237
+ 7800 val perplexity 259.8475
238
+ 7800 train 5.462539 (lr=7.1019e-06) (hash(x)=142456852)
239
+ 7900 val loss 5.5605
240
+ 7900 val perplexity 259.9470
241
+ 7900 train 5.337718 (lr=6.7852e-06) (hash(x)=147363479)
242
+ 8000 val loss 5.5541
243
+ 8000 val perplexity 258.2981
244
+ 8000 train 5.583329 (lr=6.5025e-06) (hash(x)=156122973)
245
+ 8100 val loss 5.5513
246
+ 8100 val perplexity 257.5828
247
+ 8100 train 5.597711 (lr=6.2543e-06) (hash(x)=156153179)
248
+ 8200 val loss 5.5499
249
+ 8200 val perplexity 257.2050
250
+ 8200 train 5.596151 (lr=6.0408e-06) (hash(x)=146430698)
251
+ 8300 val loss 5.5482
252
+ 8300 val perplexity 256.7670
253
+ 8300 train 5.411896 (lr=5.8625e-06) (hash(x)=143507257)
254
+ 8400 val loss 5.5431
255
+ 8400 val perplexity 255.4713
256
+ 8400 train 5.553337 (lr=5.7195e-06) (hash(x)=166272643)
257
+ 8500 val loss 5.5421
258
+ 8500 val perplexity 255.2123
259
+ 8500 train 5.447274 (lr=5.6121e-06) (hash(x)=143887848)
260
+ 8600 val loss 5.5419
261
+ 8600 val perplexity 255.1584
262
+ 8600 train 5.574839 (lr=5.5404e-06) (hash(x)=156900341)
263
+ 8700 val loss 5.5367
264
+ 8700 val perplexity 253.8504
265
+ 8700 train 5.774979 (lr=5.5045e-06) (hash(x)=146417632)
266
+ 8749 val loss 5.5365
267
+ 8749 val perplexity 253.7859
attention_kindselective_n_heads4_seed1339/model_08749.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e0998a3dbdb0477c922f88c6a1516669b2c479174b7ebe6f589d2a397b0bc9e
3
  size 92843394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0799e271014bd3a93b4c15f8a90dc002dc00df4c2d676c29130f65bddb5be45b
3
  size 92843394
attention_kindselective_n_heads4_seed1339/optimizer_08749.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2a3f247d486e700d63788557e071c6feee4fce9929dc372541588e1855466f1
3
  size 179406214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5efbfc12e221b8e452d6c7051c9f70c4a071a6f63d0982e0f3688b6b2e0170f2
3
  size 179406214