andrew-healey commited on
Commit
7b93a31
·
verified ·
1 Parent(s): 688fff0

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads4_seed1339/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1339", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1339, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "5e-5_61440_4_1339", "n_embd": 256}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1339", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1339, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 4.5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "4.5e-5_61440_4_1339", "n_embd": 256}
attention_kindselective_n_heads4_seed1339/log2.txt CHANGED
@@ -1,476 +1,267 @@
1
  max_steps: 8750
2
- 1900 val loss 6.4870
3
- 1900 val perplexity 656.5473
4
- 1900 train 6.404645 (lr=3.7502e-05) (hash(x)=141769419)
5
  0 val loss 11.2808
6
- 0 val perplexity 79287.6797
7
- 2000 val loss 6.4397
8
- 2000 val perplexity 626.2369
9
- 2000 train 6.376182 (lr=3.7143e-05) (hash(x)=151963443)
10
- 0 train 11.289297 (lr=1.0000e-07) (hash(x)=150724848)
11
- 2100 val loss 6.4155
12
- 2100 val perplexity 611.2753
13
- 2100 train 6.466409 (lr=3.6761e-05) (hash(x)=162947470)
14
- 100 val loss 10.0094
15
- 100 val perplexity 22235.4102
16
- 100 train 10.023370 (lr=1.0100e-05) (hash(x)=149910534)
17
- 2200 val loss 6.3645
18
- 2200 val perplexity 580.8438
19
- 2200 train 6.606410 (lr=3.6358e-05) (hash(x)=154954810)
20
- 200 val loss 9.2733
21
- 200 val perplexity 10649.9316
22
- 200 train 9.301048 (lr=2.0100e-05) (hash(x)=148123706)
23
- 2300 val loss 6.3335
24
- 2300 val perplexity 563.1455
25
- 2300 train 6.322995 (lr=3.5935e-05) (hash(x)=151878111)
26
- 300 val loss 8.1598
27
- 300 val perplexity 3497.6423
28
- 300 train 8.051448 (lr=3.0100e-05) (hash(x)=146678221)
29
- 2400 val loss 6.3098
30
- 2400 val perplexity 549.9563
31
- 2400 train 6.204768 (lr=3.5491e-05) (hash(x)=158661057)
32
- 400 val loss 7.6335
33
- 400 val perplexity 2066.3218
34
- 400 train 7.528817 (lr=4.0100e-05) (hash(x)=151700982)
35
- 2500 val loss 6.2701
36
- 2500 val perplexity 528.5074
37
- 2500 train 6.296215 (lr=3.5027e-05) (hash(x)=150925584)
38
- 500 val loss 7.4756
39
- 500 val perplexity 1764.5261
40
- 500 train 7.565072 (lr=5.0000e-05) (hash(x)=156182087)
41
- 2600 val loss 6.2500
42
- 2600 val perplexity 518.0140
43
- 2600 train 6.152835 (lr=3.4545e-05) (hash(x)=144515755)
44
- 600 val loss 7.3815
45
- 600 val perplexity 1605.9897
46
- 600 train 7.367438 (lr=4.9984e-05) (hash(x)=149318660)
47
- 2700 val loss 6.2234
48
- 2700 val perplexity 504.4197
49
- 2700 train 6.215466 (lr=3.4044e-05) (hash(x)=153109144)
50
- 700 val loss 7.3039
51
- 700 val perplexity 1486.1003
52
- 700 train 7.343727 (lr=4.9935e-05) (hash(x)=150482428)
53
- 2800 val loss 6.1948
54
- 2800 val perplexity 490.2085
55
- 2800 train 6.096365 (lr=3.3526e-05) (hash(x)=151152897)
56
- 800 val loss 7.1792
57
- 800 val perplexity 1311.8173
58
- 800 train 7.056084 (lr=4.9853e-05) (hash(x)=143268605)
59
- 2900 val loss 6.1718
60
- 2900 val perplexity 479.0310
61
- 2900 train 6.109528 (lr=3.2992e-05) (hash(x)=145800210)
62
- 900 val loss 7.0813
63
- 900 val perplexity 1189.4648
64
- 900 train 7.047098 (lr=4.9739e-05) (hash(x)=152322423)
65
- 3000 val loss 6.1559
66
- 3000 val perplexity 471.4690
67
- 3000 train 5.950811 (lr=3.2441e-05) (hash(x)=141997485)
68
- 1000 val loss 7.0176
69
- 1000 val perplexity 1116.0995
70
- 1000 train 6.884840 (lr=4.9593e-05) (hash(x)=147904298)
71
- 3100 val loss 6.1280
72
- 3100 val perplexity 458.5026
73
- 3100 train 6.036367 (lr=3.1875e-05) (hash(x)=154049740)
74
- 1100 val loss 6.9427
75
- 1100 val perplexity 1035.5698
76
- 1100 train 7.138206 (lr=4.9415e-05) (hash(x)=154343147)
77
- 3200 val loss 6.1142
78
- 3200 val perplexity 452.2266
79
- 3200 train 6.052361 (lr=3.1295e-05) (hash(x)=150471842)
80
- 1200 val loss 6.8632
81
- 1200 val perplexity 956.4412
82
- 1200 train 6.851167 (lr=4.9205e-05) (hash(x)=141843115)
83
- 3300 val loss 6.0974
84
- 3300 val perplexity 444.6876
85
- 3300 train 6.039181 (lr=3.0702e-05) (hash(x)=149048126)
86
- 1300 val loss 6.8038
87
- 1300 val perplexity 901.3065
88
- 1300 train 6.687198 (lr=4.8964e-05) (hash(x)=145279030)
89
- 3400 val loss 6.0719
90
- 3400 val perplexity 433.5064
91
- 3400 train 6.197778 (lr=3.0095e-05) (hash(x)=161261339)
92
- 1400 val loss 6.7539
93
- 1400 val perplexity 857.3593
94
- 1400 train 6.705647 (lr=4.8691e-05) (hash(x)=152507639)
95
- 3500 val loss 6.0590
96
- 3500 val perplexity 427.9403
97
- 3500 train 6.026854 (lr=2.9477e-05) (hash(x)=157495564)
98
- 1500 val loss 6.6972
99
- 1500 val perplexity 810.1442
100
- 1500 train 6.731950 (lr=4.8388e-05) (hash(x)=148473774)
101
- 3600 val loss 6.0460
102
- 3600 val perplexity 422.4147
103
- 3600 train 5.944861 (lr=2.8849e-05) (hash(x)=144352932)
104
- 1600 val loss 6.6193
105
- 1600 val perplexity 749.4503
106
- 1600 train 6.657555 (lr=4.8055e-05) (hash(x)=151117002)
107
- 3700 val loss 6.0294
108
- 3700 val perplexity 415.4617
109
- 3700 train 6.011248 (lr=2.8210e-05) (hash(x)=149389012)
110
- 1700 val loss 6.5588
111
- 1700 val perplexity 705.4470
112
- 1700 train 6.489615 (lr=4.7691e-05) (hash(x)=138011335)
113
- 3800 val loss 6.0066
114
- 3800 val perplexity 406.1071
115
- 3800 train 5.947698 (lr=2.7562e-05) (hash(x)=146607620)
116
- 1800 val loss 6.5096
117
- 1800 val perplexity 671.5671
118
- 1800 train 6.661233 (lr=4.7299e-05) (hash(x)=171180926)
119
- 3900 val loss 5.9952
120
- 3900 val perplexity 401.5043
121
- 3900 train 5.910914 (lr=2.6907e-05) (hash(x)=143754617)
122
- 1900 val loss 6.4539
123
- 1900 val perplexity 635.1964
124
- 1900 train 6.375162 (lr=4.6878e-05) (hash(x)=141769419)
125
- 4000 val loss 5.9766
126
- 4000 val perplexity 394.0837
127
- 4000 train 5.954946 (lr=2.6244e-05) (hash(x)=156930722)
128
- 2000 val loss 6.4109
129
- 2000 val perplexity 608.4214
130
- 2000 train 6.352161 (lr=4.6428e-05) (hash(x)=151963443)
131
- 4100 val loss 5.9643
132
- 4100 val perplexity 389.2686
133
- 4100 train 5.786150 (lr=2.5575e-05) (hash(x)=147500519)
134
- 2100 val loss 6.3943
135
- 2100 val perplexity 598.4083
136
- 2100 train 6.443270 (lr=4.5951e-05) (hash(x)=162947470)
137
- 4200 val loss 5.9559
138
- 4200 val perplexity 386.0066
139
- 4200 train 5.804139 (lr=2.4900e-05) (hash(x)=143232237)
140
- 2200 val loss 6.3348
141
- 2200 val perplexity 563.8331
142
- 2200 train 6.577147 (lr=4.5448e-05) (hash(x)=154954810)
143
- 4300 val loss 5.9418
144
- 4300 val perplexity 380.6191
145
- 4300 train 5.853108 (lr=2.4222e-05) (hash(x)=146811670)
146
- 2300 val loss 6.3036
147
- 2300 val perplexity 546.5355
148
- 2300 train 6.291760 (lr=4.4918e-05) (hash(x)=151878111)
149
- 4400 val loss 5.9268
150
- 4400 val perplexity 374.9412
151
- 4400 train 5.956989 (lr=2.3540e-05) (hash(x)=158418746)
152
- 2400 val loss 6.2758
153
- 2400 val perplexity 531.5452
154
- 2400 train 6.167399 (lr=4.4363e-05) (hash(x)=158661057)
155
- 4500 val loss 5.9192
156
- 4500 val perplexity 372.1174
157
- 4500 train 5.882172 (lr=2.2856e-05) (hash(x)=156695778)
158
- 2500 val loss 6.2377
159
- 2500 val perplexity 511.6937
160
- 2500 train 6.265279 (lr=4.3784e-05) (hash(x)=150925584)
161
- 4600 val loss 5.9054
162
- 4600 val perplexity 367.0164
163
- 4600 train 5.931722 (lr=2.2171e-05) (hash(x)=147791497)
164
- 2600 val loss 6.2134
165
- 2600 val perplexity 499.3786
166
- 2600 train 6.122252 (lr=4.3181e-05) (hash(x)=144515755)
167
- 4700 val loss 5.8916
168
- 4700 val perplexity 361.9958
169
- 4700 train 5.770754 (lr=2.1486e-05) (hash(x)=155533088)
170
- 2700 val loss 6.1906
171
- 2700 val perplexity 488.1596
172
- 2700 train 6.177955 (lr=4.2555e-05) (hash(x)=153109144)
173
- 4800 val loss 5.8876
174
- 4800 val perplexity 360.5260
175
- 4800 train 5.742024 (lr=2.0801e-05) (hash(x)=138350044)
176
- 2800 val loss 6.1570
177
- 2800 val perplexity 472.0003
178
- 2800 train 6.059934 (lr=4.1908e-05) (hash(x)=151152897)
179
- 4900 val loss 5.8709
180
- 4900 val perplexity 354.5802
181
- 4900 train 5.874714 (lr=2.0118e-05) (hash(x)=143735284)
182
- 2900 val loss 6.1354
183
- 2900 val perplexity 461.9008
184
- 2900 train 6.076121 (lr=4.1240e-05) (hash(x)=145800210)
185
- 5000 val loss 5.8621
186
- 5000 val perplexity 351.4452
187
- 5000 train 5.823290 (lr=1.9438e-05) (hash(x)=154976463)
188
- 3000 val loss 6.1252
189
- 3000 val perplexity 457.2171
190
- 3000 train 5.920308 (lr=4.0551e-05) (hash(x)=141997485)
191
- 5100 val loss 5.8545
192
- 5100 val perplexity 348.8145
193
- 5100 train 5.741332 (lr=1.8762e-05) (hash(x)=149894982)
194
- 3100 val loss 6.0910
195
- 3100 val perplexity 441.8601
196
- 3100 train 6.004350 (lr=3.9844e-05) (hash(x)=154049740)
197
- 5200 val loss 5.8469
198
- 5200 val perplexity 346.1438
199
- 5200 train 5.637977 (lr=1.8090e-05) (hash(x)=159326689)
200
- 3200 val loss 6.0723
201
- 3200 val perplexity 433.6939
202
- 3200 train 6.009145 (lr=3.9119e-05) (hash(x)=150471842)
203
- 5300 val loss 5.8343
204
- 5300 val perplexity 341.8247
205
- 5300 train 5.844418 (lr=1.7424e-05) (hash(x)=159484800)
206
- 3300 val loss 6.0593
207
- 3300 val perplexity 428.0873
208
- 3300 train 6.000336 (lr=3.8377e-05) (hash(x)=149048126)
209
- 5400 val loss 5.8222
210
- 5400 val perplexity 337.7180
211
- 5400 train 5.717206 (lr=1.6765e-05) (hash(x)=140385615)
212
- 3400 val loss 6.0372
213
- 3400 val perplexity 418.7006
214
- 3400 train 6.162151 (lr=3.7619e-05) (hash(x)=161261339)
215
- 5500 val loss 5.8150
216
- 5500 val perplexity 335.2764
217
- 5500 train 5.650219 (lr=1.6113e-05) (hash(x)=148498335)
218
- 3500 val loss 6.0272
219
- 3500 val perplexity 414.5575
220
- 3500 train 5.989676 (lr=3.6847e-05) (hash(x)=157495564)
221
- 5600 val loss 5.8132
222
- 5600 val perplexity 334.6795
223
- 5600 train 5.548153 (lr=1.5469e-05) (hash(x)=151907614)
224
- 3600 val loss 6.0075
225
- 3600 val perplexity 406.4727
226
- 3600 train 5.910073 (lr=3.6061e-05) (hash(x)=144352932)
227
- 5700 val loss 5.8038
228
- 5700 val perplexity 331.5614
229
- 5700 train 5.771846 (lr=1.4836e-05) (hash(x)=155192267)
230
- 3700 val loss 5.9869
231
- 3700 val perplexity 398.1840
232
- 3700 train 5.958940 (lr=3.5263e-05) (hash(x)=149389012)
233
- 5800 val loss 5.7946
234
- 5800 val perplexity 328.5269
235
- 5800 train 5.841525 (lr=1.4212e-05) (hash(x)=153132158)
236
- 3800 val loss 5.9682
237
- 3800 val perplexity 390.7858
238
- 3800 train 5.906829 (lr=3.4453e-05) (hash(x)=146607620)
239
- 5900 val loss 5.7887
240
- 5900 val perplexity 326.5968
241
- 5900 train 5.762638 (lr=1.3600e-05) (hash(x)=161446764)
242
- 3900 val loss 5.9571
243
- 3900 val perplexity 386.4784
244
- 3900 train 5.873564 (lr=3.3633e-05) (hash(x)=143754617)
245
- 6000 val loss 5.7856
246
- 6000 val perplexity 325.5639
247
- 6000 train 5.599146 (lr=1.3000e-05) (hash(x)=151512446)
248
- 4000 val loss 5.9406
249
- 4000 val perplexity 380.1711
250
- 4000 train 5.916785 (lr=3.2805e-05) (hash(x)=156930722)
251
- 6100 val loss 5.7740
252
- 6100 val perplexity 321.8256
253
- 6100 train 5.901221 (lr=1.2413e-05) (hash(x)=188094053)
254
- 4100 val loss 5.9264
255
- 4100 val perplexity 374.7864
256
- 4100 train 5.754153 (lr=3.1968e-05) (hash(x)=147500519)
257
- 6200 val loss 5.7654
258
- 6200 val perplexity 319.0692
259
- 6200 train 5.657475 (lr=1.1840e-05) (hash(x)=149389789)
260
- 4200 val loss 5.9170
261
- 4200 val perplexity 371.2794
262
- 4200 train 5.758588 (lr=3.1126e-05) (hash(x)=143232237)
263
- 6300 val loss 5.7603
264
- 6300 val perplexity 317.4521
265
- 6300 train 5.576626 (lr=1.1282e-05) (hash(x)=138212820)
266
- 4300 val loss 5.8994
267
- 4300 val perplexity 364.8297
268
- 4300 train 5.808607 (lr=3.0277e-05) (hash(x)=146811670)
269
- 6400 val loss 5.7599
270
- 6400 val perplexity 317.3077
271
- 6400 train 5.555570 (lr=1.0739e-05) (hash(x)=146535423)
272
- 4400 val loss 5.8864
273
- 4400 val perplexity 360.1214
274
- 4400 train 5.923263 (lr=2.9425e-05) (hash(x)=158418746)
275
- 6500 val loss 5.7497
276
- 6500 val perplexity 314.1065
277
- 6500 train 5.620624 (lr=1.0213e-05) (hash(x)=145950843)
278
- 4500 val loss 5.8788
279
- 4500 val perplexity 357.3871
280
- 4500 train 5.841973 (lr=2.8571e-05) (hash(x)=156695778)
281
- 6600 val loss 5.7452
282
- 6600 val perplexity 312.6862
283
- 6600 train 5.642556 (lr=9.7032e-06) (hash(x)=141162902)
284
- 4600 val loss 5.8646
285
- 4600 val perplexity 352.3253
286
- 4600 train 5.884218 (lr=2.7714e-05) (hash(x)=147791497)
287
- 6700 val loss 5.7435
288
- 6700 val perplexity 312.1676
289
- 6700 train 5.656245 (lr=9.2116e-06) (hash(x)=153018737)
290
- 4700 val loss 5.8486
291
- 4700 val perplexity 346.7375
292
- 4700 train 5.732183 (lr=2.6857e-05) (hash(x)=155533088)
293
- 6800 val loss 5.7361
294
- 6800 val perplexity 309.8445
295
- 6800 train 5.663511 (lr=8.7387e-06) (hash(x)=155640155)
296
- 4800 val loss 5.8401
297
- 4800 val perplexity 343.7982
298
- 4800 train 5.699566 (lr=2.6002e-05) (hash(x)=138350044)
299
- 6900 val loss 5.7339
300
- 6900 val perplexity 309.1730
301
- 6900 train 5.701223 (lr=8.2849e-06) (hash(x)=153722115)
302
- 4900 val loss 5.8266
303
- 4900 val perplexity 339.2005
304
- 4900 train 5.827739 (lr=2.5148e-05) (hash(x)=143735284)
305
- 7000 val loss 5.7269
306
- 7000 val perplexity 307.0191
307
- 7000 train 5.635604 (lr=7.8510e-06) (hash(x)=146953450)
308
- 5000 val loss 5.8136
309
- 5000 val perplexity 334.8062
310
- 5000 train 5.771458 (lr=2.4298e-05) (hash(x)=154976463)
311
- 7100 val loss 5.7270
312
- 7100 val perplexity 307.0580
313
- 7100 train 5.561845 (lr=7.4377e-06) (hash(x)=137663885)
314
- 5100 val loss 5.8073
315
- 5100 val perplexity 332.7099
316
- 5100 train 5.691780 (lr=2.3452e-05) (hash(x)=149894982)
317
- 7200 val loss 5.7176
318
- 7200 val perplexity 304.1624
319
- 7200 train 5.780611 (lr=7.0455e-06) (hash(x)=146172950)
320
- 5200 val loss 5.8004
321
- 5200 val perplexity 330.4291
322
- 5200 train 5.584211 (lr=2.2613e-05) (hash(x)=159326689)
323
- 7300 val loss 5.7154
324
- 7300 val perplexity 303.5107
325
- 7300 train 5.605722 (lr=6.6749e-06) (hash(x)=150018163)
326
- 5300 val loss 5.7876
327
- 5300 val perplexity 326.2238
328
- 5300 train 5.793699 (lr=2.1780e-05) (hash(x)=159484800)
329
- 7400 val loss 5.7112
330
- 7400 val perplexity 302.2303
331
- 7400 train 5.707638 (lr=6.3266e-06) (hash(x)=145351166)
332
- 5400 val loss 5.7735
333
- 5400 val perplexity 321.6571
334
- 5400 train 5.678544 (lr=2.0956e-05) (hash(x)=140385615)
335
- 7500 val loss 5.7119
336
- 7500 val perplexity 302.4439
337
- 7500 train 5.492382 (lr=6.0010e-06) (hash(x)=145292116)
338
- 5500 val loss 5.7677
339
- 5500 val perplexity 319.7973
340
- 5500 train 5.603406 (lr=2.0141e-05) (hash(x)=148498335)
341
- 7600 val loss 5.7052
342
- 7600 val perplexity 300.4154
343
- 7600 train 5.625059 (lr=5.6986e-06) (hash(x)=150235132)
344
- 5600 val loss 5.7657
345
- 5600 val perplexity 319.1591
346
- 5600 train 5.504023 (lr=1.9337e-05) (hash(x)=151907614)
347
- 7700 val loss 5.7019
348
- 7700 val perplexity 299.4479
349
- 7700 train 5.611199 (lr=5.4198e-06) (hash(x)=154543455)
350
- 5700 val loss 5.7550
351
- 5700 val perplexity 315.7704
352
- 5700 train 5.722241 (lr=1.8545e-05) (hash(x)=155192267)
353
- 7800 val loss 5.6998
354
- 7800 val perplexity 298.8089
355
- 7800 train 5.596223 (lr=5.1650e-06) (hash(x)=142456852)
356
- 5800 val loss 5.7419
357
- 5800 val perplexity 311.6478
358
- 5800 train 5.790777 (lr=1.7765e-05) (hash(x)=153132158)
359
- 7900 val loss 5.7008
360
- 7900 val perplexity 299.1197
361
- 7900 train 5.484799 (lr=4.9347e-06) (hash(x)=147363479)
362
- 5900 val loss 5.7398
363
- 5900 val perplexity 311.0003
364
- 5900 train 5.717348 (lr=1.7000e-05) (hash(x)=161446764)
365
- 8000 val loss 5.6943
366
- 8000 val perplexity 297.1720
367
- 8000 train 5.726096 (lr=4.7291e-06) (hash(x)=156122973)
368
- 6000 val loss 5.7350
369
- 6000 val perplexity 309.5016
370
- 6000 train 5.536606 (lr=1.6250e-05) (hash(x)=151512446)
371
- 8100 val loss 5.6915
372
- 8100 val perplexity 296.3289
373
- 8100 train 5.735093 (lr=4.5486e-06) (hash(x)=156153179)
374
- 6100 val loss 5.7231
375
- 6100 val perplexity 305.8419
376
- 6100 train 5.840291 (lr=1.5516e-05) (hash(x)=188094053)
377
- 8200 val loss 5.6903
378
- 8200 val perplexity 295.9858
379
- 8200 train 5.730418 (lr=4.3933e-06) (hash(x)=146430698)
380
- 6200 val loss 5.7147
381
- 6200 val perplexity 303.2798
382
- 6200 train 5.604211 (lr=1.4800e-05) (hash(x)=149389789)
383
- 8300 val loss 5.6895
384
- 8300 val perplexity 295.7574
385
- 8300 train 5.553320 (lr=4.2636e-06) (hash(x)=143507257)
386
- 6300 val loss 5.7076
387
- 6300 val perplexity 301.1410
388
- 6300 train 5.530816 (lr=1.4102e-05) (hash(x)=138212820)
389
- 8400 val loss 5.6841
390
- 8400 val perplexity 294.1438
391
- 8400 train 5.753223 (lr=4.1596e-06) (hash(x)=166272643)
392
- 6400 val loss 5.7063
393
- 6400 val perplexity 300.7570
394
- 6400 train 5.495872 (lr=1.3424e-05) (hash(x)=146535423)
395
- 8500 val loss 5.6830
396
- 8500 val perplexity 293.8261
397
- 8500 train 5.585660 (lr=4.0815e-06) (hash(x)=143887848)
398
- 6500 val loss 5.6980
399
- 6500 val perplexity 298.2603
400
- 6500 train 5.569312 (lr=1.2766e-05) (hash(x)=145950843)
401
- 8600 val loss 5.6830
402
- 8600 val perplexity 293.8326
403
- 8600 train 5.728127 (lr=4.0294e-06) (hash(x)=156900341)
404
- 6600 val loss 5.6921
405
- 6600 val perplexity 296.5222
406
- 6600 train 5.591072 (lr=1.2129e-05) (hash(x)=141162902)
407
- 8700 val loss 5.6780
408
- 8700 val perplexity 292.3772
409
- 8700 train 5.916970 (lr=4.0033e-06) (hash(x)=146417632)
410
- 8749 val loss 5.6773
411
- 8749 val perplexity 292.1608
412
- 6700 val loss 5.6888
413
- 6700 val perplexity 295.5327
414
- 6700 train 5.603293 (lr=1.1515e-05) (hash(x)=153018737)
415
- 6800 val loss 5.6821
416
- 6800 val perplexity 293.5735
417
- 6800 train 5.600542 (lr=1.0923e-05) (hash(x)=155640155)
418
- 6900 val loss 5.6792
419
- 6900 val perplexity 292.7130
420
- 6900 train 5.647770 (lr=1.0356e-05) (hash(x)=153722115)
421
- 7000 val loss 5.6723
422
- 7000 val perplexity 290.7011
423
- 7000 train 5.584852 (lr=9.8138e-06) (hash(x)=146953450)
424
- 7100 val loss 5.6728
425
- 7100 val perplexity 290.8389
426
- 7100 train 5.512071 (lr=9.2971e-06) (hash(x)=137663885)
427
- 7200 val loss 5.6612
428
- 7200 val perplexity 287.4832
429
- 7200 train 5.725991 (lr=8.8068e-06) (hash(x)=146172950)
430
- 7300 val loss 5.6579
431
- 7300 val perplexity 286.5494
432
- 7300 train 5.547714 (lr=8.3436e-06) (hash(x)=150018163)
433
- 7400 val loss 5.6547
434
- 7400 val perplexity 285.6349
435
- 7400 train 5.652079 (lr=7.9082e-06) (hash(x)=145351166)
436
- 7500 val loss 5.6558
437
- 7500 val perplexity 285.9538
438
- 7500 train 5.439733 (lr=7.5012e-06) (hash(x)=145292116)
439
- 7600 val loss 5.6482
440
- 7600 val perplexity 283.7883
441
- 7600 train 5.567676 (lr=7.1232e-06) (hash(x)=150235132)
442
- 7700 val loss 5.6460
443
- 7700 val perplexity 283.1464
444
- 7700 train 5.560301 (lr=6.7747e-06) (hash(x)=154543455)
445
- 7800 val loss 5.6444
446
- 7800 val perplexity 282.7156
447
- 7800 train 5.542447 (lr=6.4563e-06) (hash(x)=142456852)
448
- 7900 val loss 5.6442
449
- 7900 val perplexity 282.6458
450
- 7900 train 5.425089 (lr=6.1684e-06) (hash(x)=147363479)
451
- 8000 val loss 5.6373
452
- 8000 val perplexity 280.7155
453
- 8000 train 5.670514 (lr=5.9114e-06) (hash(x)=156122973)
454
- 8100 val loss 5.6349
455
- 8100 val perplexity 280.0438
456
- 8100 train 5.677576 (lr=5.6857e-06) (hash(x)=156153179)
457
- 8200 val loss 5.6328
458
- 8200 val perplexity 279.4368
459
- 8200 train 5.678164 (lr=5.4917e-06) (hash(x)=146430698)
460
- 8300 val loss 5.6315
461
- 8300 val perplexity 279.0829
462
- 8300 train 5.499332 (lr=5.3295e-06) (hash(x)=143507257)
463
- 8400 val loss 5.6268
464
- 8400 val perplexity 277.7611
465
- 8400 train 5.664957 (lr=5.1995e-06) (hash(x)=166272643)
466
- 8500 val loss 5.6251
467
- 8500 val perplexity 277.3011
468
- 8500 train 5.523038 (lr=5.1019e-06) (hash(x)=143887848)
469
- 8600 val loss 5.6245
470
- 8600 val perplexity 277.1212
471
- 8600 train 5.663259 (lr=5.0367e-06) (hash(x)=156900341)
472
- 8700 val loss 5.6196
473
- 8700 val perplexity 275.7807
474
- 8700 train 5.859730 (lr=5.0041e-06) (hash(x)=146417632)
475
- 8749 val loss 5.6196
476
- 8749 val perplexity 275.7850
 
1
  max_steps: 8750
 
 
 
2
  0 val loss 11.2808
3
+ 0 val perplexity 79287.8281
4
+ 0 train 11.289343 (lr=9.0000e-08) (hash(x)=150724848)
5
+ 100 val loss 10.0387
6
+ 100 val perplexity 22894.7715
7
+ 100 train 10.053072 (lr=9.0900e-06) (hash(x)=149910534)
8
+ 200 val loss 9.3866
9
+ 200 val perplexity 11926.9707
10
+ 200 train 9.415236 (lr=1.8090e-05) (hash(x)=148123706)
11
+ 300 val loss 8.2246
12
+ 300 val perplexity 3731.5139
13
+ 300 train 8.125541 (lr=2.7090e-05) (hash(x)=146678221)
14
+ 400 val loss 7.6337
15
+ 400 val perplexity 2066.6184
16
+ 400 train 7.529495 (lr=3.6090e-05) (hash(x)=151700982)
17
+ 500 val loss 7.4691
18
+ 500 val perplexity 1752.9563
19
+ 500 train 7.553552 (lr=4.5000e-05) (hash(x)=156182087)
20
+ 600 val loss 7.3525
21
+ 600 val perplexity 1560.1439
22
+ 600 train 7.332888 (lr=4.4985e-05) (hash(x)=149318660)
23
+ 700 val loss 7.2674
24
+ 700 val perplexity 1432.8541
25
+ 700 train 7.306005 (lr=4.4941e-05) (hash(x)=150482428)
26
+ 800 val loss 7.1371
27
+ 800 val perplexity 1257.8130
28
+ 800 train 7.013755 (lr=4.4868e-05) (hash(x)=143268605)
29
+ 900 val loss 7.0395
30
+ 900 val perplexity 1140.7983
31
+ 900 train 7.003427 (lr=4.4766e-05) (hash(x)=152322423)
32
+ 1000 val loss 6.9457
33
+ 1000 val perplexity 1038.6819
34
+ 1000 train 6.811792 (lr=4.4634e-05) (hash(x)=147904298)
35
+ 1100 val loss 6.8599
36
+ 1100 val perplexity 953.2381
37
+ 1100 train 7.054464 (lr=4.4474e-05) (hash(x)=154343147)
38
+ 1200 val loss 6.7650
39
+ 1200 val perplexity 866.9272
40
+ 1200 train 6.752739 (lr=4.4285e-05) (hash(x)=141843115)
41
+ 1300 val loss 6.6892
42
+ 1300 val perplexity 803.6893
43
+ 1300 train 6.579344 (lr=4.4068e-05) (hash(x)=145279030)
44
+ 1400 val loss 6.6278
45
+ 1400 val perplexity 755.7952
46
+ 1400 train 6.572786 (lr=4.3822e-05) (hash(x)=152507639)
47
+ 1500 val loss 6.5699
48
+ 1500 val perplexity 713.3077
49
+ 1500 train 6.604732 (lr=4.3549e-05) (hash(x)=148473774)
50
+ 1600 val loss 6.4935
51
+ 1600 val perplexity 660.8264
52
+ 1600 train 6.535246 (lr=4.3249e-05) (hash(x)=151117002)
53
+ 1700 val loss 6.4515
54
+ 1700 val perplexity 633.6206
55
+ 1700 train 6.387179 (lr=4.2922e-05) (hash(x)=138011335)
56
+ 1800 val loss 6.4102
57
+ 1800 val perplexity 607.9948
58
+ 1800 train 6.550987 (lr=4.2569e-05) (hash(x)=171180926)
59
+ 1900 val loss 6.3649
60
+ 1900 val perplexity 581.0685
61
+ 1900 train 6.291069 (lr=4.2190e-05) (hash(x)=141769419)
62
+ 2000 val loss 6.3258
63
+ 2000 val perplexity 558.8239
64
+ 2000 train 6.267239 (lr=4.1785e-05) (hash(x)=151963443)
65
+ 2100 val loss 6.3109
66
+ 2100 val perplexity 550.5495
67
+ 2100 train 6.360897 (lr=4.1356e-05) (hash(x)=162947470)
68
+ 2200 val loss 6.2599
69
+ 2200 val perplexity 523.1447
70
+ 2200 train 6.505328 (lr=4.0903e-05) (hash(x)=154954810)
71
+ 2300 val loss 6.2322
72
+ 2300 val perplexity 508.8639
73
+ 2300 train 6.212578 (lr=4.0426e-05) (hash(x)=151878111)
74
+ 2400 val loss 6.2105
75
+ 2400 val perplexity 497.9667
76
+ 2400 train 6.103368 (lr=3.9927e-05) (hash(x)=158661057)
77
+ 2500 val loss 6.1755
78
+ 2500 val perplexity 480.8314
79
+ 2500 train 6.202045 (lr=3.9406e-05) (hash(x)=150925584)
80
+ 2600 val loss 6.1552
81
+ 2600 val perplexity 471.1487
82
+ 2600 train 6.061382 (lr=3.8863e-05) (hash(x)=144515755)
83
+ 2700 val loss 6.1338
84
+ 2700 val perplexity 461.1937
85
+ 2700 train 6.129019 (lr=3.8300e-05) (hash(x)=153109144)
86
+ 2800 val loss 6.1062
87
+ 2800 val perplexity 448.6117
88
+ 2800 train 6.009483 (lr=3.7717e-05) (hash(x)=151152897)
89
+ 2900 val loss 6.0859
90
+ 2900 val perplexity 439.6326
91
+ 2900 train 6.025854 (lr=3.7116e-05) (hash(x)=145800210)
92
+ 3000 val loss 6.0743
93
+ 3000 val perplexity 434.5630
94
+ 3000 train 5.871215 (lr=3.6496e-05) (hash(x)=141997485)
95
+ 3100 val loss 6.0462
96
+ 3100 val perplexity 422.4893
97
+ 3100 train 5.963703 (lr=3.5860e-05) (hash(x)=154049740)
98
+ 3200 val loss 6.0270
99
+ 3200 val perplexity 414.4612
100
+ 3200 train 5.956031 (lr=3.5207e-05) (hash(x)=150471842)
101
+ 3300 val loss 6.0160
102
+ 3300 val perplexity 409.9505
103
+ 3300 train 5.961557 (lr=3.4539e-05) (hash(x)=149048126)
104
+ 3400 val loss 5.9919
105
+ 3400 val perplexity 400.1670
106
+ 3400 train 6.120944 (lr=3.3857e-05) (hash(x)=161261339)
107
+ 3500 val loss 5.9792
108
+ 3500 val perplexity 395.1056
109
+ 3500 train 5.945442 (lr=3.3162e-05) (hash(x)=157495564)
110
+ 3600 val loss 5.9652
111
+ 3600 val perplexity 389.6129
112
+ 3600 train 5.867484 (lr=3.2455e-05) (hash(x)=144352932)
113
+ 3700 val loss 5.9466
114
+ 3700 val perplexity 382.4526
115
+ 3700 train 5.919501 (lr=3.1736e-05) (hash(x)=149389012)
116
+ 3800 val loss 5.9294
117
+ 3800 val perplexity 375.9380
118
+ 3800 train 5.871775 (lr=3.1008e-05) (hash(x)=146607620)
119
+ 3900 val loss 5.9150
120
+ 3900 val perplexity 370.5585
121
+ 3900 train 5.833241 (lr=3.0270e-05) (hash(x)=143754617)
122
+ 4000 val loss 5.8975
123
+ 4000 val perplexity 364.1153
124
+ 4000 train 5.870857 (lr=2.9524e-05) (hash(x)=156930722)
125
+ 4100 val loss 5.8849
126
+ 4100 val perplexity 359.5775
127
+ 4100 train 5.711125 (lr=2.8771e-05) (hash(x)=147500519)
128
+ 4200 val loss 5.8724
129
+ 4200 val perplexity 355.0967
130
+ 4200 train 5.711286 (lr=2.8013e-05) (hash(x)=143232237)
131
+ 4300 val loss 5.8596
132
+ 4300 val perplexity 350.5836
133
+ 4300 train 5.764656 (lr=2.7250e-05) (hash(x)=146811670)
134
+ 4400 val loss 5.8431
135
+ 4400 val perplexity 344.8451
136
+ 4400 train 5.873563 (lr=2.6483e-05) (hash(x)=158418746)
137
+ 4500 val loss 5.8401
138
+ 4500 val perplexity 343.8246
139
+ 4500 train 5.804694 (lr=2.5714e-05) (hash(x)=156695778)
140
+ 4600 val loss 5.8231
141
+ 4600 val perplexity 338.0122
142
+ 4600 train 5.847899 (lr=2.4943e-05) (hash(x)=147791497)
143
+ 4700 val loss 5.8089
144
+ 4700 val perplexity 333.2647
145
+ 4700 train 5.697839 (lr=2.4172e-05) (hash(x)=155533088)
146
+ 4800 val loss 5.8027
147
+ 4800 val perplexity 331.2058
148
+ 4800 train 5.654629 (lr=2.3402e-05) (hash(x)=138350044)
149
+ 4900 val loss 5.7873
150
+ 4900 val perplexity 326.1176
151
+ 4900 train 5.788656 (lr=2.2633e-05) (hash(x)=143735284)
152
+ 5000 val loss 5.7758
153
+ 5000 val perplexity 322.3993
154
+ 5000 train 5.734388 (lr=2.1868e-05) (hash(x)=154976463)
155
+ 5100 val loss 5.7729
156
+ 5100 val perplexity 321.4783
157
+ 5100 train 5.654710 (lr=2.1107e-05) (hash(x)=149894982)
158
+ 5200 val loss 5.7619
159
+ 5200 val perplexity 317.9573
160
+ 5200 train 5.548586 (lr=2.0351e-05) (hash(x)=159326689)
161
+ 5300 val loss 5.7530
162
+ 5300 val perplexity 315.1419
163
+ 5300 train 5.758868 (lr=1.9602e-05) (hash(x)=159484800)
164
+ 5400 val loss 5.7385
165
+ 5400 val perplexity 310.5921
166
+ 5400 train 5.640297 (lr=1.8860e-05) (hash(x)=140385615)
167
+ 5500 val loss 5.7310
168
+ 5500 val perplexity 308.2803
169
+ 5500 train 5.561502 (lr=1.8127e-05) (hash(x)=148498335)
170
+ 5600 val loss 5.7311
171
+ 5600 val perplexity 308.3099
172
+ 5600 train 5.465002 (lr=1.7403e-05) (hash(x)=151907614)
173
+ 5700 val loss 5.7218
174
+ 5700 val perplexity 305.4481
175
+ 5700 train 5.691004 (lr=1.6690e-05) (hash(x)=155192267)
176
+ 5800 val loss 5.7090
177
+ 5800 val perplexity 301.5787
178
+ 5800 train 5.762886 (lr=1.5989e-05) (hash(x)=153132158)
179
+ 5900 val loss 5.7053
180
+ 5900 val perplexity 300.4576
181
+ 5900 train 5.679949 (lr=1.5300e-05) (hash(x)=161446764)
182
+ 6000 val loss 5.7017
183
+ 6000 val perplexity 299.3758
184
+ 6000 train 5.506324 (lr=1.4625e-05) (hash(x)=151512446)
185
+ 6100 val loss 5.6905
186
+ 6100 val perplexity 296.0348
187
+ 6100 train 5.810744 (lr=1.3965e-05) (hash(x)=188094053)
188
+ 6200 val loss 5.6813
189
+ 6200 val perplexity 293.3209
190
+ 6200 train 5.564333 (lr=1.3320e-05) (hash(x)=149389789)
191
+ 6300 val loss 5.6772
192
+ 6300 val perplexity 292.1324
193
+ 6300 train 5.496711 (lr=1.2692e-05) (hash(x)=138212820)
194
+ 6400 val loss 5.6753
195
+ 6400 val perplexity 291.5690
196
+ 6400 train 5.467338 (lr=1.2081e-05) (hash(x)=146535423)
197
+ 6500 val loss 5.6654
198
+ 6500 val perplexity 288.7062
199
+ 6500 train 5.533089 (lr=1.1489e-05) (hash(x)=145950843)
200
+ 6600 val loss 5.6610
201
+ 6600 val perplexity 287.4485
202
+ 6600 train 5.562228 (lr=1.0916e-05) (hash(x)=141162902)
203
+ 6700 val loss 5.6593
204
+ 6700 val perplexity 286.9351
205
+ 6700 train 5.572202 (lr=1.0363e-05) (hash(x)=153018737)
206
+ 6800 val loss 5.6498
207
+ 6800 val perplexity 284.2420
208
+ 6800 train 5.566178 (lr=9.8310e-06) (hash(x)=155640155)
209
+ 6900 val loss 5.6472
210
+ 6900 val perplexity 283.4846
211
+ 6900 train 5.618537 (lr=9.3205e-06) (hash(x)=153722115)
212
+ 7000 val loss 5.6406
213
+ 7000 val perplexity 281.6339
214
+ 7000 train 5.552649 (lr=8.8324e-06) (hash(x)=146953450)
215
+ 7100 val loss 5.6403
216
+ 7100 val perplexity 281.5345
217
+ 7100 train 5.483966 (lr=8.3674e-06) (hash(x)=137663885)
218
+ 7200 val loss 5.6309
219
+ 7200 val perplexity 278.9171
220
+ 7200 train 5.704070 (lr=7.9261e-06) (hash(x)=146172950)
221
+ 7300 val loss 5.6273
222
+ 7300 val perplexity 277.9214
223
+ 7300 train 5.517547 (lr=7.5093e-06) (hash(x)=150018163)
224
+ 7400 val loss 5.6247
225
+ 7400 val perplexity 277.1830
226
+ 7400 train 5.627498 (lr=7.1174e-06) (hash(x)=145351166)
227
+ 7500 val loss 5.6248
228
+ 7500 val perplexity 277.2080
229
+ 7500 train 5.407495 (lr=6.7511e-06) (hash(x)=145292116)
230
+ 7600 val loss 5.6174
231
+ 7600 val perplexity 275.1786
232
+ 7600 train 5.533510 (lr=6.4109e-06) (hash(x)=150235132)
233
+ 7700 val loss 5.6147
234
+ 7700 val perplexity 274.4358
235
+ 7700 train 5.519080 (lr=6.0972e-06) (hash(x)=154543455)
236
+ 7800 val loss 5.6131
237
+ 7800 val perplexity 273.9905
238
+ 7800 train 5.514878 (lr=5.8107e-06) (hash(x)=142456852)
239
+ 7900 val loss 5.6123
240
+ 7900 val perplexity 273.7810
241
+ 7900 train 5.390297 (lr=5.5515e-06) (hash(x)=147363479)
242
+ 8000 val loss 5.6059
243
+ 8000 val perplexity 272.0219
244
+ 8000 train 5.638661 (lr=5.3203e-06) (hash(x)=156122973)
245
+ 8100 val loss 5.6041
246
+ 8100 val perplexity 271.5290
247
+ 8100 train 5.650138 (lr=5.1172e-06) (hash(x)=156153179)
248
+ 8200 val loss 5.6029
249
+ 8200 val perplexity 271.2119
250
+ 8200 train 5.651247 (lr=4.9425e-06) (hash(x)=146430698)
251
+ 8300 val loss 5.6011
252
+ 8300 val perplexity 270.7372
253
+ 8300 train 5.470097 (lr=4.7966e-06) (hash(x)=143507257)
254
+ 8400 val loss 5.5964
255
+ 8400 val perplexity 269.4478
256
+ 8400 train 5.624218 (lr=4.6796e-06) (hash(x)=166272643)
257
+ 8500 val loss 5.5950
258
+ 8500 val perplexity 269.0811
259
+ 8500 train 5.498229 (lr=4.5917e-06) (hash(x)=143887848)
260
+ 8600 val loss 5.5944
261
+ 8600 val perplexity 268.9143
262
+ 8600 train 5.633615 (lr=4.5330e-06) (hash(x)=156900341)
263
+ 8700 val loss 5.5894
264
+ 8700 val perplexity 267.5721
265
+ 8700 train 5.830091 (lr=4.5037e-06) (hash(x)=146417632)
266
+ 8749 val loss 5.5894
267
+ 8749 val perplexity 267.5743
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
attention_kindselective_n_heads4_seed1339/model_08749.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:28c1d46f1189bf9e6286f016e3f11c3fb23ed74390497e972b970bc6ca6c48a1
3
  size 92843394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e0998a3dbdb0477c922f88c6a1516669b2c479174b7ebe6f589d2a397b0bc9e
3
  size 92843394
attention_kindselective_n_heads4_seed1339/optimizer_08749.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4d7d46c996fb6999606ae1a98b752899b17ac1f319d229f6eb9b477ab5add1b
3
  size 179406214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2a3f247d486e700d63788557e071c6feee4fce9929dc372541588e1855466f1
3
  size 179406214