andrew-healey commited on
Commit
edb7199
·
verified ·
1 Parent(s): e5ba3f9

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads2_seed1338/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads2_seed1338", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1338, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "5e-5_10240_2_1338", "n_embd": 128}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads2_seed1338", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1338, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 0.0001, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "10e-5_10240_2_1338", "n_embd": 128}
attention_kindselective_n_heads2_seed1338/log2.txt CHANGED
@@ -1,606 +1,582 @@
1
  max_steps: 10000
2
- max_steps: 10000
3
- 0 val loss 11.7640
4
- 0 val perplexity 128545.8516
 
 
 
5
  0 val loss 11.7640
6
  0 val perplexity 128545.8516
7
- 0 train 11.762399 (lr=2.5000e-07) (hash(x)=50671684)
8
- 0 train 11.762399 (lr=3.5000e-07) (hash(x)=50671684)
9
- 100 val loss 10.2612
10
- 100 val perplexity 28599.8301
11
- 100 val loss 10.0855
12
- 100 val perplexity 23992.4160
13
- 100 train 10.199640 (lr=2.5250e-05) (hash(x)=52740221)
14
- 100 train 10.027347 (lr=3.5350e-05) (hash(x)=52740221)
15
- 200 val loss 9.0641
16
- 200 val perplexity 8639.7002
17
- 200 train 9.082980 (lr=5.0000e-05) (hash(x)=49034180)
18
- 200 val loss 8.3482
19
- 200 val perplexity 4222.5566
20
- 200 train 8.351851 (lr=7.0000e-05) (hash(x)=49034180)
21
- 300 val loss 8.1940
22
- 300 val perplexity 3619.2537
23
- 300 train 8.292101 (lr=4.9988e-05) (hash(x)=63180688)
24
- 300 val loss 7.7296
25
- 300 val perplexity 2274.6714
26
- 300 train 7.949653 (lr=6.9984e-05) (hash(x)=63180688)
27
- 400 val loss 7.8544
28
- 400 val perplexity 2577.1091
29
- 400 train 7.816474 (lr=4.9954e-05) (hash(x)=50373500)
30
- 400 val loss 7.6567
31
- 400 val perplexity 2114.7312
32
- 400 train 7.666688 (lr=6.9935e-05) (hash(x)=50373500)
33
- 500 val loss 7.6951
34
- 500 val perplexity 2197.5679
35
- 500 train 7.606697 (lr=4.9896e-05) (hash(x)=44547422)
36
- 500 val loss 7.6381
37
- 500 val perplexity 2075.7231
38
- 500 train 7.539029 (lr=6.9854e-05) (hash(x)=44547422)
39
- 600 val loss 7.6593
40
- 600 val perplexity 2120.3745
41
- 600 train 7.554819 (lr=4.9815e-05) (hash(x)=47184699)
42
- 600 val loss 7.6413
43
- 600 val perplexity 2082.3872
44
- 600 train 7.530231 (lr=6.9741e-05) (hash(x)=47184699)
45
- 700 val loss 7.6521
46
- 700 val perplexity 2104.9644
47
- 700 train 7.565280 (lr=4.9712e-05) (hash(x)=51374582)
48
- 700 val loss 7.6389
49
- 700 val perplexity 2077.5461
50
- 700 train 7.553903 (lr=6.9596e-05) (hash(x)=51374582)
51
- 800 val loss 7.6369
52
- 800 val perplexity 2073.2136
53
- 800 train 7.378779 (lr=4.9585e-05) (hash(x)=46264805)
54
- 800 val loss 7.6290
55
- 800 val perplexity 2057.0747
56
- 800 train 7.379150 (lr=6.9419e-05) (hash(x)=46264805)
57
- 900 val loss 7.6547
58
- 900 val perplexity 2110.6108
59
- 900 train 7.937618 (lr=4.9436e-05) (hash(x)=61178712)
60
- 900 val loss 7.6323
61
- 900 val perplexity 2063.8235
62
- 900 train 7.921933 (lr=6.9210e-05) (hash(x)=61178712)
63
- 1000 val loss 7.6362
64
- 1000 val perplexity 2071.7578
65
- 1000 train 7.647681 (lr=4.9264e-05) (hash(x)=50886520)
66
- 1000 val loss 7.6263
67
- 1000 val perplexity 2051.3552
68
- 1000 train 7.646755 (lr=6.8970e-05) (hash(x)=50886520)
69
- 1100 val loss 7.6266
70
- 1100 val perplexity 2052.0225
71
- 1100 train 7.411226 (lr=4.9070e-05) (hash(x)=48600099)
72
- 1100 val loss 7.6231
73
- 1100 val perplexity 2044.9672
74
- 1100 train 7.445827 (lr=6.8698e-05) (hash(x)=48600099)
75
- 1200 val loss 7.6176
76
- 1200 val perplexity 2033.6812
77
- 1200 train 7.354834 (lr=4.8854e-05) (hash(x)=50146792)
78
- 1200 val loss 7.6272
79
- 1200 val perplexity 2053.3645
80
- 1200 train 7.383700 (lr=6.8395e-05) (hash(x)=50146792)
81
- 1300 val loss 7.6195
82
- 1300 val perplexity 2037.5619
83
- 1300 train 7.491829 (lr=4.8616e-05) (hash(x)=52617313)
84
- 1300 val loss 7.6346
85
- 1300 val perplexity 2068.6394
86
- 1300 train 7.519730 (lr=6.8062e-05) (hash(x)=52617313)
87
- 1400 val loss 7.6085
88
- 1400 val perplexity 2015.1982
89
- 1400 train 7.284451 (lr=4.8356e-05) (hash(x)=49794446)
90
- 1400 val loss 7.6309
91
- 1400 val perplexity 2060.8892
92
- 1400 train 7.316226 (lr=6.7698e-05) (hash(x)=49794446)
93
- 1500 val loss 7.6101
94
- 1500 val perplexity 2018.5585
95
- 1500 train 7.262372 (lr=4.8074e-05) (hash(x)=50766317)
96
- 1500 val loss 7.6201
97
- 1500 val perplexity 2038.8419
98
- 1500 train 7.261213 (lr=6.7304e-05) (hash(x)=50766317)
99
- 1600 val loss 7.6007
100
- 1600 val perplexity 1999.5605
101
- 1600 train 7.493066 (lr=4.7772e-05) (hash(x)=55551175)
102
- 1600 val loss 7.5985
103
- 1600 val perplexity 1995.1804
104
- 1600 train 7.499768 (lr=6.6881e-05) (hash(x)=55551175)
105
- 1700 val loss 7.5795
106
- 1700 val perplexity 1957.7035
107
- 1700 train 7.733157 (lr=4.7448e-05) (hash(x)=56717172)
108
- 1700 val loss 7.5674
109
- 1700 val perplexity 1934.0377
110
- 1700 train 7.691600 (lr=6.6428e-05) (hash(x)=56717172)
111
- 1800 val loss 7.5356
112
- 1800 val perplexity 1873.6436
113
- 1800 train 7.837944 (lr=4.7105e-05) (hash(x)=55376447)
114
- 1800 val loss 7.5258
115
- 1800 val perplexity 1855.3097
116
- 1800 train 7.796223 (lr=6.5947e-05) (hash(x)=55376447)
117
- 1900 val loss 7.5072
118
- 1900 val perplexity 1821.0981
119
- 1900 train 7.223547 (lr=4.6741e-05) (hash(x)=43810837)
120
- 1900 val loss 7.5078
121
- 1900 val perplexity 1822.2770
122
- 1900 train 7.237650 (lr=6.5437e-05) (hash(x)=43810837)
123
- 2000 val loss 7.4870
124
- 2000 val perplexity 1784.7079
125
- 2000 train 7.566131 (lr=4.6357e-05) (hash(x)=50881655)
126
- 2000 val loss 7.4835
127
- 2000 val perplexity 1778.4316
128
- 2000 train 7.579944 (lr=6.4900e-05) (hash(x)=50881655)
129
- 2100 val loss 7.4819
130
- 2100 val perplexity 1775.6853
131
- 2100 train 7.358100 (lr=4.5954e-05) (hash(x)=49386015)
132
- 2100 val loss 7.4574
133
- 2100 val perplexity 1732.5850
134
- 2100 train 7.333408 (lr=6.4335e-05) (hash(x)=49386015)
135
- 2200 val loss 7.4734
136
- 2200 val perplexity 1760.6473
137
- 2200 train 7.392198 (lr=4.5532e-05) (hash(x)=48572079)
138
- 2200 val loss 7.4651
139
- 2200 val perplexity 1745.9648
140
- 2200 train 7.405998 (lr=6.3745e-05) (hash(x)=48572079)
141
- 2300 val loss 7.4649
142
- 2300 val perplexity 1745.6702
143
- 2300 train 7.528904 (lr=4.5091e-05) (hash(x)=54950719)
144
- 2300 val loss 7.4181
145
- 2300 val perplexity 1665.8230
146
- 2300 train 7.485063 (lr=6.3128e-05) (hash(x)=54950719)
147
- 2400 val loss 7.4481
148
- 2400 val perplexity 1716.5986
149
- 2400 train 7.104608 (lr=4.4633e-05) (hash(x)=42190240)
150
- 2400 val loss 7.3888
151
- 2400 val perplexity 1617.7377
152
- 2400 train 7.051908 (lr=6.2486e-05) (hash(x)=42190240)
153
- 2500 val loss 7.4384
154
- 2500 val perplexity 1700.0844
155
- 2500 train 7.503536 (lr=4.4156e-05) (hash(x)=45223539)
156
- 2500 val loss 7.3688
157
- 2500 val perplexity 1585.7074
158
- 2500 train 7.440190 (lr=6.1819e-05) (hash(x)=45223539)
159
- 2600 val loss 7.4214
160
- 2600 val perplexity 1671.3839
161
- 2600 train 7.425591 (lr=4.3663e-05) (hash(x)=54037353)
162
- 2600 val loss 7.3434
163
- 2600 val perplexity 1546.0243
164
- 2600 train 7.361808 (lr=6.1128e-05) (hash(x)=54037353)
165
- 2700 val loss 7.3990
166
- 2700 val perplexity 1634.3760
167
- 2700 train 7.943274 (lr=4.3153e-05) (hash(x)=59131616)
168
- 2700 val loss 7.3256
169
- 2700 val perplexity 1518.7589
170
- 2700 train 7.771473 (lr=6.0414e-05) (hash(x)=59131616)
171
- 2800 val loss 7.3895
172
- 2800 val perplexity 1618.9755
173
- 2800 train 7.214322 (lr=4.2627e-05) (hash(x)=45882743)
174
- 2800 val loss 7.2809
175
- 2800 val perplexity 1452.2246
176
- 2800 train 7.131896 (lr=5.9677e-05) (hash(x)=45882743)
177
- 2900 val loss 7.3756
178
- 2900 val perplexity 1596.5599
179
- 2900 train 6.986534 (lr=4.2085e-05) (hash(x)=43758910)
180
- 3000 val loss 7.4025
181
- 3000 val perplexity 1640.0563
182
- 3000 train 7.337453 (lr=4.1529e-05) (hash(x)=47965974)
183
- 2900 val loss 7.2484
184
- 2900 val perplexity 1405.8976
185
- 2900 train 6.918225 (lr=5.8919e-05) (hash(x)=43758910)
186
- 3100 val loss 7.3828
187
- 3100 val perplexity 1608.0157
188
- 3100 train 7.282964 (lr=4.0957e-05) (hash(x)=48205243)
189
- 3000 val loss 7.2421
190
- 3000 val perplexity 1397.0543
191
- 3000 train 7.161748 (lr=5.8140e-05) (hash(x)=47965974)
192
- 3200 val loss 7.3542
193
- 3200 val perplexity 1562.7103
194
- 3200 train 7.431534 (lr=4.0373e-05) (hash(x)=54511383)
195
- 3100 val loss 7.2156
196
- 3100 val perplexity 1360.4982
197
- 3100 train 7.109374 (lr=5.7340e-05) (hash(x)=48205243)
198
- 3300 val loss 7.3452
199
- 3300 val perplexity 1548.7839
200
- 3300 train 7.324124 (lr=3.9775e-05) (hash(x)=54428388)
201
- 3200 val loss 7.2021
202
- 3200 val perplexity 1342.2188
203
- 3200 train 7.273560 (lr=5.6522e-05) (hash(x)=54511383)
204
- 3400 val loss 7.3255
205
- 3400 val perplexity 1518.5460
206
- 3400 train 7.373497 (lr=3.9164e-05) (hash(x)=48115990)
207
- 3300 val loss 7.1854
208
- 3300 val perplexity 1320.0015
209
- 3300 train 7.137845 (lr=5.5684e-05) (hash(x)=54428388)
210
- 3500 val loss 7.3067
211
- 3500 val perplexity 1490.2999
212
- 3500 train 6.951129 (lr=3.8541e-05) (hash(x)=41137345)
213
- 3400 val loss 7.1642
214
- 3400 val perplexity 1292.2944
215
- 3400 train 7.198205 (lr=5.4829e-05) (hash(x)=48115990)
216
- 3600 val loss 7.3099
217
- 3600 val perplexity 1495.0908
218
- 3600 train 7.210977 (lr=3.7907e-05) (hash(x)=55186224)
219
- 3500 val loss 7.1480
220
- 3500 val perplexity 1271.5587
221
- 3500 train 6.770042 (lr=5.3958e-05) (hash(x)=41137345)
222
- 3700 val loss 7.2953
223
- 3700 val perplexity 1473.4081
224
- 3700 train 7.162465 (lr=3.7262e-05) (hash(x)=54990049)
225
- 3600 val loss 7.1600
226
- 3600 val perplexity 1286.9684
227
- 3600 train 7.056310 (lr=5.3070e-05) (hash(x)=55186224)
228
- 3800 val loss 7.2900
229
- 3800 val perplexity 1465.5238
230
- 3800 train 7.036756 (lr=3.6608e-05) (hash(x)=46288812)
231
- 3700 val loss 7.1365
232
- 3700 val perplexity 1257.0605
233
- 3700 train 7.001248 (lr=5.2167e-05) (hash(x)=54990049)
234
- 3900 val loss 7.2680
235
- 3900 val perplexity 1433.7078
236
- 3900 train 6.930634 (lr=3.5944e-05) (hash(x)=45829773)
237
- 3800 val loss 7.1138
238
- 3800 val perplexity 1228.7883
239
- 3800 train 6.855778 (lr=5.1251e-05) (hash(x)=46288812)
240
- 4000 val loss 7.2640
241
- 4000 val perplexity 1427.9847
242
- 4000 train 7.045815 (lr=3.5271e-05) (hash(x)=52499943)
243
- 3900 val loss 7.1105
244
- 3900 val perplexity 1224.7800
245
- 3900 train 6.770267 (lr=5.0321e-05) (hash(x)=45829773)
246
- 4100 val loss 7.2214
247
- 4100 val perplexity 1368.4528
248
- 4100 train 7.119732 (lr=3.4590e-05) (hash(x)=48563796)
249
- 4000 val loss 7.0887
250
- 4000 val perplexity 1198.3544
251
- 4000 train 6.878484 (lr=4.9379e-05) (hash(x)=52499943)
252
- 4200 val loss 7.1987
253
- 4200 val perplexity 1337.7422
254
- 4200 train 7.161106 (lr=3.3902e-05) (hash(x)=49165143)
255
- 4100 val loss 7.0505
256
- 4100 val perplexity 1153.4874
257
- 4100 train 6.955088 (lr=4.8426e-05) (hash(x)=48563796)
258
- 4300 val loss 7.1848
259
- 4300 val perplexity 1319.2406
260
- 4300 train 7.263635 (lr=3.3207e-05) (hash(x)=50973176)
261
- 4200 val loss 6.9988
262
- 4200 val perplexity 1095.3329
263
- 4200 train 6.969129 (lr=4.7463e-05) (hash(x)=49165143)
264
- 4400 val loss 7.1592
265
- 4400 val perplexity 1285.8815
266
- 4400 train 7.206085 (lr=3.2507e-05) (hash(x)=55275124)
267
- 4300 val loss 6.9748
268
- 4300 val perplexity 1069.3761
269
- 4300 train 7.032444 (lr=4.6490e-05) (hash(x)=50973176)
270
- 4500 val loss 7.1423
271
- 4500 val perplexity 1264.3616
272
- 4500 train 7.480568 (lr=3.1801e-05) (hash(x)=58646505)
273
- 4400 val loss 6.9587
274
- 4400 val perplexity 1052.2388
275
- 4400 train 7.023558 (lr=4.5509e-05) (hash(x)=55275124)
276
- 4600 val loss 7.1201
277
- 4600 val perplexity 1236.5511
278
- 4600 train 6.934882 (lr=3.1091e-05) (hash(x)=42554666)
279
- 4500 val loss 6.9399
280
- 4500 val perplexity 1032.6294
281
- 4500 train 7.294427 (lr=4.4521e-05) (hash(x)=58646505)
282
- 4700 val loss 7.1123
283
- 4700 val perplexity 1226.9720
284
- 4700 train 7.021177 (lr=3.0377e-05) (hash(x)=47846764)
285
- 4600 val loss 6.9231
286
- 4600 val perplexity 1015.4349
287
- 4600 train 6.728717 (lr=4.3527e-05) (hash(x)=42554666)
288
- 4800 val loss 7.1072
289
- 4800 val perplexity 1220.7145
290
- 4800 train 7.640239 (lr=2.9661e-05) (hash(x)=58239019)
291
- 4700 val loss 6.9024
292
- 4700 val perplexity 994.6642
293
- 4700 train 6.794252 (lr=4.2528e-05) (hash(x)=47846764)
294
- 4900 val loss 7.0912
295
- 4900 val perplexity 1201.3014
296
- 4900 train 7.122570 (lr=2.8942e-05) (hash(x)=50711220)
297
- 4800 val loss 6.8864
298
- 4800 val perplexity 978.8246
299
- 4800 train 7.385535 (lr=4.1525e-05) (hash(x)=58239019)
300
- 5000 val loss 7.0806
301
- 5000 val perplexity 1188.6388
302
- 5000 train 7.011305 (lr=2.8221e-05) (hash(x)=45994194)
303
- 4900 val loss 6.8732
304
- 4900 val perplexity 966.0390
305
- 4900 train 6.911656 (lr=4.0518e-05) (hash(x)=50711220)
306
- 5100 val loss 7.0847
307
- 5100 val perplexity 1193.5157
308
- 5100 train 6.948140 (lr=2.7500e-05) (hash(x)=48659050)
309
- 5000 val loss 6.8794
310
- 5000 val perplexity 972.0590
311
- 5000 train 6.808244 (lr=3.9510e-05) (hash(x)=45994194)
312
- 5200 val loss 7.0560
313
- 5200 val perplexity 1159.7809
314
- 5200 train 6.998636 (lr=2.6779e-05) (hash(x)=49369682)
315
- 5100 val loss 6.8575
316
- 5100 val perplexity 950.9432
317
- 5100 train 6.727626 (lr=3.8500e-05) (hash(x)=48659050)
318
- 5300 val loss 7.0524
319
- 5300 val perplexity 1155.6340
320
- 5300 train 7.432145 (lr=2.6058e-05) (hash(x)=57787700)
321
- 5200 val loss 6.8367
322
- 5200 val perplexity 931.4388
323
- 5200 train 6.751755 (lr=3.7490e-05) (hash(x)=49369682)
324
- 5400 val loss 7.0336
325
- 5400 val perplexity 1134.0551
326
- 5400 train 6.964849 (lr=2.5339e-05) (hash(x)=49365400)
327
- 5300 val loss 6.8303
328
- 5300 val perplexity 925.4502
329
- 5300 train 7.234496 (lr=3.6482e-05) (hash(x)=57787700)
330
- 5500 val loss 7.0214
331
- 5500 val perplexity 1120.3920
332
- 5500 train 6.973766 (lr=2.4623e-05) (hash(x)=48720412)
333
- 5400 val loss 6.8203
334
- 5400 val perplexity 916.2604
335
- 5400 train 6.747429 (lr=3.5475e-05) (hash(x)=49365400)
336
- 5600 val loss 7.0085
337
- 5600 val perplexity 1106.0455
338
- 5600 train 7.324727 (lr=2.3909e-05) (hash(x)=55784800)
339
- 5500 val loss 6.8173
340
- 5500 val perplexity 913.5103
341
- 5500 train 6.778089 (lr=3.4472e-05) (hash(x)=48720412)
342
- 5700 val loss 7.0040
343
- 5700 val perplexity 1101.0533
344
- 5700 train 6.863188 (lr=2.3199e-05) (hash(x)=50073634)
345
- 5600 val loss 6.8012
346
- 5600 val perplexity 898.9412
347
- 5600 train 7.131571 (lr=3.3473e-05) (hash(x)=55784800)
348
- 5800 val loss 6.9817
349
- 5800 val perplexity 1076.7632
350
- 5800 train 6.810428 (lr=2.2493e-05) (hash(x)=50170324)
351
- 5700 val loss 6.7964
352
- 5700 val perplexity 894.6154
353
- 5700 train 6.646752 (lr=3.2479e-05) (hash(x)=50073634)
354
- 5900 val loss 6.9721
355
- 5900 val perplexity 1066.4613
356
- 5900 train 6.721549 (lr=2.1793e-05) (hash(x)=48410268)
357
- 5800 val loss 6.7835
358
- 5800 val perplexity 883.1710
359
- 5800 train 6.606258 (lr=3.1491e-05) (hash(x)=50170324)
360
- 6000 val loss 6.9624
361
- 6000 val perplexity 1056.1648
362
- 6000 train 6.858176 (lr=2.1098e-05) (hash(x)=49527342)
363
- 5900 val loss 6.7863
364
- 5900 val perplexity 885.6093
365
- 5900 train 6.532552 (lr=3.0510e-05) (hash(x)=48410268)
366
- 6100 val loss 6.9515
367
- 6100 val perplexity 1044.7220
368
- 6100 train 6.763265 (lr=2.0410e-05) (hash(x)=49550294)
369
- 6000 val loss 6.7822
370
- 6000 val perplexity 881.9771
371
- 6000 train 6.668611 (lr=2.9537e-05) (hash(x)=49527342)
372
- 6200 val loss 6.9387
373
- 6200 val perplexity 1031.4410
374
- 6200 train 6.494091 (lr=1.9729e-05) (hash(x)=42126106)
375
- 6100 val loss 6.7750
376
- 6100 val perplexity 875.6949
377
- 6100 train 6.587618 (lr=2.8574e-05) (hash(x)=49550294)
378
- 6300 val loss 6.9254
379
- 6300 val perplexity 1017.7720
380
- 6300 train 6.696783 (lr=1.9056e-05) (hash(x)=49608772)
381
- 6200 val loss 6.7762
382
- 6200 val perplexity 876.6934
383
- 6200 train 6.334079 (lr=2.7621e-05) (hash(x)=42126106)
384
- 6400 val loss 6.9128
385
- 6400 val perplexity 1005.0568
386
- 6400 train 6.485902 (lr=1.8392e-05) (hash(x)=52324417)
387
- 6300 val loss 6.7694
388
- 6300 val perplexity 870.7931
389
- 6300 train 6.520870 (lr=2.6679e-05) (hash(x)=49608772)
390
- 6500 val loss 6.8815
391
- 6500 val perplexity 974.0597
392
- 6500 train 6.923303 (lr=1.7738e-05) (hash(x)=46207215)
393
- 6400 val loss 6.7631
394
- 6400 val perplexity 865.2852
395
- 6400 train 6.338704 (lr=2.5749e-05) (hash(x)=52324417)
396
- 6600 val loss 6.8604
397
- 6600 val perplexity 953.7169
398
- 6600 train 6.797771 (lr=1.7093e-05) (hash(x)=49027014)
399
- 6500 val loss 6.7358
400
- 6500 val perplexity 841.9837
401
- 6500 train 6.789601 (lr=2.4833e-05) (hash(x)=46207215)
402
- 6700 val loss 6.8459
403
- 6700 val perplexity 939.9939
404
- 6700 train 6.819520 (lr=1.6459e-05) (hash(x)=46232513)
405
- 6800 val loss 6.8301
406
- 6800 val perplexity 925.2843
407
- 6800 train 6.766226 (lr=1.5836e-05) (hash(x)=47348403)
408
- 6600 val loss 6.7171
409
- 6600 val perplexity 826.3766
410
- 6600 train 6.647032 (lr=2.3930e-05) (hash(x)=49027014)
411
- 6900 val loss 6.8254
412
- 6900 val perplexity 920.9736
413
- 6900 train 6.799843 (lr=1.5225e-05) (hash(x)=49806647)
414
- 6700 val loss 6.7033
415
- 6700 val perplexity 815.0651
416
- 6700 train 6.672481 (lr=2.3042e-05) (hash(x)=46232513)
417
- 7000 val loss 6.8056
418
- 7000 val perplexity 902.8452
419
- 7000 train 6.843875 (lr=1.4627e-05) (hash(x)=50893018)
420
- 6800 val loss 6.7007
421
- 6800 val perplexity 812.9648
422
- 6800 train 6.624757 (lr=2.2171e-05) (hash(x)=47348403)
423
- 7100 val loss 6.7984
424
- 7100 val perplexity 896.4370
425
- 7100 train 6.822917 (lr=1.4043e-05) (hash(x)=49157639)
426
- 6900 val loss 6.7069
427
- 6900 val perplexity 817.9907
428
- 6900 train 6.675504 (lr=2.1316e-05) (hash(x)=49806647)
429
- 7200 val loss 6.7897
430
- 7200 val perplexity 888.6813
431
- 7200 train 6.761259 (lr=1.3471e-05) (hash(x)=47014759)
432
- 7000 val loss 6.6812
433
- 7000 val perplexity 797.3110
434
- 7000 train 6.741058 (lr=2.0478e-05) (hash(x)=50893018)
435
- 7300 val loss 6.7779
436
- 7300 val perplexity 878.1854
437
- 7300 train 6.779035 (lr=1.2915e-05) (hash(x)=47325591)
438
- 7100 val loss 6.6752
439
- 7100 val perplexity 792.4863
440
- 7100 train 6.693850 (lr=1.9660e-05) (hash(x)=49157639)
441
- 7400 val loss 6.7720
442
- 7400 val perplexity 873.0865
443
- 7400 train 6.654572 (lr=1.2373e-05) (hash(x)=49184604)
444
- 7200 val loss 6.6680
445
- 7200 val perplexity 786.8536
446
- 7200 train 6.651248 (lr=1.8860e-05) (hash(x)=47014759)
447
- 7500 val loss 6.7629
448
- 7500 val perplexity 865.1734
449
- 7500 train 7.052269 (lr=1.1847e-05) (hash(x)=55053584)
450
- 7300 val loss 6.6610
451
- 7300 val perplexity 781.3257
452
- 7300 train 6.657533 (lr=1.8081e-05) (hash(x)=47325591)
453
- 7600 val loss 6.7585
454
- 7600 val perplexity 861.3661
455
- 7600 train 6.715505 (lr=1.1337e-05) (hash(x)=48693923)
456
- 7400 val loss 6.6557
457
- 7400 val perplexity 777.1942
458
- 7400 train 6.538068 (lr=1.7323e-05) (hash(x)=49184604)
459
- 7700 val loss 6.7544
460
- 7700 val perplexity 857.8153
461
- 7700 train 6.298505 (lr=1.0844e-05) (hash(x)=40952882)
462
- 7500 val loss 6.6487
463
- 7500 val perplexity 771.8184
464
- 7500 train 6.927235 (lr=1.6586e-05) (hash(x)=55053584)
465
- 7800 val loss 6.7480
466
- 7800 val perplexity 852.3216
467
- 7800 train 6.836084 (lr=1.0367e-05) (hash(x)=52487845)
468
- 7600 val loss 6.6504
469
- 7600 val perplexity 773.0807
470
- 7600 train 6.592214 (lr=1.5872e-05) (hash(x)=48693923)
471
- 7900 val loss 6.7444
472
- 7900 val perplexity 849.3254
473
- 7900 train 6.821626 (lr=9.9088e-06) (hash(x)=50221547)
474
- 7700 val loss 6.6482
475
- 7700 val perplexity 771.4020
476
- 7700 train 6.223924 (lr=1.5181e-05) (hash(x)=40952882)
477
- 8000 val loss 6.7372
478
- 8000 val perplexity 843.2168
479
- 8000 train 7.036058 (lr=9.4682e-06) (hash(x)=62294204)
480
- 7800 val loss 6.6412
481
- 7800 val perplexity 765.9936
482
- 7800 train 6.724183 (lr=1.4514e-05) (hash(x)=52487845)
483
- 8100 val loss 6.7324
484
- 8100 val perplexity 839.1255
485
- 8100 train 6.425050 (lr=9.0461e-06) (hash(x)=44401967)
486
- 7900 val loss 6.6360
487
- 7900 val perplexity 762.0717
488
- 7900 train 6.738249 (lr=1.3872e-05) (hash(x)=50221547)
489
- 8200 val loss 6.7270
490
- 8200 val perplexity 834.6295
491
- 8200 train 6.687708 (lr=8.6430e-06) (hash(x)=52769095)
492
- 8000 val loss 6.6358
493
- 8000 val perplexity 761.8933
494
- 8000 train 6.951126 (lr=1.3255e-05) (hash(x)=62294204)
495
- 8300 val loss 6.7317
496
- 8300 val perplexity 838.6075
497
- 8300 train 6.673268 (lr=8.2593e-06) (hash(x)=56829883)
498
- 8100 val loss 6.6309
499
- 8100 val perplexity 758.1688
500
- 8100 train 6.340541 (lr=1.2665e-05) (hash(x)=44401967)
501
- 8400 val loss 6.7297
502
- 8400 val perplexity 836.9218
503
- 8400 train 6.691971 (lr=7.8953e-06) (hash(x)=52147375)
504
- 8200 val loss 6.6290
505
- 8200 val perplexity 756.7592
506
- 8200 train 6.576173 (lr=1.2100e-05) (hash(x)=52769095)
507
- 8500 val loss 6.7269
508
- 8500 val perplexity 834.5331
509
- 8500 train 6.963628 (lr=7.5515e-06) (hash(x)=60197820)
510
- 8300 val loss 6.6358
511
- 8300 val perplexity 761.8755
512
- 8300 train 6.583430 (lr=1.1563e-05) (hash(x)=56829883)
513
- 8600 val loss 6.7228
514
- 8600 val perplexity 831.1801
515
- 8600 train 6.488226 (lr=7.2282e-06) (hash(x)=49377068)
516
- 8400 val loss 6.6368
517
- 8400 val perplexity 762.6450
518
- 8400 train 6.574768 (lr=1.1053e-05) (hash(x)=52147375)
519
- 8700 val loss 6.7243
520
- 8700 val perplexity 832.3748
521
- 8700 train 6.683083 (lr=6.9257e-06) (hash(x)=51092724)
522
- 8500 val loss 6.6341
523
- 8500 val perplexity 760.6315
524
- 8500 train 6.861284 (lr=1.0572e-05) (hash(x)=60197820)
525
- 8800 val loss 6.7192
526
- 8800 val perplexity 828.1299
527
- 8800 train 6.728143 (lr=6.6444e-06) (hash(x)=48642928)
528
- 8600 val loss 6.6271
529
- 8600 val perplexity 755.2836
530
- 8600 train 6.383428 (lr=1.0119e-05) (hash(x)=49377068)
531
- 8900 val loss 6.7147
532
- 8900 val perplexity 824.3984
533
- 8900 train 6.926672 (lr=6.3845e-06) (hash(x)=55342246)
534
- 8700 val loss 6.6293
535
- 8700 val perplexity 756.9796
536
- 8700 train 6.602247 (lr=9.6960e-06) (hash(x)=51092724)
537
- 9000 val loss 6.6997
538
- 9000 val perplexity 812.1678
539
- 9000 train 6.671376 (lr=6.1462e-06) (hash(x)=48093368)
540
- 8800 val loss 6.6263
541
- 8800 val perplexity 754.6838
542
- 8800 train 6.621269 (lr=9.3021e-06) (hash(x)=48642928)
543
- 9100 val loss 6.6943
544
- 9100 val perplexity 807.8050
545
- 9100 train 6.789246 (lr=5.9300e-06) (hash(x)=48578183)
546
- 8900 val loss 6.6234
547
- 8900 val perplexity 752.5152
548
- 8900 train 6.830116 (lr=8.9382e-06) (hash(x)=55342246)
549
- 9200 val loss 6.6896
550
- 9200 val perplexity 803.9971
551
- 9200 train 6.888481 (lr=5.7359e-06) (hash(x)=50794720)
552
- 9000 val loss 6.6079
553
- 9000 val perplexity 740.9231
554
- 9000 train 6.601912 (lr=8.6047e-06) (hash(x)=48093368)
555
- 9300 val loss 6.6857
556
- 9300 val perplexity 800.8321
557
- 9300 train 6.454147 (lr=5.5641e-06) (hash(x)=46513190)
558
- 9100 val loss 6.6025
559
- 9100 val perplexity 736.9377
560
- 9100 train 6.679001 (lr=8.3020e-06) (hash(x)=48578183)
561
- 9400 val loss 6.6830
562
- 9400 val perplexity 798.7258
563
- 9400 train 6.323279 (lr=5.4149e-06) (hash(x)=43808238)
564
- 9200 val loss 6.5976
565
- 9200 val perplexity 733.3257
566
- 9200 train 6.786611 (lr=8.0302e-06) (hash(x)=50794720)
567
- 9500 val loss 6.6799
568
- 9500 val perplexity 796.2480
569
- 9500 train 6.484011 (lr=5.2884e-06) (hash(x)=45021888)
570
- 9300 val loss 6.5947
571
- 9300 val perplexity 731.1756
572
- 9300 train 6.369024 (lr=7.7898e-06) (hash(x)=46513190)
573
- 9600 val loss 6.6766
574
- 9600 val perplexity 793.6359
575
- 9600 train 6.773294 (lr=5.1847e-06) (hash(x)=56525570)
576
- 9400 val loss 6.5938
577
- 9400 val perplexity 730.5848
578
- 9400 train 6.250168 (lr=7.5809e-06) (hash(x)=43808238)
579
- 9700 val loss 6.6736
580
- 9700 val perplexity 791.2244
581
- 9700 train 6.871308 (lr=5.1040e-06) (hash(x)=52585913)
582
- 9500 val loss 6.5891
583
- 9500 val perplexity 727.1143
584
- 9500 train 6.406715 (lr=7.4038e-06) (hash(x)=45021888)
585
- 9800 val loss 6.6725
586
- 9800 val perplexity 790.3431
587
- 9800 train 6.852726 (lr=5.0462e-06) (hash(x)=52344698)
588
- 9600 val loss 6.5886
589
- 9600 val perplexity 726.7500
590
- 9600 train 6.674443 (lr=7.2586e-06) (hash(x)=56525570)
591
- 9900 val loss 6.6691
592
- 9900 val perplexity 787.7227
593
- 9900 train 6.682109 (lr=5.0116e-06) (hash(x)=51740945)
594
- 9700 val loss 6.5857
595
- 9700 val perplexity 724.6240
596
- 9700 train 6.775486 (lr=7.1456e-06) (hash(x)=52585913)
597
- 9999 val loss 6.6652
598
- 9999 val perplexity 784.6176
599
- 9800 val loss 6.5830
600
- 9800 val perplexity 722.7222
601
- 9800 train 6.780724 (lr=7.0647e-06) (hash(x)=52344698)
602
- 9900 val loss 6.5810
603
- 9900 val perplexity 721.2598
604
- 9900 train 6.589230 (lr=7.0162e-06) (hash(x)=51740945)
605
- 9999 val loss 6.5788
606
- 9999 val perplexity 719.6898
 
1
  max_steps: 10000
2
+ 100 val loss 9.3442
3
+ 100 val perplexity 11431.8779
4
+ 100 train 9.297195 (lr=7.5750e-05) (hash(x)=52740221)
5
+ 200 val loss 7.7377
6
+ 200 val perplexity 2293.1240
7
+ 200 train 7.742150 (lr=1.5000e-04) (hash(x)=49034180)
8
  0 val loss 11.7640
9
  0 val perplexity 128545.8516
10
+ 300 val loss 7.6958
11
+ 300 val perplexity 2199.1255
12
+ 300 train 7.961709 (lr=1.4997e-04) (hash(x)=63180688)
13
+ 400 val loss 7.6637
14
+ 400 val perplexity 2129.5801
15
+ 400 train 7.680656 (lr=1.4986e-04) (hash(x)=50373500)
16
+ 0 train 11.762399 (lr=5.0000e-07) (hash(x)=50671684)
17
+ 500 val loss 7.6519
18
+ 500 val perplexity 2104.6372
19
+ 500 train 7.527692 (lr=1.4969e-04) (hash(x)=44547422)
20
+ 100 val loss 9.8118
21
+ 100 val perplexity 18248.5859
22
+ 600 val loss 7.7028
23
+ 600 val perplexity 2214.5046
24
+ 100 train 9.757762 (lr=5.0500e-05) (hash(x)=52740221)
25
+ 600 train 7.582813 (lr=1.4945e-04) (hash(x)=47184699)
26
+ 700 val loss 7.6679
27
+ 700 val perplexity 2138.6357
28
+ 700 train 7.571599 (lr=1.4913e-04) (hash(x)=51374582)
29
+ 200 val loss 8.1949
30
+ 200 val perplexity 3622.3582
31
+ 200 train 8.195848 (lr=1.0000e-04) (hash(x)=49034180)
32
+ 300 val loss 7.7078
33
+ 300 val perplexity 2225.6382
34
+ 800 val loss 7.6399
35
+ 800 val perplexity 2079.5740
36
+ 300 train 7.954569 (lr=9.9977e-05) (hash(x)=63180688)
37
+ 800 train 7.360402 (lr=1.4876e-04) (hash(x)=46264805)
38
+ 900 val loss 7.6276
39
+ 900 val perplexity 2054.1938
40
+ 900 train 7.878948 (lr=1.4831e-04) (hash(x)=61178712)
41
+ 400 val loss 7.6837
42
+ 400 val perplexity 2172.6035
43
+ 400 train 7.705904 (lr=9.9908e-05) (hash(x)=50373500)
44
+ 1000 val loss 7.6159
45
+ 1000 val perplexity 2030.1835
46
+ 1000 train 7.582415 (lr=1.4779e-04) (hash(x)=50886520)
47
+ 500 val loss 7.6574
48
+ 500 val perplexity 2116.2241
49
+ 500 train 7.541328 (lr=9.9792e-05) (hash(x)=44547422)
50
+ 1100 val loss 7.5475
51
+ 1100 val perplexity 1895.9528
52
+ 1100 train 7.305693 (lr=1.4721e-04) (hash(x)=48600099)
53
+ 600 val loss 7.6596
54
+ 600 val perplexity 2120.9651
55
+ 600 train 7.551372 (lr=9.9631e-05) (hash(x)=47184699)
56
+ 1200 val loss 7.5112
57
+ 1200 val perplexity 1828.3958
58
+ 1200 train 7.210708 (lr=1.4656e-04) (hash(x)=50146792)
59
+ 700 val loss 7.6287
60
+ 700 val perplexity 2056.4128
61
+ 700 train 7.550183 (lr=9.9423e-05) (hash(x)=51374582)
62
+ 1300 val loss 7.4972
63
+ 1300 val perplexity 1802.9724
64
+ 1300 train 7.355508 (lr=1.4585e-04) (hash(x)=52617313)
65
+ 800 val loss 7.6292
66
+ 800 val perplexity 2057.5054
67
+ 800 train 7.363524 (lr=9.9170e-05) (hash(x)=46264805)
68
+ 1400 val loss 7.4674
69
+ 1400 val perplexity 1749.9814
70
+ 1400 train 7.136637 (lr=1.4507e-04) (hash(x)=49794446)
71
+ 900 val loss 7.6407
72
+ 900 val perplexity 2081.2048
73
+ 900 train 7.929552 (lr=9.8872e-05) (hash(x)=61178712)
74
+ 1500 val loss 7.4519
75
+ 1500 val perplexity 1723.1036
76
+ 1500 train 7.113922 (lr=1.4422e-04) (hash(x)=50766317)
77
+ 1000 val loss 7.6259
78
+ 1000 val perplexity 2050.5642
79
+ 1000 train 7.627928 (lr=9.8528e-05) (hash(x)=50886520)
80
+ 1600 val loss 7.4349
81
+ 1600 val perplexity 1694.0500
82
+ 1600 train 7.307274 (lr=1.4332e-04) (hash(x)=55551175)
83
+ 1100 val loss 7.6154
84
+ 1100 val perplexity 2029.2988
85
+ 1100 train 7.433861 (lr=9.8140e-05) (hash(x)=48600099)
86
+ 1700 val loss 7.3709
87
+ 1700 val perplexity 1589.0658
88
+ 1700 train 7.524827 (lr=1.4235e-04) (hash(x)=56717172)
89
+ 1200 val loss 7.6086
90
+ 1200 val perplexity 2015.4730
91
+ 1200 train 7.319845 (lr=9.7708e-05) (hash(x)=50146792)
92
+ 1800 val loss 7.3132
93
+ 1800 val perplexity 1499.9122
94
+ 1800 train 7.543727 (lr=1.4131e-04) (hash(x)=55376447)
95
+ 1300 val loss 7.6147
96
+ 1300 val perplexity 2027.8499
97
+ 1300 train 7.483940 (lr=9.7231e-05) (hash(x)=52617313)
98
+ 1900 val loss 7.2845
99
+ 1900 val perplexity 1457.4705
100
+ 1900 train 7.016429 (lr=1.4022e-04) (hash(x)=43810837)
101
+ 1400 val loss 7.6027
102
+ 1400 val perplexity 2003.5959
103
+ 1400 train 7.282799 (lr=9.6711e-05) (hash(x)=49794446)
104
+ 2000 val loss 7.2760
105
+ 2000 val perplexity 1445.2384
106
+ 2000 train 7.324104 (lr=1.3907e-04) (hash(x)=50881655)
107
+ 1500 val loss 7.5973
108
+ 1500 val perplexity 1992.9032
109
+ 1500 train 7.251457 (lr=9.6149e-05) (hash(x)=50766317)
110
+ 2100 val loss 7.2904
111
+ 2100 val perplexity 1466.1522
112
+ 2100 train 7.182340 (lr=1.3786e-04) (hash(x)=49386015)
113
+ 2200 val loss 7.2854
114
+ 2200 val perplexity 1458.8618
115
+ 2200 train 7.207294 (lr=1.3660e-04) (hash(x)=48572079)
116
+ 1600 val loss 7.5756
117
+ 1600 val perplexity 1949.9363
118
+ 1600 train 7.480996 (lr=9.5544e-05) (hash(x)=55551175)
119
+ 2300 val loss 7.2263
120
+ 2300 val perplexity 1375.1674
121
+ 2300 train 7.262286 (lr=1.3527e-04) (hash(x)=54950719)
122
+ 1700 val loss 7.5376
123
+ 1700 val perplexity 1877.3765
124
+ 1700 train 7.684578 (lr=9.4897e-05) (hash(x)=56717172)
125
+ 2400 val loss 7.2174
126
+ 2400 val perplexity 1362.8987
127
+ 2400 train 6.879970 (lr=1.3390e-04) (hash(x)=42190240)
128
+ 1800 val loss 7.4586
129
+ 1800 val perplexity 1734.6367
130
+ 1800 train 7.710993 (lr=9.4209e-05) (hash(x)=55376447)
131
+ 2500 val loss 7.1974
132
+ 2500 val perplexity 1335.8898
133
+ 2500 train 7.284463 (lr=1.3247e-04) (hash(x)=45223539)
134
+ 1900 val loss 7.4584
135
+ 1900 val perplexity 1734.4408
136
+ 1900 train 7.186811 (lr=9.3481e-05) (hash(x)=43810837)
137
+ 2600 val loss 7.1936
138
+ 2600 val perplexity 1330.8302
139
+ 2600 train 7.223955 (lr=1.3099e-04) (hash(x)=54037353)
140
+ 2000 val loss 7.4416
141
+ 2000 val perplexity 1705.4586
142
+ 2000 train 7.528937 (lr=9.2714e-05) (hash(x)=50881655)
143
+ 2700 val loss 7.2011
144
+ 2700 val perplexity 1340.8951
145
+ 2700 train 7.754804 (lr=1.2946e-04) (hash(x)=59131616)
146
+ 2100 val loss 7.4330
147
+ 2100 val perplexity 1690.8485
148
+ 2100 train 7.312185 (lr=9.1908e-05) (hash(x)=49386015)
149
+ 2800 val loss 7.1920
150
+ 2800 val perplexity 1328.6990
151
+ 2800 train 7.032724 (lr=1.2788e-04) (hash(x)=45882743)
152
+ 2200 val loss 7.3995
153
+ 2200 val perplexity 1635.2444
154
+ 2200 train 7.314746 (lr=9.1064e-05) (hash(x)=48572079)
155
+ 2900 val loss 7.1831
156
+ 2900 val perplexity 1316.9196
157
+ 2900 train 6.815891 (lr=1.2626e-04) (hash(x)=43758910)
158
+ 2300 val loss 7.3938
159
+ 2300 val perplexity 1625.8793
160
+ 2300 train 7.450395 (lr=9.0182e-05) (hash(x)=54950719)
161
+ 3000 val loss 7.1706
162
+ 3000 val perplexity 1300.5831
163
+ 3000 train 7.098728 (lr=1.2459e-04) (hash(x)=47965974)
164
+ 2400 val loss 7.3578
165
+ 2400 val perplexity 1568.3666
166
+ 2400 train 7.028030 (lr=8.9265e-05) (hash(x)=42190240)
167
+ 3100 val loss 7.1539
168
+ 3100 val perplexity 1279.1355
169
+ 3100 train 7.044363 (lr=1.2287e-04) (hash(x)=48205243)
170
+ 2500 val loss 7.3452
171
+ 2500 val perplexity 1548.7137
172
+ 2500 train 7.422577 (lr=8.8313e-05) (hash(x)=45223539)
173
+ 3200 val loss 7.1652
174
+ 3200 val perplexity 1293.6329
175
+ 3200 train 7.250436 (lr=1.2112e-04) (hash(x)=54511383)
176
+ 2600 val loss 7.3222
177
+ 2600 val perplexity 1513.4851
178
+ 2600 train 7.332448 (lr=8.7326e-05) (hash(x)=54037353)
179
+ 3300 val loss 7.1444
180
+ 3300 val perplexity 1266.9409
181
+ 3300 train 7.101440 (lr=1.1932e-04) (hash(x)=54428388)
182
+ 2700 val loss 7.3374
183
+ 2700 val perplexity 1536.7114
184
+ 2700 train 7.783456 (lr=8.6306e-05) (hash(x)=59131616)
185
+ 3400 val loss 7.1267
186
+ 3400 val perplexity 1244.7963
187
+ 3400 train 7.159663 (lr=1.1749e-04) (hash(x)=48115990)
188
+ 2800 val loss 7.3442
189
+ 2800 val perplexity 1547.1792
190
+ 2800 train 7.180212 (lr=8.5254e-05) (hash(x)=45882743)
191
+ 3500 val loss 7.1118
192
+ 3500 val perplexity 1226.2948
193
+ 3500 train 6.734808 (lr=1.1562e-04) (hash(x)=41137345)
194
+ 2900 val loss 7.3278
195
+ 2900 val perplexity 1522.0278
196
+ 2900 train 6.986289 (lr=8.4170e-05) (hash(x)=43758910)
197
+ 3600 val loss 7.1108
198
+ 3600 val perplexity 1225.1632
199
+ 3600 train 7.007893 (lr=1.1372e-04) (hash(x)=55186224)
200
+ 3000 val loss 7.3364
201
+ 3000 val perplexity 1535.1770
202
+ 3000 train 7.256237 (lr=8.3057e-05) (hash(x)=47965974)
203
+ 3700 val loss 7.1188
204
+ 3700 val perplexity 1235.0267
205
+ 3700 train 6.970138 (lr=1.1179e-04) (hash(x)=54990049)
206
+ 3100 val loss 7.2971
207
+ 3100 val perplexity 1476.0162
208
+ 3100 train 7.206101 (lr=8.1915e-05) (hash(x)=48205243)
209
+ 3800 val loss 7.0919
210
+ 3800 val perplexity 1202.2446
211
+ 3800 train 6.816278 (lr=1.0982e-04) (hash(x)=46288812)
212
+ 3200 val loss 7.2821
213
+ 3200 val perplexity 1454.0240
214
+ 3200 train 7.345427 (lr=8.0745e-05) (hash(x)=54511383)
215
+ 3900 val loss 7.0675
216
+ 3900 val perplexity 1173.2322
217
+ 3900 train 6.737766 (lr=1.0783e-04) (hash(x)=45829773)
218
+ 3300 val loss 7.2780
219
+ 3300 val perplexity 1448.0736
220
+ 3300 train 7.247321 (lr=7.9549e-05) (hash(x)=54428388)
221
+ 4000 val loss 7.0704
222
+ 4000 val perplexity 1176.6615
223
+ 4000 train 6.859371 (lr=1.0581e-04) (hash(x)=52499943)
224
+ 3400 val loss 7.2686
225
+ 3400 val perplexity 1434.5167
226
+ 3400 train 7.313385 (lr=7.8328e-05) (hash(x)=48115990)
227
+ 4100 val loss 7.0087
228
+ 4100 val perplexity 1106.1769
229
+ 4100 train 6.920537 (lr=1.0377e-04) (hash(x)=48563796)
230
+ 3500 val loss 7.2712
231
+ 3500 val perplexity 1438.2900
232
+ 3500 train 6.913240 (lr=7.7082e-05) (hash(x)=41137345)
233
+ 4200 val loss 6.9935
234
+ 4200 val perplexity 1089.5476
235
+ 4200 train 6.965348 (lr=1.0171e-04) (hash(x)=49165143)
236
+ 3600 val loss 7.2623
237
+ 3600 val perplexity 1425.5751
238
+ 3600 train 7.169919 (lr=7.5814e-05) (hash(x)=55186224)
239
+ 4300 val loss 6.9627
240
+ 4300 val perplexity 1056.4453
241
+ 4300 train 7.010408 (lr=9.9622e-05) (hash(x)=50973176)
242
+ 3700 val loss 7.2633
243
+ 3700 val perplexity 1426.9270
244
+ 3700 train 7.101009 (lr=7.4525e-05) (hash(x)=54990049)
245
+ 4400 val loss 6.9507
246
+ 4400 val perplexity 1043.8556
247
+ 4400 train 7.006516 (lr=9.7520e-05) (hash(x)=55275124)
248
+ 4500 val loss 6.9582
249
+ 4500 val perplexity 1051.7567
250
+ 4500 train 7.286173 (lr=9.5403e-05) (hash(x)=58646505)
251
+ 3800 val loss 7.2887
252
+ 3800 val perplexity 1463.6641
253
+ 3800 train 7.029315 (lr=7.3215e-05) (hash(x)=46288812)
254
+ 4600 val loss 6.9107
255
+ 4600 val perplexity 1002.9967
256
+ 4600 train 6.718947 (lr=9.3273e-05) (hash(x)=42554666)
257
+ 3900 val loss 7.2681
258
+ 3900 val perplexity 1433.8232
259
+ 3900 train 6.927856 (lr=7.1887e-05) (hash(x)=45829773)
260
+ 4700 val loss 6.9037
261
+ 4700 val perplexity 995.9628
262
+ 4700 train 6.815775 (lr=9.1132e-05) (hash(x)=47846764)
263
+ 4000 val loss 7.2776
264
+ 4000 val perplexity 1447.4598
265
+ 4000 train 7.064430 (lr=7.0541e-05) (hash(x)=52499943)
266
+ 4800 val loss 6.8718
267
+ 4800 val perplexity 964.6489
268
+ 4800 train 7.392079 (lr=8.8982e-05) (hash(x)=58239019)
269
+ 4100 val loss 7.2279
270
+ 4100 val perplexity 1377.2673
271
+ 4100 train 7.127439 (lr=6.9180e-05) (hash(x)=48563796)
272
+ 4900 val loss 6.8548
273
+ 4900 val perplexity 948.4195
274
+ 4900 train 6.890259 (lr=8.6825e-05) (hash(x)=50711220)
275
+ 4200 val loss 7.2032
276
+ 4200 val perplexity 1343.6794
277
+ 4200 train 7.175355 (lr=6.7804e-05) (hash(x)=49165143)
278
+ 5000 val loss 6.8666
279
+ 5000 val perplexity 959.6856
280
+ 5000 train 6.797673 (lr=8.4663e-05) (hash(x)=45994194)
281
+ 4300 val loss 7.1964
282
+ 4300 val perplexity 1334.5923
283
+ 4300 train 7.246315 (lr=6.6414e-05) (hash(x)=50973176)
284
+ 5100 val loss 6.8380
285
+ 5100 val perplexity 932.5778
286
+ 5100 train 6.689332 (lr=8.2500e-05) (hash(x)=48659050)
287
+ 4400 val loss 7.1871
288
+ 4400 val perplexity 1322.2667
289
+ 4400 train 7.213579 (lr=6.5013e-05) (hash(x)=55275124)
290
+ 5200 val loss 6.8171
291
+ 5200 val perplexity 913.3361
292
+ 5200 train 6.733484 (lr=8.0337e-05) (hash(x)=49369682)
293
+ 4500 val loss 7.1887
294
+ 4500 val perplexity 1324.3516
295
+ 4500 train 7.501039 (lr=6.3602e-05) (hash(x)=58646505)
296
+ 5300 val loss 6.8009
297
+ 5300 val perplexity 898.6852
298
+ 5300 train 7.195892 (lr=7.8175e-05) (hash(x)=57787700)
299
+ 4600 val loss 7.1673
300
+ 4600 val perplexity 1296.2893
301
+ 4600 train 6.993137 (lr=6.2182e-05) (hash(x)=42554666)
302
+ 5400 val loss 6.8112
303
+ 5400 val perplexity 907.9326
304
+ 5400 train 6.721237 (lr=7.6018e-05) (hash(x)=49365400)
305
+ 4700 val loss 7.1780
306
+ 4700 val perplexity 1310.3413
307
+ 4700 train 7.083921 (lr=6.0754e-05) (hash(x)=47846764)
308
+ 5500 val loss 6.7819
309
+ 5500 val perplexity 881.7007
310
+ 5500 train 6.746484 (lr=7.3868e-05) (hash(x)=48720412)
311
+ 4800 val loss 7.1616
312
+ 4800 val perplexity 1289.0020
313
+ 4800 train 7.689332 (lr=5.9321e-05) (hash(x)=58239019)
314
+ 5600 val loss 6.7828
315
+ 5600 val perplexity 882.5264
316
+ 5600 train 7.133316 (lr=7.1727e-05) (hash(x)=55784800)
317
+ 4900 val loss 7.1571
318
+ 4900 val perplexity 1283.2035
319
+ 4900 train 7.220632 (lr=5.7883e-05) (hash(x)=50711220)
320
+ 5700 val loss 6.7763
321
+ 5700 val perplexity 876.8293
322
+ 5700 train 6.594879 (lr=6.9597e-05) (hash(x)=50073634)
323
+ 5000 val loss 7.1550
324
+ 5000 val perplexity 1280.5098
325
+ 5000 train 7.076353 (lr=5.6442e-05) (hash(x)=45994194)
326
+ 5800 val loss 6.7572
327
+ 5800 val perplexity 860.2529
328
+ 5800 train 6.577003 (lr=6.7480e-05) (hash(x)=50170324)
329
+ 5100 val loss 7.1429
330
+ 5100 val perplexity 1265.0339
331
+ 5100 train 7.000565 (lr=5.5000e-05) (hash(x)=48659050)
332
+ 5900 val loss 6.7658
333
+ 5900 val perplexity 867.6870
334
+ 5900 train 6.507083 (lr=6.5378e-05) (hash(x)=48410268)
335
+ 5200 val loss 7.1410
336
+ 5200 val perplexity 1262.6818
337
+ 5200 train 7.094159 (lr=5.3558e-05) (hash(x)=49369682)
338
+ 6000 val loss 6.7626
339
+ 6000 val perplexity 864.9288
340
+ 6000 train 6.629282 (lr=6.3294e-05) (hash(x)=49527342)
341
+ 5300 val loss 7.1292
342
+ 5300 val perplexity 1247.8777
343
+ 5300 train 7.497270 (lr=5.2117e-05) (hash(x)=57787700)
344
+ 6100 val loss 6.7591
345
+ 6100 val perplexity 861.8850
346
+ 6100 train 6.519784 (lr=6.1230e-05) (hash(x)=49550294)
347
+ 5400 val loss 7.1447
348
+ 5400 val perplexity 1267.4213
349
+ 5400 train 7.050255 (lr=5.0679e-05) (hash(x)=49365400)
350
+ 6200 val loss 6.7512
351
+ 6200 val perplexity 855.1146
352
+ 6200 train 6.288379 (lr=5.9188e-05) (hash(x)=42126106)
353
+ 5500 val loss 7.1224
354
+ 5500 val perplexity 1239.3687
355
+ 5500 train 7.068456 (lr=4.9246e-05) (hash(x)=48720412)
356
+ 6300 val loss 6.7449
357
+ 6300 val perplexity 849.7552
358
+ 6300 train 6.486180 (lr=5.7169e-05) (hash(x)=49608772)
359
+ 5600 val loss 7.1197
360
+ 5600 val perplexity 1236.0317
361
+ 5600 train 7.443053 (lr=4.7818e-05) (hash(x)=55784800)
362
+ 6400 val loss 6.7319
363
+ 6400 val perplexity 838.7435
364
+ 6400 train 6.273153 (lr=5.5177e-05) (hash(x)=52324417)
365
+ 5700 val loss 7.1170
366
+ 5700 val perplexity 1232.7433
367
+ 5700 train 6.958042 (lr=4.6398e-05) (hash(x)=50073634)
368
+ 6500 val loss 6.6949
369
+ 6500 val perplexity 808.3016
370
+ 6500 train 6.726796 (lr=5.3213e-05) (hash(x)=46207215)
371
+ 5800 val loss 7.1130
372
+ 5800 val perplexity 1227.8770
373
+ 5800 train 6.936908 (lr=4.4987e-05) (hash(x)=50170324)
374
+ 6600 val loss 6.6733
375
+ 6600 val perplexity 791.0082
376
+ 6600 train 6.626161 (lr=5.1279e-05) (hash(x)=49027014)
377
+ 5900 val loss 7.1241
378
+ 5900 val perplexity 1241.5690
379
+ 5900 train 6.890422 (lr=4.3586e-05) (hash(x)=48410268)
380
+ 6700 val loss 6.6657
381
+ 6700 val perplexity 784.9918
382
+ 6700 train 6.616896 (lr=4.9377e-05) (hash(x)=46232513)
383
+ 6000 val loss 7.1221
384
+ 6000 val perplexity 1239.0070
385
+ 6000 train 7.008251 (lr=4.2196e-05) (hash(x)=49527342)
386
+ 6800 val loss 6.6663
387
+ 6800 val perplexity 785.5156
388
+ 6800 train 6.590093 (lr=4.7509e-05) (hash(x)=47348403)
389
+ 6100 val loss 7.1235
390
+ 6100 val perplexity 1240.8168
391
+ 6100 train 6.934540 (lr=4.0820e-05) (hash(x)=49550294)
392
+ 6900 val loss 6.6790
393
+ 6900 val perplexity 795.5314
394
+ 6900 train 6.656285 (lr=4.5676e-05) (hash(x)=49806647)
395
+ 6200 val loss 7.1133
396
+ 6200 val perplexity 1228.1627
397
+ 6200 train 6.688022 (lr=3.9459e-05) (hash(x)=42126106)
398
+ 7000 val loss 6.6343
399
+ 7000 val perplexity 760.7117
400
+ 7000 train 6.703082 (lr=4.3882e-05) (hash(x)=50893018)
401
+ 6300 val loss 7.1152
402
+ 6300 val perplexity 1230.5397
403
+ 6300 train 6.896497 (lr=3.8113e-05) (hash(x)=49608772)
404
+ 7100 val loss 6.6442
405
+ 7100 val perplexity 768.3418
406
+ 7100 train 6.674629 (lr=4.2128e-05) (hash(x)=49157639)
407
+ 6400 val loss 7.1097
408
+ 6400 val perplexity 1223.7263
409
+ 6400 train 6.698913 (lr=3.6785e-05) (hash(x)=52324417)
410
+ 7200 val loss 6.6269
411
+ 7200 val perplexity 755.1032
412
+ 7200 train 6.619589 (lr=4.0414e-05) (hash(x)=47014759)
413
+ 7300 val loss 6.6245
414
+ 7300 val perplexity 753.3093
415
+ 7300 train 6.609889 (lr=3.8745e-05) (hash(x)=47325591)
416
+ 6500 val loss 7.0836
417
+ 6500 val perplexity 1192.3104
418
+ 6500 train 7.098672 (lr=3.5475e-05) (hash(x)=46207215)
419
+ 7400 val loss 6.6178
420
+ 7400 val perplexity 748.2969
421
+ 7400 train 6.509352 (lr=3.7120e-05) (hash(x)=49184604)
422
+ 6600 val loss 7.0634
423
+ 6600 val perplexity 1168.4180
424
+ 6600 train 6.992548 (lr=3.4186e-05) (hash(x)=49027014)
425
+ 7500 val loss 6.6078
426
+ 7500 val perplexity 740.8351
427
+ 7500 train 6.835536 (lr=3.5541e-05) (hash(x)=55053584)
428
+ 6700 val loss 7.0569
429
+ 6700 val perplexity 1160.8925
430
+ 6700 train 6.979507 (lr=3.2918e-05) (hash(x)=46232513)
431
+ 7600 val loss 6.6067
432
+ 7600 val perplexity 740.0150
433
+ 7600 train 6.554074 (lr=3.4011e-05) (hash(x)=48693923)
434
+ 6800 val loss 7.0579
435
+ 6800 val perplexity 1162.0583
436
+ 6800 train 6.994317 (lr=3.1672e-05) (hash(x)=47348403)
437
+ 7700 val loss 6.6059
438
+ 7700 val perplexity 739.4333
439
+ 7700 train 6.199418 (lr=3.2531e-05) (hash(x)=40952882)
440
+ 6900 val loss 7.0640
441
+ 6900 val perplexity 1169.1509
442
+ 6900 train 7.008790 (lr=3.0451e-05) (hash(x)=49806647)
443
+ 7800 val loss 6.6001
444
+ 7800 val perplexity 735.1403
445
+ 7800 train 6.685646 (lr=3.1102e-05) (hash(x)=52487845)
446
+ 7000 val loss 7.0359
447
+ 7000 val perplexity 1136.6608
448
+ 7000 train 7.066860 (lr=2.9255e-05) (hash(x)=50893018)
449
+ 7900 val loss 6.5936
450
+ 7900 val perplexity 730.3727
451
+ 7900 train 6.667073 (lr=2.9726e-05) (hash(x)=50221547)
452
+ 7100 val loss 7.0304
453
+ 7100 val perplexity 1130.4474
454
+ 7100 train 7.072404 (lr=2.8085e-05) (hash(x)=49157639)
455
+ 8000 val loss 6.5985
456
+ 8000 val perplexity 733.9662
457
+ 8000 train 6.816906 (lr=2.8405e-05) (hash(x)=62294204)
458
+ 7200 val loss 7.0286
459
+ 7200 val perplexity 1128.4714
460
+ 7200 train 6.968489 (lr=2.6943e-05) (hash(x)=47014759)
461
+ 8100 val loss 6.5919
462
+ 8100 val perplexity 729.1611
463
+ 8100 train 6.290696 (lr=2.7138e-05) (hash(x)=44401967)
464
+ 7300 val loss 7.0301
465
+ 7300 val perplexity 1130.1548
466
+ 7300 train 7.023178 (lr=2.5830e-05) (hash(x)=47325591)
467
+ 8200 val loss 6.5940
468
+ 8200 val perplexity 730.6852
469
+ 8200 train 6.534984 (lr=2.5929e-05) (hash(x)=52769095)
470
+ 7400 val loss 7.0179
471
+ 7400 val perplexity 1116.3906
472
+ 7400 train 6.888594 (lr=2.4746e-05) (hash(x)=49184604)
473
+ 8300 val loss 6.6070
474
+ 8300 val perplexity 740.2457
475
+ 8300 train 6.528926 (lr=2.4778e-05) (hash(x)=56829883)
476
+ 7500 val loss 7.0154
477
+ 7500 val perplexity 1113.6748
478
+ 7500 train 7.263641 (lr=2.3694e-05) (hash(x)=55053584)
479
+ 8400 val loss 6.6083
480
+ 8400 val perplexity 741.1849
481
+ 8400 train 6.546304 (lr=2.3686e-05) (hash(x)=52147375)
482
+ 7600 val loss 7.0119
483
+ 7600 val perplexity 1109.7356
484
+ 7600 train 6.966333 (lr=2.2674e-05) (hash(x)=48693923)
485
+ 8500 val loss 6.6043
486
+ 8500 val perplexity 738.2313
487
+ 8500 train 6.838852 (lr=2.2655e-05) (hash(x)=60197820)
488
+ 7700 val loss 7.0081
489
+ 7700 val perplexity 1105.5388
490
+ 7700 train 6.596260 (lr=2.1687e-05) (hash(x)=40952882)
491
+ 8600 val loss 6.5924
492
+ 8600 val perplexity 729.5060
493
+ 8600 train 6.325205 (lr=2.1685e-05) (hash(x)=49377068)
494
+ 7800 val loss 7.0164
495
+ 7800 val perplexity 1114.7831
496
+ 7800 train 7.092906 (lr=2.0735e-05) (hash(x)=52487845)
497
+ 8700 val loss 6.6001
498
+ 8700 val perplexity 735.1347
499
+ 8700 train 6.556189 (lr=2.0777e-05) (hash(x)=51092724)
500
+ 7900 val loss 7.0032
501
+ 7900 val perplexity 1100.1108
502
+ 7900 train 7.059808 (lr=1.9818e-05) (hash(x)=50221547)
503
+ 8800 val loss 6.5955
504
+ 8800 val perplexity 731.8264
505
+ 8800 train 6.565369 (lr=1.9933e-05) (hash(x)=48642928)
506
+ 8000 val loss 7.0017
507
+ 8000 val perplexity 1098.5424
508
+ 8000 train 7.273840 (lr=1.8936e-05) (hash(x)=62294204)
509
+ 8900 val loss 6.5768
510
+ 8900 val perplexity 718.2245
511
+ 8900 train 6.758434 (lr=1.9153e-05) (hash(x)=55342246)
512
+ 8100 val loss 6.9990
513
+ 8100 val perplexity 1095.5623
514
+ 8100 train 6.701060 (lr=1.8092e-05) (hash(x)=44401967)
515
+ 9000 val loss 6.5617
516
+ 9000 val perplexity 707.4598
517
+ 9000 train 6.568685 (lr=1.8439e-05) (hash(x)=48093368)
518
+ 8200 val loss 7.0037
519
+ 8200 val perplexity 1100.6927
520
+ 8200 train 6.968071 (lr=1.7286e-05) (hash(x)=52769095)
521
+ 9100 val loss 6.5556
522
+ 9100 val perplexity 703.1831
523
+ 9100 train 6.602783 (lr=1.7790e-05) (hash(x)=48578183)
524
+ 8300 val loss 7.0071
525
+ 8300 val perplexity 1104.4213
526
+ 8300 train 6.974947 (lr=1.6519e-05) (hash(x)=56829883)
527
+ 9200 val loss 6.5544
528
+ 9200 val perplexity 702.3309
529
+ 9200 train 6.698106 (lr=1.7208e-05) (hash(x)=50794720)
530
+ 8400 val loss 7.0149
531
+ 8400 val perplexity 1113.1216
532
+ 8400 train 6.988824 (lr=1.5791e-05) (hash(x)=52147375)
533
+ 9300 val loss 6.5496
534
+ 9300 val perplexity 698.9913
535
+ 9300 train 6.316019 (lr=1.6692e-05) (hash(x)=46513190)
536
+ 8500 val loss 7.0100
537
+ 8500 val perplexity 1107.6707
538
+ 8500 train 7.226442 (lr=1.5103e-05) (hash(x)=60197820)
539
+ 9400 val loss 6.5440
540
+ 9400 val perplexity 695.0316
541
+ 9400 train 6.202390 (lr=1.6245e-05) (hash(x)=43808238)
542
+ 8600 val loss 7.0079
543
+ 8600 val perplexity 1105.2942
544
+ 8600 train 6.780334 (lr=1.4456e-05) (hash(x)=49377068)
545
+ 9500 val loss 6.5441
546
+ 9500 val perplexity 695.0981
547
+ 9500 train 6.356673 (lr=1.5865e-05) (hash(x)=45021888)
548
+ 8700 val loss 7.0063
549
+ 8700 val perplexity 1103.5695
550
+ 8700 train 6.962779 (lr=1.3851e-05) (hash(x)=51092724)
551
+ 9600 val loss 6.5407
552
+ 9600 val perplexity 692.7766
553
+ 9600 train 6.637218 (lr=1.5554e-05) (hash(x)=56525570)
554
+ 9700 val loss 6.5368
555
+ 9700 val perplexity 690.1021
556
+ 9700 train 6.719944 (lr=1.5312e-05) (hash(x)=52585913)
557
+ 8800 val loss 7.0045
558
+ 8800 val perplexity 1101.6248
559
+ 8800 train 7.003560 (lr=1.3289e-05) (hash(x)=48642928)
560
+ 9800 val loss 6.5355
561
+ 9800 val perplexity 689.1559
562
+ 9800 train 6.706218 (lr=1.5139e-05) (hash(x)=52344698)
563
+ 8900 val loss 6.9983
564
+ 8900 val perplexity 1094.7372
565
+ 8900 train 7.272505 (lr=1.2769e-05) (hash(x)=55342246)
566
+ 9900 val loss 6.5324
567
+ 9900 val perplexity 687.0436
568
+ 9900 train 6.540698 (lr=1.5035e-05) (hash(x)=51740945)
569
+ 9000 val loss 6.9807
570
+ 9000 val perplexity 1075.6921
571
+ 9000 train 6.939584 (lr=1.2292e-05) (hash(x)=48093368)
572
+ 9999 val loss 6.5308
573
+ 9999 val perplexity 685.9659
574
+ 9100 val loss 6.9772
575
+ 9100 val perplexity 1071.9318
576
+ 9100 train 7.052067 (lr=1.1860e-05) (hash(x)=48578183)
577
+ 9200 val loss 6.9744
578
+ 9200 val perplexity 1068.9447
579
+ 9200 train 7.153374 (lr=1.1472e-05) (hash(x)=50794720)
580
+ 9300 val loss 6.9714
581
+ 9300 val perplexity 1065.6759
582
+ 9300 train 6.762959 (lr=1.1128e-05) (hash(x)=46513190)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
attention_kindselective_n_heads2_seed1338/model_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88a13d00e75e1cf66932d118b2f84bb24f00babd0ee75e7f5801725ef5c13418
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9157bcc1377e68b286966c0a0341a5e69ef9fc2e367bcc48aba49860b5ba4d4
3
  size 38587970
attention_kindselective_n_heads2_seed1338/model_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32d5afc7324a3d22f4c1218ec09c6c54149343ba625fcba9940acb396b796c11
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f41aca9adf817c5b42c183649b46a97fcd1baba88fea3644adb57163bd39ec2
3
  size 38587970
attention_kindselective_n_heads2_seed1338/model_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05a121807594d5ecb5feed74e34dfa1e69b69860960bb12751f372377bb006a6
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f15805825428b7a60e7c78f2ff49c625a1b71822b49facc1d5e458f2e62df939
3
  size 38587970
attention_kindselective_n_heads2_seed1338/model_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40e0b112f277f6cbba421303357ec83f7697b417831c9e4ff43b473f4f3e1114
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d811c4612e563d7ca45c8d8885b79dc2a5745e774a70903db459198e33ac0f4a
3
  size 38587970
attention_kindselective_n_heads2_seed1338/optimizer_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d833216fc7bfe1147572ad064c3757e6d11dab6d14b15ed9dbc193d3fe0c652
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47848b329fdc5de70e6f1d2c743be1b06f2fc0459c1128539106cc34eb7bd6cb
3
  size 70895430
attention_kindselective_n_heads2_seed1338/optimizer_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42125ee96db8d16feb270e483c050b0f0ad3baa06ce04fb49898faaa1eb18ca6
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23d2ca9117d48314c932bb7aa5c430b045f768930d81a6a6a25c137857f81aad
3
  size 70895430
attention_kindselective_n_heads2_seed1338/optimizer_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4c2b49578a7dc4e2283cd26e5e389f35e17e5b918115b0f054f8c487f39e3d0
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8615fd9af3900fecade21ff7a416b6897eb44dab958718ebeb0d5baae84031b
3
  size 70895430
attention_kindselective_n_heads2_seed1338/optimizer_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42529f3655fc336117c939c1c8ec0fb125634901be3c730bed8da0466e4741ef
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d832d53521de3812a1554dab9c41a6afb51235fdd9d9e0b4f8e9dfe2e3ccd20
3
  size 70895430