andrew-healey commited on
Commit
96bdd65
·
verified ·
1 Parent(s): edb7199

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads2_seed1340/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads2_seed1340", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1340, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 7e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "7e-5_10240_2_1340", "n_embd": 128}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads2_seed1340", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1340, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 0.00015, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "15e-5_10240_2_1340", "n_embd": 128}
attention_kindselective_n_heads2_seed1340/log2.txt CHANGED
@@ -1,603 +1,423 @@
1
  max_steps: 10000
 
 
 
2
  0 val loss 11.8201
3
  0 val perplexity 135955.4688
4
- 0 val loss 11.8201
5
- 0 val perplexity 135955.4688
6
- 0 train 11.815376 (lr=2.5000e-07) (hash(x)=57791809)
7
- 0 train 11.815376 (lr=3.5000e-07) (hash(x)=57791809)
8
- 100 val loss 10.1415
9
- 100 val perplexity 25373.9316
10
- 100 train 10.167628 (lr=2.5250e-05) (hash(x)=48211824)
11
- 100 val loss 9.9846
12
- 100 val perplexity 21690.5820
13
- 100 train 10.007619 (lr=3.5350e-05) (hash(x)=48211824)
14
- 200 val loss 8.9560
15
- 200 val perplexity 7754.3325
16
- 200 train 8.916373 (lr=5.0000e-05) (hash(x)=50375849)
17
- 200 val loss 8.3972
18
- 200 val perplexity 4434.5674
19
- 200 train 8.351953 (lr=7.0000e-05) (hash(x)=50375849)
20
- 300 val loss 8.1359
21
- 300 val perplexity 3414.7334
22
- 300 train 8.400735 (lr=4.9988e-05) (hash(x)=57250808)
23
- 300 val loss 7.9232
24
- 300 val perplexity 2760.7124
25
- 300 train 8.212564 (lr=6.9984e-05) (hash(x)=57250808)
26
- 400 val loss 7.8720
27
- 400 val perplexity 2622.8079
28
- 400 train 8.402547 (lr=4.9954e-05) (hash(x)=62519858)
29
- 400 val loss 7.7554
30
- 400 val perplexity 2334.1978
31
- 400 train 8.325225 (lr=6.9935e-05) (hash(x)=62519858)
32
- 500 val loss 7.7049
33
- 500 val perplexity 2219.0955
34
- 500 train 7.604552 (lr=4.9896e-05) (hash(x)=47226806)
35
- 500 val loss 7.6750
36
- 500 val perplexity 2153.8015
37
- 500 train 7.583454 (lr=6.9854e-05) (hash(x)=47226806)
38
- 600 val loss 7.6373
39
- 600 val perplexity 2074.1904
40
- 600 train 7.663011 (lr=4.9815e-05) (hash(x)=51149322)
41
- 600 val loss 7.6327
42
- 600 val perplexity 2064.5627
43
- 600 train 7.648909 (lr=6.9741e-05) (hash(x)=51149322)
44
- 700 val loss 7.6224
45
- 700 val perplexity 2043.4777
46
- 700 train 7.632802 (lr=4.9712e-05) (hash(x)=51564551)
47
- 700 val loss 7.6209
48
- 700 val perplexity 2040.4641
49
- 700 train 7.626678 (lr=6.9596e-05) (hash(x)=51564551)
50
- 800 val loss 7.6059
51
- 800 val perplexity 2010.0964
52
- 800 train 7.368664 (lr=4.9585e-05) (hash(x)=45093459)
53
- 800 val loss 7.6051
54
- 800 val perplexity 2008.5107
55
- 800 train 7.361159 (lr=6.9419e-05) (hash(x)=45093459)
56
- 900 val loss 7.6009
57
- 900 val perplexity 1999.9039
58
- 900 train 7.899217 (lr=4.9436e-05) (hash(x)=54988361)
59
- 900 val loss 7.5850
60
- 900 val perplexity 1968.4664
61
- 900 train 7.888000 (lr=6.9210e-05) (hash(x)=54988361)
62
- 1000 val loss 7.5910
63
- 1000 val perplexity 1980.3516
64
- 1000 train 7.476419 (lr=4.9264e-05) (hash(x)=47588648)
65
- 1000 val loss 7.5780
66
- 1000 val perplexity 1954.8005
67
- 1000 train 7.459035 (lr=6.8970e-05) (hash(x)=47588648)
68
- 1100 val loss 7.5702
69
- 1100 val perplexity 1939.6012
70
- 1100 train 7.172775 (lr=4.9070e-05) (hash(x)=37984588)
71
- 1100 val loss 7.5466
72
- 1100 val perplexity 1894.2449
73
- 1100 train 7.153616 (lr=6.8698e-05) (hash(x)=37984588)
74
- 1200 val loss 7.5385
75
- 1200 val perplexity 1878.9194
76
- 1200 train 7.705000 (lr=4.8854e-05) (hash(x)=56333817)
77
- 1300 val loss 7.5312
78
- 1300 val perplexity 1865.3256
79
- 1200 val loss 7.5242
80
- 1200 val perplexity 1852.2689
81
- 1300 train 7.666130 (lr=4.8616e-05) (hash(x)=53454056)
82
- 1200 train 7.697269 (lr=6.8395e-05) (hash(x)=56333817)
83
- 1400 val loss 7.4924
84
- 1400 val perplexity 1794.3845
85
- 1400 train 7.599366 (lr=4.8356e-05) (hash(x)=55284163)
86
- 1300 val loss 7.5314
87
- 1300 val perplexity 1865.7615
88
- 1300 train 7.681477 (lr=6.8062e-05) (hash(x)=53454056)
89
- 1500 val loss 7.4758
90
- 1500 val perplexity 1764.8745
91
- 1500 train 7.364212 (lr=4.8074e-05) (hash(x)=48162598)
92
- 1400 val loss 7.4858
93
- 1400 val perplexity 1782.5961
94
- 1400 train 7.581354 (lr=6.7698e-05) (hash(x)=55284163)
95
- 1600 val loss 7.4609
96
- 1600 val perplexity 1738.7384
97
- 1600 train 7.523167 (lr=4.7772e-05) (hash(x)=54214535)
98
- 1500 val loss 7.4677
99
- 1500 val perplexity 1750.5398
100
- 1500 train 7.366483 (lr=6.7304e-05) (hash(x)=48162598)
101
- 1700 val loss 7.4489
102
- 1700 val perplexity 1718.0170
103
- 1700 train 7.658994 (lr=4.7448e-05) (hash(x)=53525003)
104
- 1600 val loss 7.4239
105
- 1600 val perplexity 1675.5126
106
- 1600 train 7.474539 (lr=6.6881e-05) (hash(x)=54214535)
107
- 1800 val loss 7.4343
108
- 1800 val perplexity 1693.1310
109
- 1800 train 7.507501 (lr=4.7105e-05) (hash(x)=51848994)
110
- 1700 val loss 7.3955
111
- 1700 val perplexity 1628.6207
112
- 1700 train 7.592460 (lr=6.6428e-05) (hash(x)=53525003)
113
- 1900 val loss 7.4327
114
- 1900 val perplexity 1690.2996
115
- 1900 train 7.251468 (lr=4.6741e-05) (hash(x)=48405987)
116
- 1800 val loss 7.3557
117
- 1800 val perplexity 1565.1370
118
- 1800 train 7.437174 (lr=6.5947e-05) (hash(x)=51848994)
119
- 2000 val loss 7.4107
120
- 2000 val perplexity 1653.5981
121
- 2000 train 7.784187 (lr=4.6357e-05) (hash(x)=58592291)
122
- 1900 val loss 7.3525
123
- 1900 val perplexity 1560.1082
124
- 1900 train 7.168375 (lr=6.5437e-05) (hash(x)=48405987)
125
- 2100 val loss 7.4144
126
- 2100 val perplexity 1659.6460
127
- 2100 train 7.489458 (lr=4.5954e-05) (hash(x)=51167081)
128
- 2000 val loss 7.3089
129
- 2000 val perplexity 1493.5795
130
- 2000 train 7.667870 (lr=6.4900e-05) (hash(x)=58592291)
131
- 2200 val loss 7.3779
132
- 2200 val perplexity 1600.2672
133
- 2200 train 7.367453 (lr=4.5532e-05) (hash(x)=47994988)
134
- 2100 val loss 7.3073
135
- 2100 val perplexity 1491.0868
136
- 2100 train 7.386800 (lr=6.4335e-05) (hash(x)=51167081)
137
- 2300 val loss 7.3693
138
- 2300 val perplexity 1586.4584
139
- 2300 train 7.348310 (lr=4.5091e-05) (hash(x)=47377604)
140
- 2200 val loss 7.3030
141
- 2200 val perplexity 1484.7701
142
- 2200 train 7.295847 (lr=6.3745e-05) (hash(x)=47994988)
143
- 2400 val loss 7.3560
144
- 2400 val perplexity 1565.5953
145
- 2400 train 7.408455 (lr=4.4633e-05) (hash(x)=53554323)
146
- 2300 val loss 7.2556
147
- 2300 val perplexity 1416.0649
148
- 2300 train 7.237532 (lr=6.3128e-05) (hash(x)=47377604)
149
- 2500 val loss 7.3607
150
- 2500 val perplexity 1573.0103
151
- 2500 train 7.371253 (lr=4.4156e-05) (hash(x)=50780417)
152
- 2400 val loss 7.2298
153
- 2400 val perplexity 1379.9376
154
- 2400 train 7.275304 (lr=6.2486e-05) (hash(x)=53554323)
155
- 2600 val loss 7.3246
156
- 2600 val perplexity 1517.1007
157
- 2600 train 7.223897 (lr=4.3663e-05) (hash(x)=46453562)
158
- 2500 val loss 7.2124
159
- 2500 val perplexity 1356.2070
160
- 2500 train 7.234367 (lr=6.1819e-05) (hash(x)=50780417)
161
- 2700 val loss 7.3181
162
- 2700 val perplexity 1507.3157
163
- 2700 train 7.332546 (lr=4.3153e-05) (hash(x)=54404221)
164
- 2600 val loss 7.1852
165
- 2600 val perplexity 1319.7157
166
- 2600 train 7.078228 (lr=6.1128e-05) (hash(x)=46453562)
167
- 2800 val loss 7.3003
168
- 2800 val perplexity 1480.7711
169
- 2800 train 8.040918 (lr=4.2627e-05) (hash(x)=59318895)
170
- 2700 val loss 7.1844
171
- 2700 val perplexity 1318.7426
172
- 2700 train 7.171799 (lr=6.0414e-05) (hash(x)=54404221)
173
- 2900 val loss 7.3080
174
- 2900 val perplexity 1492.1636
175
- 2900 train 7.285590 (lr=4.2085e-05) (hash(x)=47845760)
176
- 2800 val loss 7.1699
177
- 2800 val perplexity 1299.7759
178
- 2800 train 7.846885 (lr=5.9677e-05) (hash(x)=59318895)
179
- 3000 val loss 7.2772
180
- 3000 val perplexity 1446.9222
181
- 3000 train 6.974861 (lr=4.1529e-05) (hash(x)=44336167)
182
- 2900 val loss 7.1458
183
- 2900 val perplexity 1268.7697
184
- 2900 train 7.136559 (lr=5.8919e-05) (hash(x)=47845760)
185
- 3100 val loss 7.2409
186
- 3100 val perplexity 1395.3539
187
- 3100 train 7.539340 (lr=4.0957e-05) (hash(x)=44479330)
188
- 3000 val loss 7.1352
189
- 3000 val perplexity 1255.3414
190
- 3000 train 6.846057 (lr=5.8140e-05) (hash(x)=44336167)
191
- 3200 val loss 7.2380
192
- 3200 val perplexity 1391.3442
193
- 3200 train 7.291301 (lr=4.0373e-05) (hash(x)=54593096)
194
- 3100 val loss 7.1081
195
- 3100 val perplexity 1221.8489
196
- 3100 train 7.400929 (lr=5.7340e-05) (hash(x)=44479330)
197
- 3300 val loss 7.2317
198
- 3300 val perplexity 1382.6051
199
- 3300 train 7.120160 (lr=3.9775e-05) (hash(x)=45347643)
200
- 3200 val loss 7.1089
201
- 3200 val perplexity 1222.8351
202
- 3200 train 7.181127 (lr=5.6522e-05) (hash(x)=54593096)
203
- 3400 val loss 7.1952
204
- 3400 val perplexity 1332.9685
205
- 3400 train 7.318199 (lr=3.9164e-05) (hash(x)=47797247)
206
- 3300 val loss 7.0572
207
- 3300 val perplexity 1161.2457
208
- 3300 train 6.946620 (lr=5.5684e-05) (hash(x)=45347643)
209
- 3500 val loss 7.1690
210
- 3500 val perplexity 1298.5449
211
- 3500 train 7.078238 (lr=3.8541e-05) (hash(x)=46115683)
212
- 3400 val loss 7.0117
213
- 3400 val perplexity 1109.5795
214
- 3400 train 7.125251 (lr=5.4829e-05) (hash(x)=47797247)
215
- 3600 val loss 7.1592
216
- 3600 val perplexity 1285.8961
217
- 3600 train 7.025664 (lr=3.7907e-05) (hash(x)=44502074)
218
- 3500 val loss 6.9839
219
- 3500 val perplexity 1079.1410
220
- 3500 train 6.878851 (lr=5.3958e-05) (hash(x)=46115683)
221
- 3700 val loss 7.1425
222
- 3700 val perplexity 1264.6299
223
- 3700 train 7.283512 (lr=3.7262e-05) (hash(x)=55388443)
224
- 3600 val loss 6.9588
225
- 3600 val perplexity 1052.3683
226
- 3600 train 6.803612 (lr=5.3070e-05) (hash(x)=44502074)
227
- 3800 val loss 7.1436
228
- 3800 val perplexity 1266.0127
229
- 3800 train 6.931653 (lr=3.6608e-05) (hash(x)=43790341)
230
- 3900 val loss 7.1233
231
- 3900 val perplexity 1240.5541
232
- 3900 train 7.126774 (lr=3.5944e-05) (hash(x)=50013318)
233
- 3700 val loss 6.9234
234
- 3700 val perplexity 1015.7560
235
- 3700 train 7.075602 (lr=5.2167e-05) (hash(x)=55388443)
236
- 4000 val loss 7.1181
237
- 4000 val perplexity 1234.0818
238
- 4000 train 7.197229 (lr=3.5271e-05) (hash(x)=51704787)
239
- 3800 val loss 6.9045
240
- 3800 val perplexity 996.7182
241
- 3800 train 6.700143 (lr=5.1251e-05) (hash(x)=43790341)
242
- 4100 val loss 7.1075
243
- 4100 val perplexity 1221.0736
244
- 4100 train 7.178077 (lr=3.4590e-05) (hash(x)=50821964)
245
- 3900 val loss 6.8876
246
- 3900 val perplexity 980.0497
247
- 3900 train 6.885145 (lr=5.0321e-05) (hash(x)=50013318)
248
- 4200 val loss 7.1000
249
- 4200 val perplexity 1211.9559
250
- 4200 train 7.117894 (lr=3.3902e-05) (hash(x)=49675080)
251
- 4000 val loss 6.8554
252
- 4000 val perplexity 948.9904
253
- 4000 train 6.929147 (lr=4.9379e-05) (hash(x)=51704787)
254
- 4300 val loss 7.0877
255
- 4300 val perplexity 1197.1761
256
- 4300 train 6.751846 (lr=3.3207e-05) (hash(x)=43239281)
257
- 4100 val loss 6.8422
258
- 4100 val perplexity 936.5167
259
- 4100 train 6.914855 (lr=4.8426e-05) (hash(x)=50821964)
260
- 4400 val loss 7.0756
261
- 4400 val perplexity 1182.7396
262
- 4400 train 6.766150 (lr=3.2507e-05) (hash(x)=45076737)
263
- 4200 val loss 6.8086
264
- 4200 val perplexity 905.6094
265
- 4200 train 6.836856 (lr=4.7463e-05) (hash(x)=49675080)
266
- 4500 val loss 7.0691
267
- 4500 val perplexity 1175.0457
268
- 4500 train 7.197128 (lr=3.1801e-05) (hash(x)=57930262)
269
- 4300 val loss 6.7909
270
- 4300 val perplexity 889.7120
271
- 4300 train 6.455951 (lr=4.6490e-05) (hash(x)=43239281)
272
- 4600 val loss 7.0560
273
- 4600 val perplexity 1159.7599
274
- 4600 train 6.852347 (lr=3.1091e-05) (hash(x)=46721614)
275
- 4400 val loss 6.7779
276
- 4400 val perplexity 878.2114
277
- 4400 train 6.462660 (lr=4.5509e-05) (hash(x)=45076737)
278
- 4700 val loss 7.0485
279
- 4700 val perplexity 1151.1281
280
- 4700 train 6.876356 (lr=3.0377e-05) (hash(x)=49837920)
281
- 4500 val loss 6.7576
282
- 4500 val perplexity 860.5627
283
- 4500 train 6.874951 (lr=4.4521e-05) (hash(x)=57930262)
284
- 4800 val loss 7.0353
285
- 4800 val perplexity 1135.9937
286
- 4800 train 7.160434 (lr=2.9661e-05) (hash(x)=48380045)
287
- 4600 val loss 6.7361
288
- 4600 val perplexity 842.2592
289
- 4600 train 6.513708 (lr=4.3527e-05) (hash(x)=46721614)
290
- 4900 val loss 7.0306
291
- 4900 val perplexity 1130.7338
292
- 4900 train 6.909075 (lr=2.8942e-05) (hash(x)=44202577)
293
- 4700 val loss 6.7285
294
- 4700 val perplexity 835.9263
295
- 4700 train 6.564556 (lr=4.2528e-05) (hash(x)=49837920)
296
- 5000 val loss 7.0021
297
- 5000 val perplexity 1098.8903
298
- 5000 train 7.005834 (lr=2.8221e-05) (hash(x)=52038024)
299
- 4800 val loss 6.7083
300
- 4800 val perplexity 819.1469
301
- 4800 train 6.879846 (lr=4.1525e-05) (hash(x)=48380045)
302
- 5100 val loss 6.9806
303
- 5100 val perplexity 1075.6146
304
- 5100 train 7.158219 (lr=2.7500e-05) (hash(x)=53700038)
305
- 4900 val loss 6.6987
306
- 4900 val perplexity 811.3144
307
- 4900 train 6.585225 (lr=4.0518e-05) (hash(x)=44202577)
308
- 5200 val loss 6.9704
309
- 5200 val perplexity 1064.6042
310
- 5200 train 6.970703 (lr=2.6779e-05) (hash(x)=48137625)
311
- 5000 val loss 6.6739
312
- 5000 val perplexity 791.4561
313
- 5000 train 6.682315 (lr=3.9510e-05) (hash(x)=52038024)
314
- 5300 val loss 6.9584
315
- 5300 val perplexity 1051.9539
316
- 5300 train 6.836740 (lr=2.6058e-05) (hash(x)=43161573)
317
- 5100 val loss 6.6505
318
- 5100 val perplexity 773.1574
319
- 5100 train 6.805350 (lr=3.8500e-05) (hash(x)=53700038)
320
- 5400 val loss 6.9467
321
- 5400 val perplexity 1039.6730
322
- 5400 train 7.132883 (lr=2.5339e-05) (hash(x)=56673322)
323
- 5200 val loss 6.6534
324
- 5200 val perplexity 775.4347
325
- 5200 train 6.657163 (lr=3.7490e-05) (hash(x)=48137625)
326
- 5500 val loss 6.9299
327
- 5500 val perplexity 1022.4167
328
- 5500 train 7.111983 (lr=2.4623e-05) (hash(x)=53468295)
329
- 5300 val loss 6.6340
330
- 5300 val perplexity 760.5112
331
- 5300 train 6.477094 (lr=3.6482e-05) (hash(x)=43161573)
332
- 5600 val loss 6.9158
333
- 5600 val perplexity 1008.0984
334
- 5600 train 7.229111 (lr=2.3909e-05) (hash(x)=59287280)
335
- 5400 val loss 6.6298
336
- 5400 val perplexity 757.3374
337
- 5400 train 6.820119 (lr=3.5475e-05) (hash(x)=56673322)
338
- 5700 val loss 6.9053
339
- 5700 val perplexity 997.5583
340
- 5700 train 7.143720 (lr=2.3199e-05) (hash(x)=57575806)
341
- 5500 val loss 6.6147
342
- 5500 val perplexity 746.0064
343
- 5500 train 6.824338 (lr=3.4472e-05) (hash(x)=53468295)
344
- 5800 val loss 6.8949
345
- 5800 val perplexity 987.2734
346
- 5800 train 6.836562 (lr=2.2493e-05) (hash(x)=46897279)
347
- 5600 val loss 6.6057
348
- 5600 val perplexity 739.3046
349
- 5600 train 6.898512 (lr=3.3473e-05) (hash(x)=59287280)
350
- 5900 val loss 6.8898
351
- 5900 val perplexity 982.2181
352
- 5900 train 6.771390 (lr=2.1793e-05) (hash(x)=47565679)
353
- 5700 val loss 6.5965
354
- 5700 val perplexity 732.5139
355
- 5700 train 6.830234 (lr=3.2479e-05) (hash(x)=57575806)
356
- 6000 val loss 6.8769
357
- 6000 val perplexity 969.5782
358
- 6000 train 6.764159 (lr=2.1098e-05) (hash(x)=51590090)
359
- 5800 val loss 6.5881
360
- 5800 val perplexity 726.3762
361
- 5800 train 6.606775 (lr=3.1491e-05) (hash(x)=46897279)
362
- 6100 val loss 6.8705
363
- 6100 val perplexity 963.4629
364
- 6100 train 7.256005 (lr=2.0410e-05) (hash(x)=59732271)
365
- 5900 val loss 6.5897
366
- 5900 val perplexity 727.5562
367
- 5900 train 6.480405 (lr=3.0510e-05) (hash(x)=47565679)
368
- 6200 val loss 6.8648
369
- 6200 val perplexity 957.9373
370
- 6200 train 6.838703 (lr=1.9729e-05) (hash(x)=46394422)
371
- 6000 val loss 6.5795
372
- 6000 val perplexity 720.1879
373
- 6000 train 6.441273 (lr=2.9537e-05) (hash(x)=51590090)
374
- 6300 val loss 6.8538
375
- 6300 val perplexity 947.4933
376
- 6300 train 6.875601 (lr=1.9056e-05) (hash(x)=53748145)
377
- 6100 val loss 6.5637
378
- 6100 val perplexity 708.8743
379
- 6100 train 7.011058 (lr=2.8574e-05) (hash(x)=59732271)
380
- 6400 val loss 6.8485
381
- 6400 val perplexity 942.4641
382
- 6400 train 6.731420 (lr=1.8392e-05) (hash(x)=46054751)
383
- 6200 val loss 6.5600
384
- 6200 val perplexity 706.2467
385
- 6200 train 6.588350 (lr=2.7621e-05) (hash(x)=46394422)
386
- 6500 val loss 6.8386
387
- 6500 val perplexity 933.1525
388
- 6500 train 7.054920 (lr=1.7738e-05) (hash(x)=51816809)
389
- 6300 val loss 6.5540
390
- 6300 val perplexity 702.0677
391
- 6300 train 6.569966 (lr=2.6679e-05) (hash(x)=53748145)
392
- 6600 val loss 6.8378
393
- 6600 val perplexity 932.3933
394
- 6600 train 6.719009 (lr=1.7093e-05) (hash(x)=52453336)
395
- 6400 val loss 6.5467
396
- 6400 val perplexity 696.9398
397
- 6400 train 6.418003 (lr=2.5749e-05) (hash(x)=46054751)
398
- 6700 val loss 6.8274
399
- 6700 val perplexity 922.7719
400
- 6700 train 6.842637 (lr=1.6459e-05) (hash(x)=49108775)
401
- 6800 val loss 6.8191
402
- 6800 val perplexity 915.1290
403
- 6800 train 6.744590 (lr=1.5836e-05) (hash(x)=46745396)
404
- 6500 val loss 6.5368
405
- 6500 val perplexity 690.0514
406
- 6500 train 6.811157 (lr=2.4833e-05) (hash(x)=51816809)
407
- 6900 val loss 6.8036
408
- 6900 val perplexity 901.1028
409
- 6900 train 6.938798 (lr=1.5225e-05) (hash(x)=46534986)
410
- 6600 val loss 6.5317
411
- 6600 val perplexity 686.5802
412
- 6600 train 6.414201 (lr=2.3930e-05) (hash(x)=52453336)
413
- 7000 val loss 6.7982
414
- 7000 val perplexity 896.2391
415
- 7000 train 7.138734 (lr=1.4627e-05) (hash(x)=49317888)
416
- 6700 val loss 6.5316
417
- 6700 val perplexity 686.5275
418
- 6700 train 6.547672 (lr=2.3042e-05) (hash(x)=49108775)
419
- 7100 val loss 6.7898
420
- 7100 val perplexity 888.7249
421
- 7100 train 6.800003 (lr=1.4043e-05) (hash(x)=50360484)
422
- 6800 val loss 6.5200
423
- 6800 val perplexity 678.6013
424
- 6800 train 6.438734 (lr=2.2171e-05) (hash(x)=46745396)
425
- 7200 val loss 6.7843
426
- 7200 val perplexity 883.8223
427
- 7200 train 6.675083 (lr=1.3471e-05) (hash(x)=49515094)
428
- 6900 val loss 6.5101
429
- 6900 val perplexity 671.8820
430
- 6900 train 6.690259 (lr=2.1316e-05) (hash(x)=46534986)
431
- 7300 val loss 6.7862
432
- 7300 val perplexity 885.5561
433
- 7300 train 6.775707 (lr=1.2915e-05) (hash(x)=51546861)
434
- 7000 val loss 6.4946
435
- 7000 val perplexity 661.5610
436
- 7000 train 6.878696 (lr=2.0478e-05) (hash(x)=49317888)
437
- 7400 val loss 6.7751
438
- 7400 val perplexity 875.7767
439
- 7400 train 6.777234 (lr=1.2373e-05) (hash(x)=48320948)
440
- 7100 val loss 6.4876
441
- 7100 val perplexity 656.9729
442
- 7100 train 6.499301 (lr=1.9660e-05) (hash(x)=50360484)
443
- 7500 val loss 6.7680
444
- 7500 val perplexity 869.6076
445
- 7500 train 6.685040 (lr=1.1847e-05) (hash(x)=40167457)
446
- 7200 val loss 6.4838
447
- 7200 val perplexity 654.4562
448
- 7200 train 6.345729 (lr=1.8860e-05) (hash(x)=49515094)
449
- 7600 val loss 6.7610
450
- 7600 val perplexity 863.4881
451
- 7600 train 6.716415 (lr=1.1337e-05) (hash(x)=49942165)
452
- 7300 val loss 6.4852
453
- 7300 val perplexity 655.3403
454
- 7300 train 6.473186 (lr=1.8081e-05) (hash(x)=51546861)
455
- 7700 val loss 6.7578
456
- 7700 val perplexity 860.7277
457
- 7700 train 6.508514 (lr=1.0844e-05) (hash(x)=48853311)
458
- 7400 val loss 6.4816
459
- 7400 val perplexity 653.0450
460
- 7400 train 6.474329 (lr=1.7323e-05) (hash(x)=48320948)
461
- 7800 val loss 6.7528
462
- 7800 val perplexity 856.4212
463
- 7800 train 6.658655 (lr=1.0367e-05) (hash(x)=48510117)
464
- 7500 val loss 6.4817
465
- 7500 val perplexity 653.0565
466
- 7500 train 6.411730 (lr=1.6586e-05) (hash(x)=40167457)
467
- 7900 val loss 6.7509
468
- 7900 val perplexity 854.8618
469
- 7900 train 6.762117 (lr=9.9088e-06) (hash(x)=48339781)
470
- 7600 val loss 6.4654
471
- 7600 val perplexity 642.5402
472
- 7600 train 6.415967 (lr=1.5872e-05) (hash(x)=49942165)
473
- 8000 val loss 6.7459
474
- 8000 val perplexity 850.5717
475
- 8000 train 6.878915 (lr=9.4682e-06) (hash(x)=54927320)
476
- 7700 val loss 6.4599
477
- 7700 val perplexity 638.9849
478
- 7700 train 6.188712 (lr=1.5181e-05) (hash(x)=48853311)
479
- 8100 val loss 6.7406
480
- 8100 val perplexity 846.0574
481
- 8100 train 6.488393 (lr=9.0461e-06) (hash(x)=46461786)
482
- 7800 val loss 6.4578
483
- 7800 val perplexity 637.6554
484
- 7800 train 6.345714 (lr=1.4514e-05) (hash(x)=48510117)
485
- 8200 val loss 6.7377
486
- 8200 val perplexity 843.5936
487
- 8200 train 6.667369 (lr=8.6430e-06) (hash(x)=51536260)
488
- 7900 val loss 6.4579
489
- 7900 val perplexity 637.7202
490
- 7900 train 6.452045 (lr=1.3872e-05) (hash(x)=48339781)
491
- 8300 val loss 6.7352
492
- 8300 val perplexity 841.4913
493
- 8300 train 6.494115 (lr=8.2593e-06) (hash(x)=44770722)
494
- 8000 val loss 6.4506
495
- 8000 val perplexity 633.1036
496
- 8000 train 6.566749 (lr=1.3255e-05) (hash(x)=54927320)
497
- 8400 val loss 6.7321
498
- 8400 val perplexity 838.9099
499
- 8400 train 6.740037 (lr=7.8953e-06) (hash(x)=50104957)
500
- 8100 val loss 6.4473
501
- 8100 val perplexity 631.0035
502
- 8100 train 6.158883 (lr=1.2665e-05) (hash(x)=46461786)
503
- 8500 val loss 6.7283
504
- 8500 val perplexity 835.6963
505
- 8500 train 6.834835 (lr=7.5515e-06) (hash(x)=50132971)
506
- 8200 val loss 6.4479
507
- 8200 val perplexity 631.3887
508
- 8200 train 6.372143 (lr=1.2100e-05) (hash(x)=51536260)
509
- 8600 val loss 6.7235
510
- 8600 val perplexity 831.6856
511
- 8600 train 6.676203 (lr=7.2282e-06) (hash(x)=52193699)
512
- 8300 val loss 6.4479
513
- 8300 val perplexity 631.3722
514
- 8300 train 6.184940 (lr=1.1563e-05) (hash(x)=44770722)
515
- 8700 val loss 6.7189
516
- 8700 val perplexity 827.9468
517
- 8700 train 6.706408 (lr=6.9257e-06) (hash(x)=47902319)
518
- 8400 val loss 6.4421
519
- 8400 val perplexity 627.6946
520
- 8400 train 6.443494 (lr=1.1053e-05) (hash(x)=50104957)
521
- 8800 val loss 6.7133
522
- 8800 val perplexity 823.3087
523
- 8800 train 6.985338 (lr=6.6444e-06) (hash(x)=54904230)
524
- 8500 val loss 6.4391
525
- 8500 val perplexity 625.8142
526
- 8500 train 6.534442 (lr=1.0572e-05) (hash(x)=50132971)
527
- 8900 val loss 6.7132
528
- 8900 val perplexity 823.1639
529
- 8900 train 6.608115 (lr=6.3845e-06) (hash(x)=46311615)
530
- 8600 val loss 6.4317
531
- 8600 val perplexity 621.2293
532
- 8600 train 6.393053 (lr=1.0119e-05) (hash(x)=52193699)
533
- 9000 val loss 6.7123
534
- 9000 val perplexity 822.4835
535
- 9000 train 6.588591 (lr=6.1462e-06) (hash(x)=48535188)
536
- 8700 val loss 6.4298
537
- 8700 val perplexity 620.0411
538
- 8700 train 6.417854 (lr=9.6960e-06) (hash(x)=47902319)
539
- 9100 val loss 6.7054
540
- 9100 val perplexity 816.8428
541
- 9100 train 6.734750 (lr=5.9300e-06) (hash(x)=51757372)
542
- 8800 val loss 6.4236
543
- 8800 val perplexity 616.1947
544
- 8800 train 6.713367 (lr=9.3021e-06) (hash(x)=54904230)
545
- 9200 val loss 6.7031
546
- 9200 val perplexity 814.9101
547
- 9200 train 6.588158 (lr=5.7359e-06) (hash(x)=51131708)
548
- 8900 val loss 6.4212
549
- 8900 val perplexity 614.7217
550
- 8900 train 6.308746 (lr=8.9382e-06) (hash(x)=46311615)
551
- 9300 val loss 6.7031
552
- 9300 val perplexity 814.9078
553
- 9300 train 6.644286 (lr=5.5641e-06) (hash(x)=44784276)
554
- 9000 val loss 6.4235
555
- 9000 val perplexity 616.1304
556
- 9000 train 6.292938 (lr=8.6047e-06) (hash(x)=48535188)
557
- 9400 val loss 6.7000
558
- 9400 val perplexity 812.3867
559
- 9400 train 6.797466 (lr=5.4149e-06) (hash(x)=51981169)
560
- 9100 val loss 6.4145
561
- 9100 val perplexity 610.6580
562
- 9100 train 6.448308 (lr=8.3020e-06) (hash(x)=51757372)
563
- 9500 val loss 6.6971
564
- 9500 val perplexity 810.0333
565
- 9500 train 6.648178 (lr=5.2884e-06) (hash(x)=47232936)
566
- 9200 val loss 6.4128
567
- 9200 val perplexity 609.6115
568
- 9200 train 6.280666 (lr=8.0302e-06) (hash(x)=51131708)
569
- 9600 val loss 6.6957
570
- 9600 val perplexity 808.9078
571
- 9600 train 6.730829 (lr=5.1847e-06) (hash(x)=53800450)
572
- 9300 val loss 6.4138
573
- 9300 val perplexity 610.1989
574
- 9300 train 6.370407 (lr=7.7898e-06) (hash(x)=44784276)
575
- 9700 val loss 6.6942
576
- 9700 val perplexity 807.7153
577
- 9700 train 6.812382 (lr=5.1040e-06) (hash(x)=55768123)
578
- 9400 val loss 6.4115
579
- 9400 val perplexity 608.8001
580
- 9400 train 6.524776 (lr=7.5809e-06) (hash(x)=51981169)
581
- 9800 val loss 6.6956
582
- 9800 val perplexity 808.8314
583
- 9800 train 6.635161 (lr=5.0462e-06) (hash(x)=47745177)
584
- 9500 val loss 6.4100
585
- 9500 val perplexity 607.8681
586
- 9500 train 6.373538 (lr=7.4038e-06) (hash(x)=47232936)
587
- 9900 val loss 6.6930
588
- 9900 val perplexity 806.7683
589
- 9900 train 6.937419 (lr=5.0116e-06) (hash(x)=56592246)
590
- 9600 val loss 6.4066
591
- 9600 val perplexity 605.8401
592
- 9600 train 6.441170 (lr=7.2586e-06) (hash(x)=53800450)
593
- 9999 val loss 6.6932
594
- 9999 val perplexity 806.9299
595
- 9700 val loss 6.4071
596
- 9700 val perplexity 606.1292
597
- 9700 train 6.529077 (lr=7.1456e-06) (hash(x)=55768123)
598
- 9800 val loss 6.4078
599
- 9800 val perplexity 606.5295
600
- 9800 train 6.343826 (lr=7.0647e-06) (hash(x)=47745177)
601
- 9900 val loss 6.4075
602
- 9900 val perplexity 606.3977
603
- 9900 train 6.676889 (lr=7.0162e-06) (hash(x)=56592246)
 
1
  max_steps: 10000
2
+ 3000 val loss 6.8497
3
+ 3000 val perplexity 943.5969
4
+ 3000 train 6.557367 (lr=8.3057e-05) (hash(x)=44336167)
5
  0 val loss 11.8201
6
  0 val perplexity 135955.4688
7
+ 3100 val loss 6.8108
8
+ 3100 val perplexity 907.6118
9
+ 3100 train 7.108453 (lr=8.1915e-05) (hash(x)=44479330)
10
+ 3200 val loss 6.8168
11
+ 3200 val perplexity 913.0430
12
+ 3200 train 6.861060 (lr=8.0745e-05) (hash(x)=54593096)
13
+ 3300 val loss 6.7666
14
+ 3300 val perplexity 868.3719
15
+ 3300 train 6.689936 (lr=7.9549e-05) (hash(x)=45347643)
16
+ 0 train 11.815376 (lr=7.5000e-07) (hash(x)=57791809)
17
+ 3400 val loss 6.7480
18
+ 3400 val perplexity 852.3517
19
+ 3400 train 6.868625 (lr=7.8328e-05) (hash(x)=47797247)
20
+ 100 val loss 9.4103
21
+ 100 val perplexity 12214.1318
22
+ 100 train 9.435302 (lr=7.5750e-05) (hash(x)=48211824)
23
+ 3500 val loss 6.7277
24
+ 3500 val perplexity 835.2067
25
+ 3500 train 6.586505 (lr=7.7082e-05) (hash(x)=46115683)
26
+ 200 val loss 7.8489
27
+ 200 val perplexity 2562.9167
28
+ 200 train 7.782820 (lr=1.5000e-04) (hash(x)=50375849)
29
+ 3600 val loss 6.7276
30
+ 3600 val perplexity 835.1287
31
+ 3600 train 6.583769 (lr=7.5814e-05) (hash(x)=44502074)
32
+ 300 val loss 7.7186
33
+ 300 val perplexity 2249.8511
34
+ 300 train 8.037295 (lr=1.4997e-04) (hash(x)=57250808)
35
+ 3700 val loss 6.6958
36
+ 3700 val perplexity 808.9891
37
+ 3700 train 6.847288 (lr=7.4525e-05) (hash(x)=55388443)
38
+ 3800 val loss 6.6831
39
+ 3800 val perplexity 798.8195
40
+ 3800 train 6.478742 (lr=7.3215e-05) (hash(x)=43790341)
41
+ 400 val loss 7.6896
42
+ 400 val perplexity 2185.4397
43
+ 400 train 8.281481 (lr=1.4986e-04) (hash(x)=62519858)
44
+ 3900 val loss 6.6852
45
+ 3900 val perplexity 800.4603
46
+ 3900 train 6.680488 (lr=7.1887e-05) (hash(x)=50013318)
47
+ 500 val loss 7.6498
48
+ 500 val perplexity 2100.2070
49
+ 500 train 7.555541 (lr=1.4969e-04) (hash(x)=47226806)
50
+ 600 val loss 7.6013
51
+ 600 val perplexity 2000.8110
52
+ 600 train 7.611244 (lr=1.4945e-04) (hash(x)=51149322)
53
+ 4000 val loss 6.6633
54
+ 4000 val perplexity 783.1157
55
+ 4000 train 6.724937 (lr=7.0541e-05) (hash(x)=51704787)
56
+ 700 val loss 7.5505
57
+ 700 val perplexity 1901.7231
58
+ 700 train 7.570621 (lr=1.4913e-04) (hash(x)=51564551)
59
+ 4100 val loss 6.6502
60
+ 4100 val perplexity 772.9760
61
+ 4100 train 6.690805 (lr=6.9180e-05) (hash(x)=50821964)
62
+ 800 val loss 7.4868
63
+ 800 val perplexity 1784.4075
64
+ 800 train 7.224887 (lr=1.4876e-04) (hash(x)=45093459)
65
+ 4200 val loss 6.6401
66
+ 4200 val perplexity 765.1970
67
+ 4200 train 6.676926 (lr=6.7804e-05) (hash(x)=49675080)
68
+ 900 val loss 7.4101
69
+ 900 val perplexity 1652.5939
70
+ 900 train 7.676486 (lr=1.4831e-04) (hash(x)=54988361)
71
+ 4300 val loss 6.6300
72
+ 4300 val perplexity 757.5169
73
+ 4300 train 6.297640 (lr=6.6414e-05) (hash(x)=43239281)
74
+ 1000 val loss 7.3711
75
+ 1000 val perplexity 1589.3545
76
+ 1000 train 7.263645 (lr=1.4779e-04) (hash(x)=47588648)
77
+ 4400 val loss 6.6187
78
+ 4400 val perplexity 748.9337
79
+ 4400 train 6.297700 (lr=6.5013e-05) (hash(x)=45076737)
80
+ 1100 val loss 7.3459
81
+ 1100 val perplexity 1549.8197
82
+ 1100 train 6.974550 (lr=1.4721e-04) (hash(x)=37984588)
83
+ 4500 val loss 6.6069
84
+ 4500 val perplexity 740.1642
85
+ 4500 train 6.723946 (lr=6.3602e-05) (hash(x)=57930262)
86
+ 1200 val loss 7.2713
87
+ 1200 val perplexity 1438.3854
88
+ 1200 train 7.423686 (lr=1.4656e-04) (hash(x)=56333817)
89
+ 4600 val loss 6.5956
90
+ 4600 val perplexity 731.8596
91
+ 4600 train 6.346147 (lr=6.2182e-05) (hash(x)=46721614)
92
+ 1300 val loss 7.2478
93
+ 1300 val perplexity 1405.0525
94
+ 1300 train 7.408738 (lr=1.4585e-04) (hash(x)=53454056)
95
+ 4700 val loss 6.5840
96
+ 4700 val perplexity 723.4270
97
+ 4700 train 6.413177 (lr=6.0754e-05) (hash(x)=49837920)
98
+ 1400 val loss 7.1724
99
+ 1400 val perplexity 1302.9978
100
+ 1400 train 7.307054 (lr=1.4507e-04) (hash(x)=55284163)
101
+ 4800 val loss 6.5830
102
+ 4800 val perplexity 722.7350
103
+ 4800 train 6.747669 (lr=5.9321e-05) (hash(x)=48380045)
104
+ 1500 val loss 7.1053
105
+ 1500 val perplexity 1218.3500
106
+ 1500 train 6.960086 (lr=1.4422e-04) (hash(x)=48162598)
107
+ 4900 val loss 6.5705
108
+ 4900 val perplexity 713.7289
109
+ 4900 train 6.455478 (lr=5.7883e-05) (hash(x)=44202577)
110
+ 1600 val loss 7.0936
111
+ 1600 val perplexity 1204.2488
112
+ 1600 train 7.176346 (lr=1.4332e-04) (hash(x)=54214535)
113
+ 5000 val loss 6.5548
114
+ 5000 val perplexity 702.6354
115
+ 5000 train 6.531763 (lr=5.6442e-05) (hash(x)=52038024)
116
+ 1700 val loss 7.0477
117
+ 1700 val perplexity 1150.2354
118
+ 1700 train 7.288129 (lr=1.4235e-04) (hash(x)=53525003)
119
+ 5100 val loss 6.5472
120
+ 5100 val perplexity 697.2672
121
+ 5100 train 6.682254 (lr=5.5000e-05) (hash(x)=53700038)
122
+ 1800 val loss 6.9886
123
+ 1800 val perplexity 1084.1967
124
+ 1800 train 7.055763 (lr=1.4131e-04) (hash(x)=51848994)
125
+ 5200 val loss 6.5449
126
+ 5200 val perplexity 695.6867
127
+ 5200 train 6.514779 (lr=5.3558e-05) (hash(x)=48137625)
128
+ 1900 val loss 6.9741
129
+ 1900 val perplexity 1068.6400
130
+ 1900 train 6.808705 (lr=1.4022e-04) (hash(x)=48405987)
131
+ 5300 val loss 6.5362
132
+ 5300 val perplexity 689.6800
133
+ 5300 train 6.367496 (lr=5.2117e-05) (hash(x)=43161573)
134
+ 2000 val loss 6.9172
135
+ 2000 val perplexity 1009.4669
136
+ 2000 train 7.280006 (lr=1.3907e-04) (hash(x)=58592291)
137
+ 2100 val loss 6.9460
138
+ 2100 val perplexity 1038.9731
139
+ 2100 train 7.032774 (lr=1.3786e-04) (hash(x)=51167081)
140
+ 5400 val loss 6.5369
141
+ 5400 val perplexity 690.1711
142
+ 5400 train 6.705731 (lr=5.0679e-05) (hash(x)=56673322)
143
+ 2200 val loss 6.8788
144
+ 2200 val perplexity 971.4705
145
+ 2200 train 6.855141 (lr=1.3660e-04) (hash(x)=47994988)
146
+ 5500 val loss 6.5188
147
+ 5500 val perplexity 677.7745
148
+ 5500 train 6.729006 (lr=4.9246e-05) (hash(x)=53468295)
149
+ 2300 val loss 6.9196
150
+ 2300 val perplexity 1011.8920
151
+ 2300 train 6.912843 (lr=1.3527e-04) (hash(x)=47377604)
152
+ 5600 val loss 6.5010
153
+ 5600 val perplexity 665.8105
154
+ 5600 train 6.796459 (lr=4.7818e-05) (hash(x)=59287280)
155
+ 2400 val loss 6.8704
156
+ 2400 val perplexity 963.3668
157
+ 2400 train 6.928603 (lr=1.3390e-04) (hash(x)=53554323)
158
+ 5700 val loss 6.5016
159
+ 5700 val perplexity 666.2183
160
+ 5700 train 6.710692 (lr=4.6398e-05) (hash(x)=57575806)
161
+ 2500 val loss 6.8641
162
+ 2500 val perplexity 957.2761
163
+ 2500 train 6.931783 (lr=1.3247e-04) (hash(x)=50780417)
164
+ 5800 val loss 6.4968
165
+ 5800 val perplexity 663.0387
166
+ 5800 train 6.534732 (lr=4.4987e-05) (hash(x)=46897279)
167
+ 2600 val loss 6.8701
168
+ 2600 val perplexity 963.0816
169
+ 2600 train 6.758279 (lr=1.3099e-04) (hash(x)=46453562)
170
+ 5900 val loss 6.4894
171
+ 5900 val perplexity 658.1490
172
+ 5900 train 6.375528 (lr=4.3586e-05) (hash(x)=47565679)
173
+ 2700 val loss 6.8107
174
+ 2700 val perplexity 907.5153
175
+ 2700 train 6.781026 (lr=1.2946e-04) (hash(x)=54404221)
176
+ 6000 val loss 6.4841
177
+ 6000 val perplexity 654.6342
178
+ 6000 train 6.333650 (lr=4.2196e-05) (hash(x)=51590090)
179
+ 2800 val loss 6.7924
180
+ 2800 val perplexity 891.0690
181
+ 2800 train 7.439435 (lr=1.2788e-04) (hash(x)=59318895)
182
+ 6100 val loss 6.4780
183
+ 6100 val perplexity 650.6495
184
+ 6100 train 6.915969 (lr=4.0820e-05) (hash(x)=59732271)
185
+ 2900 val loss 6.7875
186
+ 2900 val perplexity 886.6631
187
+ 2900 train 6.780449 (lr=1.2626e-04) (hash(x)=47845760)
188
+ 6200 val loss 6.4670
189
+ 6200 val perplexity 643.5385
190
+ 6200 train 6.530252 (lr=3.9459e-05) (hash(x)=46394422)
191
+ 3000 val loss 6.7641
192
+ 3000 val perplexity 866.1562
193
+ 3000 train 6.461198 (lr=1.2459e-04) (hash(x)=44336167)
194
+ 6300 val loss 6.4697
195
+ 6300 val perplexity 645.2667
196
+ 6300 train 6.481519 (lr=3.8113e-05) (hash(x)=53748145)
197
+ 3100 val loss 6.7478
198
+ 3100 val perplexity 852.1684
199
+ 3100 train 7.055303 (lr=1.2287e-04) (hash(x)=44479330)
200
+ 6400 val loss 6.4581
201
+ 6400 val perplexity 637.8579
202
+ 6400 train 6.312898 (lr=3.6785e-05) (hash(x)=46054751)
203
+ 3200 val loss 6.7569
204
+ 3200 val perplexity 859.9979
205
+ 3200 train 6.806598 (lr=1.2112e-04) (hash(x)=54593096)
206
+ 6500 val loss 6.4519
207
+ 6500 val perplexity 633.8904
208
+ 6500 train 6.736262 (lr=3.5475e-05) (hash(x)=51816809)
209
+ 3300 val loss 6.7278
210
+ 3300 val perplexity 835.3075
211
+ 3300 train 6.684790 (lr=1.1932e-04) (hash(x)=45347643)
212
+ 6600 val loss 6.4588
213
+ 6600 val perplexity 638.2906
214
+ 6600 train 6.274892 (lr=3.4186e-05) (hash(x)=52453336)
215
+ 3400 val loss 6.7000
216
+ 3400 val perplexity 812.3824
217
+ 3400 train 6.819838 (lr=1.1749e-04) (hash(x)=47797247)
218
+ 6700 val loss 6.4472
219
+ 6700 val perplexity 630.9039
220
+ 6700 train 6.439846 (lr=3.2918e-05) (hash(x)=49108775)
221
+ 3500 val loss 6.6888
222
+ 3500 val perplexity 803.3629
223
+ 3500 train 6.543953 (lr=1.1562e-04) (hash(x)=46115683)
224
+ 6800 val loss 6.4345
225
+ 6800 val perplexity 622.9869
226
+ 6800 train 6.340245 (lr=3.1672e-05) (hash(x)=46745396)
227
+ 3600 val loss 6.6714
228
+ 3600 val perplexity 789.5205
229
+ 3600 train 6.545329 (lr=1.1372e-04) (hash(x)=44502074)
230
+ 6900 val loss 6.4267
231
+ 6900 val perplexity 618.1205
232
+ 6900 train 6.602197 (lr=3.0451e-05) (hash(x)=46534986)
233
+ 3700 val loss 6.6690
234
+ 3700 val perplexity 787.6163
235
+ 3700 train 6.809144 (lr=1.1179e-04) (hash(x)=55388443)
236
+ 7000 val loss 6.4191
237
+ 7000 val perplexity 613.4761
238
+ 7000 train 6.785058 (lr=2.9255e-05) (hash(x)=49317888)
239
+ 3800 val loss 6.6656
240
+ 3800 val perplexity 784.8956
241
+ 3800 train 6.453290 (lr=1.0982e-04) (hash(x)=43790341)
242
+ 7100 val loss 6.4132
243
+ 7100 val perplexity 609.8173
244
+ 7100 train 6.441988 (lr=2.8085e-05) (hash(x)=50360484)
245
+ 3900 val loss 6.6528
246
+ 3900 val perplexity 774.9131
247
+ 3900 train 6.637487 (lr=1.0783e-04) (hash(x)=50013318)
248
+ 7200 val loss 6.4056
249
+ 7200 val perplexity 605.1969
250
+ 7200 train 6.250135 (lr=2.6943e-05) (hash(x)=49515094)
251
+ 4000 val loss 6.6511
252
+ 4000 val perplexity 773.6136
253
+ 4000 train 6.725606 (lr=1.0581e-04) (hash(x)=51704787)
254
+ 7300 val loss 6.4160
255
+ 7300 val perplexity 611.5779
256
+ 7300 train 6.397116 (lr=2.5830e-05) (hash(x)=51546861)
257
+ 4100 val loss 6.6422
258
+ 4100 val perplexity 766.7709
259
+ 4100 train 6.687906 (lr=1.0377e-04) (hash(x)=50821964)
260
+ 7400 val loss 6.4147
261
+ 7400 val perplexity 610.7546
262
+ 7400 train 6.393087 (lr=2.4746e-05) (hash(x)=48320948)
263
+ 4200 val loss 6.6313
264
+ 4200 val perplexity 758.4469
265
+ 4200 train 6.676696 (lr=1.0171e-04) (hash(x)=49675080)
266
+ 7500 val loss 6.4144
267
+ 7500 val perplexity 610.5697
268
+ 7500 train 6.344778 (lr=2.3694e-05) (hash(x)=40167457)
269
+ 4300 val loss 6.6157
270
+ 4300 val perplexity 746.7346
271
+ 4300 train 6.275900 (lr=9.9622e-05) (hash(x)=43239281)
272
+ 7600 val loss 6.3956
273
+ 7600 val perplexity 599.2078
274
+ 7600 train 6.340053 (lr=2.2674e-05) (hash(x)=49942165)
275
+ 4400 val loss 6.6233
276
+ 4400 val perplexity 752.4334
277
+ 4400 train 6.317001 (lr=9.7520e-05) (hash(x)=45076737)
278
+ 7700 val loss 6.3912
279
+ 7700 val perplexity 596.5556
280
+ 7700 train 6.109773 (lr=2.1687e-05) (hash(x)=48853311)
281
+ 4500 val loss 6.6173
282
+ 4500 val perplexity 747.8881
283
+ 4500 train 6.730391 (lr=9.5403e-05) (hash(x)=57930262)
284
+ 7800 val loss 6.3869
285
+ 7800 val perplexity 594.0394
286
+ 7800 train 6.283017 (lr=2.0735e-05) (hash(x)=48510117)
287
+ 4600 val loss 6.5940
288
+ 4600 val perplexity 730.6702
289
+ 4600 train 6.354017 (lr=9.3273e-05) (hash(x)=46721614)
290
+ 7900 val loss 6.3907
291
+ 7900 val perplexity 596.3019
292
+ 7900 train 6.387419 (lr=1.9818e-05) (hash(x)=48339781)
293
+ 4700 val loss 6.5861
294
+ 4700 val perplexity 724.9485
295
+ 4700 train 6.410762 (lr=9.1132e-05) (hash(x)=49837920)
296
+ 8000 val loss 6.3852
297
+ 8000 val perplexity 593.0266
298
+ 8000 train 6.477322 (lr=1.8936e-05) (hash(x)=54927320)
299
+ 4800 val loss 6.5926
300
+ 4800 val perplexity 729.6981
301
+ 4800 train 6.733562 (lr=8.8982e-05) (hash(x)=48380045)
302
+ 8100 val loss 6.3803
303
+ 8100 val perplexity 590.0946
304
+ 8100 train 6.103539 (lr=1.8092e-05) (hash(x)=46461786)
305
+ 4900 val loss 6.5911
306
+ 4900 val perplexity 728.6015
307
+ 4900 train 6.470277 (lr=8.6825e-05) (hash(x)=44202577)
308
+ 8200 val loss 6.3766
309
+ 8200 val perplexity 587.9359
310
+ 8200 train 6.291387 (lr=1.7286e-05) (hash(x)=51536260)
311
+ 5000 val loss 6.5647
312
+ 5000 val perplexity 709.6234
313
+ 5000 train 6.568296 (lr=8.4663e-05) (hash(x)=52038024)
314
+ 8300 val loss 6.3755
315
+ 8300 val perplexity 587.3049
316
+ 8300 train 6.118826 (lr=1.6519e-05) (hash(x)=44770722)
317
+ 5100 val loss 6.5647
318
+ 5100 val perplexity 709.6227
319
+ 5100 train 6.695176 (lr=8.2500e-05) (hash(x)=53700038)
320
+ 8400 val loss 6.3739
321
+ 8400 val perplexity 586.3613
322
+ 8400 train 6.378792 (lr=1.5791e-05) (hash(x)=50104957)
323
+ 5200 val loss 6.5524
324
+ 5200 val perplexity 700.9124
325
+ 5200 train 6.519544 (lr=8.0337e-05) (hash(x)=48137625)
326
+ 8500 val loss 6.3685
327
+ 8500 val perplexity 583.1942
328
+ 8500 train 6.459326 (lr=1.5103e-05) (hash(x)=50132971)
329
+ 5300 val loss 6.5438
330
+ 5300 val perplexity 694.8887
331
+ 5300 train 6.366642 (lr=7.8175e-05) (hash(x)=43161573)
332
+ 8600 val loss 6.3651
333
+ 8600 val perplexity 581.2270
334
+ 8600 train 6.323270 (lr=1.4456e-05) (hash(x)=52193699)
335
+ 5400 val loss 6.5453
336
+ 5400 val perplexity 695.9648
337
+ 5400 train 6.720807 (lr=7.6018e-05) (hash(x)=56673322)
338
+ 8700 val loss 6.3602
339
+ 8700 val perplexity 578.3399
340
+ 8700 train 6.344991 (lr=1.3851e-05) (hash(x)=47902319)
341
+ 5500 val loss 6.5354
342
+ 5500 val perplexity 689.1136
343
+ 5500 train 6.723743 (lr=7.3868e-05) (hash(x)=53468295)
344
+ 8800 val loss 6.3528
345
+ 8800 val perplexity 574.0699
346
+ 8800 train 6.642093 (lr=1.3289e-05) (hash(x)=54904230)
347
+ 5600 val loss 6.5390
348
+ 5600 val perplexity 691.6022
349
+ 5600 train 6.854773 (lr=7.1727e-05) (hash(x)=59287280)
350
+ 8900 val loss 6.3538
351
+ 8900 val perplexity 574.6966
352
+ 8900 train 6.230078 (lr=1.2769e-05) (hash(x)=46311615)
353
+ 5700 val loss 6.5331
354
+ 5700 val perplexity 687.5286
355
+ 5700 train 6.746007 (lr=6.9597e-05) (hash(x)=57575806)
356
+ 9000 val loss 6.3527
357
+ 9000 val perplexity 574.0275
358
+ 9000 train 6.200503 (lr=1.2292e-05) (hash(x)=48535188)
359
+ 5800 val loss 6.5352
360
+ 5800 val perplexity 688.9601
361
+ 5800 train 6.542769 (lr=6.7480e-05) (hash(x)=46897279)
362
+ 9100 val loss 6.3477
363
+ 9100 val perplexity 571.1558
364
+ 9100 train 6.375886 (lr=1.1860e-05) (hash(x)=51757372)
365
+ 5900 val loss 6.5230
366
+ 5900 val perplexity 680.6183
367
+ 5900 train 6.415868 (lr=6.5378e-05) (hash(x)=47565679)
368
+ 9200 val loss 6.3460
369
+ 9200 val perplexity 570.2044
370
+ 9200 train 6.207469 (lr=1.1472e-05) (hash(x)=51131708)
371
+ 6000 val loss 6.5250
372
+ 6000 val perplexity 682.0081
373
+ 6000 train 6.372841 (lr=6.3294e-05) (hash(x)=51590090)
374
+ 9300 val loss 6.3423
375
+ 9300 val perplexity 568.0868
376
+ 9300 train 6.299280 (lr=1.1128e-05) (hash(x)=44784276)
377
+ 6100 val loss 6.5130
378
+ 6100 val perplexity 673.8539
379
+ 6100 train 6.930476 (lr=6.1230e-05) (hash(x)=59732271)
380
+ 9400 val loss 6.3432
381
+ 9400 val perplexity 568.6069
382
+ 9400 train 6.450517 (lr=1.0830e-05) (hash(x)=51981169)
383
+ 6200 val loss 6.5090
384
+ 6200 val perplexity 671.1452
385
+ 9500 val loss 6.3407
386
+ 9500 val perplexity 567.1857
387
+ 6200 train 6.571561 (lr=5.9188e-05) (hash(x)=46394422)
388
+ 9500 train 6.312754 (lr=1.0577e-05) (hash(x)=47232936)
389
+ 9600 val loss 6.3410
390
+ 9600 val perplexity 567.3699
391
+ 9600 train 6.355495 (lr=1.0369e-05) (hash(x)=53800450)
392
+ 6300 val loss 6.5214
393
+ 6300 val perplexity 679.5251
394
+ 6300 train 6.526929 (lr=5.7169e-05) (hash(x)=53748145)
395
+ 9700 val loss 6.3406
396
+ 9700 val perplexity 567.1387
397
+ 9700 train 6.445551 (lr=1.0208e-05) (hash(x)=55768123)
398
+ 6400 val loss 6.5170
399
+ 6400 val perplexity 676.5420
400
+ 6400 train 6.375550 (lr=5.5177e-05) (hash(x)=46054751)
401
+ 9800 val loss 6.3420
402
+ 9800 val perplexity 567.9086
403
+ 9800 train 6.281539 (lr=1.0092e-05) (hash(x)=47745177)
404
+ 6500 val loss 6.5060
405
+ 6500 val perplexity 669.1499
406
+ 6500 train 6.733078 (lr=5.3213e-05) (hash(x)=51816809)
407
+ 9900 val loss 6.3431
408
+ 9900 val perplexity 568.5347
409
+ 9900 train 6.613006 (lr=1.0023e-05) (hash(x)=56592246)
410
+ 6600 val loss 6.4993
411
+ 6600 val perplexity 664.6857
412
+ 6600 train 6.381551 (lr=5.1279e-05) (hash(x)=52453336)
413
+ 9999 val loss 6.3466
414
+ 9999 val perplexity 570.5632
415
+ 6700 val loss 6.5058
416
+ 6700 val perplexity 669.0009
417
+ 6700 train 6.497029 (lr=4.9377e-05) (hash(x)=49108775)
418
+ 6800 val loss 6.4920
419
+ 6800 val perplexity 659.8412
420
+ 6800 train 6.398888 (lr=4.7509e-05) (hash(x)=46745396)
421
+ 6900 val loss 6.4899
422
+ 6900 val perplexity 658.4554
423
+ 6900 train 6.663884 (lr=4.5676e-05) (hash(x)=46534986)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
attention_kindselective_n_heads2_seed1340/model_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c94c460d7c2e918bd19dd6c7d0a40fbf731522e2be5d922fe6d210895f4b6cd
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db5e71e2651a7e4c04adf1b1bdd2fd67898b9838ff7ce2857cd735f4766b5532
3
  size 38587970
attention_kindselective_n_heads2_seed1340/model_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99967d2565438966d9e46b6de33f253f4b8e326c9f4cfe096170054f1ecb7401
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87a3a4e1a720bf19b1c7f850b83a50505c837892b3964a6f17b36ac335b8a606
3
  size 38587970
attention_kindselective_n_heads2_seed1340/model_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3fc8260bce2fd1ee0a25bb0aefdf1b927435a4e83a5f3001b7cd9a010becef3e
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43a6634ddb7efa2797ba98f1129965459e095430bafbbef119764211ecce5d4f
3
  size 38587970
attention_kindselective_n_heads2_seed1340/model_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffa02ec9ceef9ae047bb6c9f2df20bb20a78f2a2d95be921f8573aad83aa4f34
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3b59f7ce706fb6b80035e89fbc4a42332e13eb0a0bc36b121c007413b2e495b
3
  size 38587970
attention_kindselective_n_heads2_seed1340/optimizer_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e00018265047cc47ecf529e2cb77c9da38ab5ffdb63eb4dd62a77f724a4e61f1
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad182381a14e875a3c04a9d0fce780b0641e040b738f8454b13df6255a740e34
3
  size 70895430
attention_kindselective_n_heads2_seed1340/optimizer_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1e45fc5874ff2b4f6e33851010911bf7688145d2c6254585473d064c73335ec
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89e51f2fe0afb3163bdb42b8d6c2d3968fb0c1d3a51b33ea7ae5061524b2b719
3
  size 70895430
attention_kindselective_n_heads2_seed1340/optimizer_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3fa9a215741606396ff07a59b06d77363e8598933439c4e40b9445a216a12bff
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28aada8d81583aad9ff815ca63797851095f2d61265dbd7042c05b5071bebf1c
3
  size 70895430
attention_kindselective_n_heads2_seed1340/optimizer_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b894c3c084b1f0ca1c18954ebede742cd1b3528fd4d2b7daeef15cd86b6a87b
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93e34e940783142b8f4cb2c5281bcbe179e89f9432b789eb0baeb86a876f3e07
3
  size 70895430