andrew-healey commited on
Commit
fa1fb26
·
verified ·
1 Parent(s): c0abbe7

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads4_seed1338/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1338", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1338, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 4.5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "4.5e-5_61440_4_1338", "n_embd": 256}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1338", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1338, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "5e-5_61440_4_1338", "n_embd": 256}
attention_kindselective_n_heads4_seed1338/log2.txt CHANGED
@@ -1,267 +1,525 @@
1
  max_steps: 8750
 
 
 
2
  0 val loss 11.2646
3
- 0 val perplexity 78014.2266
4
- 0 train 11.267744 (lr=9.0000e-08) (hash(x)=150327452)
5
- 100 val loss 9.9869
6
- 100 val perplexity 21740.5742
7
- 100 train 10.040390 (lr=9.0900e-06) (hash(x)=166780046)
8
- 200 val loss 9.3625
9
- 200 val perplexity 11643.5410
10
- 200 train 9.319342 (lr=1.8090e-05) (hash(x)=155040610)
11
- 300 val loss 8.4028
12
- 300 val perplexity 4459.5981
13
- 300 train 8.416439 (lr=2.7090e-05) (hash(x)=155504036)
14
- 400 val loss 7.7030
15
- 400 val perplexity 2215.0740
16
- 400 train 7.579350 (lr=3.6090e-05) (hash(x)=143823248)
17
- 500 val loss 7.4875
18
- 500 val perplexity 1785.6263
19
- 500 train 7.457825 (lr=4.5000e-05) (hash(x)=143734685)
20
- 600 val loss 7.4083
21
- 600 val perplexity 1649.5944
22
- 600 train 7.185647 (lr=4.4985e-05) (hash(x)=150678249)
23
- 700 val loss 7.3038
24
- 700 val perplexity 1485.9933
25
- 700 train 7.454690 (lr=4.4941e-05) (hash(x)=175802021)
26
- 800 val loss 7.1949
27
- 800 val perplexity 1332.6075
28
- 800 train 7.207471 (lr=4.4868e-05) (hash(x)=158681215)
29
- 900 val loss 7.1159
30
- 900 val perplexity 1231.3386
31
- 900 train 7.008085 (lr=4.4766e-05) (hash(x)=146108145)
32
- 1000 val loss 7.0572
33
- 1000 val perplexity 1161.1853
34
- 1000 train 6.899328 (lr=4.4634e-05) (hash(x)=154996086)
35
- 1100 val loss 6.9827
36
- 1100 val perplexity 1077.7751
37
- 1100 train 6.962049 (lr=4.4474e-05) (hash(x)=153885445)
38
- 1200 val loss 6.9096
39
- 1200 val perplexity 1001.8237
40
- 1200 train 6.864575 (lr=4.4285e-05) (hash(x)=142353087)
41
- 1300 val loss 6.8560
42
- 1300 val perplexity 949.5422
43
- 1300 train 6.762306 (lr=4.4068e-05) (hash(x)=150750353)
44
- 1400 val loss 6.7982
45
- 1400 val perplexity 896.2370
46
- 1400 train 6.618343 (lr=4.3822e-05) (hash(x)=152767913)
47
- 1500 val loss 6.7233
48
- 1500 val perplexity 831.5230
49
- 1500 train 6.760575 (lr=4.3549e-05) (hash(x)=151562048)
50
- 1600 val loss 6.6423
51
- 1600 val perplexity 766.8309
52
- 1600 train 6.643299 (lr=4.3249e-05) (hash(x)=166486165)
53
- 1700 val loss 6.5837
54
- 1700 val perplexity 723.2342
55
- 1700 train 6.229164 (lr=4.2922e-05) (hash(x)=130835396)
56
- 1800 val loss 6.5306
57
- 1800 val perplexity 685.8266
58
- 1800 train 6.576537 (lr=4.2569e-05) (hash(x)=158851816)
59
- 1900 val loss 6.4929
60
- 1900 val perplexity 660.4493
61
- 1900 train 6.526217 (lr=4.2190e-05) (hash(x)=153313879)
62
- 2000 val loss 6.4303
63
- 2000 val perplexity 620.3297
64
- 2000 train 6.358914 (lr=4.1785e-05) (hash(x)=158245023)
65
- 2100 val loss 6.3892
66
- 2100 val perplexity 595.4015
67
- 2100 train 6.213952 (lr=4.1356e-05) (hash(x)=157204896)
68
- 2200 val loss 6.3634
69
- 2200 val perplexity 580.2315
70
- 2200 train 6.200081 (lr=4.0903e-05) (hash(x)=137541932)
71
- 2300 val loss 6.3336
72
- 2300 val perplexity 563.1638
73
- 2300 train 6.286159 (lr=4.0426e-05) (hash(x)=150149692)
74
- 2400 val loss 6.2952
75
- 2400 val perplexity 541.9569
76
- 2400 train 6.276896 (lr=3.9927e-05) (hash(x)=151730720)
77
- 2500 val loss 6.2657
78
- 2500 val perplexity 526.2218
79
- 2500 train 6.048392 (lr=3.9406e-05) (hash(x)=143406752)
80
- 2600 val loss 6.2478
81
- 2600 val perplexity 516.8922
82
- 2600 train 6.079948 (lr=3.8863e-05) (hash(x)=157272496)
83
- 2700 val loss 6.2204
84
- 2700 val perplexity 502.9050
85
- 2700 train 6.206706 (lr=3.8300e-05) (hash(x)=155342327)
86
- 2800 val loss 6.1862
87
- 2800 val perplexity 485.9829
88
- 2800 train 6.062219 (lr=3.7717e-05) (hash(x)=140626679)
89
- 2900 val loss 6.1618
90
- 2900 val perplexity 474.2803
91
- 2900 train 6.014542 (lr=3.7116e-05) (hash(x)=144953350)
92
- 3000 val loss 6.1459
93
- 3000 val perplexity 466.8199
94
- 3000 train 6.044561 (lr=3.6496e-05) (hash(x)=172449837)
95
- 3100 val loss 6.1350
96
- 3100 val perplexity 461.7465
97
- 3100 train 5.924545 (lr=3.5860e-05) (hash(x)=141710086)
98
- 3200 val loss 6.0963
99
- 3200 val perplexity 444.2307
100
- 3200 train 6.025388 (lr=3.5207e-05) (hash(x)=151299772)
101
- 3300 val loss 6.0834
102
- 3300 val perplexity 438.5322
103
- 3300 train 5.949057 (lr=3.4539e-05) (hash(x)=146473110)
104
- 3400 val loss 6.0566
105
- 3400 val perplexity 426.9290
106
- 3400 train 6.142395 (lr=3.3857e-05) (hash(x)=153954157)
107
- 3500 val loss 6.0423
108
- 3500 val perplexity 420.8712
109
- 3500 train 5.983976 (lr=3.3162e-05) (hash(x)=153717336)
110
- 3600 val loss 6.0249
111
- 3600 val perplexity 413.6099
112
- 3600 train 5.780908 (lr=3.2455e-05) (hash(x)=144965161)
113
- 3700 val loss 6.0154
114
- 3700 val perplexity 409.6885
115
- 3700 train 5.792235 (lr=3.1736e-05) (hash(x)=125969741)
116
- 3800 val loss 5.9919
117
- 3800 val perplexity 400.1568
118
- 3800 train 5.857428 (lr=3.1008e-05) (hash(x)=155070487)
119
- 3900 val loss 5.9737
120
- 3900 val perplexity 392.9507
121
- 3900 train 5.845443 (lr=3.0270e-05) (hash(x)=149444644)
122
- 4000 val loss 5.9613
123
- 4000 val perplexity 388.1236
124
- 4000 train 5.812981 (lr=2.9524e-05) (hash(x)=151663033)
125
- 4100 val loss 5.9495
126
- 4100 val perplexity 383.5775
127
- 4100 train 5.907077 (lr=2.8771e-05) (hash(x)=143688282)
128
- 4200 val loss 5.9299
129
- 4200 val perplexity 376.1001
130
- 4200 train 5.928873 (lr=2.8013e-05) (hash(x)=163361651)
131
- 4300 val loss 5.9129
132
- 4300 val perplexity 369.7745
133
- 4300 train 5.957870 (lr=2.7250e-05) (hash(x)=153619361)
134
- 4400 val loss 5.9052
135
- 4400 val perplexity 366.9546
136
- 4400 train 6.219810 (lr=2.6483e-05) (hash(x)=168527064)
137
- 4500 val loss 5.8973
138
- 4500 val perplexity 364.0627
139
- 4500 train 5.713622 (lr=2.5714e-05) (hash(x)=125588037)
140
- 4600 val loss 5.8746
141
- 4600 val perplexity 355.8658
142
- 4600 train 5.831522 (lr=2.4943e-05) (hash(x)=143710941)
143
- 4700 val loss 5.8623
144
- 4700 val perplexity 351.5312
145
- 4700 train 5.698743 (lr=2.4172e-05) (hash(x)=150952742)
146
- 4800 val loss 5.8534
147
- 4800 val perplexity 348.4177
148
- 4800 train 5.727831 (lr=2.3402e-05) (hash(x)=145323659)
149
- 4900 val loss 5.8470
150
- 4900 val perplexity 346.1969
151
- 4900 train 5.861619 (lr=2.2633e-05) (hash(x)=153151397)
152
- 5000 val loss 5.8346
153
- 5000 val perplexity 341.9235
154
- 5000 train 5.740033 (lr=2.1868e-05) (hash(x)=143182059)
155
- 5100 val loss 5.8234
156
- 5100 val perplexity 338.1063
157
- 5100 train 5.859928 (lr=2.1107e-05) (hash(x)=170083586)
158
- 5200 val loss 5.8080
159
- 5200 val perplexity 332.9380
160
- 5200 train 5.692333 (lr=2.0351e-05) (hash(x)=149363919)
161
- 5300 val loss 5.8036
162
- 5300 val perplexity 331.4817
163
- 5300 train 5.690423 (lr=1.9602e-05) (hash(x)=152033784)
164
- 5400 val loss 5.7884
165
- 5400 val perplexity 326.4917
166
- 5400 train 5.901883 (lr=1.8860e-05) (hash(x)=154614289)
167
- 5500 val loss 5.7798
168
- 5500 val perplexity 323.7035
169
- 5500 train 5.871509 (lr=1.8127e-05) (hash(x)=157745174)
170
- 5600 val loss 5.7756
171
- 5600 val perplexity 322.3418
172
- 5600 train 5.672544 (lr=1.7403e-05) (hash(x)=147693222)
173
- 5700 val loss 5.7623
174
- 5700 val perplexity 318.0876
175
- 5700 train 5.693845 (lr=1.6690e-05) (hash(x)=149784627)
176
- 5800 val loss 5.7625
177
- 5800 val perplexity 318.1317
178
- 5800 train 5.686205 (lr=1.5989e-05) (hash(x)=158620729)
179
- 5900 val loss 5.7499
180
- 5900 val perplexity 314.1568
181
- 5900 train 5.698482 (lr=1.5300e-05) (hash(x)=159763910)
182
- 6000 val loss 5.7406
183
- 6000 val perplexity 311.2519
184
- 6000 train 5.675223 (lr=1.4625e-05) (hash(x)=147640561)
185
- 6100 val loss 5.7337
186
- 6100 val perplexity 309.1020
187
- 6100 train 5.713614 (lr=1.3965e-05) (hash(x)=156613394)
188
- 6200 val loss 5.7276
189
- 6200 val perplexity 307.2375
190
- 6200 train 5.664627 (lr=1.3320e-05) (hash(x)=186221290)
191
- 6300 val loss 5.7168
192
- 6300 val perplexity 303.9236
193
- 6300 train 5.591850 (lr=1.2692e-05) (hash(x)=152081419)
194
- 6400 val loss 5.7170
195
- 6400 val perplexity 303.9820
196
- 6400 train 5.669156 (lr=1.2081e-05) (hash(x)=154808349)
197
- 6500 val loss 5.7096
198
- 6500 val perplexity 301.7650
199
- 6500 train 5.673279 (lr=1.1489e-05) (hash(x)=159437208)
200
- 6600 val loss 5.7078
201
- 6600 val perplexity 301.1956
202
- 6600 train 5.518106 (lr=1.0916e-05) (hash(x)=157933074)
203
- 6700 val loss 5.7002
204
- 6700 val perplexity 298.9182
205
- 6700 train 5.680590 (lr=1.0363e-05) (hash(x)=161560240)
206
- 6800 val loss 5.6961
207
- 6800 val perplexity 297.7002
208
- 6800 train 5.674346 (lr=9.8310e-06) (hash(x)=155424292)
209
- 6900 val loss 5.6936
210
- 6900 val perplexity 296.9641
211
- 6900 train 5.687900 (lr=9.3205e-06) (hash(x)=148561470)
212
- 7000 val loss 5.6879
213
- 7000 val perplexity 295.2784
214
- 7000 train 5.582413 (lr=8.8324e-06) (hash(x)=141527450)
215
- 7100 val loss 5.6817
216
- 7100 val perplexity 293.4472
217
- 7100 train 5.636769 (lr=8.3674e-06) (hash(x)=151066339)
218
- 7200 val loss 5.6829
219
- 7200 val perplexity 293.8135
220
- 7200 train 5.478912 (lr=7.9261e-06) (hash(x)=155231264)
221
- 7300 val loss 5.6733
222
- 7300 val perplexity 290.9905
223
- 7300 train 5.695995 (lr=7.5093e-06) (hash(x)=150281149)
224
- 7400 val loss 5.6712
225
- 7400 val perplexity 290.3902
226
- 7400 train 5.553436 (lr=7.1174e-06) (hash(x)=148421717)
227
- 7500 val loss 5.6716
228
- 7500 val perplexity 290.4991
229
- 7500 train 5.431908 (lr=6.7511e-06) (hash(x)=146921118)
230
- 7600 val loss 5.6636
231
- 7600 val perplexity 288.1749
232
- 7600 train 5.711033 (lr=6.4109e-06) (hash(x)=150660048)
233
- 7700 val loss 5.6626
234
- 7700 val perplexity 287.9061
235
- 7700 train 5.536907 (lr=6.0972e-06) (hash(x)=148059852)
236
- 7800 val loss 5.6614
237
- 7800 val perplexity 287.5586
238
- 7800 train 5.351384 (lr=5.8107e-06) (hash(x)=148331002)
239
- 7900 val loss 5.6558
240
- 7900 val perplexity 285.9372
241
- 7900 train 5.604271 (lr=5.5515e-06) (hash(x)=164923883)
242
- 8000 val loss 5.6541
243
- 8000 val perplexity 285.4702
244
- 8000 train 5.535724 (lr=5.3203e-06) (hash(x)=143545384)
245
- 8100 val loss 5.6531
246
- 8100 val perplexity 285.1748
247
- 8100 train 5.466304 (lr=5.1172e-06) (hash(x)=160686959)
248
- 8200 val loss 5.6509
249
- 8200 val perplexity 284.5504
250
- 8200 train 5.601032 (lr=4.9425e-06) (hash(x)=156501889)
251
- 8300 val loss 5.6488
252
- 8300 val perplexity 283.9383
253
- 8300 train 5.486124 (lr=4.7966e-06) (hash(x)=142716875)
254
- 8400 val loss 5.6439
255
- 8400 val perplexity 282.5641
256
- 8400 train 5.561882 (lr=4.6796e-06) (hash(x)=154436684)
257
- 8500 val loss 5.6430
258
- 8500 val perplexity 282.2974
259
- 8500 train 5.726328 (lr=4.5917e-06) (hash(x)=147965839)
260
- 8600 val loss 5.6415
261
- 8600 val perplexity 281.8958
262
- 8600 train 5.439447 (lr=4.5330e-06) (hash(x)=145228097)
263
- 8700 val loss 5.6370
264
- 8700 val perplexity 280.6302
265
- 8700 train 5.829304 (lr=4.5037e-06) (hash(x)=152910357)
266
- 8749 val loss 5.6369
267
- 8749 val perplexity 280.5848
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  max_steps: 8750
2
+ 100 val loss 10.0225
3
+ 100 val perplexity 22528.5176
4
+ 100 train 10.075231 (lr=8.0800e-06) (hash(x)=166780046)
5
  0 val loss 11.2646
6
+ 0 val perplexity 78014.3047
7
+ 200 val loss 9.5031
8
+ 200 val perplexity 13401.6328
9
+ 200 train 9.467043 (lr=1.6080e-05) (hash(x)=155040610)
10
+ 0 train 11.267765 (lr=1.0000e-07) (hash(x)=150327452)
11
+ 300 val loss 8.5047
12
+ 300 val perplexity 4938.0015
13
+ 300 train 8.524590 (lr=2.4080e-05) (hash(x)=155504036)
14
+ 100 val loss 9.9652
15
+ 100 val perplexity 21274.1113
16
+ 100 train 10.017369 (lr=1.0100e-05) (hash(x)=166780046)
17
+ 400 val loss 7.7785
18
+ 400 val perplexity 2388.7463
19
+ 400 train 7.663756 (lr=3.2080e-05) (hash(x)=143823248)
20
+ 200 val loss 9.2958
21
+ 200 val perplexity 10892.5625
22
+ 200 train 9.248415 (lr=2.0100e-05) (hash(x)=155040610)
23
+ 500 val loss 7.5217
24
+ 500 val perplexity 1847.7620
25
+ 500 train 7.493058 (lr=4.0000e-05) (hash(x)=143734685)
26
+ 300 val loss 8.1663
27
+ 300 val perplexity 3520.3948
28
+ 300 train 8.168233 (lr=3.0100e-05) (hash(x)=155504036)
29
+ 600 val loss 7.4353
30
+ 600 val perplexity 1694.7335
31
+ 600 train 7.215819 (lr=3.9987e-05) (hash(x)=150678249)
32
+ 400 val loss 7.6121
33
+ 400 val perplexity 2022.5510
34
+ 400 train 7.478056 (lr=4.0100e-05) (hash(x)=143823248)
35
+ 700 val loss 7.3528
36
+ 700 val perplexity 1560.5144
37
+ 700 train 7.493054 (lr=3.9948e-05) (hash(x)=175802021)
38
+ 500 val loss 7.4302
39
+ 500 val perplexity 1686.1023
40
+ 500 train 7.398177 (lr=5.0000e-05) (hash(x)=143734685)
41
+ 800 val loss 7.2439
42
+ 800 val perplexity 1399.5740
43
+ 800 train 7.254493 (lr=3.9883e-05) (hash(x)=158681215)
44
+ 600 val loss 7.3581
45
+ 600 val perplexity 1568.7780
46
+ 600 train 7.127360 (lr=4.9984e-05) (hash(x)=150678249)
47
+ 900 val loss 7.1674
48
+ 900 val perplexity 1296.4656
49
+ 900 train 7.060616 (lr=3.9792e-05) (hash(x)=146108145)
50
+ 700 val loss 7.2431
51
+ 700 val perplexity 1398.4459
52
+ 700 train 7.374388 (lr=4.9935e-05) (hash(x)=175802021)
53
+ 1000 val loss 7.1057
54
+ 1000 val perplexity 1218.8782
55
+ 1000 train 6.948586 (lr=3.9675e-05) (hash(x)=154996086)
56
+ 800 val loss 7.1141
57
+ 800 val perplexity 1229.1447
58
+ 800 train 7.129083 (lr=4.9853e-05) (hash(x)=158681215)
59
+ 1100 val loss 7.0173
60
+ 1100 val perplexity 1115.7611
61
+ 1100 train 7.000314 (lr=3.9532e-05) (hash(x)=153885445)
62
+ 900 val loss 7.0241
63
+ 900 val perplexity 1123.3754
64
+ 900 train 6.911726 (lr=4.9739e-05) (hash(x)=146108145)
65
+ 1200 val loss 6.9212
66
+ 1200 val perplexity 1013.5053
67
+ 1200 train 6.879501 (lr=3.9364e-05) (hash(x)=142353087)
68
+ 1000 val loss 6.9540
69
+ 1000 val perplexity 1047.3751
70
+ 1000 train 6.793063 (lr=4.9593e-05) (hash(x)=154996086)
71
+ 1300 val loss 6.8506
72
+ 1300 val perplexity 944.4738
73
+ 1300 train 6.759327 (lr=3.9171e-05) (hash(x)=150750353)
74
+ 1100 val loss 6.8664
75
+ 1100 val perplexity 959.4811
76
+ 1100 train 6.845954 (lr=4.9415e-05) (hash(x)=153885445)
77
+ 1400 val loss 6.7955
78
+ 1400 val perplexity 893.8453
79
+ 1400 train 6.617004 (lr=3.8953e-05) (hash(x)=152767913)
80
+ 1200 val loss 6.7910
81
+ 1200 val perplexity 889.7973
82
+ 1200 train 6.746166 (lr=4.9205e-05) (hash(x)=142353087)
83
+ 1500 val loss 6.7221
84
+ 1500 val perplexity 830.5728
85
+ 1500 train 6.764291 (lr=3.8711e-05) (hash(x)=151562048)
86
+ 1300 val loss 6.7221
87
+ 1300 val perplexity 830.5680
88
+ 1300 train 6.625785 (lr=4.8964e-05) (hash(x)=150750353)
89
+ 1600 val loss 6.6517
90
+ 1600 val perplexity 774.0981
91
+ 1600 train 6.655982 (lr=3.8444e-05) (hash(x)=166486165)
92
+ 1400 val loss 6.6699
93
+ 1400 val perplexity 788.3265
94
+ 1400 train 6.484465 (lr=4.8691e-05) (hash(x)=152767913)
95
+ 1700 val loss 6.5988
96
+ 1700 val perplexity 734.1826
97
+ 1700 train 6.246137 (lr=3.8153e-05) (hash(x)=130835396)
98
+ 1500 val loss 6.6045
99
+ 1500 val perplexity 738.4147
100
+ 1500 train 6.649715 (lr=4.8388e-05) (hash(x)=151562048)
101
+ 1800 val loss 6.5471
102
+ 1800 val perplexity 697.1981
103
+ 1800 train 6.590258 (lr=3.7839e-05) (hash(x)=158851816)
104
+ 1600 val loss 6.5291
105
+ 1600 val perplexity 684.7996
106
+ 1600 train 6.542716 (lr=4.8055e-05) (hash(x)=166486165)
107
+ 1900 val loss 6.5106
108
+ 1900 val perplexity 672.2518
109
+ 1900 train 6.546779 (lr=3.7502e-05) (hash(x)=153313879)
110
+ 1700 val loss 6.4875
111
+ 1700 val perplexity 656.9081
112
+ 1700 train 6.130574 (lr=4.7691e-05) (hash(x)=130835396)
113
+ 2000 val loss 6.4463
114
+ 2000 val perplexity 630.3575
115
+ 2000 train 6.380337 (lr=3.7143e-05) (hash(x)=158245023)
116
+ 1800 val loss 6.4404
117
+ 1800 val perplexity 626.6369
118
+ 1800 train 6.488020 (lr=4.7299e-05) (hash(x)=158851816)
119
+ 2100 val loss 6.4013
120
+ 2100 val perplexity 602.6064
121
+ 2100 train 6.222491 (lr=3.6761e-05) (hash(x)=157204896)
122
+ 1900 val loss 6.4053
123
+ 1900 val perplexity 605.0572
124
+ 1900 train 6.435752 (lr=4.6878e-05) (hash(x)=153313879)
125
+ 2200 val loss 6.3789
126
+ 2200 val perplexity 589.2851
127
+ 2200 train 6.219698 (lr=3.6358e-05) (hash(x)=137541932)
128
+ 2000 val loss 6.3458
129
+ 2000 val perplexity 570.0729
130
+ 2000 train 6.278543 (lr=4.6428e-05) (hash(x)=158245023)
131
+ 2300 val loss 6.3499
132
+ 2300 val perplexity 572.4460
133
+ 2300 train 6.300820 (lr=3.5935e-05) (hash(x)=150149692)
134
+ 2100 val loss 6.3149
135
+ 2100 val perplexity 552.7206
136
+ 2100 train 6.137981 (lr=4.5951e-05) (hash(x)=157204896)
137
+ 2400 val loss 6.3041
138
+ 2400 val perplexity 546.8093
139
+ 2400 train 6.291619 (lr=3.5491e-05) (hash(x)=151730720)
140
+ 2200 val loss 6.2885
141
+ 2200 val perplexity 538.3621
142
+ 2200 train 6.130053 (lr=4.5448e-05) (hash(x)=137541932)
143
+ 2500 val loss 6.2745
144
+ 2500 val perplexity 530.8547
145
+ 2500 train 6.058253 (lr=3.5027e-05) (hash(x)=143406752)
146
+ 2300 val loss 6.2654
147
+ 2300 val perplexity 526.0432
148
+ 2300 train 6.222541 (lr=4.4918e-05) (hash(x)=150149692)
149
+ 2600 val loss 6.2600
150
+ 2600 val perplexity 523.2106
151
+ 2600 train 6.092404 (lr=3.4545e-05) (hash(x)=157272496)
152
+ 2400 val loss 6.2239
153
+ 2400 val perplexity 504.6877
154
+ 2400 train 6.207308 (lr=4.4363e-05) (hash(x)=151730720)
155
+ 2700 val loss 6.2352
156
+ 2700 val perplexity 510.4092
157
+ 2700 train 6.229642 (lr=3.4044e-05) (hash(x)=155342327)
158
+ 2500 val loss 6.1932
159
+ 2500 val perplexity 489.4210
160
+ 2500 train 5.986238 (lr=4.3784e-05) (hash(x)=143406752)
161
+ 2800 val loss 6.2031
162
+ 2800 val perplexity 494.2682
163
+ 2800 train 6.078806 (lr=3.3526e-05) (hash(x)=140626679)
164
+ 2600 val loss 6.1778
165
+ 2600 val perplexity 481.9189
166
+ 2600 train 6.010714 (lr=4.3181e-05) (hash(x)=157272496)
167
+ 2900 val loss 6.1813
168
+ 2900 val perplexity 483.6212
169
+ 2900 train 6.041678 (lr=3.2992e-05) (hash(x)=144953350)
170
+ 2700 val loss 6.1551
171
+ 2700 val perplexity 471.0950
172
+ 2700 train 6.130734 (lr=4.2555e-05) (hash(x)=155342327)
173
+ 3000 val loss 6.1637
174
+ 3000 val perplexity 475.1647
175
+ 3000 train 6.065508 (lr=3.2441e-05) (hash(x)=172449837)
176
+ 2800 val loss 6.1203
177
+ 2800 val perplexity 454.9970
178
+ 2800 train 6.002769 (lr=4.1908e-05) (hash(x)=140626679)
179
+ 3100 val loss 6.1536
180
+ 3100 val perplexity 470.3969
181
+ 3100 train 5.937268 (lr=3.1875e-05) (hash(x)=141710086)
182
+ 2900 val loss 6.1003
183
+ 2900 val perplexity 446.0030
184
+ 2900 train 5.960584 (lr=4.1240e-05) (hash(x)=144953350)
185
+ 3200 val loss 6.1185
186
+ 3200 val perplexity 454.1793
187
+ 3200 train 6.050138 (lr=3.1295e-05) (hash(x)=151299772)
188
+ 3000 val loss 6.0852
189
+ 3000 val perplexity 439.3019
190
+ 3000 train 5.978558 (lr=4.0551e-05) (hash(x)=172449837)
191
+ 3300 val loss 6.1050
192
+ 3300 val perplexity 448.0775
193
+ 3300 train 5.970407 (lr=3.0702e-05) (hash(x)=146473110)
194
+ 3100 val loss 6.0773
195
+ 3100 val perplexity 435.8608
196
+ 3100 train 5.868666 (lr=3.9844e-05) (hash(x)=141710086)
197
+ 3400 val loss 6.0857
198
+ 3400 val perplexity 439.5253
199
+ 3400 train 6.168725 (lr=3.0095e-05) (hash(x)=153954157)
200
+ 3200 val loss 6.0406
201
+ 3200 val perplexity 420.1267
202
+ 3200 train 5.970029 (lr=3.9119e-05) (hash(x)=151299772)
203
+ 3500 val loss 6.0671
204
+ 3500 val perplexity 431.4481
205
+ 3500 train 6.002892 (lr=2.9477e-05) (hash(x)=153717336)
206
+ 3300 val loss 6.0289
207
+ 3300 val perplexity 415.2450
208
+ 3300 train 5.899902 (lr=3.8377e-05) (hash(x)=146473110)
209
+ 3600 val loss 6.0520
210
+ 3600 val perplexity 424.9601
211
+ 3600 train 5.805218 (lr=2.8849e-05) (hash(x)=144965161)
212
+ 3400 val loss 6.0029
213
+ 3400 val perplexity 404.5874
214
+ 3400 train 6.088784 (lr=3.7619e-05) (hash(x)=153954157)
215
+ 3700 val loss 6.0448
216
+ 3700 val perplexity 421.9280
217
+ 3700 train 5.821227 (lr=2.8210e-05) (hash(x)=125969741)
218
+ 3500 val loss 5.9897
219
+ 3500 val perplexity 399.2789
220
+ 3500 train 5.933339 (lr=3.6847e-05) (hash(x)=153717336)
221
+ 3800 val loss 6.0203
222
+ 3800 val perplexity 411.6863
223
+ 3800 train 5.885331 (lr=2.7562e-05) (hash(x)=155070487)
224
+ 3600 val loss 5.9693
225
+ 3600 val perplexity 391.2270
226
+ 3600 train 5.724619 (lr=3.6061e-05) (hash(x)=144965161)
227
+ 3900 val loss 6.0036
228
+ 3900 val perplexity 404.8695
229
+ 3900 train 5.874807 (lr=2.6907e-05) (hash(x)=149444644)
230
+ 3700 val loss 5.9651
231
+ 3700 val perplexity 389.6051
232
+ 3700 train 5.751326 (lr=3.5263e-05) (hash(x)=125969741)
233
+ 4000 val loss 5.9952
234
+ 4000 val perplexity 401.4825
235
+ 4000 train 5.852266 (lr=2.6244e-05) (hash(x)=151663033)
236
+ 3800 val loss 5.9416
237
+ 3800 val perplexity 380.5251
238
+ 3800 train 5.806542 (lr=3.4453e-05) (hash(x)=155070487)
239
+ 4100 val loss 5.9847
240
+ 4100 val perplexity 397.3002
241
+ 4100 train 5.937428 (lr=2.5575e-05) (hash(x)=143688282)
242
+ 3900 val loss 5.9216
243
+ 3900 val perplexity 372.9961
244
+ 3900 train 5.789830 (lr=3.3633e-05) (hash(x)=149444644)
245
+ 4200 val loss 5.9638
246
+ 4200 val perplexity 389.0882
247
+ 4200 train 5.963494 (lr=2.4900e-05) (hash(x)=163361651)
248
+ 4000 val loss 5.9130
249
+ 4000 val perplexity 369.8036
250
+ 4000 train 5.765071 (lr=3.2805e-05) (hash(x)=151663033)
251
+ 4300 val loss 5.9510
252
+ 4300 val perplexity 384.1372
253
+ 4300 train 5.998785 (lr=2.4222e-05) (hash(x)=153619361)
254
+ 4100 val loss 5.9076
255
+ 4100 val perplexity 367.8302
256
+ 4100 train 5.866171 (lr=3.1968e-05) (hash(x)=143688282)
257
+ 4400 val loss 5.9426
258
+ 4400 val perplexity 380.9430
259
+ 4400 train 6.251171 (lr=2.3540e-05) (hash(x)=168527064)
260
+ 4200 val loss 5.8817
261
+ 4200 val perplexity 358.4278
262
+ 4200 train 5.884516 (lr=3.1126e-05) (hash(x)=163361651)
263
+ 4500 val loss 5.9357
264
+ 4500 val perplexity 378.3056
265
+ 4500 train 5.746198 (lr=2.2856e-05) (hash(x)=125588037)
266
+ 4300 val loss 5.8654
267
+ 4300 val perplexity 352.6216
268
+ 4300 train 5.909609 (lr=3.0277e-05) (hash(x)=153619361)
269
+ 4600 val loss 5.9151
270
+ 4600 val perplexity 370.5818
271
+ 4600 train 5.879491 (lr=2.2171e-05) (hash(x)=143710941)
272
+ 4400 val loss 5.8604
273
+ 4400 val perplexity 350.8679
274
+ 4400 train 6.193501 (lr=2.9425e-05) (hash(x)=168527064)
275
+ 4700 val loss 5.9033
276
+ 4700 val perplexity 366.2306
277
+ 4700 train 5.731603 (lr=2.1486e-05) (hash(x)=150952742)
278
+ 4500 val loss 5.8487
279
+ 4500 val perplexity 346.7679
280
+ 4500 train 5.672557 (lr=2.8571e-05) (hash(x)=125588037)
281
+ 4800 val loss 5.8950
282
+ 4800 val perplexity 363.2156
283
+ 4800 train 5.766671 (lr=2.0801e-05) (hash(x)=145323659)
284
+ 4600 val loss 5.8316
285
+ 4600 val perplexity 340.9111
286
+ 4600 train 5.789861 (lr=2.7714e-05) (hash(x)=143710941)
287
+ 4900 val loss 5.8908
288
+ 4900 val perplexity 361.6895
289
+ 4900 train 5.910984 (lr=2.0118e-05) (hash(x)=153151397)
290
+ 4700 val loss 5.8176
291
+ 4700 val perplexity 336.1733
292
+ 4700 train 5.653608 (lr=2.6857e-05) (hash(x)=150952742)
293
+ 5000 val loss 5.8805
294
+ 5000 val perplexity 357.9796
295
+ 5000 train 5.776249 (lr=1.9438e-05) (hash(x)=143182059)
296
+ 4800 val loss 5.8069
297
+ 4800 val perplexity 332.5922
298
+ 4800 train 5.684433 (lr=2.6002e-05) (hash(x)=145323659)
299
+ 5100 val loss 5.8728
300
+ 5100 val perplexity 355.2327
301
+ 5100 train 5.917749 (lr=1.8762e-05) (hash(x)=170083586)
302
+ 4900 val loss 5.8035
303
+ 4900 val perplexity 331.4719
304
+ 4900 train 5.826119 (lr=2.5148e-05) (hash(x)=153151397)
305
+ 5200 val loss 5.8569
306
+ 5200 val perplexity 349.6357
307
+ 5200 train 5.746037 (lr=1.8090e-05) (hash(x)=149363919)
308
+ 5000 val loss 5.7909
309
+ 5000 val perplexity 327.2966
310
+ 5000 train 5.696330 (lr=2.4298e-05) (hash(x)=143182059)
311
+ 5300 val loss 5.8524
312
+ 5300 val perplexity 348.0753
313
+ 5300 train 5.746236 (lr=1.7424e-05) (hash(x)=152033784)
314
+ 5100 val loss 5.7796
315
+ 5100 val perplexity 323.6216
316
+ 5100 train 5.819366 (lr=2.3452e-05) (hash(x)=170083586)
317
+ 5400 val loss 5.8388
318
+ 5400 val perplexity 343.3778
319
+ 5400 train 5.942615 (lr=1.6765e-05) (hash(x)=154614289)
320
+ 5200 val loss 5.7686
321
+ 5200 val perplexity 320.0969
322
+ 5200 train 5.652661 (lr=2.2613e-05) (hash(x)=149363919)
323
+ 5500 val loss 5.8314
324
+ 5500 val perplexity 340.8504
325
+ 5500 train 5.922790 (lr=1.6113e-05) (hash(x)=157745174)
326
+ 5300 val loss 5.7625
327
+ 5300 val perplexity 318.1494
328
+ 5300 train 5.647845 (lr=2.1780e-05) (hash(x)=152033784)
329
+ 5600 val loss 5.8262
330
+ 5600 val perplexity 339.0699
331
+ 5600 train 5.719628 (lr=1.5469e-05) (hash(x)=147693222)
332
+ 5400 val loss 5.7464
333
+ 5400 val perplexity 313.0708
334
+ 5400 train 5.859529 (lr=2.0956e-05) (hash(x)=154614289)
335
+ 5700 val loss 5.8140
336
+ 5700 val perplexity 334.9571
337
+ 5700 train 5.746548 (lr=1.4836e-05) (hash(x)=149784627)
338
+ 5500 val loss 5.7393
339
+ 5500 val perplexity 310.8358
340
+ 5500 train 5.840951 (lr=2.0141e-05) (hash(x)=157745174)
341
+ 5800 val loss 5.8125
342
+ 5800 val perplexity 334.4635
343
+ 5800 train 5.733287 (lr=1.4212e-05) (hash(x)=158620729)
344
+ 5600 val loss 5.7353
345
+ 5600 val perplexity 309.6171
346
+ 5600 train 5.634243 (lr=1.9337e-05) (hash(x)=147693222)
347
+ 5900 val loss 5.8028
348
+ 5900 val perplexity 331.2259
349
+ 5900 train 5.752628 (lr=1.3600e-05) (hash(x)=159763910)
350
+ 5700 val loss 5.7216
351
+ 5700 val perplexity 305.3927
352
+ 5700 train 5.658313 (lr=1.8545e-05) (hash(x)=149784627)
353
+ 6000 val loss 5.7939
354
+ 6000 val perplexity 328.2917
355
+ 6000 train 5.729095 (lr=1.3000e-05) (hash(x)=147640561)
356
+ 5800 val loss 5.7199
357
+ 5800 val perplexity 304.8785
358
+ 5800 train 5.642802 (lr=1.7765e-05) (hash(x)=158620729)
359
+ 6100 val loss 5.7863
360
+ 6100 val perplexity 325.8080
361
+ 6100 train 5.767511 (lr=1.2413e-05) (hash(x)=156613394)
362
+ 5900 val loss 5.7106
363
+ 5900 val perplexity 302.0421
364
+ 5900 train 5.663085 (lr=1.7000e-05) (hash(x)=159763910)
365
+ 6200 val loss 5.7810
366
+ 6200 val perplexity 324.0978
367
+ 6200 train 5.732399 (lr=1.1840e-05) (hash(x)=186221290)
368
+ 6000 val loss 5.6995
369
+ 6000 val perplexity 298.7092
370
+ 6000 train 5.637892 (lr=1.6250e-05) (hash(x)=147640561)
371
+ 6300 val loss 5.7715
372
+ 6300 val perplexity 321.0345
373
+ 6300 train 5.650871 (lr=1.1282e-05) (hash(x)=152081419)
374
+ 6100 val loss 5.6932
375
+ 6100 val perplexity 296.8378
376
+ 6100 train 5.677419 (lr=1.5516e-05) (hash(x)=156613394)
377
+ 6400 val loss 5.7706
378
+ 6400 val perplexity 320.7337
379
+ 6400 train 5.718038 (lr=1.0739e-05) (hash(x)=154808349)
380
+ 6200 val loss 5.6879
381
+ 6200 val perplexity 295.2686
382
+ 6200 train 5.623254 (lr=1.4800e-05) (hash(x)=186221290)
383
+ 6500 val loss 5.7654
384
+ 6500 val perplexity 319.0538
385
+ 6500 train 5.731769 (lr=1.0213e-05) (hash(x)=159437208)
386
+ 6300 val loss 5.6766
387
+ 6300 val perplexity 291.9467
388
+ 6300 train 5.555213 (lr=1.4102e-05) (hash(x)=152081419)
389
+ 6600 val loss 5.7626
390
+ 6600 val perplexity 318.1778
391
+ 6600 train 5.570604 (lr=9.7032e-06) (hash(x)=157933074)
392
+ 6400 val loss 5.6760
393
+ 6400 val perplexity 291.7793
394
+ 6400 train 5.630403 (lr=1.3424e-05) (hash(x)=154808349)
395
+ 6700 val loss 5.7543
396
+ 6700 val perplexity 315.5476
397
+ 6500 val loss 5.6699
398
+ 6500 val perplexity 290.0038
399
+ 6700 train 5.733328 (lr=9.2116e-06) (hash(x)=161560240)
400
+ 6500 train 5.638351 (lr=1.2766e-05) (hash(x)=159437208)
401
+ 6800 val loss 5.7510
402
+ 6800 val perplexity 314.5105
403
+ 6800 train 5.735670 (lr=8.7387e-06) (hash(x)=155424292)
404
+ 6600 val loss 5.6674
405
+ 6600 val perplexity 289.2891
406
+ 6600 train 5.483891 (lr=1.2129e-05) (hash(x)=157933074)
407
+ 6900 val loss 5.7498
408
+ 6900 val perplexity 314.1256
409
+ 6900 train 5.741641 (lr=8.2849e-06) (hash(x)=148561470)
410
+ 6700 val loss 5.6597
411
+ 6700 val perplexity 287.0517
412
+ 6700 train 5.636087 (lr=1.1515e-05) (hash(x)=161560240)
413
+ 7000 val loss 5.7422
414
+ 7000 val perplexity 311.7635
415
+ 7000 train 5.635467 (lr=7.8510e-06) (hash(x)=141527450)
416
+ 6800 val loss 5.6551
417
+ 6800 val perplexity 285.7415
418
+ 6800 train 5.644596 (lr=1.0923e-05) (hash(x)=155424292)
419
+ 7100 val loss 5.7372
420
+ 7100 val perplexity 310.2070
421
+ 7100 train 5.695245 (lr=7.4377e-06) (hash(x)=151066339)
422
+ 6900 val loss 5.6537
423
+ 6900 val perplexity 285.3379
424
+ 6900 train 5.654519 (lr=1.0356e-05) (hash(x)=148561470)
425
+ 7200 val loss 5.7389
426
+ 7200 val perplexity 310.7103
427
+ 7200 train 5.537840 (lr=7.0455e-06) (hash(x)=155231264)
428
+ 7000 val loss 5.6465
429
+ 7000 val perplexity 283.2892
430
+ 7000 train 5.545733 (lr=9.8138e-06) (hash(x)=141527450)
431
+ 7300 val loss 5.7299
432
+ 7300 val perplexity 307.9386
433
+ 7300 train 5.751905 (lr=6.6749e-06) (hash(x)=150281149)
434
+ 7100 val loss 5.6420
435
+ 7100 val perplexity 282.0365
436
+ 7100 train 5.597219 (lr=9.2971e-06) (hash(x)=151066339)
437
+ 7400 val loss 5.7280
438
+ 7400 val perplexity 307.3646
439
+ 7400 train 5.612872 (lr=6.3266e-06) (hash(x)=148421717)
440
+ 7200 val loss 5.6433
441
+ 7200 val perplexity 282.3935
442
+ 7200 train 5.440510 (lr=8.8068e-06) (hash(x)=155231264)
443
+ 7500 val loss 5.7281
444
+ 7500 val perplexity 307.3760
445
+ 7500 train 5.486419 (lr=6.0010e-06) (hash(x)=146921118)
446
+ 7300 val loss 5.6343
447
+ 7300 val perplexity 279.8769
448
+ 7300 train 5.658390 (lr=8.3436e-06) (hash(x)=150281149)
449
+ 7600 val loss 5.7210
450
+ 7600 val perplexity 305.2163
451
+ 7600 train 5.768423 (lr=5.6986e-06) (hash(x)=150660048)
452
+ 7400 val loss 5.6316
453
+ 7400 val perplexity 279.1068
454
+ 7400 train 5.521311 (lr=7.9082e-06) (hash(x)=148421717)
455
+ 7700 val loss 5.7194
456
+ 7700 val perplexity 304.7192
457
+ 7700 train 5.598227 (lr=5.4198e-06) (hash(x)=148059852)
458
+ 7500 val loss 5.6322
459
+ 7500 val perplexity 279.2650
460
+ 7500 train 5.399149 (lr=7.5012e-06) (hash(x)=146921118)
461
+ 7800 val loss 5.7185
462
+ 7800 val perplexity 304.4613
463
+ 7800 train 5.413217 (lr=5.1650e-06) (hash(x)=148331002)
464
+ 7600 val loss 5.6242
465
+ 7600 val perplexity 277.0608
466
+ 7600 train 5.672753 (lr=7.1232e-06) (hash(x)=150660048)
467
+ 7900 val loss 5.7133
468
+ 7900 val perplexity 302.8731
469
+ 7900 train 5.659461 (lr=4.9347e-06) (hash(x)=164923883)
470
+ 7700 val loss 5.6228
471
+ 7700 val perplexity 276.6698
472
+ 7700 train 5.499656 (lr=6.7747e-06) (hash(x)=148059852)
473
+ 8000 val loss 5.7111
474
+ 8000 val perplexity 302.1974
475
+ 8000 train 5.595223 (lr=4.7291e-06) (hash(x)=143545384)
476
+ 7800 val loss 5.6219
477
+ 7800 val perplexity 276.4274
478
+ 7800 train 5.311404 (lr=6.4563e-06) (hash(x)=148331002)
479
+ 8100 val loss 5.7107
480
+ 8100 val perplexity 302.0790
481
+ 8100 train 5.530027 (lr=4.5486e-06) (hash(x)=160686959)
482
+ 7900 val loss 5.6168
483
+ 7900 val perplexity 275.0202
484
+ 7900 train 5.559655 (lr=6.1684e-06) (hash(x)=164923883)
485
+ 8000 val loss 5.6148
486
+ 8000 val perplexity 274.4717
487
+ 8000 train 5.498442 (lr=5.9114e-06) (hash(x)=143545384)
488
+ 8200 val loss 5.7087
489
+ 8200 val perplexity 301.4907
490
+ 8200 train 5.666525 (lr=4.3933e-06) (hash(x)=156501889)
491
+ 8100 val loss 5.6141
492
+ 8100 val perplexity 274.2537
493
+ 8100 train 5.425257 (lr=5.6857e-06) (hash(x)=160686959)
494
+ 8300 val loss 5.7069
495
+ 8300 val perplexity 300.9297
496
+ 8300 train 5.548706 (lr=4.2636e-06) (hash(x)=142716875)
497
+ 8400 val loss 5.7022
498
+ 8400 val perplexity 299.5299
499
+ 8200 val loss 5.6117
500
+ 8200 val perplexity 273.6158
501
+ 8400 train 5.616063 (lr=4.1596e-06) (hash(x)=154436684)
502
+ 8200 train 5.565293 (lr=5.4917e-06) (hash(x)=156501889)
503
+ 8300 val loss 5.6098
504
+ 8300 val perplexity 273.0905
505
+ 8500 val loss 5.7006
506
+ 8500 val perplexity 299.0606
507
+ 8300 train 5.455184 (lr=5.3295e-06) (hash(x)=142716875)
508
+ 8500 train 5.777520 (lr=4.0815e-06) (hash(x)=147965839)
509
+ 8400 val loss 5.6050
510
+ 8400 val perplexity 271.7937
511
+ 8600 val loss 5.6996
512
+ 8600 val perplexity 298.7383
513
+ 8400 train 5.522124 (lr=5.1995e-06) (hash(x)=154436684)
514
+ 8600 train 5.497610 (lr=4.0294e-06) (hash(x)=145228097)
515
+ 8500 val loss 5.6036
516
+ 8500 val perplexity 271.4066
517
+ 8700 val loss 5.6953
518
+ 8700 val perplexity 297.4666
519
+ 8500 train 5.683967 (lr=5.1019e-06) (hash(x)=147965839)
520
+ 8700 train 5.885415 (lr=4.0033e-06) (hash(x)=152910357)
521
+ 8749 val loss 5.6961
522
+ 8749 val perplexity 297.7067
523
+ 8600 val loss 5.6034
524
+ 8600 val perplexity 271.3527
525
+ 8600 train 5.399725 (lr=5.0367e-06) (hash(x)=145228097)
attention_kindselective_n_heads4_seed1338/model_08749.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1194215ae1764e27f99d25868ee71ac63c74d1b625dda045798ebb1b8c89462
3
  size 92843394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a027a07472d227bd517865e15803294bf1cba3e05b7cb35e0461461ed15ff0a
3
  size 92843394
attention_kindselective_n_heads4_seed1338/optimizer_08749.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:719546c6e08256f8acaae03ca5284013062efd1e6207724fcf8920f416f23bd3
3
  size 179406214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5938aeb6ef1944ddc12159568227c3c4dea45324a2282b520d0faccd7bea83b8
3
  size 179406214