andrew-healey commited on
Commit
35c2929
·
verified ·
1 Parent(s): 92835e0

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads2_seed1340/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_4/attention_kindselective_n_heads2_seed1340", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_4", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1340, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 30720, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 3e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "3e-5_30720_2_1340", "n_embd": 128}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_6/attention_kindselective_n_heads2_seed1340", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 10000, "warmup_steps": 200, "group": "wider_is_better_6", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1340, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 40, "total_batch_size": 10240, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 7e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "7e-5_10240_2_1340", "n_embd": 128}
attention_kindselective_n_heads2_seed1340/dataloader_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db41c5e5513137877487a93451adf8ec4ed2448ab6e9471ebd5595c8e3293875
3
  size 964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b2ea67f78ff5a7970d0db044ff7ee527b3dc065f295fd30f588df4b44b568d0
3
  size 964
attention_kindselective_n_heads2_seed1340/dataloader_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6df8947c6ab773db1947914387d3db345a84828521d3a64bae9b652e1b0a410
3
  size 964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f03ed2ebf741f15e13c79e6cc1e9a19b308450d81cc3b4d8d0338c63d77ca59
3
  size 964
attention_kindselective_n_heads2_seed1340/dataloader_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:169891a726a7ff746d1a7aa99f459a66d85ceb4e9f2583f790f5b8501f97b6af
3
  size 964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82590037fb2eecbec961f7967a8dd1b8d85515d31a252f66b92b8139858a8b7c
3
  size 964
attention_kindselective_n_heads2_seed1340/dataloader_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e685a568a36c792ccbe7b5fcae0b9d630955e589991190bd8902836cea6a91df
3
  size 964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c544303717d12355a69b8ffb1eb109434e4fdccfd5a61279b6e8ba2e870d6700
3
  size 964
attention_kindselective_n_heads2_seed1340/log2.txt CHANGED
@@ -1,303 +1,603 @@
1
  max_steps: 10000
2
- 0 val loss 11.8210
3
- 0 val perplexity 136085.8281
4
- 0 train 11.833096 (lr=1.5000e-07) (hash(x)=164406924)
5
- 100 val loss 10.2860
6
- 100 val perplexity 29318.0957
7
- 100 train 10.581880 (lr=1.5150e-05) (hash(x)=177407419)
8
- 200 val loss 9.2449
9
- 200 val perplexity 10351.6846
10
- 200 train 9.195401 (lr=3.0000e-05) (hash(x)=144903932)
11
- 300 val loss 8.1777
12
- 300 val perplexity 3560.5066
13
- 300 train 8.432213 (lr=2.9993e-05) (hash(x)=173839165)
14
- 400 val loss 7.8301
15
- 400 val perplexity 2515.1199
16
- 400 train 7.897438 (lr=2.9972e-05) (hash(x)=167734596)
17
- 500 val loss 7.6721
18
- 500 val perplexity 2147.6790
19
- 500 train 7.686492 (lr=2.9938e-05) (hash(x)=153224076)
20
- 600 val loss 7.6069
21
- 600 val perplexity 2012.0172
22
- 600 train 7.531812 (lr=2.9889e-05) (hash(x)=149619098)
23
- 700 val loss 7.5730
24
- 700 val perplexity 1944.9812
25
- 700 train 7.526173 (lr=2.9827e-05) (hash(x)=146539909)
26
- 800 val loss 7.5481
27
- 800 val perplexity 1897.0706
28
- 800 train 7.518998 (lr=2.9751e-05) (hash(x)=153710890)
29
- 900 val loss 7.5388
30
- 900 val perplexity 1879.5011
31
- 900 train 7.499132 (lr=2.9662e-05) (hash(x)=155873620)
32
- 1000 val loss 7.5291
33
- 1000 val perplexity 1861.4382
34
- 1000 train 7.460474 (lr=2.9558e-05) (hash(x)=145450636)
35
- 1100 val loss 7.5105
36
- 1100 val perplexity 1827.0963
37
- 1100 train 7.569565 (lr=2.9442e-05) (hash(x)=154123388)
38
- 1200 val loss 7.4993
39
- 1200 val perplexity 1806.8583
40
- 1200 train 7.362889 (lr=2.9312e-05) (hash(x)=145249251)
41
- 1300 val loss 7.4861
42
- 1300 val perplexity 1783.1503
43
- 1300 train 7.407315 (lr=2.9169e-05) (hash(x)=148937127)
44
- 1400 val loss 7.4802
45
- 1400 val perplexity 1772.6321
46
- 1400 train 7.583092 (lr=2.9013e-05) (hash(x)=150475545)
47
- 1500 val loss 7.4730
48
- 1500 val perplexity 1759.8701
49
- 1500 train 7.411776 (lr=2.8845e-05) (hash(x)=154653428)
50
- 1600 val loss 7.4660
51
- 1600 val perplexity 1747.5458
52
- 1600 train 7.437787 (lr=2.8663e-05) (hash(x)=144483776)
53
- 1700 val loss 7.4535
54
- 1700 val perplexity 1725.8730
55
- 1700 train 7.627961 (lr=2.8469e-05) (hash(x)=157395496)
56
- 1800 val loss 7.4387
57
- 1800 val perplexity 1700.5133
58
- 1800 train 7.462626 (lr=2.8263e-05) (hash(x)=157916369)
59
- 1900 val loss 7.4201
60
- 1900 val perplexity 1669.2788
61
- 1900 train 7.636841 (lr=2.8044e-05) (hash(x)=166073923)
62
- 2000 val loss 7.4135
63
- 2000 val perplexity 1658.2711
64
- 2000 train 7.522213 (lr=2.7814e-05) (hash(x)=154856891)
65
- 2100 val loss 7.4018
66
- 2100 val perplexity 1638.9025
67
- 2100 train 7.377049 (lr=2.7572e-05) (hash(x)=151925203)
68
- 2200 val loss 7.3915
69
- 2200 val perplexity 1622.1089
70
- 2200 train 7.193004 (lr=2.7319e-05) (hash(x)=136191502)
71
- 2300 val loss 7.3719
72
- 2300 val perplexity 1590.5979
73
- 2300 train 7.487554 (lr=2.7055e-05) (hash(x)=153273362)
74
- 2400 val loss 7.3512
75
- 2400 val perplexity 1557.9932
76
- 2400 train 7.283143 (lr=2.6780e-05) (hash(x)=148021541)
77
- 2500 val loss 7.3460
78
- 2500 val perplexity 1550.0237
79
- 2500 train 7.250226 (lr=2.6494e-05) (hash(x)=141356608)
80
- 2600 val loss 7.3185
81
- 2600 val perplexity 1507.8693
82
- 2600 train 7.283971 (lr=2.6198e-05) (hash(x)=146005217)
83
- 2700 val loss 7.3159
84
- 2700 val perplexity 1504.0547
85
- 2700 train 7.160429 (lr=2.5892e-05) (hash(x)=144511718)
86
- 2800 val loss 7.2898
87
- 2800 val perplexity 1465.3499
88
- 2800 train 7.166070 (lr=2.5576e-05) (hash(x)=146019502)
89
- 2900 val loss 7.2652
90
- 2900 val perplexity 1429.6077
91
- 2900 train 7.165171 (lr=2.5251e-05) (hash(x)=146496200)
92
- 3000 val loss 7.2538
93
- 3000 val perplexity 1413.4381
94
- 3000 train 7.196369 (lr=2.4917e-05) (hash(x)=150127281)
95
- 3100 val loss 7.2288
96
- 3100 val perplexity 1378.5229
97
- 3100 train 7.172277 (lr=2.4574e-05) (hash(x)=142022255)
98
- 3200 val loss 7.2119
99
- 3200 val perplexity 1355.4843
100
- 3200 train 7.291126 (lr=2.4224e-05) (hash(x)=154120875)
101
- 3300 val loss 7.1995
102
- 3300 val perplexity 1338.7019
103
- 3300 train 7.225865 (lr=2.3865e-05) (hash(x)=153999717)
104
- 3400 val loss 7.1853
105
- 3400 val perplexity 1319.9038
106
- 3400 train 6.986575 (lr=2.3498e-05) (hash(x)=139694097)
107
- 3500 val loss 7.1647
108
- 3500 val perplexity 1292.9854
109
- 3500 train 7.357235 (lr=2.3125e-05) (hash(x)=162992732)
110
- 3600 val loss 7.1354
111
- 3600 val perplexity 1255.6628
112
- 3600 train 7.120846 (lr=2.2744e-05) (hash(x)=147574101)
113
- 3700 val loss 7.1180
114
- 3700 val perplexity 1233.9659
115
- 3700 train 7.231690 (lr=2.2357e-05) (hash(x)=157763099)
116
- 3800 val loss 7.0878
117
- 3800 val perplexity 1197.3303
118
- 3800 train 7.244596 (lr=2.1965e-05) (hash(x)=170800034)
119
- 3900 val loss 7.0743
120
- 3900 val perplexity 1181.2009
121
- 3900 train 7.151352 (lr=2.1566e-05) (hash(x)=164984528)
122
- 4000 val loss 7.0556
123
- 4000 val perplexity 1159.3071
124
- 4000 train 6.946763 (lr=2.1162e-05) (hash(x)=141743323)
125
- 4100 val loss 7.0423
126
- 4100 val perplexity 1143.9823
127
- 4100 train 7.050414 (lr=2.0754e-05) (hash(x)=153392872)
128
- 4200 val loss 7.0286
129
- 4200 val perplexity 1128.3999
130
- 4200 train 6.904306 (lr=2.0341e-05) (hash(x)=149074933)
131
- 4300 val loss 7.0063
132
- 4300 val perplexity 1103.5922
133
- 4300 train 7.419280 (lr=1.9924e-05) (hash(x)=167823423)
134
- 4400 val loss 6.9891
135
- 4400 val perplexity 1084.7200
136
- 4400 train 6.721108 (lr=1.9504e-05) (hash(x)=141203114)
137
- 4500 val loss 6.9697
138
- 4500 val perplexity 1063.9399
139
- 4500 train 6.962101 (lr=1.9081e-05) (hash(x)=146284780)
140
- 4600 val loss 6.9558
141
- 4600 val perplexity 1049.2501
142
- 4600 train 6.758272 (lr=1.8655e-05) (hash(x)=141126464)
143
- 4700 val loss 6.9420
144
- 4700 val perplexity 1034.8446
145
- 4700 train 6.943596 (lr=1.8226e-05) (hash(x)=154751926)
146
- 4800 val loss 6.9285
147
- 4800 val perplexity 1020.9679
148
- 4800 train 7.011858 (lr=1.7796e-05) (hash(x)=154793198)
149
- 4900 val loss 6.9213
150
- 4900 val perplexity 1013.5986
151
- 4900 train 6.667045 (lr=1.7365e-05) (hash(x)=139406392)
152
- 5000 val loss 6.9092
153
- 5000 val perplexity 1001.4541
154
- 5000 train 6.710131 (lr=1.6933e-05) (hash(x)=153548741)
155
- 5100 val loss 6.8940
156
- 5100 val perplexity 986.2919
157
- 5100 train 6.959525 (lr=1.6500e-05) (hash(x)=160488568)
158
- 5200 val loss 6.8850
159
- 5200 val perplexity 977.5457
160
- 5200 train 6.886682 (lr=1.6067e-05) (hash(x)=149645053)
161
- 5300 val loss 6.8517
162
- 5300 val perplexity 945.4849
163
- 5300 train 6.939456 (lr=1.5635e-05) (hash(x)=155820556)
164
- 5400 val loss 6.8374
165
- 5400 val perplexity 932.0350
166
- 5400 train 6.763284 (lr=1.5204e-05) (hash(x)=147538134)
167
- 5500 val loss 6.8217
168
- 5500 val perplexity 917.5742
169
- 5500 train 7.024829 (lr=1.4774e-05) (hash(x)=166889307)
170
- 5600 val loss 6.8077
171
- 5600 val perplexity 904.7902
172
- 5600 train 6.529876 (lr=1.4345e-05) (hash(x)=139516699)
173
- 5700 val loss 6.7965
174
- 5700 val perplexity 894.6815
175
- 5700 train 6.495327 (lr=1.3919e-05) (hash(x)=140453511)
176
- 5800 val loss 6.7922
177
- 5800 val perplexity 890.8710
178
- 5800 train 6.750346 (lr=1.3496e-05) (hash(x)=162964847)
179
- 5900 val loss 6.7734
180
- 5900 val perplexity 874.2501
181
- 5900 train 6.787580 (lr=1.3076e-05) (hash(x)=150606634)
182
- 6000 val loss 6.7546
183
- 6000 val perplexity 857.9982
184
- 6000 train 6.869712 (lr=1.2659e-05) (hash(x)=149890857)
185
- 6100 val loss 6.7394
186
- 6100 val perplexity 845.0160
187
- 6100 train 6.851167 (lr=1.2246e-05) (hash(x)=173884145)
188
- 6200 val loss 6.7288
189
- 6200 val perplexity 836.1551
190
- 6200 train 6.760945 (lr=1.1838e-05) (hash(x)=151987098)
191
- 6300 val loss 6.7160
192
- 6300 val perplexity 825.5145
193
- 6300 train 6.630625 (lr=1.1434e-05) (hash(x)=148853562)
194
- 6400 val loss 6.7048
195
- 6400 val perplexity 816.2783
196
- 6400 train 6.468220 (lr=1.1035e-05) (hash(x)=141530101)
197
- 6500 val loss 6.6959
198
- 6500 val perplexity 809.1172
199
- 6500 train 6.547406 (lr=1.0643e-05) (hash(x)=142297809)
200
- 6600 val loss 6.6835
201
- 6600 val perplexity 799.1030
202
- 6600 train 6.547137 (lr=1.0256e-05) (hash(x)=142447782)
203
- 6700 val loss 6.6707
204
- 6700 val perplexity 788.9225
205
- 6700 train 6.593633 (lr=9.8753e-06) (hash(x)=147004686)
206
- 6800 val loss 6.6594
207
- 6800 val perplexity 780.0958
208
- 6800 train 6.405754 (lr=9.5017e-06) (hash(x)=133438702)
209
- 6900 val loss 6.6530
210
- 6900 val perplexity 775.0946
211
- 6900 train 6.670481 (lr=9.1353e-06) (hash(x)=157085143)
212
- 7000 val loss 6.6447
213
- 7000 val perplexity 768.6899
214
- 7000 train 6.533543 (lr=8.7764e-06) (hash(x)=139437666)
215
- 7100 val loss 6.6337
216
- 7100 val perplexity 760.2664
217
- 7100 train 6.748162 (lr=8.4255e-06) (hash(x)=159792986)
218
- 7200 val loss 6.6230
219
- 7200 val perplexity 752.1608
220
- 7200 train 6.588307 (lr=8.0829e-06) (hash(x)=144930687)
221
- 7300 val loss 6.6153
222
- 7300 val perplexity 746.4230
223
- 7300 train 6.699518 (lr=7.7489e-06) (hash(x)=156242690)
224
- 7400 val loss 6.6093
225
- 7400 val perplexity 741.9413
226
- 7400 train 6.480586 (lr=7.4239e-06) (hash(x)=148183719)
227
- 7500 val loss 6.6042
228
- 7500 val perplexity 738.1967
229
- 7500 train 6.624214 (lr=7.1083e-06) (hash(x)=152494758)
230
- 7600 val loss 6.5976
231
- 7600 val perplexity 733.3229
232
- 7600 train 6.321466 (lr=6.8023e-06) (hash(x)=142485027)
233
- 7700 val loss 6.5898
234
- 7700 val perplexity 727.6033
235
- 7700 train 6.445391 (lr=6.5062e-06) (hash(x)=147512165)
236
- 7800 val loss 6.5780
237
- 7800 val perplexity 719.1275
238
- 7800 train 6.539083 (lr=6.2205e-06) (hash(x)=160346994)
239
- 7900 val loss 6.5764
240
- 7900 val perplexity 717.9341
241
- 7900 train 6.418441 (lr=5.9453e-06) (hash(x)=144488254)
242
- 8000 val loss 6.5700
243
- 8000 val perplexity 713.3625
244
- 8000 train 6.375028 (lr=5.6809e-06) (hash(x)=147637019)
245
- 8100 val loss 6.5666
246
- 8100 val perplexity 710.9796
247
- 8100 train 6.427360 (lr=5.4277e-06) (hash(x)=147340534)
248
- 8200 val loss 6.5623
249
- 8200 val perplexity 707.9001
250
- 8200 train 6.574669 (lr=5.1858e-06) (hash(x)=151630665)
251
- 8300 val loss 6.5526
252
- 8300 val perplexity 701.0833
253
- 8300 train 6.621968 (lr=4.9556e-06) (hash(x)=149747064)
254
- 8400 val loss 6.5468
255
- 8400 val perplexity 696.9837
256
- 8400 train 6.719865 (lr=4.7372e-06) (hash(x)=154245770)
257
- 8500 val loss 6.5429
258
- 8500 val perplexity 694.2892
259
- 8500 train 6.417419 (lr=4.5309e-06) (hash(x)=152559100)
260
- 8600 val loss 6.5389
261
- 8600 val perplexity 691.4964
262
- 8600 train 7.097158 (lr=4.3369e-06) (hash(x)=181365926)
263
- 8700 val loss 6.5363
264
- 8700 val perplexity 689.7556
265
- 8700 train 6.383374 (lr=4.1554e-06) (hash(x)=154405991)
266
- 8800 val loss 6.5326
267
- 8800 val perplexity 687.1642
268
- 8800 train 6.495862 (lr=3.9866e-06) (hash(x)=153755904)
269
- 8900 val loss 6.5266
270
- 8900 val perplexity 683.0411
271
- 8900 train 6.463516 (lr=3.8307e-06) (hash(x)=152120568)
272
- 9000 val loss 6.5208
273
- 9000 val perplexity 679.1530
274
- 9000 train 6.330629 (lr=3.6877e-06) (hash(x)=142797279)
275
- 9100 val loss 6.5182
276
- 9100 val perplexity 677.3309
277
- 9100 train 6.386018 (lr=3.5580e-06) (hash(x)=143037503)
278
- 9200 val loss 6.5147
279
- 9200 val perplexity 674.9849
280
- 9200 train 6.315207 (lr=3.4415e-06) (hash(x)=113690273)
281
- 9300 val loss 6.5132
282
- 9300 val perplexity 673.9931
283
- 9300 train 6.458709 (lr=3.3385e-06) (hash(x)=158025077)
284
- 9400 val loss 6.5109
285
- 9400 val perplexity 672.4445
286
- 9400 train 6.597129 (lr=3.2490e-06) (hash(x)=158251718)
287
- 9500 val loss 6.5048
288
- 9500 val perplexity 668.3381
289
- 9500 train 6.567007 (lr=3.1730e-06) (hash(x)=154752610)
290
- 9600 val loss 6.5019
291
- 9600 val perplexity 666.3765
292
- 9600 train 6.378345 (lr=3.1108e-06) (hash(x)=146889093)
293
- 9700 val loss 6.4992
294
- 9700 val perplexity 664.6182
295
- 9700 train 6.483963 (lr=3.0624e-06) (hash(x)=156906516)
296
- 9800 val loss 6.4974
297
- 9800 val perplexity 663.4178
298
- 9800 train 6.365656 (lr=3.0277e-06) (hash(x)=153841927)
299
- 9900 val loss 6.4961
300
- 9900 val perplexity 662.5817
301
- 9900 train 6.691042 (lr=3.0069e-06) (hash(x)=163514334)
302
- 9999 val loss 6.4912
303
- 9999 val perplexity 659.2943
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  max_steps: 10000
2
+ 0 val loss 11.8201
3
+ 0 val perplexity 135955.4688
4
+ 0 val loss 11.8201
5
+ 0 val perplexity 135955.4688
6
+ 0 train 11.815376 (lr=2.5000e-07) (hash(x)=57791809)
7
+ 0 train 11.815376 (lr=3.5000e-07) (hash(x)=57791809)
8
+ 100 val loss 10.1415
9
+ 100 val perplexity 25373.9316
10
+ 100 train 10.167628 (lr=2.5250e-05) (hash(x)=48211824)
11
+ 100 val loss 9.9846
12
+ 100 val perplexity 21690.5820
13
+ 100 train 10.007619 (lr=3.5350e-05) (hash(x)=48211824)
14
+ 200 val loss 8.9560
15
+ 200 val perplexity 7754.3325
16
+ 200 train 8.916373 (lr=5.0000e-05) (hash(x)=50375849)
17
+ 200 val loss 8.3972
18
+ 200 val perplexity 4434.5674
19
+ 200 train 8.351953 (lr=7.0000e-05) (hash(x)=50375849)
20
+ 300 val loss 8.1359
21
+ 300 val perplexity 3414.7334
22
+ 300 train 8.400735 (lr=4.9988e-05) (hash(x)=57250808)
23
+ 300 val loss 7.9232
24
+ 300 val perplexity 2760.7124
25
+ 300 train 8.212564 (lr=6.9984e-05) (hash(x)=57250808)
26
+ 400 val loss 7.8720
27
+ 400 val perplexity 2622.8079
28
+ 400 train 8.402547 (lr=4.9954e-05) (hash(x)=62519858)
29
+ 400 val loss 7.7554
30
+ 400 val perplexity 2334.1978
31
+ 400 train 8.325225 (lr=6.9935e-05) (hash(x)=62519858)
32
+ 500 val loss 7.7049
33
+ 500 val perplexity 2219.0955
34
+ 500 train 7.604552 (lr=4.9896e-05) (hash(x)=47226806)
35
+ 500 val loss 7.6750
36
+ 500 val perplexity 2153.8015
37
+ 500 train 7.583454 (lr=6.9854e-05) (hash(x)=47226806)
38
+ 600 val loss 7.6373
39
+ 600 val perplexity 2074.1904
40
+ 600 train 7.663011 (lr=4.9815e-05) (hash(x)=51149322)
41
+ 600 val loss 7.6327
42
+ 600 val perplexity 2064.5627
43
+ 600 train 7.648909 (lr=6.9741e-05) (hash(x)=51149322)
44
+ 700 val loss 7.6224
45
+ 700 val perplexity 2043.4777
46
+ 700 train 7.632802 (lr=4.9712e-05) (hash(x)=51564551)
47
+ 700 val loss 7.6209
48
+ 700 val perplexity 2040.4641
49
+ 700 train 7.626678 (lr=6.9596e-05) (hash(x)=51564551)
50
+ 800 val loss 7.6059
51
+ 800 val perplexity 2010.0964
52
+ 800 train 7.368664 (lr=4.9585e-05) (hash(x)=45093459)
53
+ 800 val loss 7.6051
54
+ 800 val perplexity 2008.5107
55
+ 800 train 7.361159 (lr=6.9419e-05) (hash(x)=45093459)
56
+ 900 val loss 7.6009
57
+ 900 val perplexity 1999.9039
58
+ 900 train 7.899217 (lr=4.9436e-05) (hash(x)=54988361)
59
+ 900 val loss 7.5850
60
+ 900 val perplexity 1968.4664
61
+ 900 train 7.888000 (lr=6.9210e-05) (hash(x)=54988361)
62
+ 1000 val loss 7.5910
63
+ 1000 val perplexity 1980.3516
64
+ 1000 train 7.476419 (lr=4.9264e-05) (hash(x)=47588648)
65
+ 1000 val loss 7.5780
66
+ 1000 val perplexity 1954.8005
67
+ 1000 train 7.459035 (lr=6.8970e-05) (hash(x)=47588648)
68
+ 1100 val loss 7.5702
69
+ 1100 val perplexity 1939.6012
70
+ 1100 train 7.172775 (lr=4.9070e-05) (hash(x)=37984588)
71
+ 1100 val loss 7.5466
72
+ 1100 val perplexity 1894.2449
73
+ 1100 train 7.153616 (lr=6.8698e-05) (hash(x)=37984588)
74
+ 1200 val loss 7.5385
75
+ 1200 val perplexity 1878.9194
76
+ 1200 train 7.705000 (lr=4.8854e-05) (hash(x)=56333817)
77
+ 1300 val loss 7.5312
78
+ 1300 val perplexity 1865.3256
79
+ 1200 val loss 7.5242
80
+ 1200 val perplexity 1852.2689
81
+ 1300 train 7.666130 (lr=4.8616e-05) (hash(x)=53454056)
82
+ 1200 train 7.697269 (lr=6.8395e-05) (hash(x)=56333817)
83
+ 1400 val loss 7.4924
84
+ 1400 val perplexity 1794.3845
85
+ 1400 train 7.599366 (lr=4.8356e-05) (hash(x)=55284163)
86
+ 1300 val loss 7.5314
87
+ 1300 val perplexity 1865.7615
88
+ 1300 train 7.681477 (lr=6.8062e-05) (hash(x)=53454056)
89
+ 1500 val loss 7.4758
90
+ 1500 val perplexity 1764.8745
91
+ 1500 train 7.364212 (lr=4.8074e-05) (hash(x)=48162598)
92
+ 1400 val loss 7.4858
93
+ 1400 val perplexity 1782.5961
94
+ 1400 train 7.581354 (lr=6.7698e-05) (hash(x)=55284163)
95
+ 1600 val loss 7.4609
96
+ 1600 val perplexity 1738.7384
97
+ 1600 train 7.523167 (lr=4.7772e-05) (hash(x)=54214535)
98
+ 1500 val loss 7.4677
99
+ 1500 val perplexity 1750.5398
100
+ 1500 train 7.366483 (lr=6.7304e-05) (hash(x)=48162598)
101
+ 1700 val loss 7.4489
102
+ 1700 val perplexity 1718.0170
103
+ 1700 train 7.658994 (lr=4.7448e-05) (hash(x)=53525003)
104
+ 1600 val loss 7.4239
105
+ 1600 val perplexity 1675.5126
106
+ 1600 train 7.474539 (lr=6.6881e-05) (hash(x)=54214535)
107
+ 1800 val loss 7.4343
108
+ 1800 val perplexity 1693.1310
109
+ 1800 train 7.507501 (lr=4.7105e-05) (hash(x)=51848994)
110
+ 1700 val loss 7.3955
111
+ 1700 val perplexity 1628.6207
112
+ 1700 train 7.592460 (lr=6.6428e-05) (hash(x)=53525003)
113
+ 1900 val loss 7.4327
114
+ 1900 val perplexity 1690.2996
115
+ 1900 train 7.251468 (lr=4.6741e-05) (hash(x)=48405987)
116
+ 1800 val loss 7.3557
117
+ 1800 val perplexity 1565.1370
118
+ 1800 train 7.437174 (lr=6.5947e-05) (hash(x)=51848994)
119
+ 2000 val loss 7.4107
120
+ 2000 val perplexity 1653.5981
121
+ 2000 train 7.784187 (lr=4.6357e-05) (hash(x)=58592291)
122
+ 1900 val loss 7.3525
123
+ 1900 val perplexity 1560.1082
124
+ 1900 train 7.168375 (lr=6.5437e-05) (hash(x)=48405987)
125
+ 2100 val loss 7.4144
126
+ 2100 val perplexity 1659.6460
127
+ 2100 train 7.489458 (lr=4.5954e-05) (hash(x)=51167081)
128
+ 2000 val loss 7.3089
129
+ 2000 val perplexity 1493.5795
130
+ 2000 train 7.667870 (lr=6.4900e-05) (hash(x)=58592291)
131
+ 2200 val loss 7.3779
132
+ 2200 val perplexity 1600.2672
133
+ 2200 train 7.367453 (lr=4.5532e-05) (hash(x)=47994988)
134
+ 2100 val loss 7.3073
135
+ 2100 val perplexity 1491.0868
136
+ 2100 train 7.386800 (lr=6.4335e-05) (hash(x)=51167081)
137
+ 2300 val loss 7.3693
138
+ 2300 val perplexity 1586.4584
139
+ 2300 train 7.348310 (lr=4.5091e-05) (hash(x)=47377604)
140
+ 2200 val loss 7.3030
141
+ 2200 val perplexity 1484.7701
142
+ 2200 train 7.295847 (lr=6.3745e-05) (hash(x)=47994988)
143
+ 2400 val loss 7.3560
144
+ 2400 val perplexity 1565.5953
145
+ 2400 train 7.408455 (lr=4.4633e-05) (hash(x)=53554323)
146
+ 2300 val loss 7.2556
147
+ 2300 val perplexity 1416.0649
148
+ 2300 train 7.237532 (lr=6.3128e-05) (hash(x)=47377604)
149
+ 2500 val loss 7.3607
150
+ 2500 val perplexity 1573.0103
151
+ 2500 train 7.371253 (lr=4.4156e-05) (hash(x)=50780417)
152
+ 2400 val loss 7.2298
153
+ 2400 val perplexity 1379.9376
154
+ 2400 train 7.275304 (lr=6.2486e-05) (hash(x)=53554323)
155
+ 2600 val loss 7.3246
156
+ 2600 val perplexity 1517.1007
157
+ 2600 train 7.223897 (lr=4.3663e-05) (hash(x)=46453562)
158
+ 2500 val loss 7.2124
159
+ 2500 val perplexity 1356.2070
160
+ 2500 train 7.234367 (lr=6.1819e-05) (hash(x)=50780417)
161
+ 2700 val loss 7.3181
162
+ 2700 val perplexity 1507.3157
163
+ 2700 train 7.332546 (lr=4.3153e-05) (hash(x)=54404221)
164
+ 2600 val loss 7.1852
165
+ 2600 val perplexity 1319.7157
166
+ 2600 train 7.078228 (lr=6.1128e-05) (hash(x)=46453562)
167
+ 2800 val loss 7.3003
168
+ 2800 val perplexity 1480.7711
169
+ 2800 train 8.040918 (lr=4.2627e-05) (hash(x)=59318895)
170
+ 2700 val loss 7.1844
171
+ 2700 val perplexity 1318.7426
172
+ 2700 train 7.171799 (lr=6.0414e-05) (hash(x)=54404221)
173
+ 2900 val loss 7.3080
174
+ 2900 val perplexity 1492.1636
175
+ 2900 train 7.285590 (lr=4.2085e-05) (hash(x)=47845760)
176
+ 2800 val loss 7.1699
177
+ 2800 val perplexity 1299.7759
178
+ 2800 train 7.846885 (lr=5.9677e-05) (hash(x)=59318895)
179
+ 3000 val loss 7.2772
180
+ 3000 val perplexity 1446.9222
181
+ 3000 train 6.974861 (lr=4.1529e-05) (hash(x)=44336167)
182
+ 2900 val loss 7.1458
183
+ 2900 val perplexity 1268.7697
184
+ 2900 train 7.136559 (lr=5.8919e-05) (hash(x)=47845760)
185
+ 3100 val loss 7.2409
186
+ 3100 val perplexity 1395.3539
187
+ 3100 train 7.539340 (lr=4.0957e-05) (hash(x)=44479330)
188
+ 3000 val loss 7.1352
189
+ 3000 val perplexity 1255.3414
190
+ 3000 train 6.846057 (lr=5.8140e-05) (hash(x)=44336167)
191
+ 3200 val loss 7.2380
192
+ 3200 val perplexity 1391.3442
193
+ 3200 train 7.291301 (lr=4.0373e-05) (hash(x)=54593096)
194
+ 3100 val loss 7.1081
195
+ 3100 val perplexity 1221.8489
196
+ 3100 train 7.400929 (lr=5.7340e-05) (hash(x)=44479330)
197
+ 3300 val loss 7.2317
198
+ 3300 val perplexity 1382.6051
199
+ 3300 train 7.120160 (lr=3.9775e-05) (hash(x)=45347643)
200
+ 3200 val loss 7.1089
201
+ 3200 val perplexity 1222.8351
202
+ 3200 train 7.181127 (lr=5.6522e-05) (hash(x)=54593096)
203
+ 3400 val loss 7.1952
204
+ 3400 val perplexity 1332.9685
205
+ 3400 train 7.318199 (lr=3.9164e-05) (hash(x)=47797247)
206
+ 3300 val loss 7.0572
207
+ 3300 val perplexity 1161.2457
208
+ 3300 train 6.946620 (lr=5.5684e-05) (hash(x)=45347643)
209
+ 3500 val loss 7.1690
210
+ 3500 val perplexity 1298.5449
211
+ 3500 train 7.078238 (lr=3.8541e-05) (hash(x)=46115683)
212
+ 3400 val loss 7.0117
213
+ 3400 val perplexity 1109.5795
214
+ 3400 train 7.125251 (lr=5.4829e-05) (hash(x)=47797247)
215
+ 3600 val loss 7.1592
216
+ 3600 val perplexity 1285.8961
217
+ 3600 train 7.025664 (lr=3.7907e-05) (hash(x)=44502074)
218
+ 3500 val loss 6.9839
219
+ 3500 val perplexity 1079.1410
220
+ 3500 train 6.878851 (lr=5.3958e-05) (hash(x)=46115683)
221
+ 3700 val loss 7.1425
222
+ 3700 val perplexity 1264.6299
223
+ 3700 train 7.283512 (lr=3.7262e-05) (hash(x)=55388443)
224
+ 3600 val loss 6.9588
225
+ 3600 val perplexity 1052.3683
226
+ 3600 train 6.803612 (lr=5.3070e-05) (hash(x)=44502074)
227
+ 3800 val loss 7.1436
228
+ 3800 val perplexity 1266.0127
229
+ 3800 train 6.931653 (lr=3.6608e-05) (hash(x)=43790341)
230
+ 3900 val loss 7.1233
231
+ 3900 val perplexity 1240.5541
232
+ 3900 train 7.126774 (lr=3.5944e-05) (hash(x)=50013318)
233
+ 3700 val loss 6.9234
234
+ 3700 val perplexity 1015.7560
235
+ 3700 train 7.075602 (lr=5.2167e-05) (hash(x)=55388443)
236
+ 4000 val loss 7.1181
237
+ 4000 val perplexity 1234.0818
238
+ 4000 train 7.197229 (lr=3.5271e-05) (hash(x)=51704787)
239
+ 3800 val loss 6.9045
240
+ 3800 val perplexity 996.7182
241
+ 3800 train 6.700143 (lr=5.1251e-05) (hash(x)=43790341)
242
+ 4100 val loss 7.1075
243
+ 4100 val perplexity 1221.0736
244
+ 4100 train 7.178077 (lr=3.4590e-05) (hash(x)=50821964)
245
+ 3900 val loss 6.8876
246
+ 3900 val perplexity 980.0497
247
+ 3900 train 6.885145 (lr=5.0321e-05) (hash(x)=50013318)
248
+ 4200 val loss 7.1000
249
+ 4200 val perplexity 1211.9559
250
+ 4200 train 7.117894 (lr=3.3902e-05) (hash(x)=49675080)
251
+ 4000 val loss 6.8554
252
+ 4000 val perplexity 948.9904
253
+ 4000 train 6.929147 (lr=4.9379e-05) (hash(x)=51704787)
254
+ 4300 val loss 7.0877
255
+ 4300 val perplexity 1197.1761
256
+ 4300 train 6.751846 (lr=3.3207e-05) (hash(x)=43239281)
257
+ 4100 val loss 6.8422
258
+ 4100 val perplexity 936.5167
259
+ 4100 train 6.914855 (lr=4.8426e-05) (hash(x)=50821964)
260
+ 4400 val loss 7.0756
261
+ 4400 val perplexity 1182.7396
262
+ 4400 train 6.766150 (lr=3.2507e-05) (hash(x)=45076737)
263
+ 4200 val loss 6.8086
264
+ 4200 val perplexity 905.6094
265
+ 4200 train 6.836856 (lr=4.7463e-05) (hash(x)=49675080)
266
+ 4500 val loss 7.0691
267
+ 4500 val perplexity 1175.0457
268
+ 4500 train 7.197128 (lr=3.1801e-05) (hash(x)=57930262)
269
+ 4300 val loss 6.7909
270
+ 4300 val perplexity 889.7120
271
+ 4300 train 6.455951 (lr=4.6490e-05) (hash(x)=43239281)
272
+ 4600 val loss 7.0560
273
+ 4600 val perplexity 1159.7599
274
+ 4600 train 6.852347 (lr=3.1091e-05) (hash(x)=46721614)
275
+ 4400 val loss 6.7779
276
+ 4400 val perplexity 878.2114
277
+ 4400 train 6.462660 (lr=4.5509e-05) (hash(x)=45076737)
278
+ 4700 val loss 7.0485
279
+ 4700 val perplexity 1151.1281
280
+ 4700 train 6.876356 (lr=3.0377e-05) (hash(x)=49837920)
281
+ 4500 val loss 6.7576
282
+ 4500 val perplexity 860.5627
283
+ 4500 train 6.874951 (lr=4.4521e-05) (hash(x)=57930262)
284
+ 4800 val loss 7.0353
285
+ 4800 val perplexity 1135.9937
286
+ 4800 train 7.160434 (lr=2.9661e-05) (hash(x)=48380045)
287
+ 4600 val loss 6.7361
288
+ 4600 val perplexity 842.2592
289
+ 4600 train 6.513708 (lr=4.3527e-05) (hash(x)=46721614)
290
+ 4900 val loss 7.0306
291
+ 4900 val perplexity 1130.7338
292
+ 4900 train 6.909075 (lr=2.8942e-05) (hash(x)=44202577)
293
+ 4700 val loss 6.7285
294
+ 4700 val perplexity 835.9263
295
+ 4700 train 6.564556 (lr=4.2528e-05) (hash(x)=49837920)
296
+ 5000 val loss 7.0021
297
+ 5000 val perplexity 1098.8903
298
+ 5000 train 7.005834 (lr=2.8221e-05) (hash(x)=52038024)
299
+ 4800 val loss 6.7083
300
+ 4800 val perplexity 819.1469
301
+ 4800 train 6.879846 (lr=4.1525e-05) (hash(x)=48380045)
302
+ 5100 val loss 6.9806
303
+ 5100 val perplexity 1075.6146
304
+ 5100 train 7.158219 (lr=2.7500e-05) (hash(x)=53700038)
305
+ 4900 val loss 6.6987
306
+ 4900 val perplexity 811.3144
307
+ 4900 train 6.585225 (lr=4.0518e-05) (hash(x)=44202577)
308
+ 5200 val loss 6.9704
309
+ 5200 val perplexity 1064.6042
310
+ 5200 train 6.970703 (lr=2.6779e-05) (hash(x)=48137625)
311
+ 5000 val loss 6.6739
312
+ 5000 val perplexity 791.4561
313
+ 5000 train 6.682315 (lr=3.9510e-05) (hash(x)=52038024)
314
+ 5300 val loss 6.9584
315
+ 5300 val perplexity 1051.9539
316
+ 5300 train 6.836740 (lr=2.6058e-05) (hash(x)=43161573)
317
+ 5100 val loss 6.6505
318
+ 5100 val perplexity 773.1574
319
+ 5100 train 6.805350 (lr=3.8500e-05) (hash(x)=53700038)
320
+ 5400 val loss 6.9467
321
+ 5400 val perplexity 1039.6730
322
+ 5400 train 7.132883 (lr=2.5339e-05) (hash(x)=56673322)
323
+ 5200 val loss 6.6534
324
+ 5200 val perplexity 775.4347
325
+ 5200 train 6.657163 (lr=3.7490e-05) (hash(x)=48137625)
326
+ 5500 val loss 6.9299
327
+ 5500 val perplexity 1022.4167
328
+ 5500 train 7.111983 (lr=2.4623e-05) (hash(x)=53468295)
329
+ 5300 val loss 6.6340
330
+ 5300 val perplexity 760.5112
331
+ 5300 train 6.477094 (lr=3.6482e-05) (hash(x)=43161573)
332
+ 5600 val loss 6.9158
333
+ 5600 val perplexity 1008.0984
334
+ 5600 train 7.229111 (lr=2.3909e-05) (hash(x)=59287280)
335
+ 5400 val loss 6.6298
336
+ 5400 val perplexity 757.3374
337
+ 5400 train 6.820119 (lr=3.5475e-05) (hash(x)=56673322)
338
+ 5700 val loss 6.9053
339
+ 5700 val perplexity 997.5583
340
+ 5700 train 7.143720 (lr=2.3199e-05) (hash(x)=57575806)
341
+ 5500 val loss 6.6147
342
+ 5500 val perplexity 746.0064
343
+ 5500 train 6.824338 (lr=3.4472e-05) (hash(x)=53468295)
344
+ 5800 val loss 6.8949
345
+ 5800 val perplexity 987.2734
346
+ 5800 train 6.836562 (lr=2.2493e-05) (hash(x)=46897279)
347
+ 5600 val loss 6.6057
348
+ 5600 val perplexity 739.3046
349
+ 5600 train 6.898512 (lr=3.3473e-05) (hash(x)=59287280)
350
+ 5900 val loss 6.8898
351
+ 5900 val perplexity 982.2181
352
+ 5900 train 6.771390 (lr=2.1793e-05) (hash(x)=47565679)
353
+ 5700 val loss 6.5965
354
+ 5700 val perplexity 732.5139
355
+ 5700 train 6.830234 (lr=3.2479e-05) (hash(x)=57575806)
356
+ 6000 val loss 6.8769
357
+ 6000 val perplexity 969.5782
358
+ 6000 train 6.764159 (lr=2.1098e-05) (hash(x)=51590090)
359
+ 5800 val loss 6.5881
360
+ 5800 val perplexity 726.3762
361
+ 5800 train 6.606775 (lr=3.1491e-05) (hash(x)=46897279)
362
+ 6100 val loss 6.8705
363
+ 6100 val perplexity 963.4629
364
+ 6100 train 7.256005 (lr=2.0410e-05) (hash(x)=59732271)
365
+ 5900 val loss 6.5897
366
+ 5900 val perplexity 727.5562
367
+ 5900 train 6.480405 (lr=3.0510e-05) (hash(x)=47565679)
368
+ 6200 val loss 6.8648
369
+ 6200 val perplexity 957.9373
370
+ 6200 train 6.838703 (lr=1.9729e-05) (hash(x)=46394422)
371
+ 6000 val loss 6.5795
372
+ 6000 val perplexity 720.1879
373
+ 6000 train 6.441273 (lr=2.9537e-05) (hash(x)=51590090)
374
+ 6300 val loss 6.8538
375
+ 6300 val perplexity 947.4933
376
+ 6300 train 6.875601 (lr=1.9056e-05) (hash(x)=53748145)
377
+ 6100 val loss 6.5637
378
+ 6100 val perplexity 708.8743
379
+ 6100 train 7.011058 (lr=2.8574e-05) (hash(x)=59732271)
380
+ 6400 val loss 6.8485
381
+ 6400 val perplexity 942.4641
382
+ 6400 train 6.731420 (lr=1.8392e-05) (hash(x)=46054751)
383
+ 6200 val loss 6.5600
384
+ 6200 val perplexity 706.2467
385
+ 6200 train 6.588350 (lr=2.7621e-05) (hash(x)=46394422)
386
+ 6500 val loss 6.8386
387
+ 6500 val perplexity 933.1525
388
+ 6500 train 7.054920 (lr=1.7738e-05) (hash(x)=51816809)
389
+ 6300 val loss 6.5540
390
+ 6300 val perplexity 702.0677
391
+ 6300 train 6.569966 (lr=2.6679e-05) (hash(x)=53748145)
392
+ 6600 val loss 6.8378
393
+ 6600 val perplexity 932.3933
394
+ 6600 train 6.719009 (lr=1.7093e-05) (hash(x)=52453336)
395
+ 6400 val loss 6.5467
396
+ 6400 val perplexity 696.9398
397
+ 6400 train 6.418003 (lr=2.5749e-05) (hash(x)=46054751)
398
+ 6700 val loss 6.8274
399
+ 6700 val perplexity 922.7719
400
+ 6700 train 6.842637 (lr=1.6459e-05) (hash(x)=49108775)
401
+ 6800 val loss 6.8191
402
+ 6800 val perplexity 915.1290
403
+ 6800 train 6.744590 (lr=1.5836e-05) (hash(x)=46745396)
404
+ 6500 val loss 6.5368
405
+ 6500 val perplexity 690.0514
406
+ 6500 train 6.811157 (lr=2.4833e-05) (hash(x)=51816809)
407
+ 6900 val loss 6.8036
408
+ 6900 val perplexity 901.1028
409
+ 6900 train 6.938798 (lr=1.5225e-05) (hash(x)=46534986)
410
+ 6600 val loss 6.5317
411
+ 6600 val perplexity 686.5802
412
+ 6600 train 6.414201 (lr=2.3930e-05) (hash(x)=52453336)
413
+ 7000 val loss 6.7982
414
+ 7000 val perplexity 896.2391
415
+ 7000 train 7.138734 (lr=1.4627e-05) (hash(x)=49317888)
416
+ 6700 val loss 6.5316
417
+ 6700 val perplexity 686.5275
418
+ 6700 train 6.547672 (lr=2.3042e-05) (hash(x)=49108775)
419
+ 7100 val loss 6.7898
420
+ 7100 val perplexity 888.7249
421
+ 7100 train 6.800003 (lr=1.4043e-05) (hash(x)=50360484)
422
+ 6800 val loss 6.5200
423
+ 6800 val perplexity 678.6013
424
+ 6800 train 6.438734 (lr=2.2171e-05) (hash(x)=46745396)
425
+ 7200 val loss 6.7843
426
+ 7200 val perplexity 883.8223
427
+ 7200 train 6.675083 (lr=1.3471e-05) (hash(x)=49515094)
428
+ 6900 val loss 6.5101
429
+ 6900 val perplexity 671.8820
430
+ 6900 train 6.690259 (lr=2.1316e-05) (hash(x)=46534986)
431
+ 7300 val loss 6.7862
432
+ 7300 val perplexity 885.5561
433
+ 7300 train 6.775707 (lr=1.2915e-05) (hash(x)=51546861)
434
+ 7000 val loss 6.4946
435
+ 7000 val perplexity 661.5610
436
+ 7000 train 6.878696 (lr=2.0478e-05) (hash(x)=49317888)
437
+ 7400 val loss 6.7751
438
+ 7400 val perplexity 875.7767
439
+ 7400 train 6.777234 (lr=1.2373e-05) (hash(x)=48320948)
440
+ 7100 val loss 6.4876
441
+ 7100 val perplexity 656.9729
442
+ 7100 train 6.499301 (lr=1.9660e-05) (hash(x)=50360484)
443
+ 7500 val loss 6.7680
444
+ 7500 val perplexity 869.6076
445
+ 7500 train 6.685040 (lr=1.1847e-05) (hash(x)=40167457)
446
+ 7200 val loss 6.4838
447
+ 7200 val perplexity 654.4562
448
+ 7200 train 6.345729 (lr=1.8860e-05) (hash(x)=49515094)
449
+ 7600 val loss 6.7610
450
+ 7600 val perplexity 863.4881
451
+ 7600 train 6.716415 (lr=1.1337e-05) (hash(x)=49942165)
452
+ 7300 val loss 6.4852
453
+ 7300 val perplexity 655.3403
454
+ 7300 train 6.473186 (lr=1.8081e-05) (hash(x)=51546861)
455
+ 7700 val loss 6.7578
456
+ 7700 val perplexity 860.7277
457
+ 7700 train 6.508514 (lr=1.0844e-05) (hash(x)=48853311)
458
+ 7400 val loss 6.4816
459
+ 7400 val perplexity 653.0450
460
+ 7400 train 6.474329 (lr=1.7323e-05) (hash(x)=48320948)
461
+ 7800 val loss 6.7528
462
+ 7800 val perplexity 856.4212
463
+ 7800 train 6.658655 (lr=1.0367e-05) (hash(x)=48510117)
464
+ 7500 val loss 6.4817
465
+ 7500 val perplexity 653.0565
466
+ 7500 train 6.411730 (lr=1.6586e-05) (hash(x)=40167457)
467
+ 7900 val loss 6.7509
468
+ 7900 val perplexity 854.8618
469
+ 7900 train 6.762117 (lr=9.9088e-06) (hash(x)=48339781)
470
+ 7600 val loss 6.4654
471
+ 7600 val perplexity 642.5402
472
+ 7600 train 6.415967 (lr=1.5872e-05) (hash(x)=49942165)
473
+ 8000 val loss 6.7459
474
+ 8000 val perplexity 850.5717
475
+ 8000 train 6.878915 (lr=9.4682e-06) (hash(x)=54927320)
476
+ 7700 val loss 6.4599
477
+ 7700 val perplexity 638.9849
478
+ 7700 train 6.188712 (lr=1.5181e-05) (hash(x)=48853311)
479
+ 8100 val loss 6.7406
480
+ 8100 val perplexity 846.0574
481
+ 8100 train 6.488393 (lr=9.0461e-06) (hash(x)=46461786)
482
+ 7800 val loss 6.4578
483
+ 7800 val perplexity 637.6554
484
+ 7800 train 6.345714 (lr=1.4514e-05) (hash(x)=48510117)
485
+ 8200 val loss 6.7377
486
+ 8200 val perplexity 843.5936
487
+ 8200 train 6.667369 (lr=8.6430e-06) (hash(x)=51536260)
488
+ 7900 val loss 6.4579
489
+ 7900 val perplexity 637.7202
490
+ 7900 train 6.452045 (lr=1.3872e-05) (hash(x)=48339781)
491
+ 8300 val loss 6.7352
492
+ 8300 val perplexity 841.4913
493
+ 8300 train 6.494115 (lr=8.2593e-06) (hash(x)=44770722)
494
+ 8000 val loss 6.4506
495
+ 8000 val perplexity 633.1036
496
+ 8000 train 6.566749 (lr=1.3255e-05) (hash(x)=54927320)
497
+ 8400 val loss 6.7321
498
+ 8400 val perplexity 838.9099
499
+ 8400 train 6.740037 (lr=7.8953e-06) (hash(x)=50104957)
500
+ 8100 val loss 6.4473
501
+ 8100 val perplexity 631.0035
502
+ 8100 train 6.158883 (lr=1.2665e-05) (hash(x)=46461786)
503
+ 8500 val loss 6.7283
504
+ 8500 val perplexity 835.6963
505
+ 8500 train 6.834835 (lr=7.5515e-06) (hash(x)=50132971)
506
+ 8200 val loss 6.4479
507
+ 8200 val perplexity 631.3887
508
+ 8200 train 6.372143 (lr=1.2100e-05) (hash(x)=51536260)
509
+ 8600 val loss 6.7235
510
+ 8600 val perplexity 831.6856
511
+ 8600 train 6.676203 (lr=7.2282e-06) (hash(x)=52193699)
512
+ 8300 val loss 6.4479
513
+ 8300 val perplexity 631.3722
514
+ 8300 train 6.184940 (lr=1.1563e-05) (hash(x)=44770722)
515
+ 8700 val loss 6.7189
516
+ 8700 val perplexity 827.9468
517
+ 8700 train 6.706408 (lr=6.9257e-06) (hash(x)=47902319)
518
+ 8400 val loss 6.4421
519
+ 8400 val perplexity 627.6946
520
+ 8400 train 6.443494 (lr=1.1053e-05) (hash(x)=50104957)
521
+ 8800 val loss 6.7133
522
+ 8800 val perplexity 823.3087
523
+ 8800 train 6.985338 (lr=6.6444e-06) (hash(x)=54904230)
524
+ 8500 val loss 6.4391
525
+ 8500 val perplexity 625.8142
526
+ 8500 train 6.534442 (lr=1.0572e-05) (hash(x)=50132971)
527
+ 8900 val loss 6.7132
528
+ 8900 val perplexity 823.1639
529
+ 8900 train 6.608115 (lr=6.3845e-06) (hash(x)=46311615)
530
+ 8600 val loss 6.4317
531
+ 8600 val perplexity 621.2293
532
+ 8600 train 6.393053 (lr=1.0119e-05) (hash(x)=52193699)
533
+ 9000 val loss 6.7123
534
+ 9000 val perplexity 822.4835
535
+ 9000 train 6.588591 (lr=6.1462e-06) (hash(x)=48535188)
536
+ 8700 val loss 6.4298
537
+ 8700 val perplexity 620.0411
538
+ 8700 train 6.417854 (lr=9.6960e-06) (hash(x)=47902319)
539
+ 9100 val loss 6.7054
540
+ 9100 val perplexity 816.8428
541
+ 9100 train 6.734750 (lr=5.9300e-06) (hash(x)=51757372)
542
+ 8800 val loss 6.4236
543
+ 8800 val perplexity 616.1947
544
+ 8800 train 6.713367 (lr=9.3021e-06) (hash(x)=54904230)
545
+ 9200 val loss 6.7031
546
+ 9200 val perplexity 814.9101
547
+ 9200 train 6.588158 (lr=5.7359e-06) (hash(x)=51131708)
548
+ 8900 val loss 6.4212
549
+ 8900 val perplexity 614.7217
550
+ 8900 train 6.308746 (lr=8.9382e-06) (hash(x)=46311615)
551
+ 9300 val loss 6.7031
552
+ 9300 val perplexity 814.9078
553
+ 9300 train 6.644286 (lr=5.5641e-06) (hash(x)=44784276)
554
+ 9000 val loss 6.4235
555
+ 9000 val perplexity 616.1304
556
+ 9000 train 6.292938 (lr=8.6047e-06) (hash(x)=48535188)
557
+ 9400 val loss 6.7000
558
+ 9400 val perplexity 812.3867
559
+ 9400 train 6.797466 (lr=5.4149e-06) (hash(x)=51981169)
560
+ 9100 val loss 6.4145
561
+ 9100 val perplexity 610.6580
562
+ 9100 train 6.448308 (lr=8.3020e-06) (hash(x)=51757372)
563
+ 9500 val loss 6.6971
564
+ 9500 val perplexity 810.0333
565
+ 9500 train 6.648178 (lr=5.2884e-06) (hash(x)=47232936)
566
+ 9200 val loss 6.4128
567
+ 9200 val perplexity 609.6115
568
+ 9200 train 6.280666 (lr=8.0302e-06) (hash(x)=51131708)
569
+ 9600 val loss 6.6957
570
+ 9600 val perplexity 808.9078
571
+ 9600 train 6.730829 (lr=5.1847e-06) (hash(x)=53800450)
572
+ 9300 val loss 6.4138
573
+ 9300 val perplexity 610.1989
574
+ 9300 train 6.370407 (lr=7.7898e-06) (hash(x)=44784276)
575
+ 9700 val loss 6.6942
576
+ 9700 val perplexity 807.7153
577
+ 9700 train 6.812382 (lr=5.1040e-06) (hash(x)=55768123)
578
+ 9400 val loss 6.4115
579
+ 9400 val perplexity 608.8001
580
+ 9400 train 6.524776 (lr=7.5809e-06) (hash(x)=51981169)
581
+ 9800 val loss 6.6956
582
+ 9800 val perplexity 808.8314
583
+ 9800 train 6.635161 (lr=5.0462e-06) (hash(x)=47745177)
584
+ 9500 val loss 6.4100
585
+ 9500 val perplexity 607.8681
586
+ 9500 train 6.373538 (lr=7.4038e-06) (hash(x)=47232936)
587
+ 9900 val loss 6.6930
588
+ 9900 val perplexity 806.7683
589
+ 9900 train 6.937419 (lr=5.0116e-06) (hash(x)=56592246)
590
+ 9600 val loss 6.4066
591
+ 9600 val perplexity 605.8401
592
+ 9600 train 6.441170 (lr=7.2586e-06) (hash(x)=53800450)
593
+ 9999 val loss 6.6932
594
+ 9999 val perplexity 806.9299
595
+ 9700 val loss 6.4071
596
+ 9700 val perplexity 606.1292
597
+ 9700 train 6.529077 (lr=7.1456e-06) (hash(x)=55768123)
598
+ 9800 val loss 6.4078
599
+ 9800 val perplexity 606.5295
600
+ 9800 train 6.343826 (lr=7.0647e-06) (hash(x)=47745177)
601
+ 9900 val loss 6.4075
602
+ 9900 val perplexity 606.3977
603
+ 9900 train 6.676889 (lr=7.0162e-06) (hash(x)=56592246)
attention_kindselective_n_heads2_seed1340/model_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c6723edebcb92ca2e72054a32701bc25d1fc927f93c9e3ffc3e865f8b7415e5
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c94c460d7c2e918bd19dd6c7d0a40fbf731522e2be5d922fe6d210895f4b6cd
3
  size 38587970
attention_kindselective_n_heads2_seed1340/model_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03f9fd53f979fe8cf9bf929f3f9ccb04af6199807a56cf30e6401e200e0c1bf4
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99967d2565438966d9e46b6de33f253f4b8e326c9f4cfe096170054f1ecb7401
3
  size 38587970
attention_kindselective_n_heads2_seed1340/model_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc16680a7b7cd0eb8bab693bc1270640af820af29ff05c8484576c829dca657b
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fc8260bce2fd1ee0a25bb0aefdf1b927435a4e83a5f3001b7cd9a010becef3e
3
  size 38587970
attention_kindselective_n_heads2_seed1340/model_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bee304b6d21be05c3175629877a7130ec354483d6f47805582d72d4e56b8b5f5
3
  size 38587970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffa02ec9ceef9ae047bb6c9f2df20bb20a78f2a2d95be921f8573aad83aa4f34
3
  size 38587970
attention_kindselective_n_heads2_seed1340/optimizer_02500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eaf36019f143c8ca2d052bd8495bde7f6853829697fcff4a1af930a2ed55ee8a
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e00018265047cc47ecf529e2cb77c9da38ab5ffdb63eb4dd62a77f724a4e61f1
3
  size 70895430
attention_kindselective_n_heads2_seed1340/optimizer_05000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a683404880ddecba30f3c5ded4e51f96de7555ad774fd34351ff9d5bc661c4b
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1e45fc5874ff2b4f6e33851010911bf7688145d2c6254585473d064c73335ec
3
  size 70895430
attention_kindselective_n_heads2_seed1340/optimizer_07500.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51bfb9ae26523dc1ea5f3a0fad4a6e93f8c6ce10ef08bbcd9d25e43431a4005b
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fa9a215741606396ff07a59b06d77363e8598933439c4e40b9445a216a12bff
3
  size 70895430
attention_kindselective_n_heads2_seed1340/optimizer_09999.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8966865e8fa15c38fb89509c163897db98961bbf4dcf7cb885b3b3ff5e1a6ddd
3
  size 70895430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b894c3c084b1f0ca1c18954ebede742cd1b3528fd4d2b7daeef15cd86b6a87b
3
  size 70895430