andrew-healey commited on
Commit
5f82127
·
verified ·
1 Parent(s): c88c4de

Upload folder using huggingface_hub

Browse files
attention_kindselective_n_heads4_seed1344/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1344", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1344, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 4e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "4e-5_61440_4_1344", "n_embd": 256}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1344", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1344, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 3.5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "3.5e-5_61440_4_1344", "n_embd": 256}
attention_kindselective_n_heads4_seed1344/log2.txt CHANGED
@@ -1,494 +1,534 @@
1
  max_steps: 8750
2
- 1300 val loss 6.6895
3
- 1300 val perplexity 803.9285
4
- 1300 train 6.773464 (lr=4.8964e-05) (hash(x)=173675040)
5
  0 val loss 11.2831
6
- 0 val perplexity 79465.5000
7
- 1400 val loss 6.6173
8
- 1400 val perplexity 747.9091
9
- 1400 train 6.590247 (lr=4.8691e-05) (hash(x)=162475906)
10
- 0 train 11.285261 (lr=8.0000e-08) (hash(x)=145830960)
11
- 1500 val loss 6.5603
12
- 1500 val perplexity 706.4640
13
- 1500 train 6.387834 (lr=4.8388e-05) (hash(x)=152785612)
14
- 100 val loss 10.1076
15
- 100 val perplexity 24529.0078
16
- 100 train 10.141041 (lr=8.0800e-06) (hash(x)=144157453)
17
- 1600 val loss 6.4984
18
- 1600 val perplexity 664.0759
19
- 1600 train 6.445701 (lr=4.8055e-05) (hash(x)=151987383)
20
- 200 val loss 9.6421
21
- 200 val perplexity 15400.2119
22
- 200 train 9.681087 (lr=1.6080e-05) (hash(x)=146764602)
23
- 1700 val loss 6.4370
24
- 1700 val perplexity 624.5526
25
- 1700 train 6.434201 (lr=4.7691e-05) (hash(x)=143781605)
26
- 300 val loss 8.5994
27
- 300 val perplexity 5428.6685
28
- 300 train 8.501122 (lr=2.4080e-05) (hash(x)=140559124)
29
- 1800 val loss 6.4001
30
- 1800 val perplexity 601.8904
31
- 1800 train 6.390086 (lr=4.7299e-05) (hash(x)=151040203)
32
- 1900 val loss 6.3648
33
- 1900 val perplexity 581.0037
34
- 1900 train 6.261320 (lr=4.6878e-05) (hash(x)=149414572)
35
- 400 val loss 7.7656
36
- 400 val perplexity 2358.1003
37
- 400 train 8.115546 (lr=3.2080e-05) (hash(x)=166024176)
38
- 2000 val loss 6.3277
39
- 2000 val perplexity 559.8475
40
- 2000 train 6.281135 (lr=4.6428e-05) (hash(x)=156479674)
41
- 500 val loss 7.5214
42
- 500 val perplexity 1847.1365
43
- 500 train 7.493972 (lr=4.0000e-05) (hash(x)=161040668)
44
- 2100 val loss 6.2903
45
- 2100 val perplexity 539.3040
46
- 2100 train 6.165673 (lr=4.5951e-05) (hash(x)=137861481)
47
- 600 val loss 7.4516
48
- 600 val perplexity 1722.6665
49
- 600 train 7.479457 (lr=3.9987e-05) (hash(x)=148878990)
50
- 2200 val loss 6.2634
51
- 2200 val perplexity 524.9904
52
- 2200 train 6.324560 (lr=4.5448e-05) (hash(x)=155937443)
53
- 700 val loss 7.3861
54
- 700 val perplexity 1613.4520
55
- 700 train 7.367638 (lr=3.9948e-05) (hash(x)=150420695)
56
- 2300 val loss 6.2612
57
- 2300 val perplexity 523.8252
58
- 2300 train 5.928801 (lr=4.4918e-05) (hash(x)=131035715)
59
- 800 val loss 7.3272
60
- 800 val perplexity 1521.0419
61
- 800 train 7.300409 (lr=3.9883e-05) (hash(x)=151995229)
62
- 2400 val loss 6.2118
63
- 2400 val perplexity 498.5806
64
- 2400 train 6.209708 (lr=4.4363e-05) (hash(x)=146568981)
65
- 900 val loss 7.2836
66
- 900 val perplexity 1456.1638
67
- 900 train 7.042968 (lr=3.9792e-05) (hash(x)=139853932)
68
- 2500 val loss 6.1798
69
- 2500 val perplexity 482.9110
70
- 2500 train 6.160361 (lr=4.3784e-05) (hash(x)=163317586)
71
- 1000 val loss 7.2349
72
- 1000 val perplexity 1386.9896
73
- 1000 train 7.431800 (lr=3.9675e-05) (hash(x)=174104207)
74
- 2600 val loss 6.1566
75
- 2600 val perplexity 471.8229
76
- 2600 train 5.974750 (lr=4.3181e-05) (hash(x)=144201060)
77
- 1100 val loss 7.1852
78
- 1100 val perplexity 1319.7642
79
- 1100 train 7.018353 (lr=3.9532e-05) (hash(x)=146275038)
80
- 2700 val loss 6.1323
81
- 2700 val perplexity 460.5000
82
- 2700 train 6.032785 (lr=4.2555e-05) (hash(x)=141825701)
83
- 1200 val loss 7.1329
84
- 1200 val perplexity 1252.4744
85
- 1200 train 6.912647 (lr=3.9364e-05) (hash(x)=141403655)
86
- 2800 val loss 6.1118
87
- 2800 val perplexity 451.1669
88
- 2800 train 6.219571 (lr=4.1908e-05) (hash(x)=160561627)
89
- 1300 val loss 7.0764
90
- 1300 val perplexity 1183.7563
91
- 1300 train 7.156072 (lr=3.9171e-05) (hash(x)=173675040)
92
- 2900 val loss 6.0842
93
- 2900 val perplexity 438.8623
94
- 2900 train 6.021353 (lr=4.1240e-05) (hash(x)=151758176)
95
- 1400 val loss 7.0171
96
- 1400 val perplexity 1115.5658
97
- 1400 train 6.980552 (lr=3.8953e-05) (hash(x)=162475906)
98
- 3000 val loss 6.0656
99
- 3000 val perplexity 430.7632
100
- 3000 train 6.088512 (lr=4.0551e-05) (hash(x)=155815751)
101
- 1500 val loss 6.9736
102
- 1500 val perplexity 1068.0918
103
- 1500 train 6.806849 (lr=3.8711e-05) (hash(x)=152785612)
104
- 3100 val loss 6.0532
105
- 3100 val perplexity 425.4607
106
- 3100 train 5.723763 (lr=3.9844e-05) (hash(x)=142307043)
107
- 1600 val loss 6.9146
108
- 1600 val perplexity 1006.8359
109
- 1600 train 6.869276 (lr=3.8444e-05) (hash(x)=151987383)
110
- 3200 val loss 6.0339
111
- 3200 val perplexity 417.3221
112
- 3200 train 6.141800 (lr=3.9119e-05) (hash(x)=156310690)
113
- 1700 val loss 6.8514
114
- 1700 val perplexity 945.2108
115
- 1700 train 6.829998 (lr=3.8153e-05) (hash(x)=143781605)
116
- 3300 val loss 6.0083
117
- 3300 val perplexity 406.8000
118
- 3300 train 6.009042 (lr=3.8377e-05) (hash(x)=151344506)
119
- 1800 val loss 6.8108
120
- 1800 val perplexity 907.5953
121
- 1800 train 6.784783 (lr=3.7839e-05) (hash(x)=151040203)
122
- 3400 val loss 5.9890
123
- 3400 val perplexity 399.0146
124
- 3400 train 6.022029 (lr=3.7619e-05) (hash(x)=162911881)
125
- 1900 val loss 6.7506
126
- 1900 val perplexity 854.5989
127
- 1900 train 6.654410 (lr=3.7502e-05) (hash(x)=149414572)
128
- 3500 val loss 5.9747
129
- 3500 val perplexity 393.3347
130
- 3500 train 5.916205 (lr=3.6847e-05) (hash(x)=142889971)
131
- 2000 val loss 6.6949
132
- 2000 val perplexity 808.3021
133
- 2000 train 6.656959 (lr=3.7143e-05) (hash(x)=156479674)
134
- 3600 val loss 5.9525
135
- 3600 val perplexity 384.7280
136
- 3600 train 5.941948 (lr=3.6061e-05) (hash(x)=149470354)
137
- 2100 val loss 6.6422
138
- 2100 val perplexity 766.7804
139
- 2100 train 6.508312 (lr=3.6761e-05) (hash(x)=137861481)
140
- 3700 val loss 5.9335
141
- 3700 val perplexity 377.4560
142
- 3700 train 5.878082 (lr=3.5263e-05) (hash(x)=154194821)
143
- 2200 val loss 6.5950
144
- 2200 val perplexity 731.4640
145
- 2200 train 6.654806 (lr=3.6358e-05) (hash(x)=155937443)
146
- 3800 val loss 5.9256
147
- 3800 val perplexity 374.5177
148
- 3800 train 5.851969 (lr=3.4453e-05) (hash(x)=148885848)
149
- 2300 val loss 6.5656
150
- 2300 val perplexity 710.2440
151
- 2300 train 6.250451 (lr=3.5935e-05) (hash(x)=131035715)
152
- 3900 val loss 5.9066
153
- 3900 val perplexity 367.4492
154
- 3900 train 5.823092 (lr=3.3633e-05) (hash(x)=148308484)
155
- 2400 val loss 6.5147
156
- 2400 val perplexity 675.0030
157
- 2400 train 6.513992 (lr=3.5491e-05) (hash(x)=146568981)
158
- 4000 val loss 5.8898
159
- 4000 val perplexity 361.3280
160
- 4000 train 5.728326 (lr=3.2805e-05) (hash(x)=139828564)
161
- 4100 val loss 5.8785
162
- 4100 val perplexity 357.2629
163
- 4100 train 5.783032 (lr=3.1968e-05) (hash(x)=139981997)
164
- 2500 val loss 6.4747
165
- 2500 val perplexity 648.5444
166
- 2500 train 6.444137 (lr=3.5027e-05) (hash(x)=163317586)
167
- 4200 val loss 5.8653
168
- 4200 val perplexity 352.5873
169
- 4200 train 6.020027 (lr=3.1126e-05) (hash(x)=150738447)
170
- 2600 val loss 6.4492
171
- 2600 val perplexity 632.2130
172
- 2600 train 6.273980 (lr=3.4545e-05) (hash(x)=144201060)
173
- 4300 val loss 5.8484
174
- 4300 val perplexity 346.6892
175
- 4300 train 5.620984 (lr=3.0277e-05) (hash(x)=142198107)
176
- 2700 val loss 6.4168
177
- 2700 val perplexity 612.0558
178
- 2700 train 6.313120 (lr=3.4044e-05) (hash(x)=141825701)
179
- 4400 val loss 5.8384
180
- 4400 val perplexity 343.2307
181
- 4400 train 5.713844 (lr=2.9425e-05) (hash(x)=142731201)
182
- 2800 val loss 6.3883
183
- 2800 val perplexity 594.8422
184
- 2800 train 6.510801 (lr=3.3526e-05) (hash(x)=160561627)
185
- 4500 val loss 5.8210
186
- 4500 val perplexity 337.3009
187
- 4500 train 5.849483 (lr=2.8571e-05) (hash(x)=154814426)
188
- 2900 val loss 6.3587
189
- 2900 val perplexity 577.5201
190
- 2900 train 6.310970 (lr=3.2992e-05) (hash(x)=151758176)
191
- 4600 val loss 5.8097
192
- 4600 val perplexity 333.5216
193
- 4600 train 5.787569 (lr=2.7714e-05) (hash(x)=155922230)
194
- 3000 val loss 6.3336
195
- 3000 val perplexity 563.1740
196
- 3000 train 6.341168 (lr=3.2441e-05) (hash(x)=155815751)
197
- 4700 val loss 5.8002
198
- 4700 val perplexity 330.3598
199
- 4700 train 5.552835 (lr=2.6857e-05) (hash(x)=139398510)
200
- 3100 val loss 6.3187
201
- 3100 val perplexity 554.8702
202
- 3100 train 5.986887 (lr=3.1875e-05) (hash(x)=142307043)
203
- 4800 val loss 5.7928
204
- 4800 val perplexity 327.9305
205
- 4800 train 5.851633 (lr=2.6002e-05) (hash(x)=140893236)
206
- 3200 val loss 6.2920
207
- 3200 val perplexity 540.2058
208
- 3200 train 6.404789 (lr=3.1295e-05) (hash(x)=156310690)
209
- 4900 val loss 5.7760
210
- 4900 val perplexity 322.4515
211
- 4900 train 5.668627 (lr=2.5148e-05) (hash(x)=153747830)
212
- 3300 val loss 6.2681
213
- 3300 val perplexity 527.4542
214
- 3300 train 6.271742 (lr=3.0702e-05) (hash(x)=151344506)
215
- 5000 val loss 5.7650
216
- 5000 val perplexity 318.9458
217
- 5000 train 6.023503 (lr=2.4298e-05) (hash(x)=148919005)
218
- 3400 val loss 6.2469
219
- 3400 val perplexity 516.4142
220
- 3400 train 6.282847 (lr=3.0095e-05) (hash(x)=162911881)
221
- 5100 val loss 5.7579
222
- 5100 val perplexity 316.6893
223
- 5100 train 5.549688 (lr=2.3452e-05) (hash(x)=142281936)
224
- 3500 val loss 6.2319
225
- 3500 val perplexity 508.7363
226
- 3500 train 6.170129 (lr=2.9477e-05) (hash(x)=142889971)
227
- 5200 val loss 5.7594
228
- 5200 val perplexity 317.1647
229
- 5200 train 5.555599 (lr=2.2613e-05) (hash(x)=143162650)
230
- 3600 val loss 6.2112
231
- 3600 val perplexity 498.2821
232
- 3600 train 6.204837 (lr=2.8849e-05) (hash(x)=149470354)
233
- 5300 val loss 5.7368
234
- 5300 val perplexity 310.0715
235
- 5300 train 5.798581 (lr=2.1780e-05) (hash(x)=148546849)
236
- 3700 val loss 6.1918
237
- 3700 val perplexity 488.7092
238
- 3700 train 6.136409 (lr=2.8210e-05) (hash(x)=154194821)
239
- 5400 val loss 5.7275
240
- 5400 val perplexity 307.2088
241
- 5400 train 5.588672 (lr=2.0956e-05) (hash(x)=143492259)
242
- 3800 val loss 6.1763
243
- 3800 val perplexity 481.2229
244
- 3800 train 6.109374 (lr=2.7562e-05) (hash(x)=148885848)
245
- 5500 val loss 5.7216
246
- 5500 val perplexity 305.4041
247
- 5500 train 5.477193 (lr=2.0141e-05) (hash(x)=141023941)
248
- 3900 val loss 6.1563
249
- 3900 val perplexity 471.6664
250
- 3900 train 6.070787 (lr=2.6907e-05) (hash(x)=148308484)
251
- 5600 val loss 5.7205
252
- 5600 val perplexity 305.0673
253
- 5600 train 5.603661 (lr=1.9337e-05) (hash(x)=142065021)
254
- 4000 val loss 6.1397
255
- 4000 val perplexity 463.9084
256
- 4000 train 5.980125 (lr=2.6244e-05) (hash(x)=139828564)
257
- 5700 val loss 5.7037
258
- 5700 val perplexity 299.9660
259
- 5700 train 5.590576 (lr=1.8545e-05) (hash(x)=145749913)
260
- 4100 val loss 6.1250
261
- 4100 val perplexity 457.1491
262
- 4100 train 6.032627 (lr=2.5575e-05) (hash(x)=139981997)
263
- 5800 val loss 5.6974
264
- 5800 val perplexity 298.0915
265
- 5800 train 5.381054 (lr=1.7765e-05) (hash(x)=140035522)
266
- 4200 val loss 6.1090
267
- 4200 val perplexity 449.9003
268
- 4200 train 6.255210 (lr=2.4900e-05) (hash(x)=150738447)
269
- 5900 val loss 5.6954
270
- 5900 val perplexity 297.5097
271
- 5900 train 5.436858 (lr=1.7000e-05) (hash(x)=154752726)
272
- 4300 val loss 6.0933
273
- 4300 val perplexity 442.8800
274
- 4300 train 5.871885 (lr=2.4222e-05) (hash(x)=142198107)
275
- 6000 val loss 5.6885
276
- 6000 val perplexity 295.4404
277
- 6000 train 5.713228 (lr=1.6250e-05) (hash(x)=158715824)
278
- 4400 val loss 6.0813
279
- 4400 val perplexity 437.6097
280
- 4400 train 5.954550 (lr=2.3540e-05) (hash(x)=142731201)
281
- 6100 val loss 5.6800
282
- 6100 val perplexity 292.9566
283
- 6100 train 5.699704 (lr=1.5516e-05) (hash(x)=137413820)
284
- 4500 val loss 6.0630
285
- 4500 val perplexity 429.6517
286
- 4500 train 6.074458 (lr=2.2856e-05) (hash(x)=154814426)
287
- 6200 val loss 5.6700
288
- 6200 val perplexity 290.0225
289
- 6200 train 5.589405 (lr=1.4800e-05) (hash(x)=151507523)
290
- 4600 val loss 6.0509
291
- 4600 val perplexity 424.4775
292
- 4600 train 6.017695 (lr=2.2171e-05) (hash(x)=155922230)
293
- 6300 val loss 5.6700
294
- 6300 val perplexity 290.0232
295
- 6300 train 5.439530 (lr=1.4102e-05) (hash(x)=147514617)
296
- 4700 val loss 6.0449
297
- 4700 val perplexity 421.9507
298
- 4700 train 5.787765 (lr=2.1486e-05) (hash(x)=139398510)
299
- 6400 val loss 5.6652
300
- 6400 val perplexity 288.6526
301
- 6400 train 5.666666 (lr=1.3424e-05) (hash(x)=151604465)
302
- 6500 val loss 5.6557
303
- 6500 val perplexity 285.9176
304
- 4800 val loss 6.0293
305
- 4800 val perplexity 415.4446
306
- 6500 train 5.715902 (lr=1.2766e-05) (hash(x)=144515881)
307
- 4800 train 6.103447 (lr=2.0801e-05) (hash(x)=140893236)
308
- 6600 val loss 5.6484
309
- 6600 val perplexity 283.8284
310
- 6600 train 5.447026 (lr=1.2129e-05) (hash(x)=136948374)
311
- 4900 val loss 6.0135
312
- 4900 val perplexity 408.9137
313
- 4900 train 5.920414 (lr=2.0118e-05) (hash(x)=153747830)
314
- 6700 val loss 5.6471
315
- 6700 val perplexity 283.4608
316
- 6700 train 5.448362 (lr=1.1515e-05) (hash(x)=146268592)
317
- 5000 val loss 6.0050
318
- 5000 val perplexity 405.4520
319
- 5000 train 6.230835 (lr=1.9438e-05) (hash(x)=148919005)
320
- 6800 val loss 5.6381
321
- 6800 val perplexity 280.9316
322
- 6800 train 5.587202 (lr=1.0923e-05) (hash(x)=152676836)
323
- 5100 val loss 5.9941
324
- 5100 val perplexity 401.0715
325
- 5100 train 5.782193 (lr=1.8762e-05) (hash(x)=142281936)
326
- 6900 val loss 5.6313
327
- 6900 val perplexity 279.0120
328
- 6900 train 5.562898 (lr=1.0356e-05) (hash(x)=134657776)
329
- 5200 val loss 5.9957
330
- 5200 val perplexity 401.7111
331
- 5200 train 5.804429 (lr=1.8090e-05) (hash(x)=143162650)
332
- 7000 val loss 5.6291
333
- 7000 val perplexity 278.4016
334
- 7000 train 5.589009 (lr=9.8138e-06) (hash(x)=166721861)
335
- 5300 val loss 5.9733
336
- 5300 val perplexity 392.7957
337
- 5300 train 6.033432 (lr=1.7424e-05) (hash(x)=148546849)
338
- 7100 val loss 5.6279
339
- 7100 val perplexity 278.0805
340
- 7100 train 5.430983 (lr=9.2971e-06) (hash(x)=135496702)
341
- 5400 val loss 5.9609
342
- 5400 val perplexity 387.9548
343
- 5400 train 5.819170 (lr=1.6765e-05) (hash(x)=143492259)
344
- 7200 val loss 5.6235
345
- 7200 val perplexity 276.8535
346
- 7200 train 5.736270 (lr=8.8068e-06) (hash(x)=155567461)
347
- 5500 val loss 5.9559
348
- 5500 val perplexity 386.0132
349
- 5500 train 5.713650 (lr=1.6113e-05) (hash(x)=141023941)
350
- 7300 val loss 5.6167
351
- 7300 val perplexity 274.9700
352
- 7300 train 5.422538 (lr=8.3436e-06) (hash(x)=142803829)
353
- 5600 val loss 5.9542
354
- 5600 val perplexity 385.3673
355
- 5600 train 5.828371 (lr=1.5469e-05) (hash(x)=142065021)
356
- 7400 val loss 5.6160
357
- 7400 val perplexity 274.8009
358
- 7400 train 5.435742 (lr=7.9082e-06) (hash(x)=145294178)
359
- 5700 val loss 5.9355
360
- 5700 val perplexity 378.2421
361
- 5700 train 5.798361 (lr=1.4836e-05) (hash(x)=145749913)
362
- 7500 val loss 5.6122
363
- 7500 val perplexity 273.7481
364
- 7500 train 5.373355 (lr=7.5012e-06) (hash(x)=150573713)
365
- 5800 val loss 5.9275
366
- 5800 val perplexity 375.2310
367
- 5800 train 5.613351 (lr=1.4212e-05) (hash(x)=140035522)
368
- 7600 val loss 5.6107
369
- 7600 val perplexity 273.3234
370
- 7600 train 5.632406 (lr=7.1232e-06) (hash(x)=142771511)
371
- 5900 val loss 5.9241
372
- 5900 val perplexity 373.9262
373
- 5900 train 5.678686 (lr=1.3600e-05) (hash(x)=154752726)
374
- 7700 val loss 5.6042
375
- 7700 val perplexity 271.5583
376
- 7700 train 5.505773 (lr=6.7747e-06) (hash(x)=143602175)
377
- 6000 val loss 5.9181
378
- 6000 val perplexity 371.7014
379
- 6000 train 5.941476 (lr=1.3000e-05) (hash(x)=158715824)
380
- 7800 val loss 5.6010
381
- 7800 val perplexity 270.6951
382
- 7800 train 5.663585 (lr=6.4563e-06) (hash(x)=152379862)
383
- 6100 val loss 5.9064
384
- 6100 val perplexity 367.3962
385
- 6100 train 5.913001 (lr=1.2413e-05) (hash(x)=137413820)
386
- 7900 val loss 5.5999
387
- 7900 val perplexity 270.4090
388
- 7900 train 5.425761 (lr=6.1684e-06) (hash(x)=146655921)
389
- 6200 val loss 5.8991
390
- 6200 val perplexity 364.7029
391
- 6200 train 5.816253 (lr=1.1840e-05) (hash(x)=151507523)
392
- 8000 val loss 5.6012
393
- 8000 val perplexity 270.7527
394
- 8000 train 5.731220 (lr=5.9114e-06) (hash(x)=148262482)
395
- 6300 val loss 5.8957
396
- 6300 val perplexity 363.4883
397
- 6300 train 5.667901 (lr=1.1282e-05) (hash(x)=147514617)
398
- 8100 val loss 5.5923
399
- 8100 val perplexity 268.3542
400
- 8100 train 5.538779 (lr=5.6857e-06) (hash(x)=147683655)
401
- 6400 val loss 5.8926
402
- 6400 val perplexity 362.3358
403
- 6400 train 5.901004 (lr=1.0739e-05) (hash(x)=151604465)
404
- 8200 val loss 5.5916
405
- 8200 val perplexity 268.1735
406
- 8200 train 5.658997 (lr=5.4917e-06) (hash(x)=164975934)
407
- 6500 val loss 5.8813
408
- 6500 val perplexity 358.2624
409
- 6500 train 5.937473 (lr=1.0213e-05) (hash(x)=144515881)
410
- 8300 val loss 5.5882
411
- 8300 val perplexity 267.2640
412
- 8300 train 5.499679 (lr=5.3295e-06) (hash(x)=145300550)
413
- 6600 val loss 5.8753
414
- 6600 val perplexity 356.1367
415
- 6600 train 5.660358 (lr=9.7032e-06) (hash(x)=136948374)
416
- 8400 val loss 5.5866
417
- 8400 val perplexity 266.8259
418
- 8400 train 5.493410 (lr=5.1995e-06) (hash(x)=150679400)
419
- 6700 val loss 5.8745
420
- 6700 val perplexity 355.8448
421
- 6700 train 5.690690 (lr=9.2116e-06) (hash(x)=146268592)
422
- 8500 val loss 5.5882
423
- 8500 val perplexity 267.2431
424
- 8500 train 5.576384 (lr=5.1019e-06) (hash(x)=164109401)
425
- 6800 val loss 5.8652
426
- 6800 val perplexity 352.5601
427
- 6800 train 5.824735 (lr=8.7387e-06) (hash(x)=152676836)
428
- 8600 val loss 5.5841
429
- 8600 val perplexity 266.1479
430
- 8600 train 5.520129 (lr=5.0367e-06) (hash(x)=161036376)
431
- 6900 val loss 5.8585
432
- 6900 val perplexity 350.1941
433
- 6900 train 5.778625 (lr=8.2849e-06) (hash(x)=134657776)
434
- 8700 val loss 5.5798
435
- 8700 val perplexity 265.0147
436
- 8700 train 5.501758 (lr=5.0041e-06) (hash(x)=153828820)
437
- 8749 val loss 5.5778
438
- 8749 val perplexity 264.4979
439
- 7000 val loss 5.8552
440
- 7000 val perplexity 349.0465
441
- 7000 train 5.827059 (lr=7.8510e-06) (hash(x)=166721861)
442
- 7100 val loss 5.8537
443
- 7100 val perplexity 348.5362
444
- 7100 train 5.664058 (lr=7.4377e-06) (hash(x)=135496702)
445
- 7200 val loss 5.8496
446
- 7200 val perplexity 347.0883
447
- 7200 train 5.947722 (lr=7.0455e-06) (hash(x)=155567461)
448
- 7300 val loss 5.8428
449
- 7300 val perplexity 344.7538
450
- 7300 train 5.637713 (lr=6.6749e-06) (hash(x)=142803829)
451
- 7400 val loss 5.8403
452
- 7400 val perplexity 343.8778
453
- 7400 train 5.662089 (lr=6.3266e-06) (hash(x)=145294178)
454
- 7500 val loss 5.8383
455
- 7500 val perplexity 343.1886
456
- 7500 train 5.605683 (lr=6.0010e-06) (hash(x)=150573713)
457
- 7600 val loss 5.8350
458
- 7600 val perplexity 342.0600
459
- 7600 train 5.852287 (lr=5.6986e-06) (hash(x)=142771511)
460
- 7700 val loss 5.8286
461
- 7700 val perplexity 339.8656
462
- 7700 train 5.725527 (lr=5.4198e-06) (hash(x)=143602175)
463
- 7800 val loss 5.8250
464
- 7800 val perplexity 338.6624
465
- 7800 train 5.857786 (lr=5.1650e-06) (hash(x)=152379862)
466
- 7900 val loss 5.8238
467
- 7900 val perplexity 338.2581
468
- 7900 train 5.664768 (lr=4.9347e-06) (hash(x)=146655921)
469
- 8000 val loss 5.8242
470
- 8000 val perplexity 338.4018
471
- 8000 train 5.935122 (lr=4.7291e-06) (hash(x)=148262482)
472
- 8100 val loss 5.8166
473
- 8100 val perplexity 335.8375
474
- 8100 train 5.753369 (lr=4.5486e-06) (hash(x)=147683655)
475
- 8200 val loss 5.8155
476
- 8200 val perplexity 335.4515
477
- 8200 train 5.876520 (lr=4.3933e-06) (hash(x)=164975934)
478
- 8300 val loss 5.8120
479
- 8300 val perplexity 334.2799
480
- 8300 train 5.729677 (lr=4.2636e-06) (hash(x)=145300550)
481
- 8400 val loss 5.8100
482
- 8400 val perplexity 333.6333
483
- 8400 train 5.721456 (lr=4.1596e-06) (hash(x)=150679400)
484
- 8500 val loss 5.8102
485
- 8500 val perplexity 333.6947
486
- 8500 train 5.822725 (lr=4.0815e-06) (hash(x)=164109401)
487
- 8600 val loss 5.8071
488
- 8600 val perplexity 332.6415
489
- 8600 train 5.800099 (lr=4.0294e-06) (hash(x)=161036376)
490
- 8700 val loss 5.8029
491
- 8700 val perplexity 331.2553
492
- 8700 train 5.736811 (lr=4.0033e-06) (hash(x)=153828820)
493
- 8749 val loss 5.8012
494
- 8749 val perplexity 330.7012
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  max_steps: 8750
2
+ max_steps: 8750
3
+ 0 val loss 11.2831
4
+ 0 val perplexity 79466.0234
5
  0 val loss 11.2831
6
+ 0 val perplexity 79466.0234
7
+ 0 train 11.285278 (lr=7.0000e-08) (hash(x)=145830960)
8
+ 0 train 11.285278 (lr=6.0000e-08) (hash(x)=145830960)
9
+ 100 val loss 10.1647
10
+ 100 val perplexity 25971.0234
11
+ 100 train 10.198525 (lr=7.0700e-06) (hash(x)=144157453)
12
+ 100 val loss 10.2511
13
+ 100 val perplexity 28312.6055
14
+ 100 train 10.282244 (lr=6.0600e-06) (hash(x)=144157453)
15
+ 200 val loss 9.7081
16
+ 200 val perplexity 16450.9492
17
+ 200 train 9.747164 (lr=1.4070e-05) (hash(x)=146764602)
18
+ 200 val loss 9.8092
19
+ 200 val perplexity 18200.1836
20
+ 200 train 9.849621 (lr=1.2060e-05) (hash(x)=146764602)
21
+ 300 val loss 8.8063
22
+ 300 val perplexity 6676.1025
23
+ 300 train 8.721718 (lr=2.1070e-05) (hash(x)=140559124)
24
+ 300 val loss 9.0431
25
+ 300 val perplexity 8460.3369
26
+ 300 train 8.977057 (lr=1.8060e-05) (hash(x)=140559124)
27
+ 400 val loss 7.9303
28
+ 400 val perplexity 2780.2043
29
+ 400 train 8.263488 (lr=2.8070e-05) (hash(x)=166024176)
30
+ 400 val loss 8.1200
31
+ 400 val perplexity 3361.0332
32
+ 400 train 8.433102 (lr=2.4060e-05) (hash(x)=166024176)
33
+ 500 val loss 7.5507
34
+ 500 val perplexity 1902.1611
35
+ 500 train 7.518233 (lr=3.5000e-05) (hash(x)=161040668)
36
+ 500 val loss 7.6197
37
+ 500 val perplexity 2037.9108
38
+ 500 train 7.589498 (lr=3.0000e-05) (hash(x)=161040668)
39
+ 600 val loss 7.4388
40
+ 600 val perplexity 1700.7233
41
+ 600 train 7.468059 (lr=3.4989e-05) (hash(x)=148878990)
42
+ 600 val loss 7.4876
43
+ 600 val perplexity 1785.7812
44
+ 600 train 7.517530 (lr=2.9990e-05) (hash(x)=148878990)
45
+ 700 val loss 7.3485
46
+ 700 val perplexity 1553.8051
47
+ 700 train 7.329822 (lr=3.4954e-05) (hash(x)=150420695)
48
+ 700 val loss 7.4002
49
+ 700 val perplexity 1636.2662
50
+ 700 train 7.380228 (lr=2.9961e-05) (hash(x)=150420695)
51
+ 800 val loss 7.2750
52
+ 800 val perplexity 1443.7603
53
+ 800 train 7.251266 (lr=3.4897e-05) (hash(x)=151995229)
54
+ 800 val loss 7.3320
55
+ 800 val perplexity 1528.5052
56
+ 800 train 7.302983 (lr=2.9912e-05) (hash(x)=151995229)
57
+ 900 val loss 7.2145
58
+ 900 val perplexity 1358.9803
59
+ 900 train 6.968035 (lr=3.4818e-05) (hash(x)=139853932)
60
+ 900 val loss 7.2744
61
+ 900 val perplexity 1442.9172
62
+ 900 train 7.033135 (lr=2.9844e-05) (hash(x)=139853932)
63
+ 1000 val loss 7.1335
64
+ 1000 val perplexity 1253.2953
65
+ 1000 train 7.335016 (lr=3.4715e-05) (hash(x)=174104207)
66
+ 1000 val loss 7.2135
67
+ 1000 val perplexity 1357.5917
68
+ 1000 train 7.412142 (lr=2.9756e-05) (hash(x)=174104207)
69
+ 1100 val loss 7.0682
70
+ 1100 val perplexity 1174.0667
71
+ 1100 train 6.900183 (lr=3.4591e-05) (hash(x)=146275038)
72
+ 1100 val loss 7.1555
73
+ 1100 val perplexity 1281.1627
74
+ 1100 train 6.990688 (lr=2.9649e-05) (hash(x)=146275038)
75
+ 1200 val loss 7.0004
76
+ 1200 val perplexity 1097.0662
77
+ 1200 train 6.771210 (lr=3.4444e-05) (hash(x)=141403655)
78
+ 1200 val loss 7.1005
79
+ 1200 val perplexity 1212.6289
80
+ 1200 train 6.876398 (lr=2.9523e-05) (hash(x)=141403655)
81
+ 1300 val loss 6.9292
82
+ 1300 val perplexity 1021.7067
83
+ 1300 train 7.012178 (lr=3.4275e-05) (hash(x)=173675040)
84
+ 1300 val loss 7.0356
85
+ 1300 val perplexity 1136.3947
86
+ 1300 train 7.122135 (lr=2.9378e-05) (hash(x)=173675040)
87
+ 1400 val loss 6.8715
88
+ 1400 val perplexity 964.4235
89
+ 1400 train 6.844200 (lr=3.4084e-05) (hash(x)=162475906)
90
+ 1400 val loss 6.9720
91
+ 1400 val perplexity 1066.3026
92
+ 1400 train 6.941983 (lr=2.9215e-05) (hash(x)=162475906)
93
+ 1500 val loss 6.8163
94
+ 1500 val perplexity 912.6173
95
+ 1500 train 6.646946 (lr=3.3872e-05) (hash(x)=152785612)
96
+ 1500 val loss 6.9207
97
+ 1500 val perplexity 1013.0270
98
+ 1500 train 6.750599 (lr=2.9033e-05) (hash(x)=152785612)
99
+ 1600 val loss 6.7411
100
+ 1600 val perplexity 846.5029
101
+ 1600 train 6.695332 (lr=3.3638e-05) (hash(x)=151987383)
102
+ 1600 val loss 6.8430
103
+ 1600 val perplexity 937.3285
104
+ 1600 train 6.799621 (lr=2.8833e-05) (hash(x)=151987383)
105
+ 1700 val loss 6.6802
106
+ 1700 val perplexity 796.5001
107
+ 1700 val loss 6.7791
108
+ 1700 val perplexity 879.2971
109
+ 1700 train 6.662879 (lr=3.3384e-05) (hash(x)=143781605)
110
+ 1700 train 6.760727 (lr=2.8615e-05) (hash(x)=143781605)
111
+ 1800 val loss 6.6307
112
+ 1800 val perplexity 758.0203
113
+ 1800 val loss 6.7255
114
+ 1800 val perplexity 833.4037
115
+ 1800 train 6.614835 (lr=3.3109e-05) (hash(x)=151040203)
116
+ 1800 train 6.708007 (lr=2.8379e-05) (hash(x)=151040203)
117
+ 1900 val loss 6.5788
118
+ 1900 val perplexity 719.6393
119
+ 1900 val loss 6.6702
120
+ 1900 val perplexity 788.5310
121
+ 1900 train 6.482689 (lr=3.2814e-05) (hash(x)=149414572)
122
+ 1900 train 6.571493 (lr=2.8127e-05) (hash(x)=149414572)
123
+ 2000 val loss 6.6195
124
+ 2000 val perplexity 749.5918
125
+ 2000 val loss 6.5286
126
+ 2000 val perplexity 684.4636
127
+ 2000 train 6.576480 (lr=2.7857e-05) (hash(x)=156479674)
128
+ 2000 train 6.482908 (lr=3.2500e-05) (hash(x)=156479674)
129
+ 2100 val loss 6.4825
130
+ 2100 val perplexity 653.5756
131
+ 2100 val loss 6.5620
132
+ 2100 val perplexity 707.6946
133
+ 2100 train 6.345187 (lr=3.2166e-05) (hash(x)=137861481)
134
+ 2100 train 6.429942 (lr=2.7571e-05) (hash(x)=137861481)
135
+ 2200 val loss 6.4416
136
+ 2200 val perplexity 627.3801
137
+ 2200 val loss 6.5179
138
+ 2200 val perplexity 677.1461
139
+ 2200 train 6.499350 (lr=3.1813e-05) (hash(x)=155937443)
140
+ 2200 train 6.576139 (lr=2.7269e-05) (hash(x)=155937443)
141
+ 2300 val loss 6.4203
142
+ 2300 val perplexity 614.1733
143
+ 2300 val loss 6.4995
144
+ 2300 val perplexity 664.7992
145
+ 2300 train 6.101424 (lr=3.1443e-05) (hash(x)=131035715)
146
+ 2300 train 6.185195 (lr=2.6951e-05) (hash(x)=131035715)
147
+ 2400 val loss 6.3757
148
+ 2400 val perplexity 587.4105
149
+ 2400 val loss 6.4501
150
+ 2400 val perplexity 632.7535
151
+ 2400 train 6.374401 (lr=3.1054e-05) (hash(x)=146568981)
152
+ 2400 train 6.447587 (lr=2.6618e-05) (hash(x)=146568981)
153
+ 2500 val loss 6.3448
154
+ 2500 val perplexity 569.5289
155
+ 2500 val loss 6.4192
156
+ 2500 val perplexity 613.5393
157
+ 2500 train 6.317100 (lr=3.0649e-05) (hash(x)=163317586)
158
+ 2500 train 6.389849 (lr=2.6270e-05) (hash(x)=163317586)
159
+ 2600 val loss 6.3181
160
+ 2600 val perplexity 554.5211
161
+ 2600 val loss 6.3867
162
+ 2600 val perplexity 593.9120
163
+ 2600 train 6.141277 (lr=3.0227e-05) (hash(x)=144201060)
164
+ 2600 train 6.207941 (lr=2.5909e-05) (hash(x)=144201060)
165
+ 2700 val loss 6.2931
166
+ 2700 val perplexity 540.8275
167
+ 2700 val loss 6.3637
168
+ 2700 val perplexity 580.3699
169
+ 2700 train 6.193167 (lr=2.9789e-05) (hash(x)=141825701)
170
+ 2700 train 6.259410 (lr=2.5533e-05) (hash(x)=141825701)
171
+ 2800 val loss 6.2649
172
+ 2800 val perplexity 525.7846
173
+ 2800 val loss 6.3332
174
+ 2800 val perplexity 562.9761
175
+ 2800 train 6.384350 (lr=2.9336e-05) (hash(x)=160561627)
176
+ 2800 train 6.452592 (lr=2.5145e-05) (hash(x)=160561627)
177
+ 2900 val loss 6.2419
178
+ 2900 val perplexity 513.8532
179
+ 2900 val loss 6.3035
180
+ 2900 val perplexity 546.5074
181
+ 2900 train 6.189714 (lr=2.8868e-05) (hash(x)=151758176)
182
+ 2900 train 6.255837 (lr=2.4744e-05) (hash(x)=151758176)
183
+ 3000 val loss 6.2147
184
+ 3000 val perplexity 500.0573
185
+ 3000 val loss 6.2803
186
+ 3000 val perplexity 533.9685
187
+ 3000 train 6.226311 (lr=2.8386e-05) (hash(x)=155815751)
188
+ 3000 train 6.286588 (lr=2.4331e-05) (hash(x)=155815751)
189
+ 3100 val loss 6.2016
190
+ 3100 val perplexity 493.5247
191
+ 3100 val loss 6.2672
192
+ 3100 val perplexity 527.0150
193
+ 3100 train 5.858310 (lr=2.7891e-05) (hash(x)=142307043)
194
+ 3100 train 5.926589 (lr=2.3906e-05) (hash(x)=142307043)
195
+ 3200 val loss 6.1807
196
+ 3200 val perplexity 483.3375
197
+ 3200 train 6.293739 (lr=2.7383e-05) (hash(x)=156310690)
198
+ 3200 val loss 6.2449
199
+ 3200 val perplexity 515.3918
200
+ 3200 train 6.361027 (lr=2.3471e-05) (hash(x)=156310690)
201
+ 3300 val loss 6.2148
202
+ 3300 val perplexity 500.1102
203
+ 3300 val loss 6.1538
204
+ 3300 val perplexity 470.4862
205
+ 3300 train 6.216784 (lr=2.3026e-05) (hash(x)=151344506)
206
+ 3300 train 6.156497 (lr=2.6864e-05) (hash(x)=151344506)
207
+ 3400 val loss 6.2043
208
+ 3400 val perplexity 494.8538
209
+ 3400 val loss 6.1388
210
+ 3400 val perplexity 463.4761
211
+ 3400 train 6.240551 (lr=2.2572e-05) (hash(x)=162911881)
212
+ 3400 train 6.174221 (lr=2.6333e-05) (hash(x)=162911881)
213
+ 3500 val loss 6.1855
214
+ 3500 val perplexity 485.6743
215
+ 3500 val loss 6.1274
216
+ 3500 val perplexity 458.2211
217
+ 3500 train 6.123961 (lr=2.2108e-05) (hash(x)=142889971)
218
+ 3500 train 6.059972 (lr=2.5793e-05) (hash(x)=142889971)
219
+ 3600 val loss 6.1675
220
+ 3600 val perplexity 476.9710
221
+ 3600 val loss 6.1050
222
+ 3600 val perplexity 448.1018
223
+ 3600 train 6.160964 (lr=2.1637e-05) (hash(x)=149470354)
224
+ 3600 train 6.092852 (lr=2.5243e-05) (hash(x)=149470354)
225
+ 3700 val loss 6.1523
226
+ 3700 val perplexity 469.7756
227
+ 3700 val loss 6.0880
228
+ 3700 val perplexity 440.5436
229
+ 3700 train 6.096142 (lr=2.1158e-05) (hash(x)=154194821)
230
+ 3700 train 6.030409 (lr=2.4684e-05) (hash(x)=154194821)
231
+ 3800 val loss 6.1387
232
+ 3800 val perplexity 463.4522
233
+ 3800 val loss 6.0779
234
+ 3800 val perplexity 436.1298
235
+ 3800 train 6.069734 (lr=2.0672e-05) (hash(x)=148885848)
236
+ 3800 train 6.011635 (lr=2.4117e-05) (hash(x)=148885848)
237
+ 3900 val loss 6.1210
238
+ 3900 val perplexity 455.3399
239
+ 3900 val loss 6.0583
240
+ 3900 val perplexity 427.6662
241
+ 3900 train 6.032857 (lr=2.0180e-05) (hash(x)=148308484)
242
+ 3900 train 5.971268 (lr=2.3543e-05) (hash(x)=148308484)
243
+ 4000 val loss 6.1042
244
+ 4000 val perplexity 447.7170
245
+ 4000 val loss 6.0434
246
+ 4000 val perplexity 421.3136
247
+ 4000 train 5.941088 (lr=1.9683e-05) (hash(x)=139828564)
248
+ 4000 train 5.881995 (lr=2.2963e-05) (hash(x)=139828564)
249
+ 4100 val loss 6.0952
250
+ 4100 val perplexity 443.7090
251
+ 4100 val loss 6.0326
252
+ 4100 val perplexity 416.8078
253
+ 4100 train 6.002620 (lr=1.9181e-05) (hash(x)=139981997)
254
+ 4100 train 5.940226 (lr=2.2378e-05) (hash(x)=139981997)
255
+ 4200 val loss 6.0805
256
+ 4200 val perplexity 437.2441
257
+ 4200 train 6.236762 (lr=1.8675e-05) (hash(x)=150738447)
258
+ 4200 val loss 6.0164
259
+ 4200 val perplexity 410.0989
260
+ 4200 train 6.170493 (lr=2.1788e-05) (hash(x)=150738447)
261
+ 4300 val loss 6.0659
262
+ 4300 val perplexity 430.9070
263
+ 4300 train 5.848683 (lr=1.8166e-05) (hash(x)=142198107)
264
+ 4300 val loss 6.0055
265
+ 4300 val perplexity 405.6398
266
+ 4300 train 5.784900 (lr=2.1194e-05) (hash(x)=142198107)
267
+ 4400 val loss 6.0590
268
+ 4400 val perplexity 427.9448
269
+ 4400 train 5.930000 (lr=1.7655e-05) (hash(x)=142731201)
270
+ 4400 val loss 5.9963
271
+ 4400 val perplexity 401.9518
272
+ 4400 train 5.867516 (lr=2.0598e-05) (hash(x)=142731201)
273
+ 4500 val loss 6.0420
274
+ 4500 val perplexity 420.7414
275
+ 4500 train 6.054802 (lr=1.7142e-05) (hash(x)=154814426)
276
+ 4500 val loss 5.9799
277
+ 4500 val perplexity 395.4100
278
+ 4500 train 5.997611 (lr=1.9999e-05) (hash(x)=154814426)
279
+ 4600 val loss 6.0305
280
+ 4600 val perplexity 415.9094
281
+ 4600 train 5.998842 (lr=1.6629e-05) (hash(x)=155922230)
282
+ 4600 val loss 5.9672
283
+ 4600 val perplexity 390.4033
284
+ 4600 train 5.934934 (lr=1.9400e-05) (hash(x)=155922230)
285
+ 4700 val loss 6.0251
286
+ 4700 val perplexity 413.6869
287
+ 4700 train 5.771210 (lr=1.6114e-05) (hash(x)=139398510)
288
+ 4700 val loss 5.9632
289
+ 4700 val perplexity 388.8658
290
+ 4700 train 5.711072 (lr=1.8800e-05) (hash(x)=139398510)
291
+ 4800 val loss 6.0116
292
+ 4800 val perplexity 408.1422
293
+ 4800 train 6.080204 (lr=1.5601e-05) (hash(x)=140893236)
294
+ 4800 val loss 5.9466
295
+ 4800 val perplexity 382.4628
296
+ 4800 train 6.004228 (lr=1.8201e-05) (hash(x)=140893236)
297
+ 4900 val loss 5.9966
298
+ 4900 val perplexity 402.0549
299
+ 4900 val loss 5.9329
300
+ 4900 val perplexity 377.2361
301
+ 4900 train 5.900732 (lr=1.5089e-05) (hash(x)=153747830)
302
+ 4900 train 5.829324 (lr=1.7604e-05) (hash(x)=153747830)
303
+ 5000 val loss 5.9886
304
+ 5000 val perplexity 398.8546
305
+ 5000 val loss 5.9249
306
+ 5000 val perplexity 374.2589
307
+ 5000 train 6.216745 (lr=1.4579e-05) (hash(x)=148919005)
308
+ 5000 train 6.162242 (lr=1.7009e-05) (hash(x)=148919005)
309
+ 5100 val loss 5.9818
310
+ 5100 val perplexity 396.1589
311
+ 5100 val loss 5.9171
312
+ 5100 val perplexity 371.3396
313
+ 5100 train 5.771548 (lr=1.4071e-05) (hash(x)=142281936)
314
+ 5100 train 5.707696 (lr=1.6417e-05) (hash(x)=142281936)
315
+ 5200 val loss 5.9826
316
+ 5200 val perplexity 396.4724
317
+ 5200 val loss 5.9180
318
+ 5200 val perplexity 371.6624
319
+ 5200 train 5.790337 (lr=1.3568e-05) (hash(x)=143162650)
320
+ 5200 train 5.723374 (lr=1.5829e-05) (hash(x)=143162650)
321
+ 5300 val loss 5.9621
322
+ 5300 val perplexity 388.4120
323
+ 5300 val loss 5.8968
324
+ 5300 val perplexity 363.8678
325
+ 5300 train 6.020540 (lr=1.3068e-05) (hash(x)=148546849)
326
+ 5300 train 5.958957 (lr=1.5246e-05) (hash(x)=148546849)
327
+ 5400 val loss 5.9524
328
+ 5400 val perplexity 384.6609
329
+ 5400 val loss 5.8871
330
+ 5400 val perplexity 360.3507
331
+ 5400 train 5.812173 (lr=1.2573e-05) (hash(x)=143492259)
332
+ 5400 train 5.746771 (lr=1.4669e-05) (hash(x)=143492259)
333
+ 5500 val loss 5.9471
334
+ 5500 val perplexity 382.6500
335
+ 5500 val loss 5.8826
336
+ 5500 val perplexity 358.7458
337
+ 5500 train 5.701526 (lr=1.2085e-05) (hash(x)=141023941)
338
+ 5500 train 5.634879 (lr=1.4099e-05) (hash(x)=141023941)
339
+ 5600 val loss 5.9471
340
+ 5600 val perplexity 382.6252
341
+ 5600 val loss 5.8808
342
+ 5600 val perplexity 358.0882
343
+ 5600 train 5.817373 (lr=1.1602e-05) (hash(x)=142065021)
344
+ 5600 train 5.754811 (lr=1.3536e-05) (hash(x)=142065021)
345
+ 5700 val loss 5.9307
346
+ 5700 val perplexity 376.4344
347
+ 5700 val loss 5.8645
348
+ 5700 val perplexity 352.3100
349
+ 5700 train 5.791829 (lr=1.1127e-05) (hash(x)=145749913)
350
+ 5700 train 5.731421 (lr=1.2981e-05) (hash(x)=145749913)
351
+ 5800 val loss 5.8578
352
+ 5800 val perplexity 349.9652
353
+ 5800 val loss 5.9241
354
+ 5800 val perplexity 373.9264
355
+ 5800 train 5.537057 (lr=1.2436e-05) (hash(x)=140035522)
356
+ 5800 train 5.610334 (lr=1.0659e-05) (hash(x)=140035522)
357
+ 5900 val loss 5.8524
358
+ 5900 val perplexity 348.0614
359
+ 5900 val loss 5.9200
360
+ 5900 val perplexity 372.4236
361
+ 5900 train 5.614779 (lr=1.1900e-05) (hash(x)=154752726)
362
+ 5900 train 5.682038 (lr=1.0200e-05) (hash(x)=154752726)
363
+ 6000 val loss 5.8496
364
+ 6000 val perplexity 347.1107
365
+ 6000 val loss 5.9171
366
+ 6000 val perplexity 371.3166
367
+ 6000 train 5.874280 (lr=1.1375e-05) (hash(x)=158715824)
368
+ 6000 train 5.940979 (lr=9.7500e-06) (hash(x)=158715824)
369
+ 6100 val loss 5.8378
370
+ 6100 val perplexity 343.0087
371
+ 6100 val loss 5.9049
372
+ 6100 val perplexity 366.8322
373
+ 6100 train 5.851554 (lr=1.0861e-05) (hash(x)=137413820)
374
+ 6100 train 5.914152 (lr=9.3098e-06) (hash(x)=137413820)
375
+ 6200 val loss 5.8303
376
+ 6200 val perplexity 340.4680
377
+ 6200 train 5.751894 (lr=1.0360e-05) (hash(x)=151507523)
378
+ 6200 val loss 5.8989
379
+ 6200 val perplexity 364.6396
380
+ 6200 train 5.823660 (lr=8.8800e-06) (hash(x)=151507523)
381
+ 6300 val loss 5.8296
382
+ 6300 val perplexity 340.2184
383
+ 6300 train 5.600849 (lr=9.8715e-06) (hash(x)=147514617)
384
+ 6300 val loss 5.8977
385
+ 6300 val perplexity 364.1924
386
+ 6300 train 5.668419 (lr=8.4613e-06) (hash(x)=147514617)
387
+ 6400 val loss 5.8260
388
+ 6400 val perplexity 338.9898
389
+ 6400 train 5.836993 (lr=9.3966e-06) (hash(x)=151604465)
390
+ 6400 val loss 5.8942
391
+ 6400 val perplexity 362.9248
392
+ 6400 train 5.905121 (lr=8.0542e-06) (hash(x)=151604465)
393
+ 6500 val loss 5.8152
394
+ 6500 val perplexity 335.3742
395
+ 6500 train 5.872994 (lr=8.9359e-06) (hash(x)=144515881)
396
+ 6500 val loss 5.8838
397
+ 6500 val perplexity 359.1624
398
+ 6500 train 5.940874 (lr=7.6594e-06) (hash(x)=144515881)
399
+ 6600 val loss 5.8092
400
+ 6600 val perplexity 333.3445
401
+ 6600 train 5.602404 (lr=8.4903e-06) (hash(x)=136948374)
402
+ 6600 val loss 5.8793
403
+ 6600 val perplexity 357.5739
404
+ 6600 train 5.664325 (lr=7.2774e-06) (hash(x)=136948374)
405
+ 6700 val loss 5.8089
406
+ 6700 val perplexity 333.2539
407
+ 6700 train 5.620818 (lr=8.0602e-06) (hash(x)=146268592)
408
+ 6700 val loss 5.8775
409
+ 6700 val perplexity 356.8992
410
+ 6700 train 5.695412 (lr=6.9087e-06) (hash(x)=146268592)
411
+ 6800 val loss 5.8009
412
+ 6800 val perplexity 330.5903
413
+ 6800 train 5.757844 (lr=7.6463e-06) (hash(x)=152676836)
414
+ 6800 val loss 5.8705
415
+ 6800 val perplexity 354.4432
416
+ 6800 train 5.824782 (lr=6.5540e-06) (hash(x)=152676836)
417
+ 6900 val loss 5.7935
418
+ 6900 val perplexity 328.1473
419
+ 6900 train 5.711613 (lr=7.2493e-06) (hash(x)=134657776)
420
+ 6900 val loss 5.8644
421
+ 6900 val perplexity 352.2704
422
+ 6900 train 5.785072 (lr=6.2137e-06) (hash(x)=134657776)
423
+ 7000 val loss 5.7921
424
+ 7000 val perplexity 327.6984
425
+ 7000 train 5.767048 (lr=6.8697e-06) (hash(x)=166721861)
426
+ 7000 val loss 5.8625
427
+ 7000 val perplexity 351.6026
428
+ 7000 train 5.835593 (lr=5.8883e-06) (hash(x)=166721861)
429
+ 7100 val loss 5.7891
430
+ 7100 val perplexity 326.7293
431
+ 7100 train 5.594516 (lr=6.5080e-06) (hash(x)=135496702)
432
+ 7100 val loss 5.8591
433
+ 7100 val perplexity 350.4164
434
+ 7100 train 5.666256 (lr=5.5783e-06) (hash(x)=135496702)
435
+ 7200 val loss 5.7857
436
+ 7200 val perplexity 325.5949
437
+ 7200 train 5.894931 (lr=6.1648e-06) (hash(x)=155567461)
438
+ 7200 val loss 5.8553
439
+ 7200 val perplexity 349.0879
440
+ 7200 train 5.959052 (lr=5.2841e-06) (hash(x)=155567461)
441
+ 7300 val loss 5.7788
442
+ 7300 val perplexity 323.3828
443
+ 7300 train 5.581634 (lr=5.8405e-06) (hash(x)=142803829)
444
+ 7300 val loss 5.8498
445
+ 7300 val perplexity 347.1809
446
+ 7300 train 5.649696 (lr=5.0062e-06) (hash(x)=142803829)
447
+ 7400 val loss 5.7771
448
+ 7400 val perplexity 322.8163
449
+ 7400 train 5.600485 (lr=5.5357e-06) (hash(x)=145294178)
450
+ 7400 val loss 5.8478
451
+ 7400 val perplexity 346.4585
452
+ 7400 train 5.666378 (lr=4.7449e-06) (hash(x)=145294178)
453
+ 7500 val loss 5.7761
454
+ 7500 val perplexity 322.5064
455
+ 7500 train 5.544384 (lr=5.2508e-06) (hash(x)=150573713)
456
+ 7500 val loss 5.8469
457
+ 7500 val perplexity 346.1439
458
+ 7500 train 5.614801 (lr=4.5007e-06) (hash(x)=150573713)
459
+ 7600 val loss 5.7728
460
+ 7600 val perplexity 321.4253
461
+ 7600 train 5.784178 (lr=4.9862e-06) (hash(x)=142771511)
462
+ 7600 val loss 5.8438
463
+ 7600 val perplexity 345.0785
464
+ 7600 train 5.856913 (lr=4.2739e-06) (hash(x)=142771511)
465
+ 7700 val loss 5.7669
466
+ 7700 val perplexity 319.5430
467
+ 7700 train 5.666105 (lr=4.7423e-06) (hash(x)=143602175)
468
+ 7700 val loss 5.8380
469
+ 7700 val perplexity 343.1030
470
+ 7700 train 5.733759 (lr=4.0648e-06) (hash(x)=143602175)
471
+ 7800 val loss 5.7637
472
+ 7800 val perplexity 318.5268
473
+ 7800 train 5.804039 (lr=4.5194e-06) (hash(x)=152379862)
474
+ 7800 val loss 5.8356
475
+ 7800 val perplexity 342.2685
476
+ 7800 train 5.870350 (lr=3.8738e-06) (hash(x)=152379862)
477
+ 7900 val loss 5.7638
478
+ 7900 val perplexity 318.5619
479
+ 7900 train 5.605504 (lr=4.3179e-06) (hash(x)=146655921)
480
+ 7900 val loss 5.8343
481
+ 7900 val perplexity 341.8145
482
+ 7900 train 5.679904 (lr=3.7010e-06) (hash(x)=146655921)
483
+ 8000 val loss 5.7638
484
+ 8000 val perplexity 318.5710
485
+ 8000 train 5.879199 (lr=4.1380e-06) (hash(x)=148262482)
486
+ 8000 val loss 5.8350
487
+ 8000 val perplexity 342.0742
488
+ 8000 train 5.947587 (lr=3.5468e-06) (hash(x)=148262482)
489
+ 8100 val loss 5.7564
490
+ 8100 val perplexity 316.1968
491
+ 8100 train 5.696403 (lr=3.9800e-06) (hash(x)=147683655)
492
+ 8100 val loss 5.8278
493
+ 8100 val perplexity 339.6119
494
+ 8100 train 5.762560 (lr=3.4114e-06) (hash(x)=147683655)
495
+ 8200 val loss 5.8263
496
+ 8200 val perplexity 339.1142
497
+ 8200 train 5.888067 (lr=3.2950e-06) (hash(x)=164975934)
498
+ 8200 val loss 5.7551
499
+ 8200 val perplexity 315.8080
500
+ 8200 train 5.818132 (lr=3.8442e-06) (hash(x)=164975934)
501
+ 8300 val loss 5.8229
502
+ 8300 val perplexity 337.9619
503
+ 8300 train 5.741323 (lr=3.1977e-06) (hash(x)=145300550)
504
+ 8300 val loss 5.7525
505
+ 8300 val perplexity 314.9815
506
+ 8300 train 5.667071 (lr=3.7307e-06) (hash(x)=145300550)
507
+ 8400 val loss 5.8219
508
+ 8400 val perplexity 337.6240
509
+ 8400 train 5.738000 (lr=3.1197e-06) (hash(x)=150679400)
510
+ 8400 val loss 5.7511
511
+ 8400 val perplexity 314.5375
512
+ 8400 train 5.666170 (lr=3.6397e-06) (hash(x)=150679400)
513
+ 8500 val loss 5.8224
514
+ 8500 val perplexity 337.7816
515
+ 8500 train 5.843013 (lr=3.0611e-06) (hash(x)=164109401)
516
+ 8500 val loss 5.7511
517
+ 8500 val perplexity 314.5462
518
+ 8500 train 5.767133 (lr=3.5713e-06) (hash(x)=164109401)
519
+ 8600 val loss 5.8202
520
+ 8600 val perplexity 337.0385
521
+ 8600 train 5.823380 (lr=3.0220e-06) (hash(x)=161036376)
522
+ 8600 val loss 5.7481
523
+ 8600 val perplexity 313.5803
524
+ 8600 train 5.730533 (lr=3.5257e-06) (hash(x)=161036376)
525
+ 8700 val loss 5.8156
526
+ 8700 val perplexity 335.5001
527
+ 8700 train 5.750320 (lr=3.0024e-06) (hash(x)=153828820)
528
+ 8700 val loss 5.7438
529
+ 8700 val perplexity 312.2529
530
+ 8700 train 5.674262 (lr=3.5029e-06) (hash(x)=153828820)
531
+ 8749 val loss 5.8145
532
+ 8749 val perplexity 335.1370
533
+ 8749 val loss 5.7424
534
+ 8749 val perplexity 311.8204
attention_kindselective_n_heads4_seed1344/model_08749.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:883f9a6eef276c05beb952634c52bbc6e83e399d790fd34ec66cd002ab63319e
3
  size 92843394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05d476d1b1967d89cb078db0cb8dde81e6a201530c206f968474e1bd60409cec
3
  size 92843394
attention_kindselective_n_heads4_seed1344/optimizer_08749.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11a8da35040e3dcb60b93f1d0e06cd32200bd13e83ffd949d3626688fcd9dd31
3
  size 179406214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bd43931f26ce626d3345e60608b25bfbdb809636a0d94748839af3204037042
3
  size 179406214