andrew-healey commited on
Commit
e6b1362
·
verified ·
1 Parent(s): 97fd97b

Upload folder using huggingface_hub

Browse files
logs/fix_1_latent_mask/1_latent_mask_lr_30e-4_n_latent_masks_2_seed_1339/args.json CHANGED
@@ -1 +1 @@
1
- {"hellaswag": true, "attention_kind": "selective", "log_dir": "logs/fix_1_latent_mask/1_latent_mask_lr_30e-4_n_latent_masks_2_seed_1339", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 12, "n_embd": 264, "head_dim": 22, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 4375, "warmup_steps": 250, "group": "fix_1_latent_mask", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1339, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "n_latent_masks", "selection_head_linear_combo_scale": 1.0, "disable_selection_head_linear_combo_bias": false, "assert_latent_matches_no_head": false, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": false, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 32, "total_batch_size": 131072, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": 2, "init_latent_masks_to_identity": true, "latent_mask_scale": null, "latent_mask_sigmoid": false, "S_layernorm": false, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 0.003, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "1_latent_mask_lr_30e-4_n_latent_masks_2"}
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "logs/fix_1_latent_mask/1_latent_mask_lr_30e-4_n_latent_masks_2_seed_1339", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 12, "n_embd": 264, "head_dim": 22, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 4375, "warmup_steps": 250, "group": "fix_1_latent_mask", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1339, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "n_latent_masks", "selection_head_linear_combo_scale": 1.0, "disable_selection_head_linear_combo_bias": false, "assert_latent_matches_no_head": false, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 32, "total_batch_size": 8192, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": 2, "init_latent_masks_to_identity": true, "latent_mask_scale": null, "latent_mask_sigmoid": false, "S_layernorm": false, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 0.003, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "1_latent_mask_lr_30e-4_n_latent_masks_2"}
logs/fix_1_latent_mask/1_latent_mask_lr_30e-4_n_latent_masks_2_seed_1339/dataloader_04374.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6031fd3e2855a036f7a5531cc24555aabd1115f9dd6618b8b2ca6f55279ef0b2
3
  size 964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c8d49024f3ab594241ab14ad69140ca5f3ff3f7a1401f0f49b94f9bdaf4a9a8
3
  size 964
logs/fix_1_latent_mask/1_latent_mask_lr_30e-4_n_latent_masks_2_seed_1339/log2.txt CHANGED
@@ -1,548 +1,529 @@
1
  max_steps: 4375
2
- 0 val loss 11.3204
3
- 0 val perplexity 82489.9453
4
- 0 hella 0.2564
5
- 0 train 11.303893 (lr=4.1958e-06) (hash(x)=44992657)
6
- 10 train 9.940910 (lr=4.6154e-05) (hash(x)=33468021)
7
- 20 train 9.380983 (lr=8.8112e-05) (hash(x)=40941803)
8
- 30 train 8.686651 (lr=1.3007e-04) (hash(x)=36715902)
9
- 40 train 7.949743 (lr=1.7203e-04) (hash(x)=32710993)
10
- 50 train 7.709585 (lr=2.1399e-04) (hash(x)=43839896)
11
- 60 train 7.750377 (lr=2.5594e-04) (hash(x)=40675468)
12
- 70 train 7.430070 (lr=2.9790e-04) (hash(x)=34592631)
13
- 80 train 7.412242 (lr=3.3986e-04) (hash(x)=44444845)
14
- 90 train 7.184612 (lr=3.8182e-04) (hash(x)=41965258)
15
- 100 val loss 7.0697
16
- 100 val perplexity 1175.7603
17
- 100 train 6.922948 (lr=4.2378e-04) (hash(x)=41284750)
18
- 110 train 6.848353 (lr=4.6573e-04) (hash(x)=41118734)
19
- 120 train 6.713826 (lr=5.0769e-04) (hash(x)=37537547)
20
- 130 train 6.551871 (lr=5.4965e-04) (hash(x)=43625179)
21
- 140 train 6.511169 (lr=5.9161e-04) (hash(x)=41940760)
22
- 150 train 6.638185 (lr=6.3357e-04) (hash(x)=39210431)
23
- 160 train 6.539367 (lr=6.7552e-04) (hash(x)=41128294)
24
- 170 train 6.416253 (lr=7.1748e-04) (hash(x)=41590227)
25
- 180 train 6.346042 (lr=7.5944e-04) (hash(x)=38084871)
26
- 190 train 6.271191 (lr=8.0140e-04) (hash(x)=34534333)
27
- 200 val loss 6.1653
28
- 200 val perplexity 475.9663
29
- 200 train 6.201835 (lr=8.4336e-04) (hash(x)=41614299)
30
- 210 train 6.090825 (lr=8.8531e-04) (hash(x)=35551597)
31
- 220 train 6.017484 (lr=9.2727e-04) (hash(x)=44615432)
32
- 230 train 6.147698 (lr=9.6923e-04) (hash(x)=37078768)
33
- 240 train 5.839825 (lr=1.0112e-03) (hash(x)=40644078)
34
- 250 hella 0.2413
35
- 250 train 5.891753 (lr=1.0531e-03) (hash(x)=44636066)
36
- 260 train 5.799315 (lr=1.0951e-03) (hash(x)=43247952)
37
- 270 train 5.729215 (lr=1.1371e-03) (hash(x)=37989370)
38
- 280 train 5.646903 (lr=1.1790e-03) (hash(x)=36229730)
39
- 290 train 5.571360 (lr=1.2210e-03) (hash(x)=35032996)
40
- 300 val loss 5.6987
41
- 300 val perplexity 298.4753
42
- 300 train 5.445377 (lr=1.2629e-03) (hash(x)=33868636)
43
- 310 train 5.386279 (lr=1.3049e-03) (hash(x)=39782705)
44
- 320 train 5.487435 (lr=1.3469e-03) (hash(x)=43525765)
45
- 330 train 5.582733 (lr=1.3888e-03) (hash(x)=38074926)
46
- 340 train 5.619603 (lr=1.4308e-03) (hash(x)=50125702)
47
- 350 train 5.602553 (lr=1.4727e-03) (hash(x)=35062222)
48
- 360 train 5.550775 (lr=1.5147e-03) (hash(x)=39671001)
49
- 370 train 5.557224 (lr=1.5566e-03) (hash(x)=35748364)
50
- 380 train 5.366469 (lr=1.5986e-03) (hash(x)=39962666)
51
- 390 train 5.336782 (lr=1.6406e-03) (hash(x)=39213360)
52
- 400 val loss 5.3712
53
- 400 val perplexity 215.1239
54
- 400 train 5.349317 (lr=1.6825e-03) (hash(x)=37919584)
55
- 410 train 5.302530 (lr=1.7245e-03) (hash(x)=42001527)
56
- 420 train 5.284353 (lr=1.7664e-03) (hash(x)=33940005)
57
- 430 train 5.321514 (lr=1.8084e-03) (hash(x)=41749217)
58
- 440 train 5.215183 (lr=1.8503e-03) (hash(x)=45219739)
59
- 450 train 5.205154 (lr=1.8923e-03) (hash(x)=38515090)
60
- 460 train 5.117400 (lr=1.9343e-03) (hash(x)=37518607)
61
- 470 train 4.942443 (lr=1.9762e-03) (hash(x)=46830716)
62
- 480 train 5.065008 (lr=2.0182e-03) (hash(x)=49262598)
63
- 490 train 4.857032 (lr=2.0601e-03) (hash(x)=41370559)
64
- 500 val loss 5.1671
65
- 500 val perplexity 175.4098
66
- 500 hella 0.2358
67
- 500 train 4.938737 (lr=2.1021e-03) (hash(x)=35237459)
68
- 510 train 5.045574 (lr=2.1441e-03) (hash(x)=35070579)
69
- 520 train 5.109946 (lr=2.1860e-03) (hash(x)=37355110)
70
- 530 train 5.230884 (lr=2.2280e-03) (hash(x)=39398044)
71
- 540 train 5.126976 (lr=2.2699e-03) (hash(x)=39156676)
72
- 550 train 5.072031 (lr=2.3119e-03) (hash(x)=38553909)
73
- 560 train 5.116980 (lr=2.3538e-03) (hash(x)=36555371)
74
- 570 train 5.048121 (lr=2.3958e-03) (hash(x)=42048991)
75
- 580 train 4.984550 (lr=2.4378e-03) (hash(x)=37359399)
76
- 590 train 5.011451 (lr=2.4797e-03) (hash(x)=39254663)
77
- 600 val loss 4.9725
78
- 600 val perplexity 144.3879
79
- 600 train 4.988099 (lr=2.5217e-03) (hash(x)=38982539)
80
- 610 train 4.910698 (lr=2.5636e-03) (hash(x)=35859145)
81
- 620 train 4.878326 (lr=2.6056e-03) (hash(x)=35593542)
82
- 630 train 4.783599 (lr=2.6476e-03) (hash(x)=39124418)
83
- 640 train 4.788286 (lr=2.6895e-03) (hash(x)=39107374)
84
- 650 train 4.714919 (lr=2.7315e-03) (hash(x)=34529151)
85
- 660 train 4.648001 (lr=2.7734e-03) (hash(x)=39483931)
86
- 670 train 4.635495 (lr=2.8154e-03) (hash(x)=43008010)
87
- 680 train 4.546259 (lr=2.8573e-03) (hash(x)=33469328)
88
- 690 train 4.580680 (lr=2.8993e-03) (hash(x)=35025253)
89
- 700 val loss 4.8377
90
- 700 val perplexity 126.1745
91
- 700 train 4.567236 (lr=2.9413e-03) (hash(x)=43280188)
92
- 710 train 4.833117 (lr=2.9832e-03) (hash(x)=40999263)
93
- 720 train 4.825602 (lr=3.0000e-03) (hash(x)=38304742)
94
- 730 train 4.826076 (lr=2.9999e-03) (hash(x)=34705188)
95
- 740 train 4.781812 (lr=2.9997e-03) (hash(x)=40028964)
96
- 750 hella 0.2519
97
- 750 train 4.730214 (lr=2.9994e-03) (hash(x)=39051591)
98
- 760 train 4.778199 (lr=2.9990e-03) (hash(x)=41286198)
99
- 770 train 4.711543 (lr=2.9985e-03) (hash(x)=44512967)
100
- 780 train 4.766361 (lr=2.9979e-03) (hash(x)=45171627)
101
- 790 train 4.725800 (lr=2.9972e-03) (hash(x)=46345673)
102
- 800 val loss 4.7074
103
- 800 val perplexity 110.7644
104
- 800 train 4.673951 (lr=2.9964e-03) (hash(x)=35529545)
105
- 810 train 4.578589 (lr=2.9955e-03) (hash(x)=42321793)
106
- 820 train 4.589988 (lr=2.9945e-03) (hash(x)=49327569)
107
- 830 train 4.659699 (lr=2.9934e-03) (hash(x)=41325057)
108
- 840 train 4.601190 (lr=2.9922e-03) (hash(x)=37305614)
109
- 850 train 4.560716 (lr=2.9909e-03) (hash(x)=38000800)
110
- 860 train 4.683061 (lr=2.9896e-03) (hash(x)=44047697)
111
- 870 train 4.672626 (lr=2.9881e-03) (hash(x)=41576206)
112
- 880 train 4.652940 (lr=2.9865e-03) (hash(x)=44685458)
113
- 890 train 4.629519 (lr=2.9848e-03) (hash(x)=40398597)
114
- 900 val loss 4.6103
115
- 900 val perplexity 100.5094
116
- 900 train 4.567309 (lr=2.9830e-03) (hash(x)=39312673)
117
- 910 train 4.583805 (lr=2.9811e-03) (hash(x)=38881875)
118
- 920 train 4.621392 (lr=2.9792e-03) (hash(x)=39140166)
119
- 930 train 4.512815 (lr=2.9771e-03) (hash(x)=39075781)
120
- 940 train 4.527113 (lr=2.9749e-03) (hash(x)=46887071)
121
- 950 train 4.519114 (lr=2.9726e-03) (hash(x)=39740603)
122
- 960 train 4.572857 (lr=2.9703e-03) (hash(x)=39590268)
123
- 970 train 4.519058 (lr=2.9678e-03) (hash(x)=43580052)
124
- 980 train 4.456566 (lr=2.9652e-03) (hash(x)=40226735)
125
- 990 train 4.563897 (lr=2.9626e-03) (hash(x)=40723896)
126
- 1000 val loss 4.5191
127
- 1000 val perplexity 91.7522
128
- 1000 hella 0.2466
129
- 1000 train 4.516461 (lr=2.9598e-03) (hash(x)=39408050)
130
- 1010 train 4.710083 (lr=2.9570e-03) (hash(x)=32247950)
131
- 1020 train 4.602966 (lr=2.9540e-03) (hash(x)=39049938)
132
- 1030 train 4.559230 (lr=2.9510e-03) (hash(x)=37453736)
133
- 1040 train 4.474250 (lr=2.9478e-03) (hash(x)=36649666)
134
- 1050 train 4.537192 (lr=2.9446e-03) (hash(x)=41404097)
135
- 1060 train 4.520281 (lr=2.9412e-03) (hash(x)=40701599)
136
- 1070 train 4.539452 (lr=2.9378e-03) (hash(x)=41787393)
137
- 1080 train 4.430599 (lr=2.9343e-03) (hash(x)=42325465)
138
- 1090 train 4.457718 (lr=2.9307e-03) (hash(x)=45018925)
139
- 1100 val loss 4.4571
140
- 1100 val perplexity 86.2368
141
- 1100 train 4.489385 (lr=2.9270e-03) (hash(x)=53751982)
142
- 1110 train 4.418241 (lr=2.9231e-03) (hash(x)=43118376)
143
- 1120 train 4.436633 (lr=2.9192e-03) (hash(x)=41489942)
144
- 1130 train 4.461071 (lr=2.9152e-03) (hash(x)=48020487)
145
- 1140 train 4.315529 (lr=2.9112e-03) (hash(x)=43266072)
146
- 1150 train 4.515281 (lr=2.9070e-03) (hash(x)=41790132)
147
- 1160 train 4.497178 (lr=2.9027e-03) (hash(x)=33739090)
148
- 1170 train 4.459414 (lr=2.8983e-03) (hash(x)=43885030)
149
- 1180 train 4.411491 (lr=2.8939e-03) (hash(x)=39046714)
150
- 1190 train 4.439357 (lr=2.8893e-03) (hash(x)=44387023)
151
- 1200 val loss 4.3930
152
- 1200 val perplexity 80.8832
153
- 1200 train 4.305639 (lr=2.8847e-03) (hash(x)=36667141)
154
- 1210 train 4.379218 (lr=2.8800e-03) (hash(x)=45487179)
155
- 1220 train 4.352225 (lr=2.8751e-03) (hash(x)=43456413)
156
- 1230 train 4.290226 (lr=2.8702e-03) (hash(x)=42473632)
157
- 1240 train 4.367168 (lr=2.8652e-03) (hash(x)=42966315)
158
- 1250 hella 0.2512
159
- 1250 train 4.203194 (lr=2.8601e-03) (hash(x)=42930524)
160
- 1260 train 4.365462 (lr=2.8550e-03) (hash(x)=43467676)
161
- 1270 train 4.285242 (lr=2.8497e-03) (hash(x)=40872580)
162
- 1280 train 4.357602 (lr=2.8443e-03) (hash(x)=40619157)
163
- 1290 train 4.372588 (lr=2.8389e-03) (hash(x)=44406165)
164
- 1300 val loss 4.3511
165
- 1300 val perplexity 77.5662
166
- 1300 train 4.334260 (lr=2.8333e-03) (hash(x)=40808029)
167
- 1310 train 4.512774 (lr=2.8277e-03) (hash(x)=46125736)
168
- 1320 train 4.272034 (lr=2.8220e-03) (hash(x)=41698487)
169
- 1330 train 4.313764 (lr=2.8162e-03) (hash(x)=39302878)
170
- 1340 train 4.417560 (lr=2.8103e-03) (hash(x)=41826369)
171
- 1350 train 4.235077 (lr=2.8044e-03) (hash(x)=37061413)
172
- 1360 train 4.356169 (lr=2.7983e-03) (hash(x)=40770974)
173
- 1370 train 4.275344 (lr=2.7922e-03) (hash(x)=38738718)
174
- 1380 train 4.272817 (lr=2.7860e-03) (hash(x)=34961558)
175
- 1390 train 4.318449 (lr=2.7797e-03) (hash(x)=38559342)
176
- 1400 val loss 4.3350
177
- 1400 val perplexity 76.3239
178
- 1400 train 4.275109 (lr=2.7733e-03) (hash(x)=38840020)
179
- 1410 train 4.432845 (lr=2.7668e-03) (hash(x)=44448100)
180
- 1420 train 4.489516 (lr=2.7603e-03) (hash(x)=39199838)
181
- 1430 train 4.286215 (lr=2.7536e-03) (hash(x)=41669873)
182
- 1440 train 4.411998 (lr=2.7469e-03) (hash(x)=53253836)
183
- 1450 train 4.326722 (lr=2.7401e-03) (hash(x)=41320631)
184
- 1460 train 4.249620 (lr=2.7333e-03) (hash(x)=48462803)
185
- 1470 train 4.294695 (lr=2.7263e-03) (hash(x)=42356864)
186
- 1480 train 4.341355 (lr=2.7193e-03) (hash(x)=41118068)
187
- 1490 train 4.251918 (lr=2.7121e-03) (hash(x)=37242854)
188
- 1500 val loss 4.2882
189
- 1500 val perplexity 72.8381
190
- 1500 hella 0.2500
191
- 1500 train 4.308035 (lr=2.7050e-03) (hash(x)=40571031)
192
- 1510 train 4.257050 (lr=2.6977e-03) (hash(x)=35384608)
193
- 1520 train 4.283376 (lr=2.6903e-03) (hash(x)=35341056)
194
- 1530 train 4.175711 (lr=2.6829e-03) (hash(x)=36471779)
195
- 1540 train 4.209238 (lr=2.6754e-03) (hash(x)=40356153)
196
- 1550 train 4.246115 (lr=2.6678e-03) (hash(x)=40133377)
197
- 1560 train 4.177083 (lr=2.6602e-03) (hash(x)=33722160)
198
- 1570 train 4.331501 (lr=2.6525e-03) (hash(x)=39555295)
199
- 1580 train 4.230425 (lr=2.6447e-03) (hash(x)=35950666)
200
- 1590 train 4.383768 (lr=2.6368e-03) (hash(x)=36528535)
201
- 1600 val loss 4.2654
202
- 1600 val perplexity 71.1916
203
- 1600 train 4.272776 (lr=2.6289e-03) (hash(x)=42613017)
204
- 1610 train 4.416926 (lr=2.6208e-03) (hash(x)=42845965)
205
- 1620 train 4.291862 (lr=2.6128e-03) (hash(x)=36137074)
206
- 1630 train 4.256735 (lr=2.6046e-03) (hash(x)=37560569)
207
- 1640 train 4.353688 (lr=2.5964e-03) (hash(x)=43680614)
208
- 1650 train 4.208091 (lr=2.5881e-03) (hash(x)=37281218)
209
- 1660 train 4.245766 (lr=2.5797e-03) (hash(x)=36165222)
210
- 1670 train 4.199383 (lr=2.5713e-03) (hash(x)=50222513)
211
- 1680 train 4.081939 (lr=2.5628e-03) (hash(x)=35239173)
212
- 1690 train 4.138228 (lr=2.5542e-03) (hash(x)=40287349)
213
- 1700 val loss 4.2736
214
- 1700 val perplexity 71.7819
215
- 1700 train 4.327542 (lr=2.5455e-03) (hash(x)=37444780)
216
- 1710 train 4.305676 (lr=2.5368e-03) (hash(x)=38962723)
217
- 1720 train 4.309755 (lr=2.5281e-03) (hash(x)=33882305)
218
- 1730 train 4.292196 (lr=2.5192e-03) (hash(x)=36501540)
219
- 1740 train 4.289313 (lr=2.5103e-03) (hash(x)=38723266)
220
- 1750 hella 0.2545
221
- 1750 train 4.217596 (lr=2.5014e-03) (hash(x)=38772789)
222
- 1760 train 4.297316 (lr=2.4924e-03) (hash(x)=37707898)
223
- 1770 train 4.154685 (lr=2.4833e-03) (hash(x)=30529327)
224
- 1780 train 4.239169 (lr=2.4741e-03) (hash(x)=37885464)
225
- 1790 train 4.259258 (lr=2.4649e-03) (hash(x)=39013967)
226
- 1800 val loss 4.2242
227
- 1800 val perplexity 68.3232
228
- 1800 train 4.295778 (lr=2.4556e-03) (hash(x)=34906955)
229
- 1810 train 4.166261 (lr=2.4463e-03) (hash(x)=40790682)
230
- 1820 train 4.038406 (lr=2.4369e-03) (hash(x)=39912223)
231
- 1830 train 4.163924 (lr=2.4275e-03) (hash(x)=39304486)
232
- 1840 train 4.136629 (lr=2.4180e-03) (hash(x)=37247975)
233
- 1850 train 4.128059 (lr=2.4084e-03) (hash(x)=41340614)
234
- 1860 train 4.283616 (lr=2.3988e-03) (hash(x)=39924627)
235
- 1870 train 4.268031 (lr=2.3891e-03) (hash(x)=40235993)
236
- 1880 train 4.161170 (lr=2.3794e-03) (hash(x)=37693476)
237
- 1890 train 4.274020 (lr=2.3696e-03) (hash(x)=40449864)
238
- 1900 val loss 4.1992
239
- 1900 val perplexity 66.6310
240
- 1900 train 4.173097 (lr=2.3598e-03) (hash(x)=37907749)
241
- 1910 train 4.154132 (lr=2.3499e-03) (hash(x)=34547901)
242
- 1920 train 4.186179 (lr=2.3400e-03) (hash(x)=37771788)
243
- 1930 train 4.057622 (lr=2.3300e-03) (hash(x)=36049454)
244
- 1940 train 4.149772 (lr=2.3200e-03) (hash(x)=29564336)
245
- 1950 train 4.137430 (lr=2.3099e-03) (hash(x)=41689281)
246
- 1960 train 4.059596 (lr=2.2998e-03) (hash(x)=41976699)
247
- 1970 train 4.248756 (lr=2.2896e-03) (hash(x)=41538621)
248
- 1980 train 4.142632 (lr=2.2793e-03) (hash(x)=36490902)
249
- 1990 train 4.170862 (lr=2.2691e-03) (hash(x)=42739315)
250
- 2000 val loss 4.1790
251
- 2000 val perplexity 65.3017
252
- 2000 hella 0.2500
253
- 2000 train 4.184011 (lr=2.2588e-03) (hash(x)=38600074)
254
- 2010 train 4.199542 (lr=2.2484e-03) (hash(x)=34555488)
255
- 2020 train 4.211539 (lr=2.2380e-03) (hash(x)=37227345)
256
- 2030 train 4.165526 (lr=2.2275e-03) (hash(x)=38172386)
257
- 2040 train 4.281218 (lr=2.2170e-03) (hash(x)=39340786)
258
- 2050 train 4.116986 (lr=2.2065e-03) (hash(x)=37237298)
259
- 2060 train 4.358812 (lr=2.1959e-03) (hash(x)=35894782)
260
- 2070 train 4.180064 (lr=2.1853e-03) (hash(x)=44161861)
261
- 2080 train 4.195827 (lr=2.1746e-03) (hash(x)=38062558)
262
- 2090 train 4.043560 (lr=2.1639e-03) (hash(x)=38608301)
263
- 2100 val loss 4.1557
264
- 2100 val perplexity 63.7947
265
- 2100 train 4.074593 (lr=2.1532e-03) (hash(x)=47611537)
266
- 2110 train 4.072067 (lr=2.1424e-03) (hash(x)=38076651)
267
- 2120 train 4.084739 (lr=2.1316e-03) (hash(x)=39870893)
268
- 2130 train 4.230171 (lr=2.1208e-03) (hash(x)=48123321)
269
- 2140 train 4.264887 (lr=2.1099e-03) (hash(x)=44124977)
270
- 2150 train 4.164749 (lr=2.0990e-03) (hash(x)=42181374)
271
- 2160 train 4.191967 (lr=2.0881e-03) (hash(x)=41837878)
272
- 2170 train 4.202684 (lr=2.0771e-03) (hash(x)=42161375)
273
- 2180 train 4.175346 (lr=2.0661e-03) (hash(x)=41792986)
274
- 2190 train 4.142838 (lr=2.0550e-03) (hash(x)=41872416)
275
- 2200 val loss 4.1354
276
- 2200 val perplexity 62.5162
277
- 2200 train 4.115455 (lr=2.0440e-03) (hash(x)=37395985)
278
- 2210 train 4.170252 (lr=2.0329e-03) (hash(x)=40616096)
279
- 2220 train 4.071719 (lr=2.0217e-03) (hash(x)=40118423)
280
- 2230 train 3.985840 (lr=2.0106e-03) (hash(x)=39641586)
281
- 2240 train 4.077491 (lr=1.9994e-03) (hash(x)=34720116)
282
- 2250 hella 0.2494
283
- 2250 train 4.016000 (lr=1.9882e-03) (hash(x)=43641508)
284
- 2260 train 4.039526 (lr=1.9770e-03) (hash(x)=31179786)
285
- 2270 train 4.010214 (lr=1.9657e-03) (hash(x)=42721932)
286
- 2280 train 4.338384 (lr=1.9544e-03) (hash(x)=38474505)
287
- 2290 train 4.162931 (lr=1.9431e-03) (hash(x)=41398545)
288
- 2300 val loss 4.1174
289
- 2300 val perplexity 61.4012
290
- 2300 train 4.089173 (lr=1.9318e-03) (hash(x)=44131094)
291
- 2310 train 4.172026 (lr=1.9205e-03) (hash(x)=41667948)
292
- 2320 train 4.246173 (lr=1.9091e-03) (hash(x)=36339232)
293
- 2330 train 4.127896 (lr=1.8977e-03) (hash(x)=41630644)
294
- 2340 train 4.212961 (lr=1.8863e-03) (hash(x)=41278955)
295
- 2350 train 4.086136 (lr=1.8749e-03) (hash(x)=39491976)
296
- 2360 train 4.156052 (lr=1.8635e-03) (hash(x)=41771238)
297
- 2370 train 4.251866 (lr=1.8520e-03) (hash(x)=45374570)
298
- 2380 train 4.125002 (lr=1.8406e-03) (hash(x)=41419267)
299
- 2390 train 4.160855 (lr=1.8291e-03) (hash(x)=38869169)
300
- 2400 val loss 4.1071
301
- 2400 val perplexity 60.7679
302
- 2400 train 4.062453 (lr=1.8176e-03) (hash(x)=38619293)
303
- 2410 train 3.891639 (lr=1.8061e-03) (hash(x)=41600240)
304
- 2420 train 4.010066 (lr=1.7946e-03) (hash(x)=40891045)
305
- 2430 train 4.003143 (lr=1.7830e-03) (hash(x)=41023249)
306
- 2440 train 3.973526 (lr=1.7715e-03) (hash(x)=33813452)
307
- 2450 train 3.921864 (lr=1.7600e-03) (hash(x)=38464119)
308
- 2460 train 3.820446 (lr=1.7484e-03) (hash(x)=40699982)
309
- 2470 train 3.830823 (lr=1.7368e-03) (hash(x)=38254854)
310
- 2480 train 4.116926 (lr=1.7253e-03) (hash(x)=42736069)
311
- 2490 train 4.133172 (lr=1.7137e-03) (hash(x)=41928525)
312
- 2500 val loss 4.0853
313
- 2500 val perplexity 59.4626
314
- 2500 hella 0.2567
315
- 2500 train 4.082916 (lr=1.7021e-03) (hash(x)=45864011)
316
- 2510 train 4.097414 (lr=1.6906e-03) (hash(x)=41380960)
317
- 2520 train 4.093293 (lr=1.6790e-03) (hash(x)=38366545)
318
- 2530 train 4.100535 (lr=1.6674e-03) (hash(x)=41045176)
319
- 2540 train 4.124891 (lr=1.6558e-03) (hash(x)=43435705)
320
- 2550 train 4.079135 (lr=1.6442e-03) (hash(x)=40693090)
321
- 2560 train 3.995791 (lr=1.6326e-03) (hash(x)=40502478)
322
- 2570 train 4.026785 (lr=1.6210e-03) (hash(x)=38610920)
323
- 2580 train 3.954050 (lr=1.6094e-03) (hash(x)=38333499)
324
- 2590 train 4.030530 (lr=1.5979e-03) (hash(x)=41982736)
325
- 2600 val loss 4.0690
326
- 2600 val perplexity 58.5005
327
- 2600 train 4.003514 (lr=1.5863e-03) (hash(x)=37724702)
328
- 2610 train 3.948205 (lr=1.5747e-03) (hash(x)=39564630)
329
- 2620 train 3.892067 (lr=1.5632e-03) (hash(x)=38465803)
330
- 2630 train 3.818472 (lr=1.5516e-03) (hash(x)=36955007)
331
- 2640 train 3.851274 (lr=1.5400e-03) (hash(x)=32994568)
332
- 2650 train 4.032062 (lr=1.5285e-03) (hash(x)=41165765)
333
- 2660 train 4.045706 (lr=1.5170e-03) (hash(x)=42934000)
334
- 2670 train 4.253387 (lr=1.5054e-03) (hash(x)=42726316)
335
- 2680 train 4.133718 (lr=1.4939e-03) (hash(x)=35880318)
336
- 2690 train 4.012588 (lr=1.4824e-03) (hash(x)=41747329)
337
- 2700 val loss 4.0599
338
- 2700 val perplexity 57.9673
339
- 2700 train 4.142472 (lr=1.4709e-03) (hash(x)=40259630)
340
- 2710 train 4.258617 (lr=1.4594e-03) (hash(x)=39895798)
341
- 2720 train 3.989928 (lr=1.4480e-03) (hash(x)=36146683)
342
- 2730 train 3.972654 (lr=1.4365e-03) (hash(x)=36181984)
343
- 2740 train 3.946752 (lr=1.4251e-03) (hash(x)=43700349)
344
- 2750 hella 0.2556
345
- 2750 train 4.017509 (lr=1.4137e-03) (hash(x)=41610597)
346
- 2760 train 4.008912 (lr=1.4023e-03) (hash(x)=31183639)
347
- 2770 train 3.999880 (lr=1.3909e-03) (hash(x)=37722489)
348
- 2780 train 3.943001 (lr=1.3795e-03) (hash(x)=47290688)
349
- 2790 train 3.977911 (lr=1.3682e-03) (hash(x)=41205574)
350
- 2800 val loss 4.0418
351
- 2800 val perplexity 56.9290
352
- 2800 train 3.830600 (lr=1.3569e-03) (hash(x)=40257962)
353
- 2810 train 3.803396 (lr=1.3456e-03) (hash(x)=39529014)
354
- 2820 train 3.824431 (lr=1.3343e-03) (hash(x)=42244749)
355
- 2830 train 3.881585 (lr=1.3230e-03) (hash(x)=32820090)
356
- 2840 train 3.837623 (lr=1.3118e-03) (hash(x)=40315769)
357
- 2850 train 4.083408 (lr=1.3006e-03) (hash(x)=41524462)
358
- 2860 train 4.074898 (lr=1.2894e-03) (hash(x)=38365734)
359
- 2870 train 4.210594 (lr=1.2783e-03) (hash(x)=37682602)
360
- 2880 train 4.055043 (lr=1.2671e-03) (hash(x)=39162991)
361
- 2890 train 3.912293 (lr=1.2560e-03) (hash(x)=33316384)
362
- 2900 val loss 4.0159
363
- 2900 val perplexity 55.4750
364
- 2900 train 3.959939 (lr=1.2450e-03) (hash(x)=37271132)
365
- 2910 train 4.157994 (lr=1.2339e-03) (hash(x)=35586242)
366
- 2920 train 4.043862 (lr=1.2229e-03) (hash(x)=33320586)
367
- 2930 train 4.057598 (lr=1.2119e-03) (hash(x)=43531361)
368
- 2940 train 3.941150 (lr=1.2010e-03) (hash(x)=37368286)
369
- 2950 train 3.975514 (lr=1.1901e-03) (hash(x)=40363394)
370
- 2960 train 3.941284 (lr=1.1792e-03) (hash(x)=42253792)
371
- 2970 train 3.884914 (lr=1.1684e-03) (hash(x)=38072598)
372
- 2980 train 3.956899 (lr=1.1576e-03) (hash(x)=41470557)
373
- 2990 train 3.886922 (lr=1.1468e-03) (hash(x)=42600033)
374
- 3000 val loss 4.0172
375
- 3000 val perplexity 55.5463
376
- 3000 hella 0.2537
377
- 3000 train 3.838809 (lr=1.1361e-03) (hash(x)=46890983)
378
- 3010 train 3.709950 (lr=1.1254e-03) (hash(x)=37986759)
379
- 3020 train 3.832775 (lr=1.1147e-03) (hash(x)=36270703)
380
- 3030 train 4.097351 (lr=1.1041e-03) (hash(x)=38228599)
381
- 3040 train 3.943844 (lr=1.0935e-03) (hash(x)=44344296)
382
- 3050 train 4.034949 (lr=1.0830e-03) (hash(x)=39962297)
383
- 3060 train 4.010700 (lr=1.0725e-03) (hash(x)=39817394)
384
- 3070 train 4.012198 (lr=1.0620e-03) (hash(x)=41763868)
385
- 3080 train 4.094848 (lr=1.0516e-03) (hash(x)=42343051)
386
- 3090 train 4.091245 (lr=1.0412e-03) (hash(x)=39204893)
387
- 3100 val loss 3.9969
388
- 3100 val perplexity 54.4278
389
- 3100 train 3.994044 (lr=1.0309e-03) (hash(x)=39353599)
390
- 3110 train 4.062894 (lr=1.0207e-03) (hash(x)=36118073)
391
- 3120 train 3.766075 (lr=1.0104e-03) (hash(x)=44070271)
392
- 3130 train 3.790561 (lr=1.0002e-03) (hash(x)=43368151)
393
- 3140 train 3.865196 (lr=9.9011e-04) (hash(x)=40612203)
394
- 3150 train 4.003762 (lr=9.8002e-04) (hash(x)=39598447)
395
- 3160 train 4.051315 (lr=9.6999e-04) (hash(x)=43897682)
396
- 3170 train 4.039204 (lr=9.6000e-04) (hash(x)=39029243)
397
- 3180 train 4.044245 (lr=9.5007e-04) (hash(x)=38586684)
398
- 3190 train 3.998305 (lr=9.4019e-04) (hash(x)=39299115)
399
- 3200 val loss 3.9762
400
- 3200 val perplexity 53.3127
401
- 3200 train 4.022316 (lr=9.3036e-04) (hash(x)=41064949)
402
- 3210 train 4.014362 (lr=9.2058e-04) (hash(x)=41752822)
403
- 3220 train 4.016970 (lr=9.1085e-04) (hash(x)=37484805)
404
- 3230 train 3.982487 (lr=9.0118e-04) (hash(x)=40514919)
405
- 3240 train 4.037061 (lr=8.9157e-04) (hash(x)=42440343)
406
- 3250 hella 0.2550
407
- 3250 train 3.935954 (lr=8.8201e-04) (hash(x)=32464136)
408
- 3260 train 3.959235 (lr=8.7251e-04) (hash(x)=47270558)
409
- 3270 train 3.889747 (lr=8.6307e-04) (hash(x)=39307546)
410
- 3280 train 3.930236 (lr=8.5368e-04) (hash(x)=40410299)
411
- 3290 train 3.926274 (lr=8.4435e-04) (hash(x)=40541497)
412
- 3300 val loss 3.9705
413
- 3300 val perplexity 53.0091
414
- 3300 train 3.824443 (lr=8.3508e-04) (hash(x)=38089139)
415
- 3310 train 3.862606 (lr=8.2588e-04) (hash(x)=42006576)
416
- 3320 train 3.952976 (lr=8.1673e-04) (hash(x)=40422556)
417
- 3330 train 3.748334 (lr=8.0764e-04) (hash(x)=39021643)
418
- 3340 train 4.004063 (lr=7.9862e-04) (hash(x)=40434305)
419
- 3350 train 4.036251 (lr=7.8966e-04) (hash(x)=40833559)
420
- 3360 train 4.031414 (lr=7.8076e-04) (hash(x)=40045231)
421
- 3370 train 3.999085 (lr=7.7192e-04) (hash(x)=34668317)
422
- 3380 train 3.993289 (lr=7.6315e-04) (hash(x)=42738568)
423
- 3390 train 4.092020 (lr=7.5445e-04) (hash(x)=38980166)
424
- 3400 val loss 3.9489
425
- 3400 val perplexity 51.8785
426
- 3400 train 3.999529 (lr=7.4581e-04) (hash(x)=43878176)
427
- 3410 train 4.042040 (lr=7.3724e-04) (hash(x)=38201991)
428
- 3420 train 4.022738 (lr=7.2874e-04) (hash(x)=34905889)
429
- 3430 train 4.024003 (lr=7.2030e-04) (hash(x)=41680405)
430
- 3440 train 3.977474 (lr=7.1193e-04) (hash(x)=39948184)
431
- 3450 train 4.024815 (lr=7.0363e-04) (hash(x)=33623728)
432
- 3460 train 3.896141 (lr=6.9541e-04) (hash(x)=41027647)
433
- 3470 train 3.934937 (lr=6.8725e-04) (hash(x)=42779482)
434
- 3480 train 3.948349 (lr=6.7916e-04) (hash(x)=41579288)
435
- 3490 train 3.804666 (lr=6.7114e-04) (hash(x)=37174846)
436
- 3500 val loss 3.9463
437
- 3500 val perplexity 51.7422
438
- 3500 hella 0.2534
439
- 3500 train 3.678488 (lr=6.6320e-04) (hash(x)=37082090)
440
- 3510 train 3.878827 (lr=6.5533e-04) (hash(x)=43131340)
441
- 3520 train 3.796272 (lr=6.4753e-04) (hash(x)=42457250)
442
- 3530 train 4.027472 (lr=6.3981e-04) (hash(x)=39504492)
443
- 3540 train 3.974875 (lr=6.3216e-04) (hash(x)=36225997)
444
- 3550 train 4.101677 (lr=6.2458e-04) (hash(x)=40912087)
445
- 3560 train 3.939329 (lr=6.1708e-04) (hash(x)=41545925)
446
- 3570 train 4.029837 (lr=6.0966e-04) (hash(x)=32553193)
447
- 3580 train 3.873727 (lr=6.0231e-04) (hash(x)=38799856)
448
- 3590 train 3.916789 (lr=5.9504e-04) (hash(x)=40772075)
449
- 3600 val loss 3.9267
450
- 3600 val perplexity 50.7374
451
- 3600 train 3.962950 (lr=5.8785e-04) (hash(x)=39299903)
452
- 3610 train 4.007216 (lr=5.8074e-04) (hash(x)=39503247)
453
- 3620 train 3.897359 (lr=5.7370e-04) (hash(x)=38730668)
454
- 3630 train 3.918308 (lr=5.6675e-04) (hash(x)=40503276)
455
- 3640 train 3.930182 (lr=5.5987e-04) (hash(x)=36504573)
456
- 3650 train 3.950698 (lr=5.5308e-04) (hash(x)=44903075)
457
- 3660 train 3.859430 (lr=5.4636e-04) (hash(x)=33800148)
458
- 3670 train 3.764808 (lr=5.3973e-04) (hash(x)=39404541)
459
- 3680 train 3.707870 (lr=5.3318e-04) (hash(x)=44777432)
460
- 3690 train 3.808076 (lr=5.2671e-04) (hash(x)=45649913)
461
- 3700 val loss 3.9297
462
- 3700 val perplexity 50.8933
463
- 3700 train 3.931644 (lr=5.2033e-04) (hash(x)=33812537)
464
- 3710 train 4.085416 (lr=5.1402e-04) (hash(x)=36161222)
465
- 3720 train 3.959755 (lr=5.0780e-04) (hash(x)=43690616)
466
- 3730 train 3.968621 (lr=5.0167e-04) (hash(x)=35462313)
467
- 3740 train 4.264380 (lr=4.9562e-04) (hash(x)=44674254)
468
- 3750 hella 0.2507
469
- 3750 train 3.940989 (lr=4.8965e-04) (hash(x)=44276297)
470
- 3760 train 3.988477 (lr=4.8377e-04) (hash(x)=38492178)
471
- 3770 train 3.990553 (lr=4.7798e-04) (hash(x)=44347531)
472
- 3780 train 3.908350 (lr=4.7227e-04) (hash(x)=33792059)
473
- 3790 train 4.004879 (lr=4.6665e-04) (hash(x)=38549318)
474
- 3800 val loss 3.9107
475
- 3800 val perplexity 49.9337
476
- 3800 train 3.882101 (lr=4.6112e-04) (hash(x)=41437448)
477
- 3810 train 3.782731 (lr=4.5567e-04) (hash(x)=36656768)
478
- 3820 train 3.947555 (lr=4.5031e-04) (hash(x)=46387893)
479
- 3830 train 3.944655 (lr=4.4504e-04) (hash(x)=44312129)
480
- 3840 train 3.869424 (lr=4.3986e-04) (hash(x)=40341896)
481
- 3850 train 3.916161 (lr=4.3477e-04) (hash(x)=43589160)
482
- 3860 train 3.896981 (lr=4.2977e-04) (hash(x)=40388945)
483
- 3870 train 3.794569 (lr=4.2486e-04) (hash(x)=33336679)
484
- 3880 train 3.891421 (lr=4.2004e-04) (hash(x)=37643311)
485
- 3890 train 3.916653 (lr=4.1530e-04) (hash(x)=38272927)
486
- 3900 val loss 3.9035
487
- 3900 val perplexity 49.5755
488
- 3900 train 3.856800 (lr=4.1066e-04) (hash(x)=41925748)
489
- 3910 train 3.950039 (lr=4.0611e-04) (hash(x)=38377253)
490
- 3920 train 3.955739 (lr=4.0166e-04) (hash(x)=41616611)
491
- 3930 train 3.880027 (lr=3.9729e-04) (hash(x)=32301827)
492
- 3940 train 3.931345 (lr=3.9302e-04) (hash(x)=47697363)
493
- 3950 train 3.887247 (lr=3.8884e-04) (hash(x)=37867767)
494
- 3960 train 3.886584 (lr=3.8475e-04) (hash(x)=37187295)
495
- 3970 train 3.943678 (lr=3.8076e-04) (hash(x)=41952752)
496
- 3980 train 3.911164 (lr=3.7685e-04) (hash(x)=38358660)
497
- 3990 train 3.775970 (lr=3.7305e-04) (hash(x)=40207878)
498
- 4000 val loss 3.8982
499
- 4000 val perplexity 49.3118
500
- 4000 hella 0.2570
501
- 4000 train 3.867509 (lr=3.6933e-04) (hash(x)=39134015)
502
- 4010 train 3.935813 (lr=3.6572e-04) (hash(x)=38313135)
503
- 4020 train 3.867489 (lr=3.6219e-04) (hash(x)=40710513)
504
- 4030 train 3.996215 (lr=3.5876e-04) (hash(x)=35530165)
505
- 4040 train 3.918967 (lr=3.5543e-04) (hash(x)=39494066)
506
- 4050 train 3.910345 (lr=3.5219e-04) (hash(x)=40861777)
507
- 4060 train 3.892280 (lr=3.4905e-04) (hash(x)=36151939)
508
- 4070 train 3.857571 (lr=3.4600e-04) (hash(x)=39008241)
509
- 4080 train 3.938872 (lr=3.4305e-04) (hash(x)=40395746)
510
- 4090 train 3.977077 (lr=3.4019e-04) (hash(x)=38855480)
511
- 4100 val loss 3.8855
512
- 4100 val perplexity 48.6893
513
- 4100 train 3.914841 (lr=3.3744e-04) (hash(x)=45013254)
514
- 4110 train 3.919357 (lr=3.3477e-04) (hash(x)=39934818)
515
- 4120 train 3.921833 (lr=3.3221e-04) (hash(x)=33417705)
516
- 4130 train 3.863236 (lr=3.2974e-04) (hash(x)=41267978)
517
- 4140 train 3.852715 (lr=3.2737e-04) (hash(x)=38961609)
518
- 4150 train 3.853776 (lr=3.2510e-04) (hash(x)=42173878)
519
- 4160 train 3.820437 (lr=3.2292e-04) (hash(x)=36867405)
520
- 4170 train 3.800231 (lr=3.2085e-04) (hash(x)=38991954)
521
- 4180 train 3.863340 (lr=3.1887e-04) (hash(x)=39032592)
522
- 4190 train 3.837803 (lr=3.1699e-04) (hash(x)=61329402)
523
- 4200 val loss 3.8853
524
- 4200 val perplexity 48.6805
525
- 4200 train 3.849986 (lr=3.1520e-04) (hash(x)=40455200)
526
- 4210 train 3.879632 (lr=3.1352e-04) (hash(x)=40913417)
527
- 4220 train 3.915502 (lr=3.1193e-04) (hash(x)=38809704)
528
- 4230 train 3.896493 (lr=3.1044e-04) (hash(x)=39560204)
529
- 4240 train 3.899051 (lr=3.0905e-04) (hash(x)=41496549)
530
- 4250 hella 0.2570
531
- 4250 train 3.935833 (lr=3.0776e-04) (hash(x)=33994768)
532
- 4260 train 3.919878 (lr=3.0657e-04) (hash(x)=39658687)
533
- 4270 train 3.800713 (lr=3.0548e-04) (hash(x)=42583064)
534
- 4280 train 3.863995 (lr=3.0449e-04) (hash(x)=40635811)
535
- 4290 train 3.765610 (lr=3.0359e-04) (hash(x)=35313841)
536
- 4300 val loss 3.8823
537
- 4300 val perplexity 48.5372
538
- 4300 train 3.945179 (lr=3.0280e-04) (hash(x)=39332245)
539
- 4310 train 3.847904 (lr=3.0210e-04) (hash(x)=43732999)
540
- 4320 train 3.848834 (lr=3.0150e-04) (hash(x)=40831835)
541
- 4330 train 3.984694 (lr=3.0101e-04) (hash(x)=41813368)
542
- 4340 train 3.887618 (lr=3.0061e-04) (hash(x)=36952586)
543
- 4350 train 3.911630 (lr=3.0031e-04) (hash(x)=42470745)
544
- 4360 train 3.835622 (lr=3.0011e-04) (hash(x)=39581495)
545
- 4370 train 4.026768 (lr=3.0001e-04) (hash(x)=35914538)
546
- 4374 val loss 3.8754
547
- 4374 val perplexity 48.2003
548
- 4374 hella 0.2559
 
1
  max_steps: 4375
2
+ 0 val loss 11.3198
3
+ 0 val perplexity 82438.3594
4
+ 0 train 11.379463 (lr=1.2000e-05) (hash(x)=44227275)
5
+ 10 train 9.797939 (lr=1.3200e-04) (hash(x)=45818388)
6
+ 20 train 8.633065 (lr=2.5200e-04) (hash(x)=39408131)
7
+ 30 train 7.907566 (lr=3.7200e-04) (hash(x)=40882147)
8
+ 40 train 7.893028 (lr=4.9200e-04) (hash(x)=38039067)
9
+ 50 train 7.709972 (lr=6.1200e-04) (hash(x)=35121892)
10
+ 60 train 7.915734 (lr=7.3200e-04) (hash(x)=38788225)
11
+ 70 train 7.710945 (lr=8.5200e-04) (hash(x)=42846102)
12
+ 80 train 7.764929 (lr=9.7200e-04) (hash(x)=42368835)
13
+ 90 train 7.486595 (lr=1.0920e-03) (hash(x)=39322548)
14
+ 100 val loss 7.3988
15
+ 100 val perplexity 1633.9817
16
+ 100 train 7.489345 (lr=1.2120e-03) (hash(x)=40406420)
17
+ 110 train 7.186479 (lr=1.3320e-03) (hash(x)=35906509)
18
+ 120 train 7.022212 (lr=1.4520e-03) (hash(x)=34719082)
19
+ 130 train 7.233125 (lr=1.5720e-03) (hash(x)=45268027)
20
+ 140 train 7.415177 (lr=1.6920e-03) (hash(x)=41612530)
21
+ 150 train 7.320706 (lr=1.8120e-03) (hash(x)=41411227)
22
+ 160 train 7.423467 (lr=1.9320e-03) (hash(x)=43258342)
23
+ 170 train 6.987679 (lr=2.0520e-03) (hash(x)=37150406)
24
+ 180 train 6.713885 (lr=2.1720e-03) (hash(x)=34145536)
25
+ 190 train 6.850262 (lr=2.2920e-03) (hash(x)=38936209)
26
+ 200 val loss 7.0850
27
+ 200 val perplexity 1193.9506
28
+ 200 train 6.269652 (lr=2.4120e-03) (hash(x)=30327209)
29
+ 210 train 6.897263 (lr=2.5320e-03) (hash(x)=38350146)
30
+ 220 train 7.218045 (lr=2.6520e-03) (hash(x)=45488529)
31
+ 230 train 6.780678 (lr=2.7720e-03) (hash(x)=37882079)
32
+ 240 train 6.867301 (lr=2.8920e-03) (hash(x)=40962992)
33
+ 250 train 7.046554 (lr=3.0000e-03) (hash(x)=43216900)
34
+ 260 train 6.939759 (lr=3.0000e-03) (hash(x)=26725679)
35
+ 270 train 6.865248 (lr=2.9998e-03) (hash(x)=39966727)
36
+ 280 train 7.263814 (lr=2.9996e-03) (hash(x)=46127200)
37
+ 290 train 6.663627 (lr=2.9994e-03) (hash(x)=37965454)
38
+ 300 val loss 6.7726
39
+ 300 val perplexity 873.6116
40
+ 300 train 6.809310 (lr=2.9990e-03) (hash(x)=41918472)
41
+ 310 train 6.793670 (lr=2.9986e-03) (hash(x)=39644633)
42
+ 320 train 6.435929 (lr=2.9981e-03) (hash(x)=37587691)
43
+ 330 train 6.546347 (lr=2.9975e-03) (hash(x)=37481443)
44
+ 340 train 6.849642 (lr=2.9968e-03) (hash(x)=41939532)
45
+ 350 train 6.536463 (lr=2.9961e-03) (hash(x)=38497020)
46
+ 360 train 6.817259 (lr=2.9953e-03) (hash(x)=39744344)
47
+ 370 train 6.296607 (lr=2.9944e-03) (hash(x)=34941475)
48
+ 380 train 6.493192 (lr=2.9934e-03) (hash(x)=38299371)
49
+ 390 train 6.471909 (lr=2.9923e-03) (hash(x)=37556107)
50
+ 400 val loss 6.6084
51
+ 400 val perplexity 741.2896
52
+ 400 train 6.673820 (lr=2.9912e-03) (hash(x)=44179822)
53
+ 410 train 6.640482 (lr=2.9900e-03) (hash(x)=41220755)
54
+ 420 train 6.587605 (lr=2.9887e-03) (hash(x)=38754098)
55
+ 430 train 6.739646 (lr=2.9873e-03) (hash(x)=41527019)
56
+ 440 train 6.537798 (lr=2.9859e-03) (hash(x)=41419190)
57
+ 450 train 6.568528 (lr=2.9844e-03) (hash(x)=39961641)
58
+ 460 train 6.380276 (lr=2.9828e-03) (hash(x)=37888686)
59
+ 470 train 6.339571 (lr=2.9811e-03) (hash(x)=37968322)
60
+ 480 train 6.841465 (lr=2.9793e-03) (hash(x)=43812581)
61
+ 490 train 6.391725 (lr=2.9775e-03) (hash(x)=35797189)
62
+ 500 val loss 6.4994
63
+ 500 val perplexity 664.7291
64
+ 500 train 6.251128 (lr=2.9756e-03) (hash(x)=36532055)
65
+ 510 train 6.504333 (lr=2.9736e-03) (hash(x)=44096730)
66
+ 520 train 6.963413 (lr=2.9716e-03) (hash(x)=48482144)
67
+ 530 train 6.552701 (lr=2.9694e-03) (hash(x)=39203356)
68
+ 540 train 6.309877 (lr=2.9672e-03) (hash(x)=39225929)
69
+ 550 train 7.416578 (lr=2.9649e-03) (hash(x)=53152351)
70
+ 560 train 6.352113 (lr=2.9625e-03) (hash(x)=37033697)
71
+ 570 train 6.850715 (lr=2.9601e-03) (hash(x)=88200175)
72
+ 580 train 6.499003 (lr=2.9576e-03) (hash(x)=39393936)
73
+ 590 train 6.152848 (lr=2.9550e-03) (hash(x)=36692819)
74
+ 600 val loss 6.5519
75
+ 600 val perplexity 700.5750
76
+ 600 train 6.498122 (lr=2.9523e-03) (hash(x)=41491085)
77
+ 610 train 6.646906 (lr=2.9496e-03) (hash(x)=43232078)
78
+ 620 train 5.665857 (lr=2.9468e-03) (hash(x)=23474131)
79
+ 630 train 6.317541 (lr=2.9439e-03) (hash(x)=40546831)
80
+ 640 train 6.764076 (lr=2.9409e-03) (hash(x)=51582695)
81
+ 650 train 6.118343 (lr=2.9378e-03) (hash(x)=38302375)
82
+ 660 train 6.353317 (lr=2.9347e-03) (hash(x)=38626801)
83
+ 670 train 6.385187 (lr=2.9315e-03) (hash(x)=40660592)
84
+ 680 train 6.084672 (lr=2.9283e-03) (hash(x)=31896853)
85
+ 690 train 6.273912 (lr=2.9249e-03) (hash(x)=38331964)
86
+ 700 val loss 6.3372
87
+ 700 val perplexity 565.2305
88
+ 700 train 6.271990 (lr=2.9215e-03) (hash(x)=38643410)
89
+ 710 train 6.095592 (lr=2.9180e-03) (hash(x)=37789972)
90
+ 720 train 6.601953 (lr=2.9144e-03) (hash(x)=40030446)
91
+ 730 train 6.388149 (lr=2.9108e-03) (hash(x)=45839084)
92
+ 740 train 6.416939 (lr=2.9071e-03) (hash(x)=43725009)
93
+ 750 train 6.212621 (lr=2.9033e-03) (hash(x)=37845694)
94
+ 760 train 6.168904 (lr=2.8994e-03) (hash(x)=40609741)
95
+ 770 train 6.523702 (lr=2.8955e-03) (hash(x)=43323269)
96
+ 780 train 6.212938 (lr=2.8915e-03) (hash(x)=35699653)
97
+ 790 train 6.301132 (lr=2.8874e-03) (hash(x)=38268627)
98
+ 800 val loss 6.2753
99
+ 800 val perplexity 531.2865
100
+ 800 train 6.161645 (lr=2.8833e-03) (hash(x)=41743031)
101
+ 810 train 6.651413 (lr=2.8791e-03) (hash(x)=46688666)
102
+ 820 train 6.507217 (lr=2.8748e-03) (hash(x)=43904433)
103
+ 830 train 6.205750 (lr=2.8704e-03) (hash(x)=36792619)
104
+ 840 train 6.300427 (lr=2.8660e-03) (hash(x)=43235991)
105
+ 850 train 6.215741 (lr=2.8615e-03) (hash(x)=38699469)
106
+ 860 train 6.401577 (lr=2.8569e-03) (hash(x)=42620243)
107
+ 870 train 6.427041 (lr=2.8523e-03) (hash(x)=43841580)
108
+ 880 train 6.220507 (lr=2.8476e-03) (hash(x)=39244977)
109
+ 890 train 6.474046 (lr=2.8428e-03) (hash(x)=44897346)
110
+ 900 val loss 6.2168
111
+ 900 val perplexity 501.1090
112
+ 900 train 6.524931 (lr=2.8379e-03) (hash(x)=45852497)
113
+ 910 train 5.960623 (lr=2.8330e-03) (hash(x)=38398819)
114
+ 920 train 6.158507 (lr=2.8280e-03) (hash(x)=39775421)
115
+ 930 train 6.030588 (lr=2.8230e-03) (hash(x)=39701472)
116
+ 940 train 6.094589 (lr=2.8178e-03) (hash(x)=39790089)
117
+ 950 train 5.958650 (lr=2.8127e-03) (hash(x)=38366985)
118
+ 960 train 6.082577 (lr=2.8074e-03) (hash(x)=39284432)
119
+ 970 train 6.318904 (lr=2.8021e-03) (hash(x)=43609541)
120
+ 980 train 6.221474 (lr=2.7967e-03) (hash(x)=39386797)
121
+ 990 train 5.989091 (lr=2.7912e-03) (hash(x)=37028417)
122
+ 1000 val loss 6.1751
123
+ 1000 val perplexity 480.6269
124
+ 1000 train 6.182693 (lr=2.7857e-03) (hash(x)=40181596)
125
+ 1010 train 6.144771 (lr=2.7801e-03) (hash(x)=41599650)
126
+ 1020 train 6.025442 (lr=2.7744e-03) (hash(x)=40205853)
127
+ 1030 train 6.475232 (lr=2.7687e-03) (hash(x)=43271614)
128
+ 1040 train 6.007041 (lr=2.7629e-03) (hash(x)=35882196)
129
+ 1050 train 6.102581 (lr=2.7571e-03) (hash(x)=42513714)
130
+ 1060 train 6.183872 (lr=2.7512e-03) (hash(x)=41941750)
131
+ 1070 train 6.095507 (lr=2.7452e-03) (hash(x)=38670200)
132
+ 1080 train 5.938781 (lr=2.7391e-03) (hash(x)=37161833)
133
+ 1090 train 6.174100 (lr=2.7330e-03) (hash(x)=41885859)
134
+ 1100 val loss 6.1634
135
+ 1100 val perplexity 475.0564
136
+ 1100 train 6.076378 (lr=2.7269e-03) (hash(x)=30752379)
137
+ 1110 train 5.899285 (lr=2.7206e-03) (hash(x)=36731960)
138
+ 1120 train 5.895706 (lr=2.7143e-03) (hash(x)=38122158)
139
+ 1130 train 6.211620 (lr=2.7080e-03) (hash(x)=41476640)
140
+ 1140 train 6.034586 (lr=2.7016e-03) (hash(x)=40147219)
141
+ 1150 train 5.768303 (lr=2.6951e-03) (hash(x)=39740805)
142
+ 1160 train 5.879625 (lr=2.6886e-03) (hash(x)=38564136)
143
+ 1170 train 5.940179 (lr=2.6820e-03) (hash(x)=37201973)
144
+ 1180 train 5.879993 (lr=2.6753e-03) (hash(x)=35761112)
145
+ 1190 train 6.487927 (lr=2.6686e-03) (hash(x)=45564774)
146
+ 1200 val loss 6.1201
147
+ 1200 val perplexity 454.9191
148
+ 1200 train 5.796727 (lr=2.6618e-03) (hash(x)=34912485)
149
+ 1210 train 5.971030 (lr=2.6550e-03) (hash(x)=36630633)
150
+ 1220 train 5.615767 (lr=2.6481e-03) (hash(x)=32533931)
151
+ 1230 train 5.842389 (lr=2.6411e-03) (hash(x)=38233985)
152
+ 1240 train 5.969805 (lr=2.6341e-03) (hash(x)=42794906)
153
+ 1250 train 6.008198 (lr=2.6270e-03) (hash(x)=37951459)
154
+ 1260 train 5.944851 (lr=2.6199e-03) (hash(x)=40970790)
155
+ 1270 train 5.696421 (lr=2.6127e-03) (hash(x)=37853233)
156
+ 1280 train 5.967031 (lr=2.6055e-03) (hash(x)=44243810)
157
+ 1290 train 6.204058 (lr=2.5982e-03) (hash(x)=43089904)
158
+ 1300 val loss 6.0491
159
+ 1300 val perplexity 423.7339
160
+ 1300 train 5.773164 (lr=2.5909e-03) (hash(x)=36986419)
161
+ 1310 train 5.889661 (lr=2.5835e-03) (hash(x)=38197520)
162
+ 1320 train 6.083013 (lr=2.5760e-03) (hash(x)=44102135)
163
+ 1330 train 6.082072 (lr=2.5685e-03) (hash(x)=39183556)
164
+ 1340 train 5.936688 (lr=2.5609e-03) (hash(x)=42652135)
165
+ 1350 train 5.793446 (lr=2.5533e-03) (hash(x)=37542847)
166
+ 1360 train 5.829593 (lr=2.5457e-03) (hash(x)=37357575)
167
+ 1370 train 5.869967 (lr=2.5379e-03) (hash(x)=37211563)
168
+ 1380 train 5.843562 (lr=2.5302e-03) (hash(x)=38293580)
169
+ 1390 train 5.775048 (lr=2.5223e-03) (hash(x)=39750125)
170
+ 1400 val loss 6.0906
171
+ 1400 val perplexity 441.7024
172
+ 1400 train 6.024382 (lr=2.5145e-03) (hash(x)=39537431)
173
+ 1410 train 6.029010 (lr=2.5066e-03) (hash(x)=40644308)
174
+ 1420 train 6.480279 (lr=2.4986e-03) (hash(x)=47023143)
175
+ 1430 train 6.053619 (lr=2.4906e-03) (hash(x)=36669415)
176
+ 1440 train 6.193233 (lr=2.4825e-03) (hash(x)=39721164)
177
+ 1450 train 5.608108 (lr=2.4744e-03) (hash(x)=35718305)
178
+ 1460 train 6.036287 (lr=2.4662e-03) (hash(x)=44235616)
179
+ 1470 train 6.030736 (lr=2.4580e-03) (hash(x)=39483945)
180
+ 1480 train 5.796659 (lr=2.4497e-03) (hash(x)=38800395)
181
+ 1490 train 5.896487 (lr=2.4414e-03) (hash(x)=41087022)
182
+ 1500 val loss 5.9756
183
+ 1500 val perplexity 393.6914
184
+ 1500 train 5.672778 (lr=2.4331e-03) (hash(x)=38132225)
185
+ 1510 train 5.690163 (lr=2.4247e-03) (hash(x)=37307491)
186
+ 1520 train 4.995702 (lr=2.4162e-03) (hash(x)=38656943)
187
+ 1530 train 5.967829 (lr=2.4077e-03) (hash(x)=42111934)
188
+ 1540 train 5.938928 (lr=2.3992e-03) (hash(x)=45439288)
189
+ 1550 train 5.965722 (lr=2.3906e-03) (hash(x)=41611884)
190
+ 1560 train 5.872955 (lr=2.3820e-03) (hash(x)=41163449)
191
+ 1570 train 5.680617 (lr=2.3734e-03) (hash(x)=35031263)
192
+ 1580 train 5.826626 (lr=2.3647e-03) (hash(x)=37462499)
193
+ 1590 train 5.962614 (lr=2.3559e-03) (hash(x)=42648103)
194
+ 1600 val loss 5.9542
195
+ 1600 val perplexity 385.3543
196
+ 1600 train 5.746299 (lr=2.3471e-03) (hash(x)=41299173)
197
+ 1610 train 6.093915 (lr=2.3383e-03) (hash(x)=41251778)
198
+ 1620 train 5.766378 (lr=2.3294e-03) (hash(x)=42046626)
199
+ 1630 train 5.866395 (lr=2.3205e-03) (hash(x)=40406538)
200
+ 1640 train 6.285133 (lr=2.3116e-03) (hash(x)=53507848)
201
+ 1650 train 5.936745 (lr=2.3026e-03) (hash(x)=41616447)
202
+ 1660 train 5.959065 (lr=2.2936e-03) (hash(x)=42076630)
203
+ 1670 train 6.316203 (lr=2.2845e-03) (hash(x)=45363762)
204
+ 1680 train 5.842880 (lr=2.2754e-03) (hash(x)=40213372)
205
+ 1690 train 6.138567 (lr=2.2663e-03) (hash(x)=43005357)
206
+ 1700 val loss 5.9327
207
+ 1700 val perplexity 377.1550
208
+ 1700 train 5.889496 (lr=2.2572e-03) (hash(x)=47362661)
209
+ 1710 train 5.804827 (lr=2.2480e-03) (hash(x)=43006690)
210
+ 1720 train 5.723139 (lr=2.2387e-03) (hash(x)=41020003)
211
+ 1730 train 5.599441 (lr=2.2294e-03) (hash(x)=41388965)
212
+ 1740 train 5.657528 (lr=2.2201e-03) (hash(x)=40240896)
213
+ 1750 train 5.764262 (lr=2.2108e-03) (hash(x)=38161024)
214
+ 1760 train 5.665637 (lr=2.2014e-03) (hash(x)=39187552)
215
+ 1770 train 5.524759 (lr=2.1920e-03) (hash(x)=36997602)
216
+ 1780 train 5.989076 (lr=2.1826e-03) (hash(x)=34519210)
217
+ 1790 train 5.634367 (lr=2.1731e-03) (hash(x)=45294931)
218
+ 1800 val loss 5.9013
219
+ 1800 val perplexity 365.4947
220
+ 1800 train 5.742164 (lr=2.1637e-03) (hash(x)=43856756)
221
+ 1810 train 5.472896 (lr=2.1541e-03) (hash(x)=39538565)
222
+ 1820 train 5.744007 (lr=2.1446e-03) (hash(x)=40850282)
223
+ 1830 train 6.007143 (lr=2.1350e-03) (hash(x)=41530022)
224
+ 1840 train 5.447040 (lr=2.1254e-03) (hash(x)=42128732)
225
+ 1850 train 5.711160 (lr=2.1158e-03) (hash(x)=44294510)
226
+ 1860 train 5.752619 (lr=2.1061e-03) (hash(x)=42547901)
227
+ 1870 train 5.555669 (lr=2.0964e-03) (hash(x)=38750148)
228
+ 1880 train 5.470454 (lr=2.0867e-03) (hash(x)=35073879)
229
+ 1890 train 5.679026 (lr=2.0769e-03) (hash(x)=42170459)
230
+ 1900 val loss 5.9074
231
+ 1900 val perplexity 367.7443
232
+ 1900 train 5.960515 (lr=2.0672e-03) (hash(x)=39937031)
233
+ 1910 train 5.862342 (lr=2.0574e-03) (hash(x)=41940608)
234
+ 1920 train 5.696874 (lr=2.0476e-03) (hash(x)=40555290)
235
+ 1930 train 5.915828 (lr=2.0377e-03) (hash(x)=51847973)
236
+ 1940 train 5.953328 (lr=2.0279e-03) (hash(x)=39150884)
237
+ 1950 train 5.957563 (lr=2.0180e-03) (hash(x)=41853277)
238
+ 1960 train 5.626873 (lr=2.0081e-03) (hash(x)=42463514)
239
+ 1970 train 5.599295 (lr=1.9982e-03) (hash(x)=37438437)
240
+ 1980 train 5.731918 (lr=1.9882e-03) (hash(x)=41342533)
241
+ 1990 train 5.439400 (lr=1.9783e-03) (hash(x)=38374872)
242
+ 2000 val loss 5.8381
243
+ 2000 val perplexity 343.1372
244
+ 2000 train 5.992682 (lr=1.9683e-03) (hash(x)=39550913)
245
+ 2010 train 5.596606 (lr=1.9583e-03) (hash(x)=41949369)
246
+ 2020 train 5.175270 (lr=1.9483e-03) (hash(x)=39876159)
247
+ 2030 train 5.383932 (lr=1.9382e-03) (hash(x)=39465793)
248
+ 2040 train 5.625202 (lr=1.9282e-03) (hash(x)=44765924)
249
+ 2050 train 5.483789 (lr=1.9181e-03) (hash(x)=36425985)
250
+ 2060 train 5.621045 (lr=1.9080e-03) (hash(x)=41478281)
251
+ 2070 train 5.608891 (lr=1.8979e-03) (hash(x)=37088327)
252
+ 2080 train 5.294634 (lr=1.8878e-03) (hash(x)=39731845)
253
+ 2090 train 5.783054 (lr=1.8777e-03) (hash(x)=44481426)
254
+ 2100 val loss 5.8146
255
+ 2100 val perplexity 335.1568
256
+ 2100 train 5.862079 (lr=1.8675e-03) (hash(x)=44598443)
257
+ 2110 train 5.641122 (lr=1.8574e-03) (hash(x)=42265528)
258
+ 2120 train 5.481261 (lr=1.8472e-03) (hash(x)=35395939)
259
+ 2130 train 5.908422 (lr=1.8370e-03) (hash(x)=40597472)
260
+ 2140 train 5.774949 (lr=1.8268e-03) (hash(x)=38042919)
261
+ 2150 train 5.703743 (lr=1.8166e-03) (hash(x)=39716423)
262
+ 2160 train 5.674555 (lr=1.8064e-03) (hash(x)=41499874)
263
+ 2170 train 5.362947 (lr=1.7962e-03) (hash(x)=38544810)
264
+ 2180 train 5.659645 (lr=1.7860e-03) (hash(x)=42581610)
265
+ 2190 train 5.642224 (lr=1.7758e-03) (hash(x)=41838855)
266
+ 2200 val loss 5.7886
267
+ 2200 val perplexity 326.5524
268
+ 2200 train 5.776219 (lr=1.7655e-03) (hash(x)=38519887)
269
+ 2210 train 5.674378 (lr=1.7553e-03) (hash(x)=41058518)
270
+ 2220 train 5.047271 (lr=1.7450e-03) (hash(x)=44141063)
271
+ 2230 train 5.471584 (lr=1.7348e-03) (hash(x)=39348572)
272
+ 2240 train 5.373843 (lr=1.7245e-03) (hash(x)=40565631)
273
+ 2250 train 5.314240 (lr=1.7142e-03) (hash(x)=35679001)
274
+ 2260 train 5.356872 (lr=1.7040e-03) (hash(x)=35327316)
275
+ 2270 train 5.599993 (lr=1.6937e-03) (hash(x)=42319237)
276
+ 2280 train 5.267631 (lr=1.6834e-03) (hash(x)=34722403)
277
+ 2290 train 5.801545 (lr=1.6731e-03) (hash(x)=47167627)
278
+ 2300 val loss 5.7348
279
+ 2300 val perplexity 309.4472
280
+ 2300 train 5.371782 (lr=1.6629e-03) (hash(x)=39196366)
281
+ 2310 train 5.495155 (lr=1.6526e-03) (hash(x)=41448314)
282
+ 2320 train 5.217525 (lr=1.6423e-03) (hash(x)=38083188)
283
+ 2330 train 6.138095 (lr=1.6320e-03) (hash(x)=48217064)
284
+ 2340 train 5.607168 (lr=1.6217e-03) (hash(x)=40418014)
285
+ 2350 train 5.802357 (lr=1.6114e-03) (hash(x)=41000963)
286
+ 2360 train 6.415450 (lr=1.6012e-03) (hash(x)=52212186)
287
+ 2370 train 5.717125 (lr=1.5909e-03) (hash(x)=34152167)
288
+ 2380 train 5.401453 (lr=1.5806e-03) (hash(x)=39727275)
289
+ 2390 train 5.400539 (lr=1.5704e-03) (hash(x)=35701246)
290
+ 2400 val loss 5.6992
291
+ 2400 val perplexity 298.6306
292
+ 2400 train 5.509592 (lr=1.5601e-03) (hash(x)=35750369)
293
+ 2410 train 5.666783 (lr=1.5498e-03) (hash(x)=36921837)
294
+ 2420 train 6.307586 (lr=1.5396e-03) (hash(x)=52333487)
295
+ 2430 train 6.096017 (lr=1.5294e-03) (hash(x)=43700247)
296
+ 2440 train 5.685547 (lr=1.5191e-03) (hash(x)=38790797)
297
+ 2450 train 5.831779 (lr=1.5089e-03) (hash(x)=44909302)
298
+ 2460 train 5.790894 (lr=1.4987e-03) (hash(x)=44520063)
299
+ 2470 train 5.374334 (lr=1.4885e-03) (hash(x)=28926941)
300
+ 2480 train 5.589633 (lr=1.4783e-03) (hash(x)=38034434)
301
+ 2490 train 5.915215 (lr=1.4681e-03) (hash(x)=40370265)
302
+ 2500 val loss 5.6744
303
+ 2500 val perplexity 291.3057
304
+ 2500 train 6.844440 (lr=1.4579e-03) (hash(x)=48759128)
305
+ 2510 train 5.583824 (lr=1.4477e-03) (hash(x)=38903830)
306
+ 2520 train 5.554042 (lr=1.4375e-03) (hash(x)=38343017)
307
+ 2530 train 5.665951 (lr=1.4274e-03) (hash(x)=37353172)
308
+ 2540 train 5.771531 (lr=1.4173e-03) (hash(x)=43056679)
309
+ 2550 train 6.077626 (lr=1.4071e-03) (hash(x)=44884973)
310
+ 2560 train 5.891807 (lr=1.3970e-03) (hash(x)=44270024)
311
+ 2570 train 5.550761 (lr=1.3869e-03) (hash(x)=37552441)
312
+ 2580 train 5.655738 (lr=1.3769e-03) (hash(x)=37548505)
313
+ 2590 train 5.549836 (lr=1.3668e-03) (hash(x)=37070515)
314
+ 2600 val loss 5.5970
315
+ 2600 val perplexity 269.6157
316
+ 2600 train 5.588780 (lr=1.3568e-03) (hash(x)=41861606)
317
+ 2610 train 5.602142 (lr=1.3467e-03) (hash(x)=38749857)
318
+ 2620 train 6.009288 (lr=1.3367e-03) (hash(x)=48282206)
319
+ 2630 train 5.665033 (lr=1.3267e-03) (hash(x)=38695111)
320
+ 2640 train 5.420346 (lr=1.3168e-03) (hash(x)=36162702)
321
+ 2650 train 5.395927 (lr=1.3068e-03) (hash(x)=35619411)
322
+ 2660 train 5.714244 (lr=1.2969e-03) (hash(x)=41525016)
323
+ 2670 train 5.492519 (lr=1.2870e-03) (hash(x)=39454506)
324
+ 2680 train 5.565373 (lr=1.2771e-03) (hash(x)=38205746)
325
+ 2690 train 5.629951 (lr=1.2672e-03) (hash(x)=40239766)
326
+ 2700 val loss 5.5653
327
+ 2700 val perplexity 261.2092
328
+ 2700 train 5.958931 (lr=1.2573e-03) (hash(x)=48386728)
329
+ 2710 train 5.635997 (lr=1.2475e-03) (hash(x)=39766962)
330
+ 2720 train 5.834649 (lr=1.2377e-03) (hash(x)=46288620)
331
+ 2730 train 5.637322 (lr=1.2279e-03) (hash(x)=39531992)
332
+ 2740 train 5.417400 (lr=1.2182e-03) (hash(x)=38557594)
333
+ 2750 train 5.772608 (lr=1.2085e-03) (hash(x)=45712316)
334
+ 2760 train 5.390047 (lr=1.1988e-03) (hash(x)=38148531)
335
+ 2770 train 5.445625 (lr=1.1891e-03) (hash(x)=38199587)
336
+ 2780 train 5.799500 (lr=1.1794e-03) (hash(x)=43995601)
337
+ 2790 train 5.766082 (lr=1.1698e-03) (hash(x)=49076165)
338
+ 2800 val loss 5.5369
339
+ 2800 val perplexity 253.8802
340
+ 2800 train 5.755539 (lr=1.1602e-03) (hash(x)=42585188)
341
+ 2810 train 5.620761 (lr=1.1506e-03) (hash(x)=42076895)
342
+ 2820 train 5.453242 (lr=1.1411e-03) (hash(x)=39243011)
343
+ 2830 train 5.599067 (lr=1.1316e-03) (hash(x)=44189646)
344
+ 2840 train 5.917010 (lr=1.1221e-03) (hash(x)=44534008)
345
+ 2850 train 5.691290 (lr=1.1127e-03) (hash(x)=45896146)
346
+ 2860 train 5.388630 (lr=1.1033e-03) (hash(x)=40607651)
347
+ 2870 train 5.462972 (lr=1.0939e-03) (hash(x)=39953685)
348
+ 2880 train 5.919540 (lr=1.0845e-03) (hash(x)=45594293)
349
+ 2890 train 5.482719 (lr=1.0752e-03) (hash(x)=38530396)
350
+ 2900 val loss 5.5202
351
+ 2900 val perplexity 249.6909
352
+ 2900 train 6.492098 (lr=1.0659e-03) (hash(x)=46001938)
353
+ 2910 train 5.920892 (lr=1.0567e-03) (hash(x)=42053698)
354
+ 2920 train 5.178813 (lr=1.0474e-03) (hash(x)=31288623)
355
+ 2930 train 5.596229 (lr=1.0383e-03) (hash(x)=40957911)
356
+ 2940 train 5.421710 (lr=1.0291e-03) (hash(x)=37565291)
357
+ 2950 train 6.021412 (lr=1.0200e-03) (hash(x)=44688067)
358
+ 2960 train 5.389401 (lr=1.0109e-03) (hash(x)=40554915)
359
+ 2970 train 5.504748 (lr=1.0019e-03) (hash(x)=41858938)
360
+ 2980 train 6.159545 (lr=9.9289e-04) (hash(x)=46677201)
361
+ 2990 train 5.423365 (lr=9.8392e-04) (hash(x)=41191005)
362
+ 3000 val loss 5.4933
363
+ 3000 val perplexity 243.0576
364
+ 3000 train 5.604980 (lr=9.7500e-04) (hash(x)=42901823)
365
+ 3010 train 5.662110 (lr=9.6612e-04) (hash(x)=37714179)
366
+ 3020 train 5.765316 (lr=9.5727e-04) (hash(x)=45451321)
367
+ 3030 train 5.735548 (lr=9.4847e-04) (hash(x)=41111660)
368
+ 3040 train 5.811325 (lr=9.3970e-04) (hash(x)=48126392)
369
+ 3050 train 5.233923 (lr=9.3098e-04) (hash(x)=41165333)
370
+ 3060 train 5.460855 (lr=9.2230e-04) (hash(x)=41222845)
371
+ 3070 train 5.615535 (lr=9.1366e-04) (hash(x)=39756054)
372
+ 3080 train 5.360039 (lr=9.0506e-04) (hash(x)=35102807)
373
+ 3090 train 5.561254 (lr=8.9651e-04) (hash(x)=44800641)
374
+ 3100 val loss 5.4559
375
+ 3100 val perplexity 234.1353
376
+ 3100 train 5.419527 (lr=8.8800e-04) (hash(x)=39146350)
377
+ 3110 train 5.505163 (lr=8.7954e-04) (hash(x)=38968249)
378
+ 3120 train 6.064114 (lr=8.7112e-04) (hash(x)=45757845)
379
+ 3130 train 5.641373 (lr=8.6274e-04) (hash(x)=39272467)
380
+ 3140 train 5.436275 (lr=8.5441e-04) (hash(x)=40568086)
381
+ 3150 train 5.132092 (lr=8.4613e-04) (hash(x)=33341935)
382
+ 3160 train 5.487285 (lr=8.3789e-04) (hash(x)=36747042)
383
+ 3170 train 5.106658 (lr=8.2970e-04) (hash(x)=34255813)
384
+ 3180 train 5.474054 (lr=8.2156e-04) (hash(x)=40994836)
385
+ 3190 train 5.427584 (lr=8.1347e-04) (hash(x)=40204220)
386
+ 3200 val loss 5.4459
387
+ 3200 val perplexity 231.8014
388
+ 3200 train 5.382375 (lr=8.0542e-04) (hash(x)=37855244)
389
+ 3210 train 5.592527 (lr=7.9742e-04) (hash(x)=42937862)
390
+ 3220 train 5.335435 (lr=7.8948e-04) (hash(x)=41573470)
391
+ 3230 train 5.308156 (lr=7.8158e-04) (hash(x)=36922132)
392
+ 3240 train 5.328730 (lr=7.7373e-04) (hash(x)=38756443)
393
+ 3250 train 5.225527 (lr=7.6594e-04) (hash(x)=33303705)
394
+ 3260 train 5.366871 (lr=7.5819e-04) (hash(x)=38545233)
395
+ 3270 train 5.305439 (lr=7.5050e-04) (hash(x)=37078540)
396
+ 3280 train 5.432631 (lr=7.4286e-04) (hash(x)=42842412)
397
+ 3290 train 5.256313 (lr=7.3527e-04) (hash(x)=35416091)
398
+ 3300 val loss 5.4374
399
+ 3300 val perplexity 229.8350
400
+ 3300 train 5.710620 (lr=7.2774e-04) (hash(x)=42943006)
401
+ 3310 train 5.426646 (lr=7.2026e-04) (hash(x)=39078402)
402
+ 3320 train 5.354695 (lr=7.1283e-04) (hash(x)=40069891)
403
+ 3330 train 5.569634 (lr=7.0545e-04) (hash(x)=40560947)
404
+ 3340 train 5.622434 (lr=6.9814e-04) (hash(x)=39168598)
405
+ 3350 train 5.405854 (lr=6.9087e-04) (hash(x)=39228302)
406
+ 3360 train 5.255980 (lr=6.8367e-04) (hash(x)=38696155)
407
+ 3370 train 5.450393 (lr=6.7651e-04) (hash(x)=38097171)
408
+ 3380 train 5.651337 (lr=6.6942e-04) (hash(x)=42707851)
409
+ 3390 train 6.033101 (lr=6.6238e-04) (hash(x)=56570802)
410
+ 3400 val loss 5.3961
411
+ 3400 val perplexity 220.5476
412
+ 3400 train 5.286184 (lr=6.5540e-04) (hash(x)=39381717)
413
+ 3410 train 6.032186 (lr=6.4848e-04) (hash(x)=45129271)
414
+ 3420 train 5.215788 (lr=6.4161e-04) (hash(x)=38942398)
415
+ 3430 train 5.070482 (lr=6.3480e-04) (hash(x)=38488960)
416
+ 3440 train 5.585747 (lr=6.2806e-04) (hash(x)=45767547)
417
+ 3450 train 5.176673 (lr=6.2137e-04) (hash(x)=36198784)
418
+ 3460 train 5.489666 (lr=6.1474e-04) (hash(x)=42778044)
419
+ 3470 train 5.649846 (lr=6.0817e-04) (hash(x)=44950608)
420
+ 3480 train 4.863423 (lr=6.0166e-04) (hash(x)=39044959)
421
+ 3490 train 5.425606 (lr=5.9521e-04) (hash(x)=39987339)
422
+ 3500 val loss 5.3831
423
+ 3500 val perplexity 217.7065
424
+ 3500 train 5.434652 (lr=5.8883e-04) (hash(x)=38401262)
425
+ 3510 train 5.179955 (lr=5.8250e-04) (hash(x)=37122830)
426
+ 3520 train 5.364791 (lr=5.7624e-04) (hash(x)=40987016)
427
+ 3530 train 5.169059 (lr=5.7004e-04) (hash(x)=37900005)
428
+ 3540 train 5.365578 (lr=5.6390e-04) (hash(x)=40038501)
429
+ 3550 train 5.334502 (lr=5.5783e-04) (hash(x)=37671489)
430
+ 3560 train 5.389966 (lr=5.5182e-04) (hash(x)=38596282)
431
+ 3570 train 5.657457 (lr=5.4587e-04) (hash(x)=41774051)
432
+ 3580 train 5.595036 (lr=5.3998e-04) (hash(x)=38610942)
433
+ 3590 train 5.089818 (lr=5.3416e-04) (hash(x)=36069068)
434
+ 3600 val loss 5.3580
435
+ 3600 val perplexity 212.3097
436
+ 3600 train 5.414062 (lr=5.2841e-04) (hash(x)=42363563)
437
+ 3610 train 5.279015 (lr=5.2272e-04) (hash(x)=34615777)
438
+ 3620 train 5.367689 (lr=5.1710e-04) (hash(x)=40899217)
439
+ 3630 train 5.423831 (lr=5.1154e-04) (hash(x)=42238236)
440
+ 3640 train 5.093359 (lr=5.0604e-04) (hash(x)=34553691)
441
+ 3650 train 5.283995 (lr=5.0062e-04) (hash(x)=38697807)
442
+ 3660 train 5.345992 (lr=4.9526e-04) (hash(x)=40766257)
443
+ 3670 train 5.439585 (lr=4.8997e-04) (hash(x)=39337876)
444
+ 3680 train 5.555716 (lr=4.8474e-04) (hash(x)=42335002)
445
+ 3690 train 5.306686 (lr=4.7958e-04) (hash(x)=36528848)
446
+ 3700 val loss 5.3439
447
+ 3700 val perplexity 209.3265
448
+ 3700 train 5.352815 (lr=4.7449e-04) (hash(x)=38817765)
449
+ 3710 train 5.348447 (lr=4.6947e-04) (hash(x)=38409263)
450
+ 3720 train 5.717241 (lr=4.6452e-04) (hash(x)=46547894)
451
+ 3730 train 5.532721 (lr=4.5963e-04) (hash(x)=42716380)
452
+ 3740 train 5.072857 (lr=4.5482e-04) (hash(x)=35016013)
453
+ 3750 train 5.184477 (lr=4.5007e-04) (hash(x)=38076751)
454
+ 3760 train 5.359593 (lr=4.4540e-04) (hash(x)=37632632)
455
+ 3770 train 4.929888 (lr=4.4079e-04) (hash(x)=41300497)
456
+ 3780 train 5.414285 (lr=4.3625e-04) (hash(x)=36222071)
457
+ 3790 train 5.313930 (lr=4.3179e-04) (hash(x)=42341695)
458
+ 3800 val loss 5.3214
459
+ 3800 val perplexity 204.6624
460
+ 3800 train 5.301294 (lr=4.2739e-04) (hash(x)=40448838)
461
+ 3810 train 5.495556 (lr=4.2307e-04) (hash(x)=41458713)
462
+ 3820 train 5.403583 (lr=4.1881e-04) (hash(x)=41813233)
463
+ 3830 train 5.154315 (lr=4.1463e-04) (hash(x)=37783109)
464
+ 3840 train 5.377775 (lr=4.1052e-04) (hash(x)=38909502)
465
+ 3850 train 5.205124 (lr=4.0648e-04) (hash(x)=45321879)
466
+ 3860 train 5.499700 (lr=4.0252e-04) (hash(x)=42854087)
467
+ 3870 train 5.795658 (lr=3.9862e-04) (hash(x)=54309583)
468
+ 3880 train 5.194125 (lr=3.9480e-04) (hash(x)=41118717)
469
+ 3890 train 5.439643 (lr=3.9105e-04) (hash(x)=38995555)
470
+ 3900 val loss 5.3088
471
+ 3900 val perplexity 202.1078
472
+ 3900 train 5.799438 (lr=3.8738e-04) (hash(x)=46283603)
473
+ 3910 train 5.472976 (lr=3.8378e-04) (hash(x)=43245072)
474
+ 3920 train 4.922875 (lr=3.8025e-04) (hash(x)=37286533)
475
+ 3930 train 5.469273 (lr=3.7679e-04) (hash(x)=43413014)
476
+ 3940 train 5.305347 (lr=3.7341e-04) (hash(x)=42540654)
477
+ 3950 train 5.302908 (lr=3.7010e-04) (hash(x)=41489136)
478
+ 3960 train 5.817022 (lr=3.6687e-04) (hash(x)=41594224)
479
+ 3970 train 4.878980 (lr=3.6371e-04) (hash(x)=38126486)
480
+ 3980 train 4.963630 (lr=3.6063e-04) (hash(x)=36803107)
481
+ 3990 train 5.435699 (lr=3.5762e-04) (hash(x)=48697262)
482
+ 4000 val loss 5.2920
483
+ 4000 val perplexity 198.7383
484
+ 4000 train 4.849784 (lr=3.5468e-04) (hash(x)=34780610)
485
+ 4010 train 5.313143 (lr=3.5183e-04) (hash(x)=38957122)
486
+ 4020 train 5.144952 (lr=3.4904e-04) (hash(x)=41312761)
487
+ 4030 train 5.304487 (lr=3.4633e-04) (hash(x)=38815685)
488
+ 4040 train 5.290574 (lr=3.4370e-04) (hash(x)=40548890)
489
+ 4050 train 5.367460 (lr=3.4114e-04) (hash(x)=41487337)
490
+ 4060 train 5.449817 (lr=3.3866e-04) (hash(x)=40326916)
491
+ 4070 train 5.340825 (lr=3.3626e-04) (hash(x)=39315642)
492
+ 4080 train 5.144194 (lr=3.3393e-04) (hash(x)=39593043)
493
+ 4090 train 5.210406 (lr=3.3168e-04) (hash(x)=41855711)
494
+ 4100 val loss 5.2902
495
+ 4100 val perplexity 198.3822
496
+ 4100 train 4.922108 (lr=3.2950e-04) (hash(x)=31879830)
497
+ 4110 train 5.598635 (lr=3.2740e-04) (hash(x)=42446951)
498
+ 4120 train 5.279421 (lr=3.2538e-04) (hash(x)=43833919)
499
+ 4130 train 5.258513 (lr=3.2343e-04) (hash(x)=41527761)
500
+ 4140 train 5.213408 (lr=3.2156e-04) (hash(x)=37972657)
501
+ 4150 train 5.180491 (lr=3.1977e-04) (hash(x)=40068282)
502
+ 4160 train 5.406243 (lr=3.1806e-04) (hash(x)=41369110)
503
+ 4170 train 5.125205 (lr=3.1642e-04) (hash(x)=34821306)
504
+ 4180 train 5.056667 (lr=3.1486e-04) (hash(x)=40148949)
505
+ 4190 train 5.203684 (lr=3.1338e-04) (hash(x)=41315194)
506
+ 4200 val loss 5.2638
507
+ 4200 val perplexity 193.2136
508
+ 4200 train 4.899979 (lr=3.1197e-04) (hash(x)=42586995)
509
+ 4210 train 4.859787 (lr=3.1065e-04) (hash(x)=36718793)
510
+ 4220 train 4.906089 (lr=3.0940e-04) (hash(x)=36462814)
511
+ 4230 train 5.541209 (lr=3.0822e-04) (hash(x)=47694519)
512
+ 4240 train 5.211882 (lr=3.0713e-04) (hash(x)=37953249)
513
+ 4250 train 5.359942 (lr=3.0611e-04) (hash(x)=41039733)
514
+ 4260 train 4.859933 (lr=3.0517e-04) (hash(x)=39791205)
515
+ 4270 train 5.394708 (lr=3.0431e-04) (hash(x)=38797177)
516
+ 4280 train 5.263216 (lr=3.0353e-04) (hash(x)=37485257)
517
+ 4290 train 4.743282 (lr=3.0283e-04) (hash(x)=37238242)
518
+ 4300 val loss 5.2628
519
+ 4300 val perplexity 193.0162
520
+ 4300 train 5.429446 (lr=3.0220e-04) (hash(x)=40166116)
521
+ 4310 train 5.114768 (lr=3.0165e-04) (hash(x)=33290613)
522
+ 4320 train 5.372978 (lr=3.0118e-04) (hash(x)=42501007)
523
+ 4330 train 5.486688 (lr=3.0079e-04) (hash(x)=42070500)
524
+ 4340 train 5.242562 (lr=3.0048e-04) (hash(x)=39474877)
525
+ 4350 train 5.229932 (lr=3.0024e-04) (hash(x)=39074840)
526
+ 4360 train 5.349564 (lr=3.0009e-04) (hash(x)=45427492)
527
+ 4370 train 5.381782 (lr=3.0001e-04) (hash(x)=48198843)
528
+ 4374 val loss 5.2624
529
+ 4374 val perplexity 192.9513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/fix_1_latent_mask/1_latent_mask_lr_30e-4_n_latent_masks_2_seed_1339/model_04374.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b71cdbdcf73b465a3846e9e2b9535c0ce4c34bf0410db9e2118cf35c95dddb8
3
  size 97707314
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4df22df54abde012517a6225585c7cba5451f3c4728d24085e2b48e70a98fda4
3
  size 97707314
logs/fix_1_latent_mask/1_latent_mask_lr_30e-4_n_latent_masks_2_seed_1339/optimizer_04374.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73aab609d1c5d457f68bd839498de4ce00438bccc31cb2855dfe8ada75dc11b6
3
  size 189136950
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa542e0350dd6e3379f67a45d5ea1b5744ef40c64b550535eaaa6a8ae5706151
3
  size 189136950