andrew-healey commited on
Commit
33cc5eb
·
verified ·
1 Parent(s): 12807b2

Upload folder using huggingface_hub

Browse files
lr6e-4_total_batch_size61440_baseline_seed1340/args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_11/lr6e-4_total_batch_size61440_baseline_seed1340", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_11", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1340, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 0.0006, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "6e-4_61440", "n_embd": 256}
lr6e-4_total_batch_size61440_baseline_seed1340/dataloader_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:953385078aa3787b69fc6857dfd48b0a2cd2f4d27c6f8892e01211aca53d07f5
3
+ size 964
lr6e-4_total_batch_size61440_baseline_seed1340/log2.txt ADDED
@@ -0,0 +1,1054 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ max_steps: 8750
2
+ 0 val loss 11.2465
3
+ 0 val perplexity 76607.9531
4
+ 0 train 11.254622 (lr=1.2000e-06) (hash(x)=164406924)
5
+ 10 train 10.486423 (lr=1.3200e-05) (hash(x)=152008797)
6
+ 20 train 9.935457 (lr=2.5200e-05) (hash(x)=153494457)
7
+ 30 train 9.582285 (lr=3.7200e-05) (hash(x)=137328499)
8
+ 40 train 9.382899 (lr=4.9200e-05) (hash(x)=159333245)
9
+ 50 train 8.991772 (lr=6.1200e-05) (hash(x)=177407419)
10
+ 60 train 8.484509 (lr=7.3200e-05) (hash(x)=127877799)
11
+ 70 train 8.067822 (lr=8.5200e-05) (hash(x)=140873918)
12
+ 80 train 7.910814 (lr=9.7200e-05) (hash(x)=160249377)
13
+ 90 train 7.667356 (lr=1.0920e-04) (hash(x)=154282910)
14
+ 100 val loss 7.5881
15
+ 100 val perplexity 1974.5780
16
+ 100 train 7.494137 (lr=1.2120e-04) (hash(x)=144903932)
17
+ 110 train 7.504959 (lr=1.3320e-04) (hash(x)=151685723)
18
+ 120 train 7.299349 (lr=1.4520e-04) (hash(x)=152347864)
19
+ 130 train 7.293032 (lr=1.5720e-04) (hash(x)=152230926)
20
+ 140 train 7.148556 (lr=1.6920e-04) (hash(x)=142121048)
21
+ 150 train 7.265087 (lr=1.8120e-04) (hash(x)=173839165)
22
+ 160 train 7.050098 (lr=1.9320e-04) (hash(x)=158755931)
23
+ 170 train 6.881308 (lr=2.0520e-04) (hash(x)=151645697)
24
+ 180 train 7.167406 (lr=2.1720e-04) (hash(x)=179696111)
25
+ 190 train 6.812750 (lr=2.2920e-04) (hash(x)=150511423)
26
+ 200 val loss 6.8505
27
+ 200 val perplexity 944.3774
28
+ 200 train 6.975245 (lr=2.4120e-04) (hash(x)=167734596)
29
+ 210 train 6.547418 (lr=2.5320e-04) (hash(x)=133157560)
30
+ 220 train 6.813692 (lr=2.6520e-04) (hash(x)=152234670)
31
+ 230 train 6.621839 (lr=2.7720e-04) (hash(x)=160995039)
32
+ 240 train 6.495830 (lr=2.8920e-04) (hash(x)=139367591)
33
+ 250 train 6.664530 (lr=3.0120e-04) (hash(x)=153224076)
34
+ 260 train 6.560595 (lr=3.1320e-04) (hash(x)=156667529)
35
+ 270 train 6.467413 (lr=3.2520e-04) (hash(x)=133883725)
36
+ 280 train 6.291535 (lr=3.3720e-04) (hash(x)=151939311)
37
+ 290 train 6.328707 (lr=3.4920e-04) (hash(x)=150290478)
38
+ 300 val loss 6.3322
39
+ 300 val perplexity 562.4131
40
+ 300 train 6.290477 (lr=3.6120e-04) (hash(x)=149619098)
41
+ 310 train 6.208458 (lr=3.7320e-04) (hash(x)=142344072)
42
+ 320 train 6.271106 (lr=3.8520e-04) (hash(x)=151878766)
43
+ 330 train 6.121476 (lr=3.9720e-04) (hash(x)=136419763)
44
+ 340 train 6.240640 (lr=4.0920e-04) (hash(x)=160761498)
45
+ 350 train 6.089416 (lr=4.2120e-04) (hash(x)=146539909)
46
+ 360 train 6.200458 (lr=4.3320e-04) (hash(x)=143063312)
47
+ 370 train 6.056172 (lr=4.4520e-04) (hash(x)=153705818)
48
+ 380 train 5.972964 (lr=4.5720e-04) (hash(x)=149579175)
49
+ 390 train 5.918429 (lr=4.6920e-04) (hash(x)=150904406)
50
+ 400 val loss 6.0112
51
+ 400 val perplexity 407.9679
52
+ 400 train 5.876863 (lr=4.8120e-04) (hash(x)=153710890)
53
+ 410 train 5.771727 (lr=4.9320e-04) (hash(x)=138302968)
54
+ 420 train 5.777105 (lr=5.0520e-04) (hash(x)=141112989)
55
+ 430 train 5.830154 (lr=5.1720e-04) (hash(x)=149846944)
56
+ 440 train 5.828116 (lr=5.2920e-04) (hash(x)=157683479)
57
+ 450 train 5.824431 (lr=5.4120e-04) (hash(x)=155873620)
58
+ 460 train 6.028512 (lr=5.5320e-04) (hash(x)=152133659)
59
+ 470 train 5.799159 (lr=5.6520e-04) (hash(x)=158095308)
60
+ 480 train 5.649451 (lr=5.7720e-04) (hash(x)=148422063)
61
+ 490 train 5.826170 (lr=5.8920e-04) (hash(x)=145665071)
62
+ 500 val loss 5.7841
63
+ 500 val perplexity 325.1038
64
+ 500 train 5.776611 (lr=6.0000e-04) (hash(x)=145450636)
65
+ 510 train 5.759080 (lr=6.0000e-04) (hash(x)=148375186)
66
+ 520 train 5.775891 (lr=5.9999e-04) (hash(x)=144256573)
67
+ 530 train 5.816446 (lr=5.9998e-04) (hash(x)=141443048)
68
+ 540 train 5.650346 (lr=5.9997e-04) (hash(x)=149347480)
69
+ 550 train 5.734946 (lr=5.9995e-04) (hash(x)=154123388)
70
+ 560 train 5.695611 (lr=5.9993e-04) (hash(x)=143045783)
71
+ 570 train 5.565760 (lr=5.9990e-04) (hash(x)=157244060)
72
+ 580 train 5.561405 (lr=5.9987e-04) (hash(x)=158018166)
73
+ 590 train 5.635768 (lr=5.9984e-04) (hash(x)=153794251)
74
+ 600 val loss 5.6070
75
+ 600 val perplexity 272.3222
76
+ 600 train 5.557906 (lr=5.9980e-04) (hash(x)=145249251)
77
+ 610 train 5.607287 (lr=5.9976e-04) (hash(x)=150377688)
78
+ 620 train 5.564731 (lr=5.9972e-04) (hash(x)=134768519)
79
+ 630 train 5.479506 (lr=5.9967e-04) (hash(x)=141559208)
80
+ 640 train 5.457463 (lr=5.9962e-04) (hash(x)=148444988)
81
+ 650 train 5.460835 (lr=5.9956e-04) (hash(x)=148937127)
82
+ 660 train 5.367904 (lr=5.9950e-04) (hash(x)=145121305)
83
+ 670 train 5.500012 (lr=5.9943e-04) (hash(x)=156860740)
84
+ 680 train 5.281329 (lr=5.9937e-04) (hash(x)=137272598)
85
+ 690 train 5.285691 (lr=5.9929e-04) (hash(x)=139021179)
86
+ 700 val loss 5.4152
87
+ 700 val perplexity 224.7959
88
+ 700 train 5.586763 (lr=5.9922e-04) (hash(x)=150475545)
89
+ 710 train 5.376775 (lr=5.9914e-04) (hash(x)=148476077)
90
+ 720 train 5.262850 (lr=5.9905e-04) (hash(x)=143328717)
91
+ 730 train 5.183437 (lr=5.9897e-04) (hash(x)=137342488)
92
+ 740 train 5.278402 (lr=5.9887e-04) (hash(x)=149452392)
93
+ 750 train 5.261243 (lr=5.9878e-04) (hash(x)=154653428)
94
+ 760 train 5.379678 (lr=5.9868e-04) (hash(x)=153305312)
95
+ 770 train 5.227743 (lr=5.9857e-04) (hash(x)=153862299)
96
+ 780 train 5.243790 (lr=5.9847e-04) (hash(x)=151035519)
97
+ 790 train 5.340232 (lr=5.9836e-04) (hash(x)=145990773)
98
+ 800 val loss 5.2937
99
+ 800 val perplexity 199.0835
100
+ 800 train 5.304738 (lr=5.9824e-04) (hash(x)=144483776)
101
+ 810 train 5.202996 (lr=5.9812e-04) (hash(x)=142045737)
102
+ 820 train 5.332297 (lr=5.9800e-04) (hash(x)=157946886)
103
+ 830 train 5.202717 (lr=5.9787e-04) (hash(x)=150425385)
104
+ 840 train 5.245149 (lr=5.9774e-04) (hash(x)=144514796)
105
+ 850 train 5.278734 (lr=5.9761e-04) (hash(x)=157395496)
106
+ 860 train 5.250891 (lr=5.9747e-04) (hash(x)=151365298)
107
+ 870 train 5.275786 (lr=5.9732e-04) (hash(x)=151437005)
108
+ 880 train 5.180932 (lr=5.9718e-04) (hash(x)=151935250)
109
+ 890 train 5.223048 (lr=5.9703e-04) (hash(x)=159838534)
110
+ 900 val loss 5.1791
111
+ 900 val perplexity 177.5242
112
+ 900 train 5.195571 (lr=5.9687e-04) (hash(x)=157916369)
113
+ 910 train 5.251864 (lr=5.9672e-04) (hash(x)=165272484)
114
+ 920 train 5.105950 (lr=5.9655e-04) (hash(x)=145383083)
115
+ 930 train 5.092994 (lr=5.9639e-04) (hash(x)=152078393)
116
+ 940 train 5.209867 (lr=5.9622e-04) (hash(x)=151846396)
117
+ 950 train 5.135397 (lr=5.9605e-04) (hash(x)=166073923)
118
+ 960 train 5.104422 (lr=5.9587e-04) (hash(x)=136782937)
119
+ 970 train 5.076737 (lr=5.9569e-04) (hash(x)=146747407)
120
+ 980 train 5.097136 (lr=5.9550e-04) (hash(x)=171270168)
121
+ 990 train 5.102144 (lr=5.9531e-04) (hash(x)=141668491)
122
+ 1000 val loss 5.1032
123
+ 1000 val perplexity 164.5528
124
+ 1000 train 5.003941 (lr=5.9512e-04) (hash(x)=154856891)
125
+ 1010 train 5.093828 (lr=5.9492e-04) (hash(x)=145288527)
126
+ 1020 train 4.962240 (lr=5.9472e-04) (hash(x)=144618667)
127
+ 1030 train 5.199950 (lr=5.9452e-04) (hash(x)=139276042)
128
+ 1040 train 4.887685 (lr=5.9431e-04) (hash(x)=147217952)
129
+ 1050 train 4.988291 (lr=5.9410e-04) (hash(x)=151925203)
130
+ 1060 train 4.949405 (lr=5.9388e-04) (hash(x)=147473652)
131
+ 1070 train 4.911823 (lr=5.9366e-04) (hash(x)=145345524)
132
+ 1080 train 4.801612 (lr=5.9344e-04) (hash(x)=155595779)
133
+ 1090 train 4.851241 (lr=5.9321e-04) (hash(x)=147405225)
134
+ 1100 val loss 5.0080
135
+ 1100 val perplexity 149.6043
136
+ 1100 train 4.702862 (lr=5.9298e-04) (hash(x)=136191502)
137
+ 1110 train 4.935129 (lr=5.9275e-04) (hash(x)=151847694)
138
+ 1120 train 5.024825 (lr=5.9251e-04) (hash(x)=144828302)
139
+ 1130 train 5.058961 (lr=5.9227e-04) (hash(x)=153817096)
140
+ 1140 train 5.107350 (lr=5.9202e-04) (hash(x)=166352243)
141
+ 1150 train 5.144423 (lr=5.9177e-04) (hash(x)=153273362)
142
+ 1160 train 4.983146 (lr=5.9152e-04) (hash(x)=178082599)
143
+ 1170 train 4.779559 (lr=5.9126e-04) (hash(x)=149460553)
144
+ 1180 train 4.984323 (lr=5.9100e-04) (hash(x)=151910947)
145
+ 1190 train 4.881153 (lr=5.9073e-04) (hash(x)=147115336)
146
+ 1200 val loss 4.9122
147
+ 1200 val perplexity 135.9390
148
+ 1200 train 4.933949 (lr=5.9046e-04) (hash(x)=148021541)
149
+ 1210 train 4.826488 (lr=5.9019e-04) (hash(x)=151495989)
150
+ 1220 train 4.946360 (lr=5.8992e-04) (hash(x)=156983220)
151
+ 1230 train 5.015411 (lr=5.8963e-04) (hash(x)=147788187)
152
+ 1240 train 4.985041 (lr=5.8935e-04) (hash(x)=171125590)
153
+ 1250 train 4.886112 (lr=5.8906e-04) (hash(x)=141356608)
154
+ 1260 train 4.929478 (lr=5.8877e-04) (hash(x)=150132098)
155
+ 1270 train 4.807566 (lr=5.8848e-04) (hash(x)=147917245)
156
+ 1280 train 4.900241 (lr=5.8818e-04) (hash(x)=148900016)
157
+ 1290 train 4.870604 (lr=5.8787e-04) (hash(x)=144978605)
158
+ 1300 val loss 4.8334
159
+ 1300 val perplexity 125.6381
160
+ 1300 train 4.781497 (lr=5.8757e-04) (hash(x)=146005217)
161
+ 1310 train 4.797524 (lr=5.8726e-04) (hash(x)=144892309)
162
+ 1320 train 4.859780 (lr=5.8694e-04) (hash(x)=165903661)
163
+ 1330 train 4.807590 (lr=5.8663e-04) (hash(x)=168489989)
164
+ 1340 train 4.975265 (lr=5.8630e-04) (hash(x)=176281294)
165
+ 1350 train 4.759254 (lr=5.8598e-04) (hash(x)=144511718)
166
+ 1360 train 4.823272 (lr=5.8565e-04) (hash(x)=144362722)
167
+ 1370 train 4.693036 (lr=5.8532e-04) (hash(x)=139964999)
168
+ 1380 train 4.716559 (lr=5.8498e-04) (hash(x)=193332654)
169
+ 1390 train 4.761081 (lr=5.8464e-04) (hash(x)=138180866)
170
+ 1400 val loss 4.7848
171
+ 1400 val perplexity 119.6811
172
+ 1400 train 4.762902 (lr=5.8430e-04) (hash(x)=146019502)
173
+ 1410 train 4.809028 (lr=5.8395e-04) (hash(x)=153245388)
174
+ 1420 train 4.693506 (lr=5.8360e-04) (hash(x)=149839636)
175
+ 1430 train 4.802482 (lr=5.8324e-04) (hash(x)=142844796)
176
+ 1440 train 5.023189 (lr=5.8289e-04) (hash(x)=159787060)
177
+ 1450 train 4.732884 (lr=5.8252e-04) (hash(x)=146496200)
178
+ 1460 train 4.655418 (lr=5.8216e-04) (hash(x)=164169521)
179
+ 1470 train 4.702697 (lr=5.8179e-04) (hash(x)=150906386)
180
+ 1480 train 4.720006 (lr=5.8142e-04) (hash(x)=148916053)
181
+ 1490 train 4.651597 (lr=5.8104e-04) (hash(x)=148746554)
182
+ 1500 val loss 4.7859
183
+ 1500 val perplexity 119.8145
184
+ 1500 train 4.742775 (lr=5.8066e-04) (hash(x)=150127281)
185
+ 1510 train 4.912547 (lr=5.8028e-04) (hash(x)=149760530)
186
+ 1520 train 4.759301 (lr=5.7989e-04) (hash(x)=144237370)
187
+ 1530 train 4.567895 (lr=5.7950e-04) (hash(x)=132692898)
188
+ 1540 train 4.553084 (lr=5.7910e-04) (hash(x)=150979737)
189
+ 1550 train 4.742966 (lr=5.7870e-04) (hash(x)=142022255)
190
+ 1560 train 4.673608 (lr=5.7830e-04) (hash(x)=135769745)
191
+ 1570 train 4.536185 (lr=5.7790e-04) (hash(x)=162241131)
192
+ 1580 train 4.635792 (lr=5.7749e-04) (hash(x)=166874637)
193
+ 1590 train 4.600491 (lr=5.7707e-04) (hash(x)=157401041)
194
+ 1600 val loss 4.6843
195
+ 1600 val perplexity 108.2363
196
+ 1600 train 4.623621 (lr=5.7666e-04) (hash(x)=154120875)
197
+ 1610 train 4.613076 (lr=5.7624e-04) (hash(x)=149407490)
198
+ 1620 train 4.560949 (lr=5.7581e-04) (hash(x)=144479755)
199
+ 1630 train 4.640452 (lr=5.7539e-04) (hash(x)=170907716)
200
+ 1640 train 4.289717 (lr=5.7496e-04) (hash(x)=151914010)
201
+ 1650 train 4.579615 (lr=5.7452e-04) (hash(x)=149843610)
202
+ 1660 train 4.474859 (lr=5.7408e-04) (hash(x)=133217001)
203
+ 1670 train 4.449507 (lr=5.7364e-04) (hash(x)=168227774)
204
+ 1680 train 4.403447 (lr=5.7320e-04) (hash(x)=157093189)
205
+ 1690 train 4.241521 (lr=5.7275e-04) (hash(x)=154989819)
206
+ 1700 val loss 4.6601
207
+ 1700 val perplexity 105.6499
208
+ 1700 train 4.764375 (lr=5.7230e-04) (hash(x)=155797680)
209
+ 1710 train 4.438869 (lr=5.7184e-04) (hash(x)=166050772)
210
+ 1720 train 4.415730 (lr=5.7138e-04) (hash(x)=151663443)
211
+ 1730 train 4.637233 (lr=5.7092e-04) (hash(x)=152112619)
212
+ 1740 train 4.559391 (lr=5.7045e-04) (hash(x)=156132679)
213
+ 1750 train 4.410774 (lr=5.6999e-04) (hash(x)=149044477)
214
+ 1760 train 4.588952 (lr=5.6951e-04) (hash(x)=142924719)
215
+ 1770 train 4.492014 (lr=5.6904e-04) (hash(x)=153467406)
216
+ 1780 train 4.576634 (lr=5.6856e-04) (hash(x)=148889581)
217
+ 1790 train 4.516285 (lr=5.6807e-04) (hash(x)=164831182)
218
+ 1800 val loss 4.5968
219
+ 1800 val perplexity 99.1682
220
+ 1800 train 4.524964 (lr=5.6759e-04) (hash(x)=156809396)
221
+ 1810 train 4.623642 (lr=5.6710e-04) (hash(x)=149245741)
222
+ 1820 train 4.570191 (lr=5.6660e-04) (hash(x)=146865466)
223
+ 1830 train 4.640428 (lr=5.6611e-04) (hash(x)=148068618)
224
+ 1840 train 4.473223 (lr=5.6561e-04) (hash(x)=150088183)
225
+ 1850 train 4.549740 (lr=5.6510e-04) (hash(x)=146695082)
226
+ 1860 train 4.488262 (lr=5.6459e-04) (hash(x)=169008624)
227
+ 1870 train 4.424570 (lr=5.6408e-04) (hash(x)=148645124)
228
+ 1880 train 4.590400 (lr=5.6357e-04) (hash(x)=139701422)
229
+ 1890 train 4.548831 (lr=5.6305e-04) (hash(x)=156374257)
230
+ 1900 val loss 4.5746
231
+ 1900 val perplexity 96.9929
232
+ 1900 train 4.496499 (lr=5.6253e-04) (hash(x)=144640294)
233
+ 1910 train 4.637846 (lr=5.6201e-04) (hash(x)=147484985)
234
+ 1920 train 4.494401 (lr=5.6148e-04) (hash(x)=142317889)
235
+ 1930 train 4.482625 (lr=5.6095e-04) (hash(x)=144906216)
236
+ 1940 train 4.534515 (lr=5.6041e-04) (hash(x)=165332621)
237
+ 1950 train 4.510262 (lr=5.5988e-04) (hash(x)=145701919)
238
+ 1960 train 4.410273 (lr=5.5934e-04) (hash(x)=146414118)
239
+ 1970 train 4.572169 (lr=5.5879e-04) (hash(x)=146766958)
240
+ 1980 train 4.492216 (lr=5.5824e-04) (hash(x)=148146847)
241
+ 1990 train 4.256873 (lr=5.5769e-04) (hash(x)=159684604)
242
+ 2000 val loss 4.5661
243
+ 2000 val perplexity 96.1705
244
+ 2000 train 4.404471 (lr=5.5714e-04) (hash(x)=162831106)
245
+ 2010 train 4.424958 (lr=5.5658e-04) (hash(x)=150583346)
246
+ 2020 train 4.373349 (lr=5.5602e-04) (hash(x)=142779458)
247
+ 2030 train 4.354062 (lr=5.5546e-04) (hash(x)=143755114)
248
+ 2040 train 4.268564 (lr=5.5489e-04) (hash(x)=147324095)
249
+ 2050 train 4.371172 (lr=5.5432e-04) (hash(x)=141727373)
250
+ 2060 train 4.443020 (lr=5.5374e-04) (hash(x)=162053052)
251
+ 2070 train 4.490764 (lr=5.5317e-04) (hash(x)=162596975)
252
+ 2080 train 4.409820 (lr=5.5259e-04) (hash(x)=161242340)
253
+ 2090 train 4.390230 (lr=5.5200e-04) (hash(x)=148583522)
254
+ 2100 val loss 4.5780
255
+ 2100 val perplexity 97.3198
256
+ 2100 train 4.465339 (lr=5.5142e-04) (hash(x)=158239484)
257
+ 2110 train 4.378835 (lr=5.5083e-04) (hash(x)=152610058)
258
+ 2120 train 4.409603 (lr=5.5023e-04) (hash(x)=146582203)
259
+ 2130 train 4.148681 (lr=5.4964e-04) (hash(x)=146686436)
260
+ 2140 train 4.555778 (lr=5.4904e-04) (hash(x)=142827295)
261
+ 2150 train 4.478709 (lr=5.4843e-04) (hash(x)=150747398)
262
+ 2160 train 4.763395 (lr=5.4783e-04) (hash(x)=150777134)
263
+ 2170 train 4.433492 (lr=5.4722e-04) (hash(x)=157002856)
264
+ 2180 train 4.492192 (lr=5.4661e-04) (hash(x)=150442337)
265
+ 2190 train 4.460946 (lr=5.4599e-04) (hash(x)=150799081)
266
+ 2200 val loss 4.4986
267
+ 2200 val perplexity 89.8879
268
+ 2200 train 4.381059 (lr=5.4537e-04) (hash(x)=140504180)
269
+ 2210 train 4.517085 (lr=5.4475e-04) (hash(x)=149088475)
270
+ 2220 train 4.481184 (lr=5.4413e-04) (hash(x)=155180847)
271
+ 2230 train 4.402691 (lr=5.4350e-04) (hash(x)=159347164)
272
+ 2240 train 4.563639 (lr=5.4287e-04) (hash(x)=153574288)
273
+ 2250 train 4.508776 (lr=5.4223e-04) (hash(x)=142603512)
274
+ 2260 train 4.519448 (lr=5.4160e-04) (hash(x)=142953831)
275
+ 2270 train 4.413083 (lr=5.4096e-04) (hash(x)=137642681)
276
+ 2280 train 4.433607 (lr=5.4031e-04) (hash(x)=158106614)
277
+ 2290 train 4.436308 (lr=5.3967e-04) (hash(x)=160787891)
278
+ 2300 val loss 4.4679
279
+ 2300 val perplexity 87.1697
280
+ 2300 train 4.533278 (lr=5.3902e-04) (hash(x)=142234024)
281
+ 2310 train 4.434681 (lr=5.3837e-04) (hash(x)=138192210)
282
+ 2320 train 4.458347 (lr=5.3771e-04) (hash(x)=147727662)
283
+ 2330 train 4.403520 (lr=5.3705e-04) (hash(x)=169324653)
284
+ 2340 train 4.514364 (lr=5.3639e-04) (hash(x)=145350355)
285
+ 2350 train 4.508618 (lr=5.3573e-04) (hash(x)=156909016)
286
+ 2360 train 4.430801 (lr=5.3506e-04) (hash(x)=144559543)
287
+ 2370 train 4.417887 (lr=5.3439e-04) (hash(x)=153212312)
288
+ 2380 train 4.389775 (lr=5.3372e-04) (hash(x)=131816284)
289
+ 2390 train 4.450120 (lr=5.3304e-04) (hash(x)=165818309)
290
+ 2400 val loss 4.4651
291
+ 2400 val perplexity 86.9287
292
+ 2400 train 4.476176 (lr=5.3236e-04) (hash(x)=143091562)
293
+ 2410 train 4.372552 (lr=5.3168e-04) (hash(x)=154010697)
294
+ 2420 train 4.343225 (lr=5.3099e-04) (hash(x)=144844001)
295
+ 2430 train 4.501669 (lr=5.3030e-04) (hash(x)=156389834)
296
+ 2440 train 4.198299 (lr=5.2961e-04) (hash(x)=162425919)
297
+ 2450 train 4.446963 (lr=5.2892e-04) (hash(x)=154200479)
298
+ 2460 train 4.370580 (lr=5.2822e-04) (hash(x)=159066617)
299
+ 2470 train 4.185802 (lr=5.2752e-04) (hash(x)=148278058)
300
+ 2480 train 4.150192 (lr=5.2682e-04) (hash(x)=165598522)
301
+ 2490 train 4.454783 (lr=5.2612e-04) (hash(x)=141306086)
302
+ 2500 val loss 4.4934
303
+ 2500 val perplexity 89.4246
304
+ 2500 train 4.308168 (lr=5.2541e-04) (hash(x)=149857456)
305
+ 2510 train 4.301208 (lr=5.2470e-04) (hash(x)=150368907)
306
+ 2520 train 4.256428 (lr=5.2398e-04) (hash(x)=153394920)
307
+ 2530 train 4.233460 (lr=5.2327e-04) (hash(x)=159740116)
308
+ 2540 train 4.392202 (lr=5.2255e-04) (hash(x)=161620367)
309
+ 2550 train 4.467092 (lr=5.2183e-04) (hash(x)=147223658)
310
+ 2560 train 4.554312 (lr=5.2110e-04) (hash(x)=158146613)
311
+ 2570 train 4.460134 (lr=5.2037e-04) (hash(x)=136375336)
312
+ 2580 train 4.500267 (lr=5.1964e-04) (hash(x)=149298016)
313
+ 2590 train 4.415389 (lr=5.1891e-04) (hash(x)=150720933)
314
+ 2600 val loss 4.4306
315
+ 2600 val perplexity 83.9809
316
+ 2600 train 4.525354 (lr=5.1817e-04) (hash(x)=146191551)
317
+ 2610 train 4.342191 (lr=5.1743e-04) (hash(x)=151413395)
318
+ 2620 train 4.543557 (lr=5.1669e-04) (hash(x)=162094106)
319
+ 2630 train 4.428928 (lr=5.1595e-04) (hash(x)=150608302)
320
+ 2640 train 4.333382 (lr=5.1520e-04) (hash(x)=146115160)
321
+ 2650 train 4.430802 (lr=5.1445e-04) (hash(x)=138242788)
322
+ 2660 train 4.641551 (lr=5.1370e-04) (hash(x)=159921837)
323
+ 2670 train 4.250050 (lr=5.1295e-04) (hash(x)=137205989)
324
+ 2680 train 4.269504 (lr=5.1219e-04) (hash(x)=158608524)
325
+ 2690 train 4.314833 (lr=5.1143e-04) (hash(x)=153322085)
326
+ 2700 val loss 4.4073
327
+ 2700 val perplexity 82.0512
328
+ 2700 train 4.354226 (lr=5.1067e-04) (hash(x)=145375752)
329
+ 2710 train 4.365409 (lr=5.0990e-04) (hash(x)=145036398)
330
+ 2720 train 4.384626 (lr=5.0913e-04) (hash(x)=140064355)
331
+ 2730 train 4.415036 (lr=5.0836e-04) (hash(x)=148983355)
332
+ 2740 train 4.311646 (lr=5.0759e-04) (hash(x)=147636026)
333
+ 2750 train 4.367444 (lr=5.0681e-04) (hash(x)=151763585)
334
+ 2760 train 4.299557 (lr=5.0603e-04) (hash(x)=178033416)
335
+ 2770 train 4.262702 (lr=5.0525e-04) (hash(x)=147097669)
336
+ 2780 train 4.223789 (lr=5.0447e-04) (hash(x)=140475447)
337
+ 2790 train 4.337634 (lr=5.0368e-04) (hash(x)=141135962)
338
+ 2800 val loss 4.3988
339
+ 2800 val perplexity 81.3571
340
+ 2800 train 4.240672 (lr=5.0290e-04) (hash(x)=151568014)
341
+ 2810 train 4.953083 (lr=5.0210e-04) (hash(x)=170629615)
342
+ 2820 train 4.317946 (lr=5.0131e-04) (hash(x)=151858146)
343
+ 2830 train 4.232313 (lr=5.0052e-04) (hash(x)=152170305)
344
+ 2840 train 4.149346 (lr=4.9972e-04) (hash(x)=150705881)
345
+ 2850 train 4.182802 (lr=4.9892e-04) (hash(x)=146997394)
346
+ 2860 train 4.164981 (lr=4.9811e-04) (hash(x)=159749180)
347
+ 2870 train 4.046839 (lr=4.9731e-04) (hash(x)=147618423)
348
+ 2880 train 4.240561 (lr=4.9650e-04) (hash(x)=140778993)
349
+ 2890 train 4.124408 (lr=4.9569e-04) (hash(x)=150735837)
350
+ 2900 val loss 4.3982
351
+ 2900 val perplexity 81.3030
352
+ 2900 train 4.178474 (lr=4.9487e-04) (hash(x)=149366597)
353
+ 2910 train 4.299051 (lr=4.9406e-04) (hash(x)=181602500)
354
+ 2920 train 4.265178 (lr=4.9324e-04) (hash(x)=148080200)
355
+ 2930 train 4.104599 (lr=4.9242e-04) (hash(x)=150629961)
356
+ 2940 train 4.279860 (lr=4.9160e-04) (hash(x)=155786888)
357
+ 2950 train 4.278832 (lr=4.9077e-04) (hash(x)=148553059)
358
+ 2960 train 4.396752 (lr=4.8995e-04) (hash(x)=158494862)
359
+ 2970 train 4.517593 (lr=4.8912e-04) (hash(x)=147684099)
360
+ 2980 train 4.273145 (lr=4.8829e-04) (hash(x)=159008790)
361
+ 2990 train 4.419888 (lr=4.8745e-04) (hash(x)=148288782)
362
+ 3000 val loss 4.3568
363
+ 3000 val perplexity 78.0109
364
+ 3000 train 4.292861 (lr=4.8662e-04) (hash(x)=150464442)
365
+ 3010 train 4.371593 (lr=4.8578e-04) (hash(x)=148665916)
366
+ 3020 train 4.316908 (lr=4.8494e-04) (hash(x)=159788454)
367
+ 3030 train 4.389796 (lr=4.8409e-04) (hash(x)=170224765)
368
+ 3040 train 4.277595 (lr=4.8325e-04) (hash(x)=139746299)
369
+ 3050 train 4.338845 (lr=4.8240e-04) (hash(x)=173491884)
370
+ 3060 train 4.332809 (lr=4.8155e-04) (hash(x)=146098725)
371
+ 3070 train 4.309294 (lr=4.8070e-04) (hash(x)=154713525)
372
+ 3080 train 4.338870 (lr=4.7984e-04) (hash(x)=151206978)
373
+ 3090 train 4.370872 (lr=4.7899e-04) (hash(x)=144250687)
374
+ 3100 val loss 4.3782
375
+ 3100 val perplexity 79.6952
376
+ 3100 train 4.409786 (lr=4.7813e-04) (hash(x)=182449036)
377
+ 3110 train 4.221416 (lr=4.7727e-04) (hash(x)=145504538)
378
+ 3120 train 4.258609 (lr=4.7641e-04) (hash(x)=147948751)
379
+ 3130 train 4.212210 (lr=4.7554e-04) (hash(x)=144547711)
380
+ 3140 train 4.326009 (lr=4.7467e-04) (hash(x)=158622029)
381
+ 3150 train 4.281703 (lr=4.7380e-04) (hash(x)=155949476)
382
+ 3160 train 4.287187 (lr=4.7293e-04) (hash(x)=129245664)
383
+ 3170 train 4.309855 (lr=4.7206e-04) (hash(x)=157162959)
384
+ 3180 train 4.374291 (lr=4.7118e-04) (hash(x)=163102434)
385
+ 3190 train 4.401355 (lr=4.7031e-04) (hash(x)=155952009)
386
+ 3200 val loss 4.3503
387
+ 3200 val perplexity 77.5037
388
+ 3200 train 4.289626 (lr=4.6943e-04) (hash(x)=140141286)
389
+ 3210 train 4.280479 (lr=4.6855e-04) (hash(x)=148811694)
390
+ 3220 train 4.223735 (lr=4.6766e-04) (hash(x)=143433404)
391
+ 3230 train 4.279935 (lr=4.6678e-04) (hash(x)=150525065)
392
+ 3240 train 4.176414 (lr=4.6589e-04) (hash(x)=152014854)
393
+ 3250 train 4.048805 (lr=4.6500e-04) (hash(x)=144716153)
394
+ 3260 train 4.385694 (lr=4.6411e-04) (hash(x)=145324818)
395
+ 3270 train 4.317697 (lr=4.6322e-04) (hash(x)=154930419)
396
+ 3280 train 4.319047 (lr=4.6232e-04) (hash(x)=144649958)
397
+ 3290 train 4.301317 (lr=4.6142e-04) (hash(x)=156496147)
398
+ 3300 val loss 4.3369
399
+ 3300 val perplexity 76.4678
400
+ 3300 train 4.258340 (lr=4.6052e-04) (hash(x)=148099414)
401
+ 3310 train 4.315618 (lr=4.5962e-04) (hash(x)=141081470)
402
+ 3320 train 4.287163 (lr=4.5872e-04) (hash(x)=151572529)
403
+ 3330 train 4.253131 (lr=4.5782e-04) (hash(x)=146887343)
404
+ 3340 train 4.288600 (lr=4.5691e-04) (hash(x)=148612634)
405
+ 3350 train 4.246165 (lr=4.5600e-04) (hash(x)=145218304)
406
+ 3360 train 4.167013 (lr=4.5509e-04) (hash(x)=146893345)
407
+ 3370 train 4.254632 (lr=4.5418e-04) (hash(x)=159302018)
408
+ 3380 train 4.323816 (lr=4.5326e-04) (hash(x)=164117611)
409
+ 3390 train 4.194248 (lr=4.5235e-04) (hash(x)=142401925)
410
+ 3400 val loss 4.3225
411
+ 3400 val perplexity 75.3787
412
+ 3400 train 4.173793 (lr=4.5143e-04) (hash(x)=142633951)
413
+ 3410 train 4.300762 (lr=4.5051e-04) (hash(x)=148491904)
414
+ 3420 train 4.351557 (lr=4.4959e-04) (hash(x)=144013244)
415
+ 3430 train 4.161072 (lr=4.4867e-04) (hash(x)=146939843)
416
+ 3440 train 4.182618 (lr=4.4774e-04) (hash(x)=161055964)
417
+ 3450 train 4.087534 (lr=4.4682e-04) (hash(x)=134825681)
418
+ 3460 train 4.276597 (lr=4.4589e-04) (hash(x)=145087511)
419
+ 3470 train 4.100667 (lr=4.4496e-04) (hash(x)=144200286)
420
+ 3480 train 4.148541 (lr=4.4403e-04) (hash(x)=157605428)
421
+ 3490 train 4.325905 (lr=4.4310e-04) (hash(x)=153636990)
422
+ 3500 val loss 4.3239
423
+ 3500 val perplexity 75.4790
424
+ 3500 train 4.178516 (lr=4.4216e-04) (hash(x)=148368965)
425
+ 3510 train 4.088175 (lr=4.4123e-04) (hash(x)=144775557)
426
+ 3520 train 4.261998 (lr=4.4029e-04) (hash(x)=151539855)
427
+ 3530 train 4.280823 (lr=4.3935e-04) (hash(x)=168384321)
428
+ 3540 train 4.422899 (lr=4.3841e-04) (hash(x)=155671447)
429
+ 3550 train 4.444036 (lr=4.3747e-04) (hash(x)=157403334)
430
+ 3560 train 4.223127 (lr=4.3652e-04) (hash(x)=139495714)
431
+ 3570 train 4.357045 (lr=4.3558e-04) (hash(x)=147685555)
432
+ 3580 train 4.476269 (lr=4.3463e-04) (hash(x)=154209753)
433
+ 3590 train 4.097392 (lr=4.3368e-04) (hash(x)=166864372)
434
+ 3600 val loss 4.3065
435
+ 3600 val perplexity 74.1818
436
+ 3600 train 4.208117 (lr=4.3273e-04) (hash(x)=152372067)
437
+ 3610 train 4.488429 (lr=4.3178e-04) (hash(x)=156579291)
438
+ 3620 train 4.076468 (lr=4.3083e-04) (hash(x)=152969451)
439
+ 3630 train 4.051098 (lr=4.2987e-04) (hash(x)=164428105)
440
+ 3640 train 4.226479 (lr=4.2892e-04) (hash(x)=153325907)
441
+ 3650 train 4.200468 (lr=4.2796e-04) (hash(x)=159197101)
442
+ 3660 train 4.191760 (lr=4.2700e-04) (hash(x)=157503290)
443
+ 3670 train 4.112825 (lr=4.2604e-04) (hash(x)=149036650)
444
+ 3680 train 4.122258 (lr=4.2508e-04) (hash(x)=144525088)
445
+ 3690 train 4.232194 (lr=4.2411e-04) (hash(x)=143154211)
446
+ 3700 val loss 4.3016
447
+ 3700 val perplexity 73.8149
448
+ 3700 train 4.222100 (lr=4.2315e-04) (hash(x)=168885609)
449
+ 3710 train 4.189565 (lr=4.2218e-04) (hash(x)=148815644)
450
+ 3720 train 4.120987 (lr=4.2122e-04) (hash(x)=153279629)
451
+ 3730 train 4.140110 (lr=4.2025e-04) (hash(x)=148181200)
452
+ 3740 train 4.113040 (lr=4.1928e-04) (hash(x)=151357364)
453
+ 3750 train 4.080936 (lr=4.1831e-04) (hash(x)=145269246)
454
+ 3760 train 4.149215 (lr=4.1734e-04) (hash(x)=148923398)
455
+ 3770 train 4.154061 (lr=4.1636e-04) (hash(x)=141205226)
456
+ 3780 train 4.041055 (lr=4.1539e-04) (hash(x)=146180296)
457
+ 3790 train 4.117515 (lr=4.1441e-04) (hash(x)=169790000)
458
+ 3800 val loss 4.3008
459
+ 3800 val perplexity 73.7604
460
+ 3800 train 4.153253 (lr=4.1343e-04) (hash(x)=141633734)
461
+ 3810 train 4.277663 (lr=4.1246e-04) (hash(x)=156306070)
462
+ 3820 train 4.264000 (lr=4.1148e-04) (hash(x)=158769870)
463
+ 3830 train 4.237772 (lr=4.1050e-04) (hash(x)=165301927)
464
+ 3840 train 4.267307 (lr=4.0951e-04) (hash(x)=147025475)
465
+ 3850 train 4.208849 (lr=4.0853e-04) (hash(x)=141223580)
466
+ 3860 train 4.236732 (lr=4.0755e-04) (hash(x)=139668795)
467
+ 3870 train 4.295900 (lr=4.0656e-04) (hash(x)=155839599)
468
+ 3880 train 4.365773 (lr=4.0557e-04) (hash(x)=150635541)
469
+ 3890 train 4.353963 (lr=4.0459e-04) (hash(x)=153702524)
470
+ 3900 val loss 4.2767
471
+ 3900 val perplexity 72.0022
472
+ 3900 train 4.179128 (lr=4.0360e-04) (hash(x)=153141007)
473
+ 3910 train 4.262108 (lr=4.0261e-04) (hash(x)=145483115)
474
+ 3920 train 4.056787 (lr=4.0162e-04) (hash(x)=147480523)
475
+ 3930 train 4.230914 (lr=4.0063e-04) (hash(x)=146229467)
476
+ 3940 train 4.207361 (lr=3.9963e-04) (hash(x)=151376187)
477
+ 3950 train 4.179211 (lr=3.9864e-04) (hash(x)=153745186)
478
+ 3960 train 4.150270 (lr=3.9764e-04) (hash(x)=147595615)
479
+ 3970 train 4.275993 (lr=3.9665e-04) (hash(x)=157979848)
480
+ 3980 train 4.144835 (lr=3.9565e-04) (hash(x)=153714091)
481
+ 3990 train 4.175827 (lr=3.9465e-04) (hash(x)=155637629)
482
+ 4000 val loss 4.2712
483
+ 4000 val perplexity 71.6050
484
+ 4000 train 4.201859 (lr=3.9365e-04) (hash(x)=160577202)
485
+ 4010 train 4.116205 (lr=3.9266e-04) (hash(x)=147432640)
486
+ 4020 train 4.075618 (lr=3.9165e-04) (hash(x)=135542902)
487
+ 4030 train 4.209609 (lr=3.9065e-04) (hash(x)=143137909)
488
+ 4040 train 4.202774 (lr=3.8965e-04) (hash(x)=148269908)
489
+ 4050 train 4.087222 (lr=3.8865e-04) (hash(x)=141954182)
490
+ 4060 train 4.169354 (lr=3.8764e-04) (hash(x)=157035179)
491
+ 4070 train 3.901249 (lr=3.8664e-04) (hash(x)=151361423)
492
+ 4080 train 4.023126 (lr=3.8563e-04) (hash(x)=144373988)
493
+ 4090 train 4.241362 (lr=3.8463e-04) (hash(x)=156461428)
494
+ 4100 val loss 4.2644
495
+ 4100 val perplexity 71.1221
496
+ 4100 train 4.343929 (lr=3.8362e-04) (hash(x)=153858169)
497
+ 4110 train 4.315270 (lr=3.8261e-04) (hash(x)=165506959)
498
+ 4120 train 4.319119 (lr=3.8160e-04) (hash(x)=158709009)
499
+ 4130 train 4.398439 (lr=3.8059e-04) (hash(x)=151836522)
500
+ 4140 train 4.161856 (lr=3.7958e-04) (hash(x)=152917389)
501
+ 4150 train 4.210320 (lr=3.7857e-04) (hash(x)=146973868)
502
+ 4160 train 4.311877 (lr=3.7756e-04) (hash(x)=157875887)
503
+ 4170 train 4.578331 (lr=3.7655e-04) (hash(x)=167933111)
504
+ 4180 train 4.178298 (lr=3.7553e-04) (hash(x)=161938168)
505
+ 4190 train 4.148326 (lr=3.7452e-04) (hash(x)=158102630)
506
+ 4200 val loss 4.2498
507
+ 4200 val perplexity 70.0920
508
+ 4200 train 4.269673 (lr=3.7351e-04) (hash(x)=155889149)
509
+ 4210 train 4.349800 (lr=3.7249e-04) (hash(x)=131046288)
510
+ 4220 train 4.115254 (lr=3.7148e-04) (hash(x)=149423408)
511
+ 4230 train 4.205424 (lr=3.7046e-04) (hash(x)=150033580)
512
+ 4240 train 4.096198 (lr=3.6944e-04) (hash(x)=137509644)
513
+ 4250 train 4.267327 (lr=3.6843e-04) (hash(x)=161159362)
514
+ 4260 train 4.173113 (lr=3.6741e-04) (hash(x)=148117355)
515
+ 4270 train 4.100213 (lr=3.6639e-04) (hash(x)=147315384)
516
+ 4280 train 4.166073 (lr=3.6537e-04) (hash(x)=156577316)
517
+ 4290 train 4.342910 (lr=3.6435e-04) (hash(x)=161553761)
518
+ 4300 val loss 4.2421
519
+ 4300 val perplexity 69.5559
520
+ 4300 train 4.006040 (lr=3.6333e-04) (hash(x)=152294662)
521
+ 4310 train 4.035946 (lr=3.6231e-04) (hash(x)=144616611)
522
+ 4320 train 4.183397 (lr=3.6129e-04) (hash(x)=154134591)
523
+ 4330 train 4.045680 (lr=3.6027e-04) (hash(x)=159947834)
524
+ 4340 train 4.027687 (lr=3.5925e-04) (hash(x)=156261313)
525
+ 4350 train 4.147388 (lr=3.5822e-04) (hash(x)=141245643)
526
+ 4360 train 4.113534 (lr=3.5720e-04) (hash(x)=157291204)
527
+ 4370 train 4.145621 (lr=3.5618e-04) (hash(x)=142877676)
528
+ 4380 train 4.186522 (lr=3.5515e-04) (hash(x)=155174402)
529
+ 4390 train 4.036535 (lr=3.5413e-04) (hash(x)=154675451)
530
+ 4400 val loss 4.2396
531
+ 4400 val perplexity 69.3832
532
+ 4400 train 4.203861 (lr=3.5311e-04) (hash(x)=141804386)
533
+ 4410 train 4.426002 (lr=3.5208e-04) (hash(x)=163930619)
534
+ 4420 train 4.298167 (lr=3.5106e-04) (hash(x)=139437472)
535
+ 4430 train 4.234465 (lr=3.5003e-04) (hash(x)=142474831)
536
+ 4440 train 4.258693 (lr=3.4901e-04) (hash(x)=150102428)
537
+ 4450 train 4.227980 (lr=3.4798e-04) (hash(x)=135805460)
538
+ 4460 train 4.189848 (lr=3.4695e-04) (hash(x)=154732100)
539
+ 4470 train 4.195867 (lr=3.4593e-04) (hash(x)=148554435)
540
+ 4480 train 4.133001 (lr=3.4490e-04) (hash(x)=142410065)
541
+ 4490 train 4.155505 (lr=3.4387e-04) (hash(x)=148108338)
542
+ 4500 val loss 4.2217
543
+ 4500 val perplexity 68.1509
544
+ 4500 train 4.071621 (lr=3.4285e-04) (hash(x)=151095242)
545
+ 4510 train 4.075661 (lr=3.4182e-04) (hash(x)=154911617)
546
+ 4520 train 4.115407 (lr=3.4079e-04) (hash(x)=150858662)
547
+ 4530 train 4.061187 (lr=3.3977e-04) (hash(x)=146850830)
548
+ 4540 train 4.118964 (lr=3.3874e-04) (hash(x)=153506103)
549
+ 4550 train 4.134123 (lr=3.3771e-04) (hash(x)=157068400)
550
+ 4560 train 4.206189 (lr=3.3668e-04) (hash(x)=163640327)
551
+ 4570 train 4.263094 (lr=3.3565e-04) (hash(x)=158962962)
552
+ 4580 train 4.125972 (lr=3.3463e-04) (hash(x)=143454481)
553
+ 4590 train 4.134850 (lr=3.3360e-04) (hash(x)=155380269)
554
+ 4600 val loss 4.2556
555
+ 4600 val perplexity 70.4971
556
+ 4600 train 4.288472 (lr=3.3257e-04) (hash(x)=156414699)
557
+ 4610 train 4.108760 (lr=3.3154e-04) (hash(x)=153520595)
558
+ 4620 train 4.129999 (lr=3.3051e-04) (hash(x)=151287061)
559
+ 4630 train 3.961761 (lr=3.2949e-04) (hash(x)=144972877)
560
+ 4640 train 4.081348 (lr=3.2846e-04) (hash(x)=152031134)
561
+ 4650 train 4.220479 (lr=3.2743e-04) (hash(x)=155348609)
562
+ 4660 train 4.006318 (lr=3.2640e-04) (hash(x)=144855343)
563
+ 4670 train 4.161096 (lr=3.2537e-04) (hash(x)=159247995)
564
+ 4680 train 4.053130 (lr=3.2435e-04) (hash(x)=133146878)
565
+ 4690 train 4.289427 (lr=3.2332e-04) (hash(x)=148797338)
566
+ 4700 val loss 4.2147
567
+ 4700 val perplexity 67.6768
568
+ 4700 train 4.169440 (lr=3.2229e-04) (hash(x)=161556686)
569
+ 4710 train 4.353711 (lr=3.2126e-04) (hash(x)=142528636)
570
+ 4720 train 4.279312 (lr=3.2023e-04) (hash(x)=148322603)
571
+ 4730 train 4.196731 (lr=3.1921e-04) (hash(x)=145051555)
572
+ 4740 train 4.113234 (lr=3.1818e-04) (hash(x)=146193153)
573
+ 4750 train 4.310147 (lr=3.1715e-04) (hash(x)=165802167)
574
+ 4760 train 4.260219 (lr=3.1613e-04) (hash(x)=158948628)
575
+ 4770 train 4.301055 (lr=3.1510e-04) (hash(x)=156177788)
576
+ 4780 train 4.156542 (lr=3.1407e-04) (hash(x)=175446069)
577
+ 4790 train 4.179591 (lr=3.1305e-04) (hash(x)=141100706)
578
+ 4800 val loss 4.2115
579
+ 4800 val perplexity 67.4604
580
+ 4800 train 4.103304 (lr=3.1202e-04) (hash(x)=149000293)
581
+ 4810 train 4.028175 (lr=3.1099e-04) (hash(x)=135891778)
582
+ 4820 train 4.081861 (lr=3.0997e-04) (hash(x)=145474733)
583
+ 4830 train 4.141610 (lr=3.0894e-04) (hash(x)=145187742)
584
+ 4840 train 4.190108 (lr=3.0792e-04) (hash(x)=157685237)
585
+ 4850 train 4.026490 (lr=3.0689e-04) (hash(x)=128599506)
586
+ 4860 train 4.117213 (lr=3.0587e-04) (hash(x)=142067051)
587
+ 4870 train 4.065267 (lr=3.0485e-04) (hash(x)=156215711)
588
+ 4880 train 4.259776 (lr=3.0382e-04) (hash(x)=150456895)
589
+ 4890 train 3.884215 (lr=3.0280e-04) (hash(x)=141647202)
590
+ 4900 val loss 4.2000
591
+ 4900 val perplexity 66.6849
592
+ 4900 train 3.945271 (lr=3.0178e-04) (hash(x)=154349989)
593
+ 4910 train 3.981321 (lr=3.0075e-04) (hash(x)=151563396)
594
+ 4920 train 3.994511 (lr=2.9973e-04) (hash(x)=138868314)
595
+ 4930 train 4.139812 (lr=2.9871e-04) (hash(x)=148533460)
596
+ 4940 train 4.129040 (lr=2.9769e-04) (hash(x)=144892493)
597
+ 4950 train 4.150645 (lr=2.9667e-04) (hash(x)=170326431)
598
+ 4960 train 4.149920 (lr=2.9565e-04) (hash(x)=163312680)
599
+ 4970 train 4.111387 (lr=2.9463e-04) (hash(x)=184352734)
600
+ 4980 train 4.187467 (lr=2.9361e-04) (hash(x)=140363733)
601
+ 4990 train 4.085557 (lr=2.9259e-04) (hash(x)=147794873)
602
+ 5000 val loss 4.1943
603
+ 5000 val perplexity 66.3055
604
+ 5000 train 4.177243 (lr=2.9157e-04) (hash(x)=131475967)
605
+ 5010 train 4.281310 (lr=2.9056e-04) (hash(x)=139560000)
606
+ 5020 train 4.186268 (lr=2.8954e-04) (hash(x)=160128701)
607
+ 5030 train 4.298525 (lr=2.8852e-04) (hash(x)=154698531)
608
+ 5040 train 4.261716 (lr=2.8751e-04) (hash(x)=153833791)
609
+ 5050 train 4.180938 (lr=2.8649e-04) (hash(x)=145953388)
610
+ 5060 train 4.522370 (lr=2.8548e-04) (hash(x)=134297881)
611
+ 5070 train 4.186088 (lr=2.8447e-04) (hash(x)=159987550)
612
+ 5080 train 4.177457 (lr=2.8345e-04) (hash(x)=147699302)
613
+ 5090 train 3.901203 (lr=2.8244e-04) (hash(x)=140534876)
614
+ 5100 val loss 4.1898
615
+ 5100 val perplexity 66.0123
616
+ 5100 train 4.105705 (lr=2.8143e-04) (hash(x)=149717902)
617
+ 5110 train 4.023517 (lr=2.8042e-04) (hash(x)=147648148)
618
+ 5120 train 4.042538 (lr=2.7941e-04) (hash(x)=170962791)
619
+ 5130 train 4.088052 (lr=2.7840e-04) (hash(x)=139633288)
620
+ 5140 train 4.256811 (lr=2.7739e-04) (hash(x)=160163221)
621
+ 5150 train 4.087533 (lr=2.7638e-04) (hash(x)=146625393)
622
+ 5160 train 4.150156 (lr=2.7537e-04) (hash(x)=147399092)
623
+ 5170 train 3.991426 (lr=2.7437e-04) (hash(x)=147172792)
624
+ 5180 train 4.097240 (lr=2.7336e-04) (hash(x)=140600568)
625
+ 5190 train 4.018775 (lr=2.7236e-04) (hash(x)=152423962)
626
+ 5200 val loss 4.1866
627
+ 5200 val perplexity 65.8001
628
+ 5200 train 4.127081 (lr=2.7135e-04) (hash(x)=151407999)
629
+ 5210 train 4.075913 (lr=2.7035e-04) (hash(x)=154874903)
630
+ 5220 train 3.890605 (lr=2.6935e-04) (hash(x)=154133697)
631
+ 5230 train 4.077150 (lr=2.6835e-04) (hash(x)=148148721)
632
+ 5240 train 4.055752 (lr=2.6734e-04) (hash(x)=149681665)
633
+ 5250 train 4.022038 (lr=2.6635e-04) (hash(x)=150136904)
634
+ 5260 train 4.137352 (lr=2.6535e-04) (hash(x)=149261170)
635
+ 5270 train 4.107561 (lr=2.6435e-04) (hash(x)=155540595)
636
+ 5280 train 4.124571 (lr=2.6335e-04) (hash(x)=141973714)
637
+ 5290 train 4.570252 (lr=2.6236e-04) (hash(x)=146827439)
638
+ 5300 val loss 4.1745
639
+ 5300 val perplexity 65.0058
640
+ 5300 train 4.140273 (lr=2.6136e-04) (hash(x)=168602728)
641
+ 5310 train 4.119944 (lr=2.6037e-04) (hash(x)=144035757)
642
+ 5320 train 4.179716 (lr=2.5937e-04) (hash(x)=143287560)
643
+ 5330 train 4.110137 (lr=2.5838e-04) (hash(x)=142461814)
644
+ 5340 train 4.154832 (lr=2.5739e-04) (hash(x)=149905536)
645
+ 5350 train 4.251827 (lr=2.5640e-04) (hash(x)=134597061)
646
+ 5360 train 4.276795 (lr=2.5541e-04) (hash(x)=156520228)
647
+ 5370 train 4.185418 (lr=2.5443e-04) (hash(x)=152690323)
648
+ 5380 train 4.125360 (lr=2.5344e-04) (hash(x)=140092622)
649
+ 5390 train 4.088529 (lr=2.5245e-04) (hash(x)=153504017)
650
+ 5400 val loss 4.1616
651
+ 5400 val perplexity 64.1746
652
+ 5400 train 4.112967 (lr=2.5147e-04) (hash(x)=158344511)
653
+ 5410 train 4.042314 (lr=2.5049e-04) (hash(x)=149681960)
654
+ 5420 train 4.093867 (lr=2.4950e-04) (hash(x)=143459968)
655
+ 5430 train 4.008551 (lr=2.4852e-04) (hash(x)=134759020)
656
+ 5440 train 3.986306 (lr=2.4754e-04) (hash(x)=164760471)
657
+ 5450 train 4.039769 (lr=2.4657e-04) (hash(x)=148551310)
658
+ 5460 train 4.175451 (lr=2.4559e-04) (hash(x)=155464239)
659
+ 5470 train 3.987691 (lr=2.4461e-04) (hash(x)=147309485)
660
+ 5480 train 3.940682 (lr=2.4364e-04) (hash(x)=163937590)
661
+ 5490 train 4.226748 (lr=2.4266e-04) (hash(x)=142830147)
662
+ 5500 val loss 4.1583
663
+ 5500 val perplexity 63.9600
664
+ 5500 train 4.054812 (lr=2.4169e-04) (hash(x)=148350057)
665
+ 5510 train 4.028842 (lr=2.4072e-04) (hash(x)=149007838)
666
+ 5520 train 3.997600 (lr=2.3975e-04) (hash(x)=158176239)
667
+ 5530 train 4.104340 (lr=2.3878e-04) (hash(x)=156395740)
668
+ 5540 train 4.270691 (lr=2.3782e-04) (hash(x)=152453211)
669
+ 5550 train 3.986491 (lr=2.3685e-04) (hash(x)=144447218)
670
+ 5560 train 4.157467 (lr=2.3589e-04) (hash(x)=153858804)
671
+ 5570 train 3.988536 (lr=2.3492e-04) (hash(x)=133929681)
672
+ 5580 train 4.237970 (lr=2.3396e-04) (hash(x)=159170988)
673
+ 5590 train 4.175734 (lr=2.3300e-04) (hash(x)=144978886)
674
+ 5600 val loss 4.1527
675
+ 5600 val perplexity 63.6048
676
+ 5600 train 4.194169 (lr=2.3204e-04) (hash(x)=153847323)
677
+ 5610 train 4.143672 (lr=2.3108e-04) (hash(x)=145553636)
678
+ 5620 train 5.290322 (lr=2.3013e-04) (hash(x)=153712417)
679
+ 5630 train 4.174591 (lr=2.2917e-04) (hash(x)=148397520)
680
+ 5640 train 4.117166 (lr=2.2822e-04) (hash(x)=149424351)
681
+ 5650 train 4.210706 (lr=2.2727e-04) (hash(x)=140531069)
682
+ 5660 train 4.148504 (lr=2.2632e-04) (hash(x)=144904009)
683
+ 5670 train 3.999589 (lr=2.2537e-04) (hash(x)=162469666)
684
+ 5680 train 4.124962 (lr=2.2442e-04) (hash(x)=149254569)
685
+ 5690 train 4.041796 (lr=2.2348e-04) (hash(x)=146949494)
686
+ 5700 val loss 4.1440
687
+ 5700 val perplexity 63.0532
688
+ 5700 train 4.104294 (lr=2.2253e-04) (hash(x)=156607405)
689
+ 5710 train 3.951504 (lr=2.2159e-04) (hash(x)=146349425)
690
+ 5720 train 4.117228 (lr=2.2065e-04) (hash(x)=160347239)
691
+ 5730 train 4.053485 (lr=2.1971e-04) (hash(x)=153785045)
692
+ 5740 train 4.068789 (lr=2.1877e-04) (hash(x)=135967367)
693
+ 5750 train 4.032428 (lr=2.1784e-04) (hash(x)=140181987)
694
+ 5760 train 4.184039 (lr=2.1690e-04) (hash(x)=153217075)
695
+ 5770 train 4.046869 (lr=2.1597e-04) (hash(x)=146360100)
696
+ 5780 train 4.070055 (lr=2.1504e-04) (hash(x)=161111351)
697
+ 5790 train 4.006124 (lr=2.1411e-04) (hash(x)=147891055)
698
+ 5800 val loss 4.1410
699
+ 5800 val perplexity 62.8682
700
+ 5800 train 3.884526 (lr=2.1318e-04) (hash(x)=145115031)
701
+ 5810 train 3.921891 (lr=2.1226e-04) (hash(x)=153661465)
702
+ 5820 train 4.016182 (lr=2.1133e-04) (hash(x)=148264581)
703
+ 5830 train 4.116851 (lr=2.1041e-04) (hash(x)=171144748)
704
+ 5840 train 4.046432 (lr=2.0949e-04) (hash(x)=157863238)
705
+ 5850 train 3.985251 (lr=2.0857e-04) (hash(x)=144669655)
706
+ 5860 train 3.898940 (lr=2.0765e-04) (hash(x)=154948909)
707
+ 5870 train 4.066360 (lr=2.0674e-04) (hash(x)=158981837)
708
+ 5880 train 4.108955 (lr=2.0582e-04) (hash(x)=151102851)
709
+ 5890 train 4.306065 (lr=2.0491e-04) (hash(x)=153490628)
710
+ 5900 val loss 4.1310
711
+ 5900 val perplexity 62.2424
712
+ 5900 train 4.096318 (lr=2.0400e-04) (hash(x)=141584622)
713
+ 5910 train 3.982910 (lr=2.0309e-04) (hash(x)=142896324)
714
+ 5920 train 4.269152 (lr=2.0218e-04) (hash(x)=148264764)
715
+ 5930 train 4.129268 (lr=2.0128e-04) (hash(x)=153157848)
716
+ 5940 train 4.105019 (lr=2.0038e-04) (hash(x)=151086429)
717
+ 5950 train 4.079765 (lr=1.9948e-04) (hash(x)=146009598)
718
+ 5960 train 4.190752 (lr=1.9858e-04) (hash(x)=149488374)
719
+ 5970 train 4.024580 (lr=1.9768e-04) (hash(x)=149307478)
720
+ 5980 train 3.795582 (lr=1.9678e-04) (hash(x)=156742339)
721
+ 5990 train 4.059119 (lr=1.9589e-04) (hash(x)=164296391)
722
+ 6000 val loss 4.1255
723
+ 6000 val perplexity 61.8984
724
+ 6000 train 4.154809 (lr=1.9500e-04) (hash(x)=146613857)
725
+ 6010 train 4.030589 (lr=1.9411e-04) (hash(x)=149742104)
726
+ 6020 train 3.977369 (lr=1.9322e-04) (hash(x)=145645994)
727
+ 6030 train 4.059315 (lr=1.9234e-04) (hash(x)=156324150)
728
+ 6040 train 4.231585 (lr=1.9145e-04) (hash(x)=154655300)
729
+ 6050 train 4.094217 (lr=1.9057e-04) (hash(x)=153714860)
730
+ 6060 train 4.046013 (lr=1.8969e-04) (hash(x)=139981556)
731
+ 6070 train 3.881154 (lr=1.8882e-04) (hash(x)=158753458)
732
+ 6080 train 4.043500 (lr=1.8794e-04) (hash(x)=152559930)
733
+ 6090 train 4.016532 (lr=1.8707e-04) (hash(x)=137128715)
734
+ 6100 val loss 4.1262
735
+ 6100 val perplexity 61.9437
736
+ 6100 train 4.030574 (lr=1.8620e-04) (hash(x)=144621768)
737
+ 6110 train 4.032393 (lr=1.8533e-04) (hash(x)=155614333)
738
+ 6120 train 4.017185 (lr=1.8446e-04) (hash(x)=145682343)
739
+ 6130 train 3.926471 (lr=1.8359e-04) (hash(x)=142909600)
740
+ 6140 train 3.959304 (lr=1.8273e-04) (hash(x)=152993494)
741
+ 6150 train 3.844943 (lr=1.8187e-04) (hash(x)=157151527)
742
+ 6160 train 3.993061 (lr=1.8101e-04) (hash(x)=150653611)
743
+ 6170 train 4.244534 (lr=1.8016e-04) (hash(x)=165237934)
744
+ 6180 train 4.094344 (lr=1.7930e-04) (hash(x)=155730197)
745
+ 6190 train 4.112364 (lr=1.7845e-04) (hash(x)=149687169)
746
+ 6200 val loss 4.1250
747
+ 6200 val perplexity 61.8648
748
+ 6200 train 4.106633 (lr=1.7760e-04) (hash(x)=146521760)
749
+ 6210 train 4.114485 (lr=1.7675e-04) (hash(x)=141008090)
750
+ 6220 train 4.076965 (lr=1.7591e-04) (hash(x)=143407095)
751
+ 6230 train 4.213218 (lr=1.7506e-04) (hash(x)=152947604)
752
+ 6240 train 4.522073 (lr=1.7422e-04) (hash(x)=153876740)
753
+ 6250 train 4.159676 (lr=1.7338e-04) (hash(x)=144250615)
754
+ 6260 train 4.153634 (lr=1.7255e-04) (hash(x)=155236959)
755
+ 6270 train 4.127711 (lr=1.7171e-04) (hash(x)=139785369)
756
+ 6280 train 4.057211 (lr=1.7088e-04) (hash(x)=165036565)
757
+ 6290 train 4.078882 (lr=1.7005e-04) (hash(x)=144585028)
758
+ 6300 val loss 4.1120
759
+ 6300 val perplexity 61.0669
760
+ 6300 train 3.931736 (lr=1.6923e-04) (hash(x)=161378136)
761
+ 6310 train 3.981984 (lr=1.6840e-04) (hash(x)=148483421)
762
+ 6320 train 4.031734 (lr=1.6758e-04) (hash(x)=149835040)
763
+ 6330 train 3.944311 (lr=1.6676e-04) (hash(x)=149459414)
764
+ 6340 train 3.975904 (lr=1.6594e-04) (hash(x)=154290067)
765
+ 6350 train 4.075600 (lr=1.6513e-04) (hash(x)=147407391)
766
+ 6360 train 4.010707 (lr=1.6431e-04) (hash(x)=146317149)
767
+ 6370 train 3.914214 (lr=1.6350e-04) (hash(x)=141970628)
768
+ 6380 train 3.949024 (lr=1.6269e-04) (hash(x)=139170535)
769
+ 6390 train 4.145844 (lr=1.6189e-04) (hash(x)=148905963)
770
+ 6400 val loss 4.1111
771
+ 6400 val perplexity 61.0108
772
+ 6400 train 3.937335 (lr=1.6108e-04) (hash(x)=141624235)
773
+ 6410 train 3.942767 (lr=1.6028e-04) (hash(x)=150520968)
774
+ 6420 train 3.830738 (lr=1.5948e-04) (hash(x)=155019129)
775
+ 6430 train 3.888057 (lr=1.5869e-04) (hash(x)=150031836)
776
+ 6440 train 3.857171 (lr=1.5790e-04) (hash(x)=112835661)
777
+ 6450 train 3.855033 (lr=1.5710e-04) (hash(x)=141072709)
778
+ 6460 train 3.998727 (lr=1.5632e-04) (hash(x)=153933796)
779
+ 6470 train 4.050470 (lr=1.5553e-04) (hash(x)=153315715)
780
+ 6480 train 4.126259 (lr=1.5475e-04) (hash(x)=158089228)
781
+ 6490 train 3.978586 (lr=1.5397e-04) (hash(x)=149471788)
782
+ 6500 val loss 4.1078
783
+ 6500 val perplexity 60.8111
784
+ 6500 train 4.108046 (lr=1.5319e-04) (hash(x)=151197095)
785
+ 6510 train 4.070200 (lr=1.5241e-04) (hash(x)=165554266)
786
+ 6520 train 4.232692 (lr=1.5164e-04) (hash(x)=157822242)
787
+ 6530 train 4.166207 (lr=1.5087e-04) (hash(x)=156267861)
788
+ 6540 train 4.076926 (lr=1.5010e-04) (hash(x)=145825803)
789
+ 6550 train 4.155082 (lr=1.4933e-04) (hash(x)=151303683)
790
+ 6560 train 4.217830 (lr=1.4857e-04) (hash(x)=150786942)
791
+ 6570 train 4.034340 (lr=1.4781e-04) (hash(x)=146237093)
792
+ 6580 train 3.999360 (lr=1.4705e-04) (hash(x)=147063866)
793
+ 6590 train 4.233628 (lr=1.4630e-04) (hash(x)=130513396)
794
+ 6600 val loss 4.1058
795
+ 6600 val perplexity 60.6909
796
+ 6600 train 4.125781 (lr=1.4555e-04) (hash(x)=153269571)
797
+ 6610 train 4.198629 (lr=1.4480e-04) (hash(x)=150340530)
798
+ 6620 train 4.071691 (lr=1.4405e-04) (hash(x)=153843616)
799
+ 6630 train 4.143201 (lr=1.4331e-04) (hash(x)=144540858)
800
+ 6640 train 3.941280 (lr=1.4257e-04) (hash(x)=153330434)
801
+ 6650 train 3.955559 (lr=1.4183e-04) (hash(x)=146851492)
802
+ 6660 train 3.974411 (lr=1.4109e-04) (hash(x)=153601788)
803
+ 6670 train 4.007741 (lr=1.4036e-04) (hash(x)=140005742)
804
+ 6680 train 3.877739 (lr=1.3963e-04) (hash(x)=154102392)
805
+ 6690 train 3.874567 (lr=1.3890e-04) (hash(x)=152774975)
806
+ 6700 val loss 4.1016
807
+ 6700 val perplexity 60.4390
808
+ 6700 train 4.074510 (lr=1.3817e-04) (hash(x)=146111181)
809
+ 6710 train 4.026546 (lr=1.3745e-04) (hash(x)=143988017)
810
+ 6720 train 4.016078 (lr=1.3673e-04) (hash(x)=146073959)
811
+ 6730 train 4.267421 (lr=1.3602e-04) (hash(x)=152642956)
812
+ 6740 train 4.157714 (lr=1.3530e-04) (hash(x)=146964363)
813
+ 6750 train 3.954356 (lr=1.3459e-04) (hash(x)=162251871)
814
+ 6760 train 4.301650 (lr=1.3388e-04) (hash(x)=171122166)
815
+ 6770 train 4.080363 (lr=1.3318e-04) (hash(x)=135199617)
816
+ 6780 train 4.048437 (lr=1.3248e-04) (hash(x)=160480410)
817
+ 6790 train 4.193917 (lr=1.3178e-04) (hash(x)=151933948)
818
+ 6800 val loss 4.0881
819
+ 6800 val perplexity 59.6239
820
+ 6800 train 4.094066 (lr=1.3108e-04) (hash(x)=147269760)
821
+ 6810 train 4.161630 (lr=1.3039e-04) (hash(x)=156710316)
822
+ 6820 train 4.175751 (lr=1.2970e-04) (hash(x)=157300754)
823
+ 6830 train 3.989247 (lr=1.2901e-04) (hash(x)=133838057)
824
+ 6840 train 4.171911 (lr=1.2832e-04) (hash(x)=143518263)
825
+ 6850 train 4.247186 (lr=1.2764e-04) (hash(x)=160328446)
826
+ 6860 train 4.085854 (lr=1.2696e-04) (hash(x)=142143427)
827
+ 6870 train 4.144669 (lr=1.2628e-04) (hash(x)=149045075)
828
+ 6880 train 4.058600 (lr=1.2561e-04) (hash(x)=150708270)
829
+ 6890 train 4.058171 (lr=1.2494e-04) (hash(x)=153066841)
830
+ 6900 val loss 4.0867
831
+ 6900 val perplexity 59.5415
832
+ 6900 train 4.168222 (lr=1.2427e-04) (hash(x)=152912762)
833
+ 6910 train 4.011430 (lr=1.2361e-04) (hash(x)=148346240)
834
+ 6920 train 4.135716 (lr=1.2295e-04) (hash(x)=152806752)
835
+ 6930 train 4.104208 (lr=1.2229e-04) (hash(x)=161831829)
836
+ 6940 train 4.017661 (lr=1.2163e-04) (hash(x)=149483673)
837
+ 6950 train 4.001894 (lr=1.2098e-04) (hash(x)=119634555)
838
+ 6960 train 3.924197 (lr=1.2033e-04) (hash(x)=144709540)
839
+ 6970 train 3.955101 (lr=1.1969e-04) (hash(x)=141277017)
840
+ 6980 train 4.011095 (lr=1.1904e-04) (hash(x)=140618792)
841
+ 6990 train 3.988875 (lr=1.1840e-04) (hash(x)=157443505)
842
+ 7000 val loss 4.0861
843
+ 7000 val perplexity 59.5067
844
+ 7000 train 4.222891 (lr=1.1777e-04) (hash(x)=165412343)
845
+ 7010 train 3.872253 (lr=1.1713e-04) (hash(x)=140746035)
846
+ 7020 train 3.918733 (lr=1.1650e-04) (hash(x)=141101046)
847
+ 7030 train 4.062104 (lr=1.1587e-04) (hash(x)=152098342)
848
+ 7040 train 3.916734 (lr=1.1525e-04) (hash(x)=169244968)
849
+ 7050 train 4.041646 (lr=1.1463e-04) (hash(x)=149886680)
850
+ 7060 train 4.208581 (lr=1.1401e-04) (hash(x)=144670487)
851
+ 7070 train 4.230530 (lr=1.1339e-04) (hash(x)=148774474)
852
+ 7080 train 4.061625 (lr=1.1278e-04) (hash(x)=155096286)
853
+ 7090 train 4.037445 (lr=1.1217e-04) (hash(x)=145048246)
854
+ 7100 val loss 4.0724
855
+ 7100 val perplexity 58.7005
856
+ 7100 train 4.182160 (lr=1.1157e-04) (hash(x)=162866028)
857
+ 7110 train 4.045501 (lr=1.1096e-04) (hash(x)=162308558)
858
+ 7120 train 4.019279 (lr=1.1036e-04) (hash(x)=153468309)
859
+ 7130 train 4.038635 (lr=1.0977e-04) (hash(x)=144522880)
860
+ 7140 train 4.189165 (lr=1.0917e-04) (hash(x)=153055749)
861
+ 7150 train 4.003743 (lr=1.0858e-04) (hash(x)=171163513)
862
+ 7160 train 4.081718 (lr=1.0800e-04) (hash(x)=138563864)
863
+ 7170 train 4.215020 (lr=1.0741e-04) (hash(x)=166996657)
864
+ 7180 train 3.910931 (lr=1.0683e-04) (hash(x)=154809376)
865
+ 7190 train 3.989370 (lr=1.0626e-04) (hash(x)=147988525)
866
+ 7200 val loss 4.0713
867
+ 7200 val perplexity 58.6325
868
+ 7200 train 4.039526 (lr=1.0568e-04) (hash(x)=142998115)
869
+ 7210 train 4.027789 (lr=1.0511e-04) (hash(x)=145518575)
870
+ 7220 train 4.053323 (lr=1.0454e-04) (hash(x)=145758781)
871
+ 7230 train 4.155663 (lr=1.0398e-04) (hash(x)=148909637)
872
+ 7240 train 4.015603 (lr=1.0342e-04) (hash(x)=144422691)
873
+ 7250 train 3.865092 (lr=1.0286e-04) (hash(x)=140396153)
874
+ 7260 train 4.101425 (lr=1.0231e-04) (hash(x)=153619124)
875
+ 7270 train 4.005348 (lr=1.0176e-04) (hash(x)=161980521)
876
+ 7280 train 3.834925 (lr=1.0121e-04) (hash(x)=135160527)
877
+ 7290 train 3.962374 (lr=1.0066e-04) (hash(x)=167156181)
878
+ 7300 val loss 4.0717
879
+ 7300 val perplexity 58.6537
880
+ 7300 train 3.901530 (lr=1.0012e-04) (hash(x)=145486999)
881
+ 7310 train 3.977693 (lr=9.9586e-05) (hash(x)=155013351)
882
+ 7320 train 4.023978 (lr=9.9052e-05) (hash(x)=169616991)
883
+ 7330 train 3.901934 (lr=9.8521e-05) (hash(x)=146516856)
884
+ 7340 train 4.229925 (lr=9.7993e-05) (hash(x)=171918417)
885
+ 7350 train 4.244969 (lr=9.7469e-05) (hash(x)=160511891)
886
+ 7360 train 4.087421 (lr=9.6948e-05) (hash(x)=150280167)
887
+ 7370 train 4.164104 (lr=9.6431e-05) (hash(x)=151373787)
888
+ 7380 train 4.267698 (lr=9.5917e-05) (hash(x)=163311616)
889
+ 7390 train 4.250998 (lr=9.5406e-05) (hash(x)=164418521)
890
+ 7400 val loss 4.0614
891
+ 7400 val perplexity 58.0563
892
+ 7400 train 4.013670 (lr=9.4899e-05) (hash(x)=155325873)
893
+ 7410 train 4.068769 (lr=9.4395e-05) (hash(x)=135020608)
894
+ 7420 train 3.958039 (lr=9.3894e-05) (hash(x)=142812793)
895
+ 7430 train 4.070299 (lr=9.3397e-05) (hash(x)=149463283)
896
+ 7440 train 4.032311 (lr=9.2904e-05) (hash(x)=170512352)
897
+ 7450 train 4.000636 (lr=9.2413e-05) (hash(x)=146253806)
898
+ 7460 train 4.657423 (lr=9.1927e-05) (hash(x)=133479853)
899
+ 7470 train 4.010655 (lr=9.1443e-05) (hash(x)=138198949)
900
+ 7480 train 3.998845 (lr=9.0964e-05) (hash(x)=143831431)
901
+ 7490 train 4.159807 (lr=9.0487e-05) (hash(x)=148147144)
902
+ 7500 val loss 4.0613
903
+ 7500 val perplexity 58.0501
904
+ 7500 train 4.057199 (lr=9.0014e-05) (hash(x)=145131256)
905
+ 7510 train 4.096821 (lr=8.9545e-05) (hash(x)=150932291)
906
+ 7520 train 4.058319 (lr=8.9079e-05) (hash(x)=147605934)
907
+ 7530 train 4.011611 (lr=8.8617e-05) (hash(x)=154343507)
908
+ 7540 train 4.150372 (lr=8.8158e-05) (hash(x)=148440064)
909
+ 7550 train 3.961226 (lr=8.7702e-05) (hash(x)=140505990)
910
+ 7560 train 3.936891 (lr=8.7251e-05) (hash(x)=177438878)
911
+ 7570 train 4.017305 (lr=8.6802e-05) (hash(x)=154527960)
912
+ 7580 train 4.104770 (lr=8.6357e-05) (hash(x)=153793091)
913
+ 7590 train 3.928969 (lr=8.5916e-05) (hash(x)=163097232)
914
+ 7600 val loss 4.0677
915
+ 7600 val perplexity 58.4208
916
+ 7600 train 4.060457 (lr=8.5478e-05) (hash(x)=144008365)
917
+ 7610 train 4.048656 (lr=8.5044e-05) (hash(x)=139649886)
918
+ 7620 train 3.993801 (lr=8.4613e-05) (hash(x)=145452123)
919
+ 7630 train 4.018026 (lr=8.4186e-05) (hash(x)=132477285)
920
+ 7640 train 3.926105 (lr=8.3763e-05) (hash(x)=139030720)
921
+ 7650 train 4.058655 (lr=8.3343e-05) (hash(x)=157466085)
922
+ 7660 train 4.067036 (lr=8.2926e-05) (hash(x)=153638385)
923
+ 7670 train 4.124471 (lr=8.2514e-05) (hash(x)=154823094)
924
+ 7680 train 4.028829 (lr=8.2104e-05) (hash(x)=153107930)
925
+ 7690 train 4.012093 (lr=8.1699e-05) (hash(x)=153681916)
926
+ 7700 val loss 4.0533
927
+ 7700 val perplexity 57.5871
928
+ 7700 train 4.325125 (lr=8.1297e-05) (hash(x)=148848532)
929
+ 7710 train 4.049396 (lr=8.0898e-05) (hash(x)=156640791)
930
+ 7720 train 4.291998 (lr=8.0503e-05) (hash(x)=152573035)
931
+ 7730 train 4.129068 (lr=8.0112e-05) (hash(x)=147536091)
932
+ 7740 train 4.289264 (lr=7.9725e-05) (hash(x)=175687483)
933
+ 7750 train 4.039374 (lr=7.9341e-05) (hash(x)=143775898)
934
+ 7760 train 4.064744 (lr=7.8960e-05) (hash(x)=146986193)
935
+ 7770 train 3.973891 (lr=7.8584e-05) (hash(x)=160924293)
936
+ 7780 train 4.018727 (lr=7.8211e-05) (hash(x)=132579169)
937
+ 7790 train 4.244557 (lr=7.7841e-05) (hash(x)=147474225)
938
+ 7800 val loss 4.0525
939
+ 7800 val perplexity 57.5394
940
+ 7800 train 4.086012 (lr=7.7476e-05) (hash(x)=150391642)
941
+ 7810 train 4.254696 (lr=7.7114e-05) (hash(x)=156984481)
942
+ 7820 train 3.973237 (lr=7.6755e-05) (hash(x)=139652488)
943
+ 7830 train 3.994135 (lr=7.6400e-05) (hash(x)=169033643)
944
+ 7840 train 3.966312 (lr=7.6049e-05) (hash(x)=152645857)
945
+ 7850 train 4.117519 (lr=7.5702e-05) (hash(x)=150228075)
946
+ 7860 train 4.032240 (lr=7.5358e-05) (hash(x)=178788133)
947
+ 7870 train 4.018313 (lr=7.5018e-05) (hash(x)=149891068)
948
+ 7880 train 3.961585 (lr=7.4682e-05) (hash(x)=157461488)
949
+ 7890 train 3.927008 (lr=7.4350e-05) (hash(x)=142502446)
950
+ 7900 val loss 4.0535
951
+ 7900 val perplexity 57.5997
952
+ 7900 train 3.859741 (lr=7.4021e-05) (hash(x)=152191414)
953
+ 7910 train 4.037789 (lr=7.3696e-05) (hash(x)=151329539)
954
+ 7920 train 3.916925 (lr=7.3374e-05) (hash(x)=134143115)
955
+ 7930 train 3.921423 (lr=7.3056e-05) (hash(x)=155536160)
956
+ 7940 train 3.993058 (lr=7.2742e-05) (hash(x)=146709034)
957
+ 7950 train 4.072831 (lr=7.2432e-05) (hash(x)=141966330)
958
+ 7960 train 4.045867 (lr=7.2126e-05) (hash(x)=152102296)
959
+ 7970 train 4.215632 (lr=7.1823e-05) (hash(x)=150272684)
960
+ 7980 train 4.195538 (lr=7.1524e-05) (hash(x)=150251784)
961
+ 7990 train 4.177956 (lr=7.1228e-05) (hash(x)=151226159)
962
+ 8000 val loss 4.0466
963
+ 8000 val perplexity 57.2035
964
+ 8000 train 4.068943 (lr=7.0937e-05) (hash(x)=159755587)
965
+ 8010 train 4.072999 (lr=7.0649e-05) (hash(x)=137607202)
966
+ 8020 train 4.072789 (lr=7.0365e-05) (hash(x)=149574886)
967
+ 8030 train 3.981304 (lr=7.0085e-05) (hash(x)=151690927)
968
+ 8040 train 4.008238 (lr=6.9808e-05) (hash(x)=151181288)
969
+ 8050 train 4.125662 (lr=6.9536e-05) (hash(x)=150465993)
970
+ 8060 train 4.140782 (lr=6.9267e-05) (hash(x)=134201020)
971
+ 8070 train 4.037067 (lr=6.9002e-05) (hash(x)=157696045)
972
+ 8080 train 4.035491 (lr=6.8740e-05) (hash(x)=135581816)
973
+ 8090 train 3.966376 (lr=6.8483e-05) (hash(x)=153898278)
974
+ 8100 val loss 4.0447
975
+ 8100 val perplexity 57.0941
976
+ 8100 train 4.095786 (lr=6.8229e-05) (hash(x)=156664468)
977
+ 8110 train 4.196200 (lr=6.7979e-05) (hash(x)=161572593)
978
+ 8120 train 3.939740 (lr=6.7733e-05) (hash(x)=154381651)
979
+ 8130 train 4.201794 (lr=6.7490e-05) (hash(x)=149810514)
980
+ 8140 train 4.063170 (lr=6.7252e-05) (hash(x)=145085369)
981
+ 8150 train 3.909049 (lr=6.7017e-05) (hash(x)=149020616)
982
+ 8160 train 3.791153 (lr=6.6786e-05) (hash(x)=146450644)
983
+ 8170 train 3.993742 (lr=6.6559e-05) (hash(x)=148094499)
984
+ 8180 train 3.864647 (lr=6.6335e-05) (hash(x)=140810897)
985
+ 8190 train 3.968001 (lr=6.6116e-05) (hash(x)=142960309)
986
+ 8200 val loss 4.0449
987
+ 8200 val perplexity 57.1079
988
+ 8200 train 4.004465 (lr=6.5900e-05) (hash(x)=139457379)
989
+ 8210 train 3.985307 (lr=6.5688e-05) (hash(x)=145575125)
990
+ 8220 train 3.781904 (lr=6.5480e-05) (hash(x)=152836922)
991
+ 8230 train 3.868821 (lr=6.5276e-05) (hash(x)=135086706)
992
+ 8240 train 3.840396 (lr=6.5076e-05) (hash(x)=153448613)
993
+ 8250 train 3.801072 (lr=6.4879e-05) (hash(x)=138041222)
994
+ 8260 train 3.861392 (lr=6.4687e-05) (hash(x)=158857968)
995
+ 8270 train 3.871105 (lr=6.4498e-05) (hash(x)=157265315)
996
+ 8280 train 3.717079 (lr=6.4313e-05) (hash(x)=136059191)
997
+ 8290 train 3.791139 (lr=6.4132e-05) (hash(x)=162526712)
998
+ 8300 val loss 4.0517
999
+ 8300 val perplexity 57.4950
1000
+ 8300 train 3.868982 (lr=6.3954e-05) (hash(x)=145478564)
1001
+ 8310 train 3.521606 (lr=6.3781e-05) (hash(x)=157367549)
1002
+ 8320 train 3.913667 (lr=6.3612e-05) (hash(x)=156085849)
1003
+ 8330 train 4.191821 (lr=6.3446e-05) (hash(x)=154822719)
1004
+ 8340 train 3.912183 (lr=6.3284e-05) (hash(x)=159688342)
1005
+ 8350 train 4.039035 (lr=6.3126e-05) (hash(x)=144133822)
1006
+ 8360 train 3.984049 (lr=6.2972e-05) (hash(x)=154155471)
1007
+ 8370 train 3.992030 (lr=6.2822e-05) (hash(x)=157077804)
1008
+ 8380 train 3.993416 (lr=6.2676e-05) (hash(x)=147796790)
1009
+ 8390 train 4.002159 (lr=6.2533e-05) (hash(x)=145208254)
1010
+ 8400 val loss 4.0401
1011
+ 8400 val perplexity 56.8332
1012
+ 8400 train 3.949569 (lr=6.2395e-05) (hash(x)=154982769)
1013
+ 8410 train 4.153895 (lr=6.2260e-05) (hash(x)=138073867)
1014
+ 8420 train 4.194023 (lr=6.2129e-05) (hash(x)=159521725)
1015
+ 8430 train 4.111905 (lr=6.2002e-05) (hash(x)=149988578)
1016
+ 8440 train 4.083979 (lr=6.1879e-05) (hash(x)=149457063)
1017
+ 8450 train 3.947554 (lr=6.1760e-05) (hash(x)=134678896)
1018
+ 8460 train 3.998334 (lr=6.1645e-05) (hash(x)=146122249)
1019
+ 8470 train 4.021149 (lr=6.1533e-05) (hash(x)=141393319)
1020
+ 8480 train 4.046381 (lr=6.1426e-05) (hash(x)=165396836)
1021
+ 8490 train 4.119053 (lr=6.1322e-05) (hash(x)=140043806)
1022
+ 8500 val loss 4.0371
1023
+ 8500 val perplexity 56.6632
1024
+ 8500 train 4.071108 (lr=6.1223e-05) (hash(x)=145798118)
1025
+ 8510 train 4.018235 (lr=6.1127e-05) (hash(x)=150358299)
1026
+ 8520 train 3.910102 (lr=6.1035e-05) (hash(x)=152572913)
1027
+ 8530 train 3.981836 (lr=6.0947e-05) (hash(x)=148669229)
1028
+ 8540 train 4.019794 (lr=6.0863e-05) (hash(x)=147078347)
1029
+ 8550 train 3.996059 (lr=6.0783e-05) (hash(x)=146715166)
1030
+ 8560 train 3.969763 (lr=6.0706e-05) (hash(x)=143176392)
1031
+ 8570 train 4.090860 (lr=6.0634e-05) (hash(x)=153825484)
1032
+ 8580 train 4.106710 (lr=6.0566e-05) (hash(x)=163379045)
1033
+ 8590 train 3.902104 (lr=6.0501e-05) (hash(x)=140050873)
1034
+ 8600 val loss 4.0393
1035
+ 8600 val perplexity 56.7860
1036
+ 8600 train 3.978757 (lr=6.0440e-05) (hash(x)=143231551)
1037
+ 8610 train 3.838269 (lr=6.0384e-05) (hash(x)=164291360)
1038
+ 8620 train 3.860530 (lr=6.0331e-05) (hash(x)=159122123)
1039
+ 8630 train 3.836739 (lr=6.0282e-05) (hash(x)=140041695)
1040
+ 8640 train 3.860580 (lr=6.0237e-05) (hash(x)=146010790)
1041
+ 8650 train 3.774630 (lr=6.0196e-05) (hash(x)=146882123)
1042
+ 8660 train 3.595366 (lr=6.0159e-05) (hash(x)=144933705)
1043
+ 8670 train 3.822641 (lr=6.0125e-05) (hash(x)=145580075)
1044
+ 8680 train 3.873542 (lr=6.0096e-05) (hash(x)=160982926)
1045
+ 8690 train 3.746416 (lr=6.0070e-05) (hash(x)=140047108)
1046
+ 8700 val loss 4.0492
1047
+ 8700 val perplexity 57.3495
1048
+ 8700 train 3.877881 (lr=6.0049e-05) (hash(x)=154780112)
1049
+ 8710 train 3.961047 (lr=6.0031e-05) (hash(x)=157568560)
1050
+ 8720 train 4.004353 (lr=6.0018e-05) (hash(x)=159587459)
1051
+ 8730 train 4.001848 (lr=6.0008e-05) (hash(x)=152918103)
1052
+ 8740 train 4.002712 (lr=6.0002e-05) (hash(x)=156928586)
1053
+ 8749 val loss 4.0374
1054
+ 8749 val perplexity 56.6812
lr6e-4_total_batch_size61440_baseline_seed1340/model_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e164078064d8faede2e3f19f43d8a8c8ef062dcaf29f983c8d0f7714b1e8c5b
3
+ size 92843394
lr6e-4_total_batch_size61440_baseline_seed1340/optimizer_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9286a5b0a808330739591424ae678a613538c8e3d7048be1a4112b396fbb76c
3
+ size 179406214