andrew-healey commited on
Commit
efae75f
·
verified ·
1 Parent(s): 2399e01

Upload folder using huggingface_hub

Browse files
lr8e-4_total_batch_size61440_baseline_seed1340/args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_11/lr8e-4_total_batch_size61440_baseline_seed1340", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_11", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1340, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 0.0008, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "8e-4_61440", "n_embd": 256}
lr8e-4_total_batch_size61440_baseline_seed1340/dataloader_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:953385078aa3787b69fc6857dfd48b0a2cd2f4d27c6f8892e01211aca53d07f5
3
+ size 964
lr8e-4_total_batch_size61440_baseline_seed1340/log2.txt ADDED
@@ -0,0 +1,1054 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ max_steps: 8750
2
+ 0 val loss 11.2465
3
+ 0 val perplexity 76607.9531
4
+ 0 train 11.254622 (lr=1.6000e-06) (hash(x)=164406924)
5
+ 10 train 10.403351 (lr=1.7600e-05) (hash(x)=152008797)
6
+ 20 train 9.868977 (lr=3.3600e-05) (hash(x)=153494457)
7
+ 30 train 9.439003 (lr=4.9600e-05) (hash(x)=137328499)
8
+ 40 train 9.154652 (lr=6.5600e-05) (hash(x)=159333245)
9
+ 50 train 8.698051 (lr=8.1600e-05) (hash(x)=177407419)
10
+ 60 train 8.108204 (lr=9.7600e-05) (hash(x)=127877799)
11
+ 70 train 7.771571 (lr=1.1360e-04) (hash(x)=140873918)
12
+ 80 train 7.731904 (lr=1.2960e-04) (hash(x)=160249377)
13
+ 90 train 7.585963 (lr=1.4560e-04) (hash(x)=154282910)
14
+ 100 val loss 7.5267
15
+ 100 val perplexity 1856.9932
16
+ 100 train 7.433286 (lr=1.6160e-04) (hash(x)=144903932)
17
+ 110 train 7.439407 (lr=1.7760e-04) (hash(x)=151685723)
18
+ 120 train 7.209979 (lr=1.9360e-04) (hash(x)=152347864)
19
+ 130 train 7.200047 (lr=2.0960e-04) (hash(x)=152230926)
20
+ 140 train 7.039568 (lr=2.2560e-04) (hash(x)=142121048)
21
+ 150 train 7.147993 (lr=2.4160e-04) (hash(x)=173839165)
22
+ 160 train 6.931983 (lr=2.5760e-04) (hash(x)=158755931)
23
+ 170 train 6.753846 (lr=2.7360e-04) (hash(x)=151645697)
24
+ 180 train 7.072424 (lr=2.8960e-04) (hash(x)=179696111)
25
+ 190 train 6.703321 (lr=3.0560e-04) (hash(x)=150511423)
26
+ 200 val loss 6.7542
27
+ 200 val perplexity 857.6615
28
+ 200 train 6.878301 (lr=3.2160e-04) (hash(x)=167734596)
29
+ 210 train 6.442939 (lr=3.3760e-04) (hash(x)=133157560)
30
+ 220 train 6.690583 (lr=3.5360e-04) (hash(x)=152234670)
31
+ 230 train 6.510370 (lr=3.6960e-04) (hash(x)=160995039)
32
+ 240 train 6.399655 (lr=3.8560e-04) (hash(x)=139367591)
33
+ 250 train 6.569918 (lr=4.0160e-04) (hash(x)=153224076)
34
+ 260 train 6.472605 (lr=4.1760e-04) (hash(x)=156667529)
35
+ 270 train 6.388954 (lr=4.3360e-04) (hash(x)=133883725)
36
+ 280 train 6.184897 (lr=4.4960e-04) (hash(x)=151939311)
37
+ 290 train 6.241949 (lr=4.6560e-04) (hash(x)=150290478)
38
+ 300 val loss 6.2475
39
+ 300 val perplexity 516.7203
40
+ 300 train 6.209776 (lr=4.8160e-04) (hash(x)=149619098)
41
+ 310 train 6.118904 (lr=4.9760e-04) (hash(x)=142344072)
42
+ 320 train 6.189412 (lr=5.1360e-04) (hash(x)=151878766)
43
+ 330 train 6.037600 (lr=5.2960e-04) (hash(x)=136419763)
44
+ 340 train 6.145535 (lr=5.4560e-04) (hash(x)=160761498)
45
+ 350 train 6.016574 (lr=5.6160e-04) (hash(x)=146539909)
46
+ 360 train 6.113569 (lr=5.7760e-04) (hash(x)=143063312)
47
+ 370 train 5.972857 (lr=5.9360e-04) (hash(x)=153705818)
48
+ 380 train 5.882222 (lr=6.0960e-04) (hash(x)=149579175)
49
+ 390 train 5.831676 (lr=6.2560e-04) (hash(x)=150904406)
50
+ 400 val loss 5.9268
51
+ 400 val perplexity 374.9491
52
+ 400 train 5.795779 (lr=6.4160e-04) (hash(x)=153710890)
53
+ 410 train 5.692109 (lr=6.5760e-04) (hash(x)=138302968)
54
+ 420 train 5.692981 (lr=6.7360e-04) (hash(x)=141112989)
55
+ 430 train 5.750648 (lr=6.8960e-04) (hash(x)=149846944)
56
+ 440 train 5.743404 (lr=7.0560e-04) (hash(x)=157683479)
57
+ 450 train 5.747998 (lr=7.2160e-04) (hash(x)=155873620)
58
+ 460 train 5.953325 (lr=7.3760e-04) (hash(x)=152133659)
59
+ 470 train 5.708988 (lr=7.5360e-04) (hash(x)=158095308)
60
+ 480 train 5.579508 (lr=7.6960e-04) (hash(x)=148422063)
61
+ 490 train 5.766305 (lr=7.8560e-04) (hash(x)=145665071)
62
+ 500 val loss 5.7229
63
+ 500 val perplexity 305.7949
64
+ 500 train 5.692929 (lr=8.0000e-04) (hash(x)=145450636)
65
+ 510 train 5.697438 (lr=8.0000e-04) (hash(x)=148375186)
66
+ 520 train 5.714912 (lr=7.9999e-04) (hash(x)=144256573)
67
+ 530 train 5.765066 (lr=7.9998e-04) (hash(x)=141443048)
68
+ 540 train 5.584963 (lr=7.9996e-04) (hash(x)=149347480)
69
+ 550 train 5.667272 (lr=7.9993e-04) (hash(x)=154123388)
70
+ 560 train 5.640288 (lr=7.9991e-04) (hash(x)=143045783)
71
+ 570 train 5.503636 (lr=7.9987e-04) (hash(x)=157244060)
72
+ 580 train 5.495625 (lr=7.9983e-04) (hash(x)=158018166)
73
+ 590 train 5.568650 (lr=7.9979e-04) (hash(x)=153794251)
74
+ 600 val loss 5.5449
75
+ 600 val perplexity 255.9347
76
+ 600 train 5.487761 (lr=7.9974e-04) (hash(x)=145249251)
77
+ 610 train 5.555068 (lr=7.9968e-04) (hash(x)=150377688)
78
+ 620 train 5.509329 (lr=7.9962e-04) (hash(x)=134768519)
79
+ 630 train 5.423077 (lr=7.9956e-04) (hash(x)=141559208)
80
+ 640 train 5.397127 (lr=7.9949e-04) (hash(x)=148444988)
81
+ 650 train 5.408972 (lr=7.9941e-04) (hash(x)=148937127)
82
+ 660 train 5.298910 (lr=7.9933e-04) (hash(x)=145121305)
83
+ 670 train 5.437715 (lr=7.9925e-04) (hash(x)=156860740)
84
+ 680 train 5.224507 (lr=7.9915e-04) (hash(x)=137272598)
85
+ 690 train 5.228613 (lr=7.9906e-04) (hash(x)=139021179)
86
+ 700 val loss 5.3565
87
+ 700 val perplexity 211.9827
88
+ 700 train 5.517815 (lr=7.9896e-04) (hash(x)=150475545)
89
+ 710 train 5.306555 (lr=7.9885e-04) (hash(x)=148476077)
90
+ 720 train 5.197684 (lr=7.9874e-04) (hash(x)=143328717)
91
+ 730 train 5.128149 (lr=7.9862e-04) (hash(x)=137342488)
92
+ 740 train 5.202574 (lr=7.9850e-04) (hash(x)=149452392)
93
+ 750 train 5.188231 (lr=7.9837e-04) (hash(x)=154653428)
94
+ 760 train 5.320519 (lr=7.9824e-04) (hash(x)=153305312)
95
+ 770 train 5.169168 (lr=7.9810e-04) (hash(x)=153862299)
96
+ 780 train 5.183595 (lr=7.9796e-04) (hash(x)=151035519)
97
+ 790 train 5.272301 (lr=7.9781e-04) (hash(x)=145990773)
98
+ 800 val loss 5.2334
99
+ 800 val perplexity 187.4342
100
+ 800 train 5.242605 (lr=7.9765e-04) (hash(x)=144483776)
101
+ 810 train 5.147661 (lr=7.9749e-04) (hash(x)=142045737)
102
+ 820 train 5.270576 (lr=7.9733e-04) (hash(x)=157946886)
103
+ 830 train 5.146349 (lr=7.9716e-04) (hash(x)=150425385)
104
+ 840 train 5.186336 (lr=7.9699e-04) (hash(x)=144514796)
105
+ 850 train 5.230666 (lr=7.9681e-04) (hash(x)=157395496)
106
+ 860 train 5.188850 (lr=7.9662e-04) (hash(x)=151365298)
107
+ 870 train 5.209791 (lr=7.9643e-04) (hash(x)=151437005)
108
+ 880 train 5.125273 (lr=7.9624e-04) (hash(x)=151935250)
109
+ 890 train 5.171317 (lr=7.9604e-04) (hash(x)=159838534)
110
+ 900 val loss 5.1185
111
+ 900 val perplexity 167.0831
112
+ 900 train 5.096739 (lr=7.9583e-04) (hash(x)=157916369)
113
+ 910 train 5.220689 (lr=7.9562e-04) (hash(x)=165272484)
114
+ 920 train 5.051099 (lr=7.9541e-04) (hash(x)=145383083)
115
+ 930 train 5.021815 (lr=7.9518e-04) (hash(x)=152078393)
116
+ 940 train 5.160800 (lr=7.9496e-04) (hash(x)=151846396)
117
+ 950 train 5.079991 (lr=7.9473e-04) (hash(x)=166073923)
118
+ 960 train 5.027879 (lr=7.9449e-04) (hash(x)=136782937)
119
+ 970 train 5.002523 (lr=7.9425e-04) (hash(x)=146747407)
120
+ 980 train 5.018802 (lr=7.9400e-04) (hash(x)=171270168)
121
+ 990 train 5.032642 (lr=7.9375e-04) (hash(x)=141668491)
122
+ 1000 val loss 5.0419
123
+ 1000 val perplexity 154.7682
124
+ 1000 train 4.942974 (lr=7.9349e-04) (hash(x)=154856891)
125
+ 1010 train 5.023415 (lr=7.9323e-04) (hash(x)=145288527)
126
+ 1020 train 4.891458 (lr=7.9297e-04) (hash(x)=144618667)
127
+ 1030 train 5.122633 (lr=7.9269e-04) (hash(x)=139276042)
128
+ 1040 train 4.822103 (lr=7.9242e-04) (hash(x)=147217952)
129
+ 1050 train 4.909261 (lr=7.9213e-04) (hash(x)=151925203)
130
+ 1060 train 4.889205 (lr=7.9185e-04) (hash(x)=147473652)
131
+ 1070 train 4.843236 (lr=7.9155e-04) (hash(x)=145345524)
132
+ 1080 train 4.721206 (lr=7.9126e-04) (hash(x)=155595779)
133
+ 1090 train 4.774611 (lr=7.9095e-04) (hash(x)=147405225)
134
+ 1100 val loss 4.9275
135
+ 1100 val perplexity 138.0355
136
+ 1100 train 4.630736 (lr=7.9064e-04) (hash(x)=136191502)
137
+ 1110 train 4.868481 (lr=7.9033e-04) (hash(x)=151847694)
138
+ 1120 train 4.962811 (lr=7.9001e-04) (hash(x)=144828302)
139
+ 1130 train 4.973944 (lr=7.8969e-04) (hash(x)=153817096)
140
+ 1140 train 5.029817 (lr=7.8936e-04) (hash(x)=166352243)
141
+ 1150 train 5.074063 (lr=7.8903e-04) (hash(x)=153273362)
142
+ 1160 train 4.891214 (lr=7.8869e-04) (hash(x)=178082599)
143
+ 1170 train 4.706911 (lr=7.8835e-04) (hash(x)=149460553)
144
+ 1180 train 4.890978 (lr=7.8800e-04) (hash(x)=151910947)
145
+ 1190 train 4.809578 (lr=7.8764e-04) (hash(x)=147115336)
146
+ 1200 val loss 4.8477
147
+ 1200 val perplexity 127.4472
148
+ 1200 train 4.877482 (lr=7.8729e-04) (hash(x)=148021541)
149
+ 1210 train 4.768750 (lr=7.8692e-04) (hash(x)=151495989)
150
+ 1220 train 4.880058 (lr=7.8655e-04) (hash(x)=156983220)
151
+ 1230 train 4.935190 (lr=7.8618e-04) (hash(x)=147788187)
152
+ 1240 train 4.916607 (lr=7.8580e-04) (hash(x)=171125590)
153
+ 1250 train 4.835770 (lr=7.8542e-04) (hash(x)=141356608)
154
+ 1260 train 4.869235 (lr=7.8503e-04) (hash(x)=150132098)
155
+ 1270 train 4.739377 (lr=7.8464e-04) (hash(x)=147917245)
156
+ 1280 train 4.835557 (lr=7.8424e-04) (hash(x)=148900016)
157
+ 1290 train 4.795037 (lr=7.8383e-04) (hash(x)=144978605)
158
+ 1300 val loss 4.7704
159
+ 1300 val perplexity 117.9618
160
+ 1300 train 4.727566 (lr=7.8342e-04) (hash(x)=146005217)
161
+ 1310 train 4.730542 (lr=7.8301e-04) (hash(x)=144892309)
162
+ 1320 train 4.810223 (lr=7.8259e-04) (hash(x)=165903661)
163
+ 1330 train 4.754246 (lr=7.8217e-04) (hash(x)=168489989)
164
+ 1340 train 4.920579 (lr=7.8174e-04) (hash(x)=176281294)
165
+ 1350 train 4.715534 (lr=7.8131e-04) (hash(x)=144511718)
166
+ 1360 train 4.766327 (lr=7.8087e-04) (hash(x)=144362722)
167
+ 1370 train 4.656821 (lr=7.8042e-04) (hash(x)=139964999)
168
+ 1380 train 4.687278 (lr=7.7998e-04) (hash(x)=193332654)
169
+ 1390 train 4.714314 (lr=7.7952e-04) (hash(x)=138180866)
170
+ 1400 val loss 4.7345
171
+ 1400 val perplexity 113.8102
172
+ 1400 train 4.719286 (lr=7.7906e-04) (hash(x)=146019502)
173
+ 1410 train 4.751299 (lr=7.7860e-04) (hash(x)=153245388)
174
+ 1420 train 4.651731 (lr=7.7813e-04) (hash(x)=149839636)
175
+ 1430 train 4.751934 (lr=7.7766e-04) (hash(x)=142844796)
176
+ 1440 train 4.976958 (lr=7.7718e-04) (hash(x)=159787060)
177
+ 1450 train 4.693465 (lr=7.7670e-04) (hash(x)=146496200)
178
+ 1460 train 4.615085 (lr=7.7621e-04) (hash(x)=164169521)
179
+ 1470 train 4.659662 (lr=7.7572e-04) (hash(x)=150906386)
180
+ 1480 train 4.679850 (lr=7.7522e-04) (hash(x)=148916053)
181
+ 1490 train 4.613160 (lr=7.7472e-04) (hash(x)=148746554)
182
+ 1500 val loss 4.7497
183
+ 1500 val perplexity 115.5447
184
+ 1500 train 4.710093 (lr=7.7421e-04) (hash(x)=150127281)
185
+ 1510 train 4.875929 (lr=7.7370e-04) (hash(x)=149760530)
186
+ 1520 train 4.708329 (lr=7.7318e-04) (hash(x)=144237370)
187
+ 1530 train 4.524237 (lr=7.7266e-04) (hash(x)=132692898)
188
+ 1540 train 4.510156 (lr=7.7214e-04) (hash(x)=150979737)
189
+ 1550 train 4.708315 (lr=7.7160e-04) (hash(x)=142022255)
190
+ 1560 train 4.633008 (lr=7.7107e-04) (hash(x)=135769745)
191
+ 1570 train 4.496590 (lr=7.7053e-04) (hash(x)=162241131)
192
+ 1580 train 4.591646 (lr=7.6998e-04) (hash(x)=166874637)
193
+ 1590 train 4.558323 (lr=7.6943e-04) (hash(x)=157401041)
194
+ 1600 val loss 4.6503
195
+ 1600 val perplexity 104.6158
196
+ 1600 train 4.591084 (lr=7.6888e-04) (hash(x)=154120875)
197
+ 1610 train 4.559020 (lr=7.6832e-04) (hash(x)=149407490)
198
+ 1620 train 4.525401 (lr=7.6775e-04) (hash(x)=144479755)
199
+ 1630 train 4.602766 (lr=7.6718e-04) (hash(x)=170907716)
200
+ 1640 train 4.240652 (lr=7.6661e-04) (hash(x)=151914010)
201
+ 1650 train 4.549155 (lr=7.6603e-04) (hash(x)=149843610)
202
+ 1660 train 4.444502 (lr=7.6545e-04) (hash(x)=133217001)
203
+ 1670 train 4.400337 (lr=7.6486e-04) (hash(x)=168227774)
204
+ 1680 train 4.357891 (lr=7.6426e-04) (hash(x)=157093189)
205
+ 1690 train 4.199286 (lr=7.6367e-04) (hash(x)=154989819)
206
+ 1700 val loss 4.6285
207
+ 1700 val perplexity 102.3628
208
+ 1700 train 4.723501 (lr=7.6306e-04) (hash(x)=155797680)
209
+ 1710 train 4.406520 (lr=7.6246e-04) (hash(x)=166050772)
210
+ 1720 train 4.370466 (lr=7.6184e-04) (hash(x)=151663443)
211
+ 1730 train 4.605211 (lr=7.6123e-04) (hash(x)=152112619)
212
+ 1740 train 4.541809 (lr=7.6061e-04) (hash(x)=156132679)
213
+ 1750 train 4.385110 (lr=7.5998e-04) (hash(x)=149044477)
214
+ 1760 train 4.568919 (lr=7.5935e-04) (hash(x)=142924719)
215
+ 1770 train 4.461357 (lr=7.5872e-04) (hash(x)=153467406)
216
+ 1780 train 4.542307 (lr=7.5808e-04) (hash(x)=148889581)
217
+ 1790 train 4.487999 (lr=7.5743e-04) (hash(x)=164831182)
218
+ 1800 val loss 4.5634
219
+ 1800 val perplexity 95.9069
220
+ 1800 train 4.488400 (lr=7.5678e-04) (hash(x)=156809396)
221
+ 1810 train 4.583954 (lr=7.5613e-04) (hash(x)=149245741)
222
+ 1820 train 4.536496 (lr=7.5547e-04) (hash(x)=146865466)
223
+ 1830 train 4.607292 (lr=7.5481e-04) (hash(x)=148068618)
224
+ 1840 train 4.442547 (lr=7.5414e-04) (hash(x)=150088183)
225
+ 1850 train 4.528140 (lr=7.5347e-04) (hash(x)=146695082)
226
+ 1860 train 4.454133 (lr=7.5279e-04) (hash(x)=169008624)
227
+ 1870 train 4.399381 (lr=7.5211e-04) (hash(x)=148645124)
228
+ 1880 train 4.554943 (lr=7.5143e-04) (hash(x)=139701422)
229
+ 1890 train 4.520085 (lr=7.5074e-04) (hash(x)=156374257)
230
+ 1900 val loss 4.5413
231
+ 1900 val perplexity 93.8098
232
+ 1900 train 4.468108 (lr=7.5004e-04) (hash(x)=144640294)
233
+ 1910 train 4.617693 (lr=7.4934e-04) (hash(x)=147484985)
234
+ 1920 train 4.462884 (lr=7.4864e-04) (hash(x)=142317889)
235
+ 1930 train 4.455640 (lr=7.4793e-04) (hash(x)=144906216)
236
+ 1940 train 4.495804 (lr=7.4722e-04) (hash(x)=165332621)
237
+ 1950 train 4.480155 (lr=7.4650e-04) (hash(x)=145701919)
238
+ 1960 train 4.377773 (lr=7.4578e-04) (hash(x)=146414118)
239
+ 1970 train 4.549281 (lr=7.4505e-04) (hash(x)=146766958)
240
+ 1980 train 4.467148 (lr=7.4432e-04) (hash(x)=148146847)
241
+ 1990 train 4.222916 (lr=7.4359e-04) (hash(x)=159684604)
242
+ 2000 val loss 4.5393
243
+ 2000 val perplexity 93.6256
244
+ 2000 train 4.372840 (lr=7.4285e-04) (hash(x)=162831106)
245
+ 2010 train 4.392554 (lr=7.4211e-04) (hash(x)=150583346)
246
+ 2020 train 4.344497 (lr=7.4136e-04) (hash(x)=142779458)
247
+ 2030 train 4.326798 (lr=7.4061e-04) (hash(x)=143755114)
248
+ 2040 train 4.237622 (lr=7.3985e-04) (hash(x)=147324095)
249
+ 2050 train 4.332928 (lr=7.3909e-04) (hash(x)=141727373)
250
+ 2060 train 4.418235 (lr=7.3833e-04) (hash(x)=162053052)
251
+ 2070 train 4.464035 (lr=7.3756e-04) (hash(x)=162596975)
252
+ 2080 train 4.385101 (lr=7.3678e-04) (hash(x)=161242340)
253
+ 2090 train 4.371880 (lr=7.3600e-04) (hash(x)=148583522)
254
+ 2100 val loss 4.5540
255
+ 2100 val perplexity 95.0149
256
+ 2100 train 4.437373 (lr=7.3522e-04) (hash(x)=158239484)
257
+ 2110 train 4.354429 (lr=7.3444e-04) (hash(x)=152610058)
258
+ 2120 train 4.409340 (lr=7.3364e-04) (hash(x)=146582203)
259
+ 2130 train 4.129742 (lr=7.3285e-04) (hash(x)=146686436)
260
+ 2140 train 4.537238 (lr=7.3205e-04) (hash(x)=142827295)
261
+ 2150 train 4.455830 (lr=7.3125e-04) (hash(x)=150747398)
262
+ 2160 train 4.733751 (lr=7.3044e-04) (hash(x)=150777134)
263
+ 2170 train 4.413770 (lr=7.2963e-04) (hash(x)=157002856)
264
+ 2180 train 4.465530 (lr=7.2881e-04) (hash(x)=150442337)
265
+ 2190 train 4.423717 (lr=7.2799e-04) (hash(x)=150799081)
266
+ 2200 val loss 4.4807
267
+ 2200 val perplexity 88.2965
268
+ 2200 train 4.362355 (lr=7.2716e-04) (hash(x)=140504180)
269
+ 2210 train 4.492737 (lr=7.2634e-04) (hash(x)=149088475)
270
+ 2220 train 4.455531 (lr=7.2550e-04) (hash(x)=155180847)
271
+ 2230 train 4.377587 (lr=7.2467e-04) (hash(x)=159347164)
272
+ 2240 train 4.543031 (lr=7.2382e-04) (hash(x)=153574288)
273
+ 2250 train 4.486637 (lr=7.2298e-04) (hash(x)=142603512)
274
+ 2260 train 4.500721 (lr=7.2213e-04) (hash(x)=142953831)
275
+ 2270 train 4.397542 (lr=7.2128e-04) (hash(x)=137642681)
276
+ 2280 train 4.408342 (lr=7.2042e-04) (hash(x)=158106614)
277
+ 2290 train 4.403256 (lr=7.1956e-04) (hash(x)=160787891)
278
+ 2300 val loss 4.4439
279
+ 2300 val perplexity 85.1104
280
+ 2300 train 4.512861 (lr=7.1869e-04) (hash(x)=142234024)
281
+ 2310 train 4.417135 (lr=7.1782e-04) (hash(x)=138192210)
282
+ 2320 train 4.441013 (lr=7.1695e-04) (hash(x)=147727662)
283
+ 2330 train 4.380257 (lr=7.1607e-04) (hash(x)=169324653)
284
+ 2340 train 4.493105 (lr=7.1519e-04) (hash(x)=145350355)
285
+ 2350 train 4.494195 (lr=7.1430e-04) (hash(x)=156909016)
286
+ 2360 train 4.408182 (lr=7.1341e-04) (hash(x)=144559543)
287
+ 2370 train 4.400643 (lr=7.1252e-04) (hash(x)=153212312)
288
+ 2380 train 4.370908 (lr=7.1162e-04) (hash(x)=131816284)
289
+ 2390 train 4.426950 (lr=7.1072e-04) (hash(x)=165818309)
290
+ 2400 val loss 4.4459
291
+ 2400 val perplexity 85.2803
292
+ 2400 train 4.459247 (lr=7.0981e-04) (hash(x)=143091562)
293
+ 2410 train 4.344828 (lr=7.0890e-04) (hash(x)=154010697)
294
+ 2420 train 4.326580 (lr=7.0799e-04) (hash(x)=144844001)
295
+ 2430 train 4.488793 (lr=7.0707e-04) (hash(x)=156389834)
296
+ 2440 train 4.171124 (lr=7.0615e-04) (hash(x)=162425919)
297
+ 2450 train 4.425237 (lr=7.0523e-04) (hash(x)=154200479)
298
+ 2460 train 4.354559 (lr=7.0430e-04) (hash(x)=159066617)
299
+ 2470 train 4.158089 (lr=7.0337e-04) (hash(x)=148278058)
300
+ 2480 train 4.122016 (lr=7.0243e-04) (hash(x)=165598522)
301
+ 2490 train 4.438636 (lr=7.0149e-04) (hash(x)=141306086)
302
+ 2500 val loss 4.4752
303
+ 2500 val perplexity 87.8132
304
+ 2500 train 4.294726 (lr=7.0054e-04) (hash(x)=149857456)
305
+ 2510 train 4.284560 (lr=6.9960e-04) (hash(x)=150368907)
306
+ 2520 train 4.236595 (lr=6.9864e-04) (hash(x)=153394920)
307
+ 2530 train 4.216421 (lr=6.9769e-04) (hash(x)=159740116)
308
+ 2540 train 4.369461 (lr=6.9673e-04) (hash(x)=161620367)
309
+ 2550 train 4.451651 (lr=6.9577e-04) (hash(x)=147223658)
310
+ 2560 train 4.545908 (lr=6.9480e-04) (hash(x)=158146613)
311
+ 2570 train 4.444758 (lr=6.9383e-04) (hash(x)=136375336)
312
+ 2580 train 4.481951 (lr=6.9286e-04) (hash(x)=149298016)
313
+ 2590 train 4.405572 (lr=6.9188e-04) (hash(x)=150720933)
314
+ 2600 val loss 4.4096
315
+ 2600 val perplexity 82.2385
316
+ 2600 train 4.510528 (lr=6.9090e-04) (hash(x)=146191551)
317
+ 2610 train 4.317633 (lr=6.8991e-04) (hash(x)=151413395)
318
+ 2620 train 4.533701 (lr=6.8892e-04) (hash(x)=162094106)
319
+ 2630 train 4.404751 (lr=6.8793e-04) (hash(x)=150608302)
320
+ 2640 train 4.319120 (lr=6.8694e-04) (hash(x)=146115160)
321
+ 2650 train 4.413404 (lr=6.8594e-04) (hash(x)=138242788)
322
+ 2660 train 4.600614 (lr=6.8493e-04) (hash(x)=159921837)
323
+ 2670 train 4.239727 (lr=6.8393e-04) (hash(x)=137205989)
324
+ 2680 train 4.257802 (lr=6.8292e-04) (hash(x)=158608524)
325
+ 2690 train 4.291055 (lr=6.8190e-04) (hash(x)=153322085)
326
+ 2700 val loss 4.3896
327
+ 2700 val perplexity 80.6121
328
+ 2700 train 4.331791 (lr=6.8089e-04) (hash(x)=145375752)
329
+ 2710 train 4.348867 (lr=6.7987e-04) (hash(x)=145036398)
330
+ 2720 train 4.369700 (lr=6.7884e-04) (hash(x)=140064355)
331
+ 2730 train 4.403379 (lr=6.7782e-04) (hash(x)=148983355)
332
+ 2740 train 4.296770 (lr=6.7678e-04) (hash(x)=147636026)
333
+ 2750 train 4.354241 (lr=6.7575e-04) (hash(x)=151763585)
334
+ 2760 train 4.278496 (lr=6.7471e-04) (hash(x)=178033416)
335
+ 2770 train 4.243112 (lr=6.7367e-04) (hash(x)=147097669)
336
+ 2780 train 4.208759 (lr=6.7263e-04) (hash(x)=140475447)
337
+ 2790 train 4.322309 (lr=6.7158e-04) (hash(x)=141135962)
338
+ 2800 val loss 4.3828
339
+ 2800 val perplexity 80.0634
340
+ 2800 train 4.221279 (lr=6.7053e-04) (hash(x)=151568014)
341
+ 2810 train 4.934597 (lr=6.6947e-04) (hash(x)=170629615)
342
+ 2820 train 4.303170 (lr=6.6841e-04) (hash(x)=151858146)
343
+ 2830 train 4.210390 (lr=6.6735e-04) (hash(x)=152170305)
344
+ 2840 train 4.129820 (lr=6.6629e-04) (hash(x)=150705881)
345
+ 2850 train 4.163996 (lr=6.6522e-04) (hash(x)=146997394)
346
+ 2860 train 4.152957 (lr=6.6415e-04) (hash(x)=159749180)
347
+ 2870 train 4.034689 (lr=6.6308e-04) (hash(x)=147618423)
348
+ 2880 train 4.229532 (lr=6.6200e-04) (hash(x)=140778993)
349
+ 2890 train 4.103230 (lr=6.6092e-04) (hash(x)=150735837)
350
+ 2900 val loss 4.3834
351
+ 2900 val perplexity 80.1119
352
+ 2900 train 4.162219 (lr=6.5983e-04) (hash(x)=149366597)
353
+ 2910 train 4.283740 (lr=6.5875e-04) (hash(x)=181602500)
354
+ 2920 train 4.244694 (lr=6.5766e-04) (hash(x)=148080200)
355
+ 2930 train 4.081357 (lr=6.5656e-04) (hash(x)=150629961)
356
+ 2940 train 4.262661 (lr=6.5547e-04) (hash(x)=155786888)
357
+ 2950 train 4.263927 (lr=6.5437e-04) (hash(x)=148553059)
358
+ 2960 train 4.380385 (lr=6.5326e-04) (hash(x)=158494862)
359
+ 2970 train 4.492483 (lr=6.5216e-04) (hash(x)=147684099)
360
+ 2980 train 4.263389 (lr=6.5105e-04) (hash(x)=159008790)
361
+ 2990 train 4.413972 (lr=6.4994e-04) (hash(x)=148288782)
362
+ 3000 val loss 4.3429
363
+ 3000 val perplexity 76.9281
364
+ 3000 train 4.276694 (lr=6.4882e-04) (hash(x)=150464442)
365
+ 3010 train 4.349776 (lr=6.4770e-04) (hash(x)=148665916)
366
+ 3020 train 4.292125 (lr=6.4658e-04) (hash(x)=159788454)
367
+ 3030 train 4.379039 (lr=6.4546e-04) (hash(x)=170224765)
368
+ 3040 train 4.265642 (lr=6.4433e-04) (hash(x)=139746299)
369
+ 3050 train 4.342177 (lr=6.4320e-04) (hash(x)=173491884)
370
+ 3060 train 4.319968 (lr=6.4207e-04) (hash(x)=146098725)
371
+ 3070 train 4.291288 (lr=6.4093e-04) (hash(x)=154713525)
372
+ 3080 train 4.318592 (lr=6.3979e-04) (hash(x)=151206978)
373
+ 3090 train 4.356441 (lr=6.3865e-04) (hash(x)=144250687)
374
+ 3100 val loss 4.3596
375
+ 3100 val perplexity 78.2258
376
+ 3100 train 4.395173 (lr=6.3750e-04) (hash(x)=182449036)
377
+ 3110 train 4.214971 (lr=6.3636e-04) (hash(x)=145504538)
378
+ 3120 train 4.246961 (lr=6.3521e-04) (hash(x)=147948751)
379
+ 3130 train 4.192139 (lr=6.3405e-04) (hash(x)=144547711)
380
+ 3140 train 4.310242 (lr=6.3290e-04) (hash(x)=158622029)
381
+ 3150 train 4.271168 (lr=6.3174e-04) (hash(x)=155949476)
382
+ 3160 train 4.276151 (lr=6.3058e-04) (hash(x)=129245664)
383
+ 3170 train 4.289102 (lr=6.2941e-04) (hash(x)=157162959)
384
+ 3180 train 4.353170 (lr=6.2825e-04) (hash(x)=163102434)
385
+ 3190 train 4.390101 (lr=6.2708e-04) (hash(x)=155952009)
386
+ 3200 val loss 4.3352
387
+ 3200 val perplexity 76.3420
388
+ 3200 train 4.271265 (lr=6.2590e-04) (hash(x)=140141286)
389
+ 3210 train 4.270248 (lr=6.2473e-04) (hash(x)=148811694)
390
+ 3220 train 4.210612 (lr=6.2355e-04) (hash(x)=143433404)
391
+ 3230 train 4.263620 (lr=6.2237e-04) (hash(x)=150525065)
392
+ 3240 train 4.156843 (lr=6.2119e-04) (hash(x)=152014854)
393
+ 3250 train 4.045469 (lr=6.2000e-04) (hash(x)=144716153)
394
+ 3260 train 4.375982 (lr=6.1881e-04) (hash(x)=145324818)
395
+ 3270 train 4.309959 (lr=6.1762e-04) (hash(x)=154930419)
396
+ 3280 train 4.306784 (lr=6.1643e-04) (hash(x)=144649958)
397
+ 3290 train 4.294215 (lr=6.1523e-04) (hash(x)=156496147)
398
+ 3300 val loss 4.3204
399
+ 3300 val perplexity 75.2181
400
+ 3300 train 4.240705 (lr=6.1403e-04) (hash(x)=148099414)
401
+ 3310 train 4.302856 (lr=6.1283e-04) (hash(x)=141081470)
402
+ 3320 train 4.276582 (lr=6.1163e-04) (hash(x)=151572529)
403
+ 3330 train 4.232923 (lr=6.1042e-04) (hash(x)=146887343)
404
+ 3340 train 4.273978 (lr=6.0921e-04) (hash(x)=148612634)
405
+ 3350 train 4.232584 (lr=6.0800e-04) (hash(x)=145218304)
406
+ 3360 train 4.165869 (lr=6.0679e-04) (hash(x)=146893345)
407
+ 3370 train 4.241447 (lr=6.0557e-04) (hash(x)=159302018)
408
+ 3380 train 4.306074 (lr=6.0435e-04) (hash(x)=164117611)
409
+ 3390 train 4.181183 (lr=6.0313e-04) (hash(x)=142401925)
410
+ 3400 val loss 4.3082
411
+ 3400 val perplexity 74.3046
412
+ 3400 train 4.163392 (lr=6.0191e-04) (hash(x)=142633951)
413
+ 3410 train 4.288529 (lr=6.0068e-04) (hash(x)=148491904)
414
+ 3420 train 4.335003 (lr=5.9945e-04) (hash(x)=144013244)
415
+ 3430 train 4.146573 (lr=5.9822e-04) (hash(x)=146939843)
416
+ 3440 train 4.175786 (lr=5.9699e-04) (hash(x)=161055964)
417
+ 3450 train 4.076260 (lr=5.9576e-04) (hash(x)=134825681)
418
+ 3460 train 4.268161 (lr=5.9452e-04) (hash(x)=145087511)
419
+ 3470 train 4.089385 (lr=5.9328e-04) (hash(x)=144200286)
420
+ 3480 train 4.130699 (lr=5.9204e-04) (hash(x)=157605428)
421
+ 3490 train 4.316826 (lr=5.9080e-04) (hash(x)=153636990)
422
+ 3500 val loss 4.3115
423
+ 3500 val perplexity 74.5501
424
+ 3500 train 4.169837 (lr=5.8955e-04) (hash(x)=148368965)
425
+ 3510 train 4.075866 (lr=5.8830e-04) (hash(x)=144775557)
426
+ 3520 train 4.250553 (lr=5.8705e-04) (hash(x)=151539855)
427
+ 3530 train 4.262748 (lr=5.8580e-04) (hash(x)=168384321)
428
+ 3540 train 4.407971 (lr=5.8454e-04) (hash(x)=155671447)
429
+ 3550 train 4.441235 (lr=5.8329e-04) (hash(x)=157403334)
430
+ 3560 train 4.209557 (lr=5.8203e-04) (hash(x)=139495714)
431
+ 3570 train 4.347342 (lr=5.8077e-04) (hash(x)=147685555)
432
+ 3580 train 4.472521 (lr=5.7951e-04) (hash(x)=154209753)
433
+ 3590 train 4.086065 (lr=5.7824e-04) (hash(x)=166864372)
434
+ 3600 val loss 4.2907
435
+ 3600 val perplexity 73.0203
436
+ 3600 train 4.200510 (lr=5.7697e-04) (hash(x)=152372067)
437
+ 3610 train 4.481126 (lr=5.7571e-04) (hash(x)=156579291)
438
+ 3620 train 4.064327 (lr=5.7443e-04) (hash(x)=152969451)
439
+ 3630 train 4.037411 (lr=5.7316e-04) (hash(x)=164428105)
440
+ 3640 train 4.215364 (lr=5.7189e-04) (hash(x)=153325907)
441
+ 3650 train 4.196020 (lr=5.7061e-04) (hash(x)=159197101)
442
+ 3660 train 4.183208 (lr=5.6933e-04) (hash(x)=157503290)
443
+ 3670 train 4.105588 (lr=5.6805e-04) (hash(x)=149036650)
444
+ 3680 train 4.110247 (lr=5.6677e-04) (hash(x)=144525088)
445
+ 3690 train 4.224317 (lr=5.6549e-04) (hash(x)=143154211)
446
+ 3700 val loss 4.2872
447
+ 3700 val perplexity 72.7606
448
+ 3700 train 4.217520 (lr=5.6420e-04) (hash(x)=168885609)
449
+ 3710 train 4.182745 (lr=5.6291e-04) (hash(x)=148815644)
450
+ 3720 train 4.113211 (lr=5.6162e-04) (hash(x)=153279629)
451
+ 3730 train 4.131580 (lr=5.6033e-04) (hash(x)=148181200)
452
+ 3740 train 4.099375 (lr=5.5904e-04) (hash(x)=151357364)
453
+ 3750 train 4.063350 (lr=5.5774e-04) (hash(x)=145269246)
454
+ 3760 train 4.141498 (lr=5.5645e-04) (hash(x)=148923398)
455
+ 3770 train 4.138111 (lr=5.5515e-04) (hash(x)=141205226)
456
+ 3780 train 4.027296 (lr=5.5385e-04) (hash(x)=146180296)
457
+ 3790 train 4.098681 (lr=5.5255e-04) (hash(x)=169790000)
458
+ 3800 val loss 4.2884
459
+ 3800 val perplexity 72.8520
460
+ 3800 train 4.140154 (lr=5.5125e-04) (hash(x)=141633734)
461
+ 3810 train 4.264968 (lr=5.4994e-04) (hash(x)=156306070)
462
+ 3820 train 4.257529 (lr=5.4864e-04) (hash(x)=158769870)
463
+ 3830 train 4.221703 (lr=5.4733e-04) (hash(x)=165301927)
464
+ 3840 train 4.261649 (lr=5.4602e-04) (hash(x)=147025475)
465
+ 3850 train 4.204757 (lr=5.4471e-04) (hash(x)=141223580)
466
+ 3860 train 4.229239 (lr=5.4340e-04) (hash(x)=139668795)
467
+ 3870 train 4.290291 (lr=5.4208e-04) (hash(x)=155839599)
468
+ 3880 train 4.349567 (lr=5.4077e-04) (hash(x)=150635541)
469
+ 3890 train 4.344969 (lr=5.3945e-04) (hash(x)=153702524)
470
+ 3900 val loss 4.2637
471
+ 3900 val perplexity 71.0743
472
+ 3900 train 4.167935 (lr=5.3813e-04) (hash(x)=153141007)
473
+ 3910 train 4.255192 (lr=5.3681e-04) (hash(x)=145483115)
474
+ 3920 train 4.044975 (lr=5.3549e-04) (hash(x)=147480523)
475
+ 3930 train 4.221537 (lr=5.3417e-04) (hash(x)=146229467)
476
+ 3940 train 4.202239 (lr=5.3284e-04) (hash(x)=151376187)
477
+ 3950 train 4.170072 (lr=5.3152e-04) (hash(x)=153745186)
478
+ 3960 train 4.128955 (lr=5.3019e-04) (hash(x)=147595615)
479
+ 3970 train 4.266096 (lr=5.2886e-04) (hash(x)=157979848)
480
+ 3980 train 4.131737 (lr=5.2754e-04) (hash(x)=153714091)
481
+ 3990 train 4.166999 (lr=5.2620e-04) (hash(x)=155637629)
482
+ 4000 val loss 4.2596
483
+ 4000 val perplexity 70.7806
484
+ 4000 train 4.197123 (lr=5.2487e-04) (hash(x)=160577202)
485
+ 4010 train 4.104901 (lr=5.2354e-04) (hash(x)=147432640)
486
+ 4020 train 4.067557 (lr=5.2221e-04) (hash(x)=135542902)
487
+ 4030 train 4.200021 (lr=5.2087e-04) (hash(x)=143137909)
488
+ 4040 train 4.196943 (lr=5.1953e-04) (hash(x)=148269908)
489
+ 4050 train 4.077495 (lr=5.1820e-04) (hash(x)=141954182)
490
+ 4060 train 4.168184 (lr=5.1686e-04) (hash(x)=157035179)
491
+ 4070 train 3.890860 (lr=5.1552e-04) (hash(x)=151361423)
492
+ 4080 train 4.016812 (lr=5.1418e-04) (hash(x)=144373988)
493
+ 4090 train 4.229814 (lr=5.1284e-04) (hash(x)=156461428)
494
+ 4100 val loss 4.2536
495
+ 4100 val perplexity 70.3554
496
+ 4100 train 4.337818 (lr=5.1149e-04) (hash(x)=153858169)
497
+ 4110 train 4.302637 (lr=5.1015e-04) (hash(x)=165506959)
498
+ 4120 train 4.307225 (lr=5.0880e-04) (hash(x)=158709009)
499
+ 4130 train 4.383077 (lr=5.0746e-04) (hash(x)=151836522)
500
+ 4140 train 4.145933 (lr=5.0611e-04) (hash(x)=152917389)
501
+ 4150 train 4.194824 (lr=5.0476e-04) (hash(x)=146973868)
502
+ 4160 train 4.292345 (lr=5.0341e-04) (hash(x)=157875887)
503
+ 4170 train 4.574834 (lr=5.0206e-04) (hash(x)=167933111)
504
+ 4180 train 4.161945 (lr=5.0071e-04) (hash(x)=161938168)
505
+ 4190 train 4.143842 (lr=4.9936e-04) (hash(x)=158102630)
506
+ 4200 val loss 4.2357
507
+ 4200 val perplexity 69.1110
508
+ 4200 train 4.255004 (lr=4.9801e-04) (hash(x)=155889149)
509
+ 4210 train 4.343439 (lr=4.9665e-04) (hash(x)=131046288)
510
+ 4220 train 4.102593 (lr=4.9530e-04) (hash(x)=149423408)
511
+ 4230 train 4.191511 (lr=4.9395e-04) (hash(x)=150033580)
512
+ 4240 train 4.084181 (lr=4.9259e-04) (hash(x)=137509644)
513
+ 4250 train 4.258571 (lr=4.9123e-04) (hash(x)=161159362)
514
+ 4260 train 4.159616 (lr=4.8988e-04) (hash(x)=148117355)
515
+ 4270 train 4.094699 (lr=4.8852e-04) (hash(x)=147315384)
516
+ 4280 train 4.150157 (lr=4.8716e-04) (hash(x)=156577316)
517
+ 4290 train 4.340037 (lr=4.8580e-04) (hash(x)=161553761)
518
+ 4300 val loss 4.2312
519
+ 4300 val perplexity 68.7999
520
+ 4300 train 3.985034 (lr=4.8444e-04) (hash(x)=152294662)
521
+ 4310 train 4.017335 (lr=4.8308e-04) (hash(x)=144616611)
522
+ 4320 train 4.176939 (lr=4.8172e-04) (hash(x)=154134591)
523
+ 4330 train 4.037823 (lr=4.8036e-04) (hash(x)=159947834)
524
+ 4340 train 4.018911 (lr=4.7899e-04) (hash(x)=156261313)
525
+ 4350 train 4.131335 (lr=4.7763e-04) (hash(x)=141245643)
526
+ 4360 train 4.101488 (lr=4.7627e-04) (hash(x)=157291204)
527
+ 4370 train 4.132045 (lr=4.7490e-04) (hash(x)=142877676)
528
+ 4380 train 4.179982 (lr=4.7354e-04) (hash(x)=155174402)
529
+ 4390 train 4.027390 (lr=4.7217e-04) (hash(x)=154675451)
530
+ 4400 val loss 4.2285
531
+ 4400 val perplexity 68.6117
532
+ 4400 train 4.197064 (lr=4.7081e-04) (hash(x)=141804386)
533
+ 4410 train 4.414270 (lr=4.6944e-04) (hash(x)=163930619)
534
+ 4420 train 4.285137 (lr=4.6807e-04) (hash(x)=139437472)
535
+ 4430 train 4.228657 (lr=4.6671e-04) (hash(x)=142474831)
536
+ 4440 train 4.242911 (lr=4.6534e-04) (hash(x)=150102428)
537
+ 4450 train 4.209610 (lr=4.6397e-04) (hash(x)=135805460)
538
+ 4460 train 4.174462 (lr=4.6260e-04) (hash(x)=154732100)
539
+ 4470 train 4.177282 (lr=4.6124e-04) (hash(x)=148554435)
540
+ 4480 train 4.120358 (lr=4.5987e-04) (hash(x)=142410065)
541
+ 4490 train 4.150236 (lr=4.5850e-04) (hash(x)=148108338)
542
+ 4500 val loss 4.2082
543
+ 4500 val perplexity 67.2346
544
+ 4500 train 4.047217 (lr=4.5713e-04) (hash(x)=151095242)
545
+ 4510 train 4.058399 (lr=4.5576e-04) (hash(x)=154911617)
546
+ 4520 train 4.106719 (lr=4.5439e-04) (hash(x)=150858662)
547
+ 4530 train 4.055121 (lr=4.5302e-04) (hash(x)=146850830)
548
+ 4540 train 4.116180 (lr=4.5165e-04) (hash(x)=153506103)
549
+ 4550 train 4.117402 (lr=4.5028e-04) (hash(x)=157068400)
550
+ 4560 train 4.200628 (lr=4.4891e-04) (hash(x)=163640327)
551
+ 4570 train 4.254792 (lr=4.4754e-04) (hash(x)=158962962)
552
+ 4580 train 4.123038 (lr=4.4617e-04) (hash(x)=143454481)
553
+ 4590 train 4.127841 (lr=4.4480e-04) (hash(x)=155380269)
554
+ 4600 val loss 4.2468
555
+ 4600 val perplexity 69.8807
556
+ 4600 train 4.274385 (lr=4.4343e-04) (hash(x)=156414699)
557
+ 4610 train 4.101875 (lr=4.4206e-04) (hash(x)=153520595)
558
+ 4620 train 4.126172 (lr=4.4069e-04) (hash(x)=151287061)
559
+ 4630 train 3.953681 (lr=4.3931e-04) (hash(x)=144972877)
560
+ 4640 train 4.076117 (lr=4.3794e-04) (hash(x)=152031134)
561
+ 4650 train 4.214238 (lr=4.3657e-04) (hash(x)=155348609)
562
+ 4660 train 3.999416 (lr=4.3520e-04) (hash(x)=144855343)
563
+ 4670 train 4.153498 (lr=4.3383e-04) (hash(x)=159247995)
564
+ 4680 train 4.042363 (lr=4.3246e-04) (hash(x)=133146878)
565
+ 4690 train 4.272006 (lr=4.3109e-04) (hash(x)=148797338)
566
+ 4700 val loss 4.2037
567
+ 4700 val perplexity 66.9357
568
+ 4700 train 4.158514 (lr=4.2972e-04) (hash(x)=161556686)
569
+ 4710 train 4.343598 (lr=4.2835e-04) (hash(x)=142528636)
570
+ 4720 train 4.275161 (lr=4.2698e-04) (hash(x)=148322603)
571
+ 4730 train 4.187306 (lr=4.2561e-04) (hash(x)=145051555)
572
+ 4740 train 4.103632 (lr=4.2424e-04) (hash(x)=146193153)
573
+ 4750 train 4.299421 (lr=4.2287e-04) (hash(x)=165802167)
574
+ 4760 train 4.241821 (lr=4.2150e-04) (hash(x)=158948628)
575
+ 4770 train 4.297499 (lr=4.2013e-04) (hash(x)=156177788)
576
+ 4780 train 4.148581 (lr=4.1876e-04) (hash(x)=175446069)
577
+ 4790 train 4.170279 (lr=4.1740e-04) (hash(x)=141100706)
578
+ 4800 val loss 4.1999
579
+ 4800 val perplexity 66.6807
580
+ 4800 train 4.087453 (lr=4.1603e-04) (hash(x)=149000293)
581
+ 4810 train 4.022676 (lr=4.1466e-04) (hash(x)=135891778)
582
+ 4820 train 4.074157 (lr=4.1329e-04) (hash(x)=145474733)
583
+ 4830 train 4.132744 (lr=4.1193e-04) (hash(x)=145187742)
584
+ 4840 train 4.176939 (lr=4.1056e-04) (hash(x)=157685237)
585
+ 4850 train 4.014524 (lr=4.0919e-04) (hash(x)=128599506)
586
+ 4860 train 4.106462 (lr=4.0783e-04) (hash(x)=142067051)
587
+ 4870 train 4.060631 (lr=4.0646e-04) (hash(x)=156215711)
588
+ 4880 train 4.241800 (lr=4.0510e-04) (hash(x)=150456895)
589
+ 4890 train 3.870941 (lr=4.0373e-04) (hash(x)=141647202)
590
+ 4900 val loss 4.1891
591
+ 4900 val perplexity 65.9613
592
+ 4900 train 3.934667 (lr=4.0237e-04) (hash(x)=154349989)
593
+ 4910 train 3.962008 (lr=4.0101e-04) (hash(x)=151563396)
594
+ 4920 train 3.985592 (lr=3.9964e-04) (hash(x)=138868314)
595
+ 4930 train 4.125028 (lr=3.9828e-04) (hash(x)=148533460)
596
+ 4940 train 4.120391 (lr=3.9692e-04) (hash(x)=144892493)
597
+ 4950 train 4.139394 (lr=3.9556e-04) (hash(x)=170326431)
598
+ 4960 train 4.133707 (lr=3.9420e-04) (hash(x)=163312680)
599
+ 4970 train 4.099628 (lr=3.9284e-04) (hash(x)=184352734)
600
+ 4980 train 4.171930 (lr=3.9148e-04) (hash(x)=140363733)
601
+ 4990 train 4.072880 (lr=3.9012e-04) (hash(x)=147794873)
602
+ 5000 val loss 4.1830
603
+ 5000 val perplexity 65.5644
604
+ 5000 train 4.167898 (lr=3.8877e-04) (hash(x)=131475967)
605
+ 5010 train 4.271088 (lr=3.8741e-04) (hash(x)=139560000)
606
+ 5020 train 4.173810 (lr=3.8605e-04) (hash(x)=160128701)
607
+ 5030 train 4.287411 (lr=3.8470e-04) (hash(x)=154698531)
608
+ 5040 train 4.248640 (lr=3.8335e-04) (hash(x)=153833791)
609
+ 5050 train 4.170629 (lr=3.8199e-04) (hash(x)=145953388)
610
+ 5060 train 4.517692 (lr=3.8064e-04) (hash(x)=134297881)
611
+ 5070 train 4.169985 (lr=3.7929e-04) (hash(x)=159987550)
612
+ 5080 train 4.168464 (lr=3.7794e-04) (hash(x)=147699302)
613
+ 5090 train 3.886464 (lr=3.7659e-04) (hash(x)=140534876)
614
+ 5100 val loss 4.1779
615
+ 5100 val perplexity 65.2258
616
+ 5100 train 4.094859 (lr=3.7524e-04) (hash(x)=149717902)
617
+ 5110 train 4.011804 (lr=3.7389e-04) (hash(x)=147648148)
618
+ 5120 train 4.026576 (lr=3.7254e-04) (hash(x)=170962791)
619
+ 5130 train 4.078098 (lr=3.7120e-04) (hash(x)=139633288)
620
+ 5140 train 4.241095 (lr=3.6985e-04) (hash(x)=160163221)
621
+ 5150 train 4.080642 (lr=3.6851e-04) (hash(x)=146625393)
622
+ 5160 train 4.141926 (lr=3.6716e-04) (hash(x)=147399092)
623
+ 5170 train 3.977932 (lr=3.6582e-04) (hash(x)=147172792)
624
+ 5180 train 4.083449 (lr=3.6448e-04) (hash(x)=140600568)
625
+ 5190 train 4.010283 (lr=3.6314e-04) (hash(x)=152423962)
626
+ 5200 val loss 4.1738
627
+ 5200 val perplexity 64.9593
628
+ 5200 train 4.113544 (lr=3.6180e-04) (hash(x)=151407999)
629
+ 5210 train 4.060960 (lr=3.6047e-04) (hash(x)=154874903)
630
+ 5220 train 3.878493 (lr=3.5913e-04) (hash(x)=154133697)
631
+ 5230 train 4.058887 (lr=3.5779e-04) (hash(x)=148148721)
632
+ 5240 train 4.048518 (lr=3.5646e-04) (hash(x)=149681665)
633
+ 5250 train 4.019365 (lr=3.5513e-04) (hash(x)=150136904)
634
+ 5260 train 4.124552 (lr=3.5380e-04) (hash(x)=149261170)
635
+ 5270 train 4.098753 (lr=3.5246e-04) (hash(x)=155540595)
636
+ 5280 train 4.112729 (lr=3.5114e-04) (hash(x)=141973714)
637
+ 5290 train 4.549044 (lr=3.4981e-04) (hash(x)=146827439)
638
+ 5300 val loss 4.1624
639
+ 5300 val perplexity 64.2232
640
+ 5300 train 4.121033 (lr=3.4848e-04) (hash(x)=168602728)
641
+ 5310 train 4.109458 (lr=3.4716e-04) (hash(x)=144035757)
642
+ 5320 train 4.164502 (lr=3.4583e-04) (hash(x)=143287560)
643
+ 5330 train 4.103524 (lr=3.4451e-04) (hash(x)=142461814)
644
+ 5340 train 4.138205 (lr=3.4319e-04) (hash(x)=149905536)
645
+ 5350 train 4.240426 (lr=3.4187e-04) (hash(x)=134597061)
646
+ 5360 train 4.260373 (lr=3.4055e-04) (hash(x)=156520228)
647
+ 5370 train 4.166106 (lr=3.3923e-04) (hash(x)=152690323)
648
+ 5380 train 4.114301 (lr=3.3792e-04) (hash(x)=140092622)
649
+ 5390 train 4.074981 (lr=3.3660e-04) (hash(x)=153504017)
650
+ 5400 val loss 4.1477
651
+ 5400 val perplexity 63.2883
652
+ 5400 train 4.093682 (lr=3.3529e-04) (hash(x)=158344511)
653
+ 5410 train 4.028800 (lr=3.3398e-04) (hash(x)=149681960)
654
+ 5420 train 4.079967 (lr=3.3267e-04) (hash(x)=143459968)
655
+ 5430 train 3.996368 (lr=3.3136e-04) (hash(x)=134759020)
656
+ 5440 train 3.970869 (lr=3.3006e-04) (hash(x)=164760471)
657
+ 5450 train 4.027930 (lr=3.2875e-04) (hash(x)=148551310)
658
+ 5460 train 4.165794 (lr=3.2745e-04) (hash(x)=155464239)
659
+ 5470 train 3.976872 (lr=3.2615e-04) (hash(x)=147309485)
660
+ 5480 train 3.932474 (lr=3.2485e-04) (hash(x)=163937590)
661
+ 5490 train 4.215070 (lr=3.2355e-04) (hash(x)=142830147)
662
+ 5500 val loss 4.1447
663
+ 5500 val perplexity 63.1018
664
+ 5500 train 4.041501 (lr=3.2226e-04) (hash(x)=148350057)
665
+ 5510 train 4.018369 (lr=3.2096e-04) (hash(x)=149007838)
666
+ 5520 train 3.987614 (lr=3.1967e-04) (hash(x)=158176239)
667
+ 5530 train 4.085081 (lr=3.1838e-04) (hash(x)=156395740)
668
+ 5540 train 4.269013 (lr=3.1709e-04) (hash(x)=152453211)
669
+ 5550 train 3.966241 (lr=3.1580e-04) (hash(x)=144447218)
670
+ 5560 train 4.153876 (lr=3.1451e-04) (hash(x)=153858804)
671
+ 5570 train 3.975330 (lr=3.1323e-04) (hash(x)=133929681)
672
+ 5580 train 4.230175 (lr=3.1195e-04) (hash(x)=159170988)
673
+ 5590 train 4.166400 (lr=3.1067e-04) (hash(x)=144978886)
674
+ 5600 val loss 4.1395
675
+ 5600 val perplexity 62.7716
676
+ 5600 train 4.180494 (lr=3.0939e-04) (hash(x)=153847323)
677
+ 5610 train 4.129622 (lr=3.0811e-04) (hash(x)=145553636)
678
+ 5620 train 5.283006 (lr=3.0684e-04) (hash(x)=153712417)
679
+ 5630 train 4.160687 (lr=3.0557e-04) (hash(x)=148397520)
680
+ 5640 train 4.106377 (lr=3.0429e-04) (hash(x)=149424351)
681
+ 5650 train 4.192017 (lr=3.0303e-04) (hash(x)=140531069)
682
+ 5660 train 4.134145 (lr=3.0176e-04) (hash(x)=144904009)
683
+ 5670 train 3.978693 (lr=3.0049e-04) (hash(x)=162469666)
684
+ 5680 train 4.111402 (lr=2.9923e-04) (hash(x)=149254569)
685
+ 5690 train 4.030644 (lr=2.9797e-04) (hash(x)=146949494)
686
+ 5700 val loss 4.1318
687
+ 5700 val perplexity 62.2899
688
+ 5700 train 4.090963 (lr=2.9671e-04) (hash(x)=156607405)
689
+ 5710 train 3.940398 (lr=2.9546e-04) (hash(x)=146349425)
690
+ 5720 train 4.115198 (lr=2.9420e-04) (hash(x)=160347239)
691
+ 5730 train 4.043807 (lr=2.9295e-04) (hash(x)=153785045)
692
+ 5740 train 4.048010 (lr=2.9170e-04) (hash(x)=135967367)
693
+ 5750 train 4.020603 (lr=2.9045e-04) (hash(x)=140181987)
694
+ 5760 train 4.170301 (lr=2.8920e-04) (hash(x)=153217075)
695
+ 5770 train 4.040683 (lr=2.8796e-04) (hash(x)=146360100)
696
+ 5780 train 4.062373 (lr=2.8672e-04) (hash(x)=161111351)
697
+ 5790 train 3.995572 (lr=2.8548e-04) (hash(x)=147891055)
698
+ 5800 val loss 4.1293
699
+ 5800 val perplexity 62.1330
700
+ 5800 train 3.867943 (lr=2.8424e-04) (hash(x)=145115031)
701
+ 5810 train 3.911765 (lr=2.8301e-04) (hash(x)=153661465)
702
+ 5820 train 4.009286 (lr=2.8178e-04) (hash(x)=148264581)
703
+ 5830 train 4.105780 (lr=2.8055e-04) (hash(x)=171144748)
704
+ 5840 train 4.037564 (lr=2.7932e-04) (hash(x)=157863238)
705
+ 5850 train 3.976783 (lr=2.7809e-04) (hash(x)=144669655)
706
+ 5860 train 3.887782 (lr=2.7687e-04) (hash(x)=154948909)
707
+ 5870 train 4.053301 (lr=2.7565e-04) (hash(x)=158981837)
708
+ 5880 train 4.102110 (lr=2.7443e-04) (hash(x)=151102851)
709
+ 5890 train 4.291588 (lr=2.7321e-04) (hash(x)=153490628)
710
+ 5900 val loss 4.1200
711
+ 5900 val perplexity 61.5616
712
+ 5900 train 4.085109 (lr=2.7200e-04) (hash(x)=141584622)
713
+ 5910 train 3.965896 (lr=2.7079e-04) (hash(x)=142896324)
714
+ 5920 train 4.256884 (lr=2.6958e-04) (hash(x)=148264764)
715
+ 5930 train 4.115695 (lr=2.6837e-04) (hash(x)=153157848)
716
+ 5940 train 4.092025 (lr=2.6717e-04) (hash(x)=151086429)
717
+ 5950 train 4.073498 (lr=2.6597e-04) (hash(x)=146009598)
718
+ 5960 train 4.175600 (lr=2.6477e-04) (hash(x)=149488374)
719
+ 5970 train 4.019866 (lr=2.6357e-04) (hash(x)=149307478)
720
+ 5980 train 3.781977 (lr=2.6238e-04) (hash(x)=156742339)
721
+ 5990 train 4.043467 (lr=2.6119e-04) (hash(x)=164296391)
722
+ 6000 val loss 4.1115
723
+ 6000 val perplexity 61.0411
724
+ 6000 train 4.145084 (lr=2.6000e-04) (hash(x)=146613857)
725
+ 6010 train 4.018159 (lr=2.5881e-04) (hash(x)=149742104)
726
+ 6020 train 3.962527 (lr=2.5763e-04) (hash(x)=145645994)
727
+ 6030 train 4.046016 (lr=2.5645e-04) (hash(x)=156324150)
728
+ 6040 train 4.222361 (lr=2.5527e-04) (hash(x)=154655300)
729
+ 6050 train 4.085439 (lr=2.5410e-04) (hash(x)=153714860)
730
+ 6060 train 4.041783 (lr=2.5292e-04) (hash(x)=139981556)
731
+ 6070 train 3.864680 (lr=2.5175e-04) (hash(x)=158753458)
732
+ 6080 train 4.033009 (lr=2.5059e-04) (hash(x)=152559930)
733
+ 6090 train 4.000428 (lr=2.4942e-04) (hash(x)=137128715)
734
+ 6100 val loss 4.1123
735
+ 6100 val perplexity 61.0880
736
+ 6100 train 4.019367 (lr=2.4826e-04) (hash(x)=144621768)
737
+ 6110 train 4.023533 (lr=2.4710e-04) (hash(x)=155614333)
738
+ 6120 train 4.006380 (lr=2.4595e-04) (hash(x)=145682343)
739
+ 6130 train 3.914570 (lr=2.4479e-04) (hash(x)=142909600)
740
+ 6140 train 3.943367 (lr=2.4364e-04) (hash(x)=152993494)
741
+ 6150 train 3.831490 (lr=2.4250e-04) (hash(x)=157151527)
742
+ 6160 train 3.979619 (lr=2.4135e-04) (hash(x)=150653611)
743
+ 6170 train 4.241260 (lr=2.4021e-04) (hash(x)=165237934)
744
+ 6180 train 4.084354 (lr=2.3907e-04) (hash(x)=155730197)
745
+ 6190 train 4.098281 (lr=2.3793e-04) (hash(x)=149687169)
746
+ 6200 val loss 4.1127
747
+ 6200 val perplexity 61.1091
748
+ 6200 train 4.095367 (lr=2.3680e-04) (hash(x)=146521760)
749
+ 6210 train 4.108415 (lr=2.3567e-04) (hash(x)=141008090)
750
+ 6220 train 4.062373 (lr=2.3454e-04) (hash(x)=143407095)
751
+ 6230 train 4.196450 (lr=2.3342e-04) (hash(x)=152947604)
752
+ 6240 train 4.505361 (lr=2.3230e-04) (hash(x)=153876740)
753
+ 6250 train 4.151809 (lr=2.3118e-04) (hash(x)=144250615)
754
+ 6260 train 4.140716 (lr=2.3006e-04) (hash(x)=155236959)
755
+ 6270 train 4.118550 (lr=2.2895e-04) (hash(x)=139785369)
756
+ 6280 train 4.034225 (lr=2.2784e-04) (hash(x)=165036565)
757
+ 6290 train 4.068254 (lr=2.2674e-04) (hash(x)=144585028)
758
+ 6300 val loss 4.0962
759
+ 6300 val perplexity 60.1101
760
+ 6300 train 3.925282 (lr=2.2563e-04) (hash(x)=161378136)
761
+ 6310 train 3.975079 (lr=2.2453e-04) (hash(x)=148483421)
762
+ 6320 train 4.015523 (lr=2.2344e-04) (hash(x)=149835040)
763
+ 6330 train 3.933032 (lr=2.2234e-04) (hash(x)=149459414)
764
+ 6340 train 3.966676 (lr=2.2125e-04) (hash(x)=154290067)
765
+ 6350 train 4.060078 (lr=2.2017e-04) (hash(x)=147407391)
766
+ 6360 train 3.994248 (lr=2.1908e-04) (hash(x)=146317149)
767
+ 6370 train 3.898934 (lr=2.1800e-04) (hash(x)=141970628)
768
+ 6380 train 3.933867 (lr=2.1692e-04) (hash(x)=139170535)
769
+ 6390 train 4.129155 (lr=2.1585e-04) (hash(x)=148905963)
770
+ 6400 val loss 4.0970
771
+ 6400 val perplexity 60.1566
772
+ 6400 train 3.924435 (lr=2.1478e-04) (hash(x)=141624235)
773
+ 6410 train 3.926183 (lr=2.1371e-04) (hash(x)=150520968)
774
+ 6420 train 3.817674 (lr=2.1265e-04) (hash(x)=155019129)
775
+ 6430 train 3.879323 (lr=2.1159e-04) (hash(x)=150031836)
776
+ 6440 train 3.837521 (lr=2.1053e-04) (hash(x)=112835661)
777
+ 6450 train 3.843982 (lr=2.0947e-04) (hash(x)=141072709)
778
+ 6460 train 3.982452 (lr=2.0842e-04) (hash(x)=153933796)
779
+ 6470 train 4.041571 (lr=2.0737e-04) (hash(x)=153315715)
780
+ 6480 train 4.108235 (lr=2.0633e-04) (hash(x)=158089228)
781
+ 6490 train 3.965218 (lr=2.0529e-04) (hash(x)=149471788)
782
+ 6500 val loss 4.0934
783
+ 6500 val perplexity 59.9419
784
+ 6500 train 4.100058 (lr=2.0425e-04) (hash(x)=151197095)
785
+ 6510 train 4.049337 (lr=2.0322e-04) (hash(x)=165554266)
786
+ 6520 train 4.220020 (lr=2.0218e-04) (hash(x)=157822242)
787
+ 6530 train 4.153152 (lr=2.0116e-04) (hash(x)=156267861)
788
+ 6540 train 4.063520 (lr=2.0013e-04) (hash(x)=145825803)
789
+ 6550 train 4.144500 (lr=1.9911e-04) (hash(x)=151303683)
790
+ 6560 train 4.201071 (lr=1.9810e-04) (hash(x)=150786942)
791
+ 6570 train 4.016315 (lr=1.9708e-04) (hash(x)=146237093)
792
+ 6580 train 3.989436 (lr=1.9607e-04) (hash(x)=147063866)
793
+ 6590 train 4.218430 (lr=1.9507e-04) (hash(x)=130513396)
794
+ 6600 val loss 4.0895
795
+ 6600 val perplexity 59.7122
796
+ 6600 train 4.109011 (lr=1.9406e-04) (hash(x)=153269571)
797
+ 6610 train 4.177717 (lr=1.9306e-04) (hash(x)=150340530)
798
+ 6620 train 4.066791 (lr=1.9207e-04) (hash(x)=153843616)
799
+ 6630 train 4.131222 (lr=1.9108e-04) (hash(x)=144540858)
800
+ 6640 train 3.927896 (lr=1.9009e-04) (hash(x)=153330434)
801
+ 6650 train 3.938975 (lr=1.8910e-04) (hash(x)=146851492)
802
+ 6660 train 3.961146 (lr=1.8812e-04) (hash(x)=153601788)
803
+ 6670 train 3.995249 (lr=1.8714e-04) (hash(x)=140005742)
804
+ 6680 train 3.858025 (lr=1.8617e-04) (hash(x)=154102392)
805
+ 6690 train 3.862147 (lr=1.8520e-04) (hash(x)=152774975)
806
+ 6700 val loss 4.0877
807
+ 6700 val perplexity 59.6009
808
+ 6700 train 4.065046 (lr=1.8423e-04) (hash(x)=146111181)
809
+ 6710 train 4.014231 (lr=1.8327e-04) (hash(x)=143988017)
810
+ 6720 train 4.001183 (lr=1.8231e-04) (hash(x)=146073959)
811
+ 6730 train 4.255454 (lr=1.8136e-04) (hash(x)=152642956)
812
+ 6740 train 4.142077 (lr=1.8040e-04) (hash(x)=146964363)
813
+ 6750 train 3.936705 (lr=1.7946e-04) (hash(x)=162251871)
814
+ 6760 train 4.288689 (lr=1.7851e-04) (hash(x)=171122166)
815
+ 6770 train 4.061587 (lr=1.7757e-04) (hash(x)=135199617)
816
+ 6780 train 4.032383 (lr=1.7663e-04) (hash(x)=160480410)
817
+ 6790 train 4.180742 (lr=1.7570e-04) (hash(x)=151933948)
818
+ 6800 val loss 4.0727
819
+ 6800 val perplexity 58.7151
820
+ 6800 train 4.071478 (lr=1.7477e-04) (hash(x)=147269760)
821
+ 6810 train 4.142908 (lr=1.7385e-04) (hash(x)=156710316)
822
+ 6820 train 4.167407 (lr=1.7293e-04) (hash(x)=157300754)
823
+ 6830 train 3.968544 (lr=1.7201e-04) (hash(x)=133838057)
824
+ 6840 train 4.165523 (lr=1.7110e-04) (hash(x)=143518263)
825
+ 6850 train 4.232651 (lr=1.7019e-04) (hash(x)=160328446)
826
+ 6860 train 4.067945 (lr=1.6928e-04) (hash(x)=142143427)
827
+ 6870 train 4.127422 (lr=1.6838e-04) (hash(x)=149045075)
828
+ 6880 train 4.037887 (lr=1.6748e-04) (hash(x)=150708270)
829
+ 6890 train 4.039498 (lr=1.6659e-04) (hash(x)=153066841)
830
+ 6900 val loss 4.0709
831
+ 6900 val perplexity 58.6113
832
+ 6900 train 4.150796 (lr=1.6570e-04) (hash(x)=152912762)
833
+ 6910 train 3.991377 (lr=1.6481e-04) (hash(x)=148346240)
834
+ 6920 train 4.112321 (lr=1.6393e-04) (hash(x)=152806752)
835
+ 6930 train 4.095747 (lr=1.6305e-04) (hash(x)=161831829)
836
+ 6940 train 4.010226 (lr=1.6218e-04) (hash(x)=149483673)
837
+ 6950 train 3.990258 (lr=1.6131e-04) (hash(x)=119634555)
838
+ 6960 train 3.909781 (lr=1.6044e-04) (hash(x)=144709540)
839
+ 6970 train 3.941809 (lr=1.5958e-04) (hash(x)=141277017)
840
+ 6980 train 3.994755 (lr=1.5872e-04) (hash(x)=140618792)
841
+ 6990 train 3.973725 (lr=1.5787e-04) (hash(x)=157443505)
842
+ 7000 val loss 4.0699
843
+ 7000 val perplexity 58.5540
844
+ 7000 train 4.211453 (lr=1.5702e-04) (hash(x)=165412343)
845
+ 7010 train 3.857019 (lr=1.5618e-04) (hash(x)=140746035)
846
+ 7020 train 3.902428 (lr=1.5533e-04) (hash(x)=141101046)
847
+ 7030 train 4.042645 (lr=1.5450e-04) (hash(x)=152098342)
848
+ 7040 train 3.900831 (lr=1.5366e-04) (hash(x)=169244968)
849
+ 7050 train 4.025446 (lr=1.5284e-04) (hash(x)=149886680)
850
+ 7060 train 4.192437 (lr=1.5201e-04) (hash(x)=144670487)
851
+ 7070 train 4.215551 (lr=1.5119e-04) (hash(x)=148774474)
852
+ 7080 train 4.048332 (lr=1.5037e-04) (hash(x)=155096286)
853
+ 7090 train 4.019391 (lr=1.4956e-04) (hash(x)=145048246)
854
+ 7100 val loss 4.0556
855
+ 7100 val perplexity 57.7220
856
+ 7100 train 4.162021 (lr=1.4875e-04) (hash(x)=162866028)
857
+ 7110 train 4.033250 (lr=1.4795e-04) (hash(x)=162308558)
858
+ 7120 train 3.996074 (lr=1.4715e-04) (hash(x)=153468309)
859
+ 7130 train 4.026896 (lr=1.4636e-04) (hash(x)=144522880)
860
+ 7140 train 4.171306 (lr=1.4556e-04) (hash(x)=153055749)
861
+ 7150 train 3.984723 (lr=1.4478e-04) (hash(x)=171163513)
862
+ 7160 train 4.063915 (lr=1.4400e-04) (hash(x)=138563864)
863
+ 7170 train 4.198422 (lr=1.4322e-04) (hash(x)=166996657)
864
+ 7180 train 3.894148 (lr=1.4244e-04) (hash(x)=154809376)
865
+ 7190 train 3.980723 (lr=1.4167e-04) (hash(x)=147988525)
866
+ 7200 val loss 4.0552
867
+ 7200 val perplexity 57.6953
868
+ 7200 train 4.023038 (lr=1.4091e-04) (hash(x)=142998115)
869
+ 7210 train 4.007470 (lr=1.4015e-04) (hash(x)=145518575)
870
+ 7220 train 4.037627 (lr=1.3939e-04) (hash(x)=145758781)
871
+ 7230 train 4.144683 (lr=1.3864e-04) (hash(x)=148909637)
872
+ 7240 train 4.000212 (lr=1.3789e-04) (hash(x)=144422691)
873
+ 7250 train 3.846135 (lr=1.3715e-04) (hash(x)=140396153)
874
+ 7260 train 4.084816 (lr=1.3641e-04) (hash(x)=153619124)
875
+ 7270 train 3.990026 (lr=1.3568e-04) (hash(x)=161980521)
876
+ 7280 train 3.819988 (lr=1.3495e-04) (hash(x)=135160527)
877
+ 7290 train 3.953932 (lr=1.3422e-04) (hash(x)=167156181)
878
+ 7300 val loss 4.0550
879
+ 7300 val perplexity 57.6829
880
+ 7300 train 3.889663 (lr=1.3350e-04) (hash(x)=145486999)
881
+ 7310 train 3.960577 (lr=1.3278e-04) (hash(x)=155013351)
882
+ 7320 train 4.004139 (lr=1.3207e-04) (hash(x)=169616991)
883
+ 7330 train 3.890252 (lr=1.3136e-04) (hash(x)=146516856)
884
+ 7340 train 4.210858 (lr=1.3066e-04) (hash(x)=171918417)
885
+ 7350 train 4.222839 (lr=1.2996e-04) (hash(x)=160511891)
886
+ 7360 train 4.070906 (lr=1.2926e-04) (hash(x)=150280167)
887
+ 7370 train 4.147654 (lr=1.2857e-04) (hash(x)=151373787)
888
+ 7380 train 4.246967 (lr=1.2789e-04) (hash(x)=163311616)
889
+ 7390 train 4.233782 (lr=1.2721e-04) (hash(x)=164418521)
890
+ 7400 val loss 4.0432
891
+ 7400 val perplexity 57.0093
892
+ 7400 train 3.998075 (lr=1.2653e-04) (hash(x)=155325873)
893
+ 7410 train 4.055531 (lr=1.2586e-04) (hash(x)=135020608)
894
+ 7420 train 3.943713 (lr=1.2519e-04) (hash(x)=142812793)
895
+ 7430 train 4.056371 (lr=1.2453e-04) (hash(x)=149463283)
896
+ 7440 train 4.016263 (lr=1.2387e-04) (hash(x)=170512352)
897
+ 7450 train 3.993564 (lr=1.2322e-04) (hash(x)=146253806)
898
+ 7460 train 4.654219 (lr=1.2257e-04) (hash(x)=133479853)
899
+ 7470 train 4.002298 (lr=1.2192e-04) (hash(x)=138198949)
900
+ 7480 train 3.980343 (lr=1.2128e-04) (hash(x)=143831431)
901
+ 7490 train 4.143089 (lr=1.2065e-04) (hash(x)=148147144)
902
+ 7500 val loss 4.0429
903
+ 7500 val perplexity 56.9901
904
+ 7500 train 4.045605 (lr=1.2002e-04) (hash(x)=145131256)
905
+ 7510 train 4.081214 (lr=1.1939e-04) (hash(x)=150932291)
906
+ 7520 train 4.041511 (lr=1.1877e-04) (hash(x)=147605934)
907
+ 7530 train 3.996105 (lr=1.1816e-04) (hash(x)=154343507)
908
+ 7540 train 4.132847 (lr=1.1754e-04) (hash(x)=148440064)
909
+ 7550 train 3.938812 (lr=1.1694e-04) (hash(x)=140505990)
910
+ 7560 train 3.922266 (lr=1.1633e-04) (hash(x)=177438878)
911
+ 7570 train 4.001867 (lr=1.1574e-04) (hash(x)=154527960)
912
+ 7580 train 4.090985 (lr=1.1514e-04) (hash(x)=153793091)
913
+ 7590 train 3.910792 (lr=1.1455e-04) (hash(x)=163097232)
914
+ 7600 val loss 4.0522
915
+ 7600 val perplexity 57.5267
916
+ 7600 train 4.042652 (lr=1.1397e-04) (hash(x)=144008365)
917
+ 7610 train 4.032667 (lr=1.1339e-04) (hash(x)=139649886)
918
+ 7620 train 3.979476 (lr=1.1282e-04) (hash(x)=145452123)
919
+ 7630 train 3.996011 (lr=1.1225e-04) (hash(x)=132477285)
920
+ 7640 train 3.912077 (lr=1.1168e-04) (hash(x)=139030720)
921
+ 7650 train 4.040545 (lr=1.1112e-04) (hash(x)=157466085)
922
+ 7660 train 4.055116 (lr=1.1057e-04) (hash(x)=153638385)
923
+ 7670 train 4.101335 (lr=1.1002e-04) (hash(x)=154823094)
924
+ 7680 train 4.008807 (lr=1.0947e-04) (hash(x)=153107930)
925
+ 7690 train 3.999007 (lr=1.0893e-04) (hash(x)=153681916)
926
+ 7700 val loss 4.0351
927
+ 7700 val perplexity 56.5501
928
+ 7700 train 4.314200 (lr=1.0840e-04) (hash(x)=148848532)
929
+ 7710 train 4.039814 (lr=1.0786e-04) (hash(x)=156640791)
930
+ 7720 train 4.270339 (lr=1.0734e-04) (hash(x)=152573035)
931
+ 7730 train 4.111857 (lr=1.0682e-04) (hash(x)=147536091)
932
+ 7740 train 4.281199 (lr=1.0630e-04) (hash(x)=175687483)
933
+ 7750 train 4.021767 (lr=1.0579e-04) (hash(x)=143775898)
934
+ 7760 train 4.044106 (lr=1.0528e-04) (hash(x)=146986193)
935
+ 7770 train 3.960701 (lr=1.0478e-04) (hash(x)=160924293)
936
+ 7780 train 4.009738 (lr=1.0428e-04) (hash(x)=132579169)
937
+ 7790 train 4.225746 (lr=1.0379e-04) (hash(x)=147474225)
938
+ 7800 val loss 4.0338
939
+ 7800 val perplexity 56.4751
940
+ 7800 train 4.070296 (lr=1.0330e-04) (hash(x)=150391642)
941
+ 7810 train 4.240418 (lr=1.0282e-04) (hash(x)=156984481)
942
+ 7820 train 3.954861 (lr=1.0234e-04) (hash(x)=139652488)
943
+ 7830 train 3.973538 (lr=1.0187e-04) (hash(x)=169033643)
944
+ 7840 train 3.948540 (lr=1.0140e-04) (hash(x)=152645857)
945
+ 7850 train 4.096553 (lr=1.0094e-04) (hash(x)=150228075)
946
+ 7860 train 4.015212 (lr=1.0048e-04) (hash(x)=178788133)
947
+ 7870 train 4.004895 (lr=1.0002e-04) (hash(x)=149891068)
948
+ 7880 train 3.945235 (lr=9.9576e-05) (hash(x)=157461488)
949
+ 7890 train 3.909171 (lr=9.9133e-05) (hash(x)=142502446)
950
+ 7900 val loss 4.0353
951
+ 7900 val perplexity 56.5587
952
+ 7900 train 3.841818 (lr=9.8694e-05) (hash(x)=152191414)
953
+ 7910 train 4.029228 (lr=9.8261e-05) (hash(x)=151329539)
954
+ 7920 train 3.905884 (lr=9.7832e-05) (hash(x)=134143115)
955
+ 7930 train 3.911115 (lr=9.7408e-05) (hash(x)=155536160)
956
+ 7940 train 3.972835 (lr=9.6990e-05) (hash(x)=146709034)
957
+ 7950 train 4.055519 (lr=9.6576e-05) (hash(x)=141966330)
958
+ 7960 train 4.027141 (lr=9.6167e-05) (hash(x)=152102296)
959
+ 7970 train 4.198737 (lr=9.5764e-05) (hash(x)=150272684)
960
+ 7980 train 4.170050 (lr=9.5365e-05) (hash(x)=150251784)
961
+ 7990 train 4.157490 (lr=9.4971e-05) (hash(x)=151226159)
962
+ 8000 val loss 4.0288
963
+ 8000 val perplexity 56.1923
964
+ 8000 train 4.056860 (lr=9.4583e-05) (hash(x)=159755587)
965
+ 8010 train 4.059852 (lr=9.4199e-05) (hash(x)=137607202)
966
+ 8020 train 4.054667 (lr=9.3820e-05) (hash(x)=149574886)
967
+ 8030 train 3.968499 (lr=9.3446e-05) (hash(x)=151690927)
968
+ 8040 train 3.989096 (lr=9.3078e-05) (hash(x)=151181288)
969
+ 8050 train 4.102449 (lr=9.2714e-05) (hash(x)=150465993)
970
+ 8060 train 4.126997 (lr=9.2356e-05) (hash(x)=134201020)
971
+ 8070 train 4.016723 (lr=9.2002e-05) (hash(x)=157696045)
972
+ 8080 train 4.022478 (lr=9.1654e-05) (hash(x)=135581816)
973
+ 8090 train 3.951553 (lr=9.1310e-05) (hash(x)=153898278)
974
+ 8100 val loss 4.0254
975
+ 8100 val perplexity 56.0010
976
+ 8100 train 4.071581 (lr=9.0972e-05) (hash(x)=156664468)
977
+ 8110 train 4.173835 (lr=9.0638e-05) (hash(x)=161572593)
978
+ 8120 train 3.923417 (lr=9.0310e-05) (hash(x)=154381651)
979
+ 8130 train 4.184300 (lr=8.9987e-05) (hash(x)=149810514)
980
+ 8140 train 4.037950 (lr=8.9669e-05) (hash(x)=145085369)
981
+ 8150 train 3.896777 (lr=8.9356e-05) (hash(x)=149020616)
982
+ 8160 train 3.779180 (lr=8.9048e-05) (hash(x)=146450644)
983
+ 8170 train 3.979685 (lr=8.8745e-05) (hash(x)=148094499)
984
+ 8180 train 3.846212 (lr=8.8447e-05) (hash(x)=140810897)
985
+ 8190 train 3.955846 (lr=8.8154e-05) (hash(x)=142960309)
986
+ 8200 val loss 4.0263
987
+ 8200 val perplexity 56.0558
988
+ 8200 train 3.989845 (lr=8.7867e-05) (hash(x)=139457379)
989
+ 8210 train 3.969254 (lr=8.7584e-05) (hash(x)=145575125)
990
+ 8220 train 3.766742 (lr=8.7307e-05) (hash(x)=152836922)
991
+ 8230 train 3.852656 (lr=8.7035e-05) (hash(x)=135086706)
992
+ 8240 train 3.822788 (lr=8.6768e-05) (hash(x)=153448613)
993
+ 8250 train 3.786960 (lr=8.6506e-05) (hash(x)=138041222)
994
+ 8260 train 3.844816 (lr=8.6249e-05) (hash(x)=158857968)
995
+ 8270 train 3.861579 (lr=8.5997e-05) (hash(x)=157265315)
996
+ 8280 train 3.699619 (lr=8.5750e-05) (hash(x)=136059191)
997
+ 8290 train 3.771731 (lr=8.5509e-05) (hash(x)=162526712)
998
+ 8300 val loss 4.0331
999
+ 8300 val perplexity 56.4341
1000
+ 8300 train 3.853588 (lr=8.5273e-05) (hash(x)=145478564)
1001
+ 8310 train 3.498652 (lr=8.5041e-05) (hash(x)=157367549)
1002
+ 8320 train 3.892195 (lr=8.4815e-05) (hash(x)=156085849)
1003
+ 8330 train 4.177628 (lr=8.4594e-05) (hash(x)=154822719)
1004
+ 8340 train 3.892810 (lr=8.4379e-05) (hash(x)=159688342)
1005
+ 8350 train 4.026961 (lr=8.4168e-05) (hash(x)=144133822)
1006
+ 8360 train 3.965671 (lr=8.3963e-05) (hash(x)=154155471)
1007
+ 8370 train 3.969778 (lr=8.3762e-05) (hash(x)=157077804)
1008
+ 8380 train 3.977879 (lr=8.3567e-05) (hash(x)=147796790)
1009
+ 8390 train 3.983309 (lr=8.3377e-05) (hash(x)=145208254)
1010
+ 8400 val loss 4.0212
1011
+ 8400 val perplexity 55.7665
1012
+ 8400 train 3.936470 (lr=8.3193e-05) (hash(x)=154982769)
1013
+ 8410 train 4.127687 (lr=8.3013e-05) (hash(x)=138073867)
1014
+ 8420 train 4.170509 (lr=8.2839e-05) (hash(x)=159521725)
1015
+ 8430 train 4.096187 (lr=8.2669e-05) (hash(x)=149988578)
1016
+ 8440 train 4.064534 (lr=8.2505e-05) (hash(x)=149457063)
1017
+ 8450 train 3.931552 (lr=8.2347e-05) (hash(x)=134678896)
1018
+ 8460 train 3.979403 (lr=8.2193e-05) (hash(x)=146122249)
1019
+ 8470 train 4.000155 (lr=8.2044e-05) (hash(x)=141393319)
1020
+ 8480 train 4.029669 (lr=8.1901e-05) (hash(x)=165396836)
1021
+ 8490 train 4.098529 (lr=8.1763e-05) (hash(x)=140043806)
1022
+ 8500 val loss 4.0175
1023
+ 8500 val perplexity 55.5628
1024
+ 8500 train 4.053110 (lr=8.1630e-05) (hash(x)=145798118)
1025
+ 8510 train 4.001561 (lr=8.1502e-05) (hash(x)=150358299)
1026
+ 8520 train 3.894746 (lr=8.1380e-05) (hash(x)=152572913)
1027
+ 8530 train 3.967403 (lr=8.1263e-05) (hash(x)=148669229)
1028
+ 8540 train 3.993587 (lr=8.1150e-05) (hash(x)=147078347)
1029
+ 8550 train 3.978547 (lr=8.1044e-05) (hash(x)=146715166)
1030
+ 8560 train 3.950264 (lr=8.0942e-05) (hash(x)=143176392)
1031
+ 8570 train 4.067235 (lr=8.0845e-05) (hash(x)=153825484)
1032
+ 8580 train 4.096317 (lr=8.0754e-05) (hash(x)=163379045)
1033
+ 8590 train 3.885189 (lr=8.0668e-05) (hash(x)=140050873)
1034
+ 8600 val loss 4.0200
1035
+ 8600 val perplexity 55.7025
1036
+ 8600 train 3.957555 (lr=8.0587e-05) (hash(x)=143231551)
1037
+ 8610 train 3.812697 (lr=8.0511e-05) (hash(x)=164291360)
1038
+ 8620 train 3.844626 (lr=8.0441e-05) (hash(x)=159122123)
1039
+ 8630 train 3.817628 (lr=8.0376e-05) (hash(x)=140041695)
1040
+ 8640 train 3.849620 (lr=8.0316e-05) (hash(x)=146010790)
1041
+ 8650 train 3.763236 (lr=8.0261e-05) (hash(x)=146882123)
1042
+ 8660 train 3.575861 (lr=8.0211e-05) (hash(x)=144933705)
1043
+ 8670 train 3.803743 (lr=8.0167e-05) (hash(x)=145580075)
1044
+ 8680 train 3.850536 (lr=8.0128e-05) (hash(x)=160982926)
1045
+ 8690 train 3.727633 (lr=8.0094e-05) (hash(x)=140047108)
1046
+ 8700 val loss 4.0299
1047
+ 8700 val perplexity 56.2546
1048
+ 8700 train 3.854482 (lr=8.0065e-05) (hash(x)=154780112)
1049
+ 8710 train 3.940895 (lr=8.0042e-05) (hash(x)=157568560)
1050
+ 8720 train 3.983122 (lr=8.0023e-05) (hash(x)=159587459)
1051
+ 8730 train 3.986237 (lr=8.0010e-05) (hash(x)=152918103)
1052
+ 8740 train 3.984485 (lr=8.0003e-05) (hash(x)=156928586)
1053
+ 8749 val loss 4.0185
1054
+ 8749 val perplexity 55.6157
lr8e-4_total_batch_size61440_baseline_seed1340/model_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a0f1619dd8d4621d4f83155177dfe843d66005c298d6eb6c6a08a9de2201267
3
+ size 92843394
lr8e-4_total_batch_size61440_baseline_seed1340/optimizer_08749.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ca2a073f94983c8a04cf45dec16b670ace74b0893e4ee7e6a2edb9d37e71b14
3
+ size 179406214