andrew-healey commited on
Commit
179ff27
·
verified ·
1 Parent(s): 565ae2b

Upload folder using huggingface_hub

Browse files
lr2e-4_total_batch_size5120_seq_len128/args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "proxy_model_sweep/lr2e-4_total_batch_size5120_seq_len128", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": null, "group": "proxy_model_sweep", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1337, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 128, "batch_size": null, "total_batch_size": 5120, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": true, "mup_enable_coord_check_logging": false, "max_lr": 0.0002, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "n_embd": 128}
lr2e-4_total_batch_size5120_seq_len128/dataloader_00999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60582085b67f79a59d68bfd29cd9c94a65421fc83fc6750cd587ed93c2812fa4
3
+ size 964
lr2e-4_total_batch_size5120_seq_len128/log2.txt ADDED
@@ -0,0 +1,1043 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ max_steps: 1000
2
+ 0 val loss 11.6870
3
+ 0 val perplexity 119016.5000
4
+ 0 train 11.683437 (lr=6.9930e-07) (hash(x)=6154740)
5
+ 1 train 11.694798 (lr=1.3986e-06) (hash(x)=6605789)
6
+ 2 train 11.686706 (lr=2.0979e-06) (hash(x)=6696707)
7
+ 3 train 11.721859 (lr=2.7972e-06) (hash(x)=5113283)
8
+ 4 train 11.638785 (lr=3.4965e-06) (hash(x)=5607069)
9
+ 5 train 11.652862 (lr=4.1958e-06) (hash(x)=7007748)
10
+ 6 train 11.653969 (lr=4.8951e-06) (hash(x)=7643299)
11
+ 7 train 11.661434 (lr=5.5944e-06) (hash(x)=5534142)
12
+ 8 train 11.631168 (lr=6.2937e-06) (hash(x)=6539293)
13
+ 9 train 11.643259 (lr=6.9930e-06) (hash(x)=5974274)
14
+ 10 train 11.644226 (lr=7.6923e-06) (hash(x)=7199085)
15
+ 11 train 11.602199 (lr=8.3916e-06) (hash(x)=6763231)
16
+ 12 train 11.525333 (lr=9.0909e-06) (hash(x)=6720398)
17
+ 13 train 11.517467 (lr=9.7902e-06) (hash(x)=6706513)
18
+ 14 train 11.546041 (lr=1.0490e-05) (hash(x)=5727925)
19
+ 15 train 11.510773 (lr=1.1189e-05) (hash(x)=6328012)
20
+ 16 train 11.458251 (lr=1.1888e-05) (hash(x)=5812722)
21
+ 17 train 11.434595 (lr=1.2587e-05) (hash(x)=6183012)
22
+ 18 train 11.447615 (lr=1.3287e-05) (hash(x)=5952539)
23
+ 19 train 11.361456 (lr=1.3986e-05) (hash(x)=7052161)
24
+ 20 train 11.310294 (lr=1.4685e-05) (hash(x)=5810871)
25
+ 21 train 11.319653 (lr=1.5385e-05) (hash(x)=6586466)
26
+ 22 train 11.257821 (lr=1.6084e-05) (hash(x)=5061781)
27
+ 23 train 11.243176 (lr=1.6783e-05) (hash(x)=5556098)
28
+ 24 train 11.290564 (lr=1.7483e-05) (hash(x)=7268535)
29
+ 25 train 11.152784 (lr=1.8182e-05) (hash(x)=8155572)
30
+ 26 train 11.089466 (lr=1.8881e-05) (hash(x)=5019590)
31
+ 27 train 11.053359 (lr=1.9580e-05) (hash(x)=4696943)
32
+ 28 train 10.974409 (lr=2.0280e-05) (hash(x)=5984010)
33
+ 29 train 11.075852 (lr=2.0979e-05) (hash(x)=7649389)
34
+ 30 train 10.880369 (lr=2.1678e-05) (hash(x)=6332817)
35
+ 31 train 10.832123 (lr=2.2378e-05) (hash(x)=6219138)
36
+ 32 train 10.904154 (lr=2.3077e-05) (hash(x)=6895681)
37
+ 33 train 10.755385 (lr=2.3776e-05) (hash(x)=7075794)
38
+ 34 train 10.661633 (lr=2.4476e-05) (hash(x)=5024163)
39
+ 35 train 10.716202 (lr=2.5175e-05) (hash(x)=6049878)
40
+ 36 train 10.740591 (lr=2.5874e-05) (hash(x)=4774943)
41
+ 37 train 10.716571 (lr=2.6573e-05) (hash(x)=6385927)
42
+ 38 train 10.569189 (lr=2.7273e-05) (hash(x)=5070217)
43
+ 39 train 10.541676 (lr=2.7972e-05) (hash(x)=5491972)
44
+ 40 train 10.567021 (lr=2.8671e-05) (hash(x)=5695751)
45
+ 41 train 10.516397 (lr=2.9371e-05) (hash(x)=6061420)
46
+ 42 train 10.451446 (lr=3.0070e-05) (hash(x)=7836710)
47
+ 43 train 10.524962 (lr=3.0769e-05) (hash(x)=8075458)
48
+ 44 train 10.365536 (lr=3.1469e-05) (hash(x)=7190584)
49
+ 45 train 10.368127 (lr=3.2168e-05) (hash(x)=7238769)
50
+ 46 train 10.563442 (lr=3.2867e-05) (hash(x)=4931205)
51
+ 47 train 10.362887 (lr=3.3566e-05) (hash(x)=5917741)
52
+ 48 train 10.323801 (lr=3.4266e-05) (hash(x)=5530785)
53
+ 49 train 10.377196 (lr=3.4965e-05) (hash(x)=5430605)
54
+ 50 val loss 10.3576
55
+ 50 val perplexity 31496.1992
56
+ 50 train 10.364745 (lr=3.5664e-05) (hash(x)=6056774)
57
+ 51 train 10.276453 (lr=3.6364e-05) (hash(x)=5750403)
58
+ 52 train 10.288654 (lr=3.7063e-05) (hash(x)=6187497)
59
+ 53 train 10.287844 (lr=3.7762e-05) (hash(x)=4792640)
60
+ 54 train 10.371317 (lr=3.8462e-05) (hash(x)=6868430)
61
+ 55 train 10.253813 (lr=3.9161e-05) (hash(x)=6112458)
62
+ 56 train 10.190598 (lr=3.9860e-05) (hash(x)=5523601)
63
+ 57 train 10.168907 (lr=4.0559e-05) (hash(x)=5801122)
64
+ 58 train 10.254910 (lr=4.1259e-05) (hash(x)=6826295)
65
+ 59 train 10.352236 (lr=4.1958e-05) (hash(x)=5806490)
66
+ 60 train 10.204342 (lr=4.2657e-05) (hash(x)=6223100)
67
+ 61 train 10.174080 (lr=4.3357e-05) (hash(x)=5754140)
68
+ 62 train 10.150809 (lr=4.4056e-05) (hash(x)=6675367)
69
+ 63 train 10.067300 (lr=4.4755e-05) (hash(x)=6568379)
70
+ 64 train 9.939812 (lr=4.5455e-05) (hash(x)=5648233)
71
+ 65 train 10.102877 (lr=4.6154e-05) (hash(x)=7134932)
72
+ 66 train 10.099470 (lr=4.6853e-05) (hash(x)=5242536)
73
+ 67 train 10.125234 (lr=4.7552e-05) (hash(x)=7212403)
74
+ 68 train 9.986435 (lr=4.8252e-05) (hash(x)=5342967)
75
+ 69 train 10.102911 (lr=4.8951e-05) (hash(x)=7720703)
76
+ 70 train 9.992676 (lr=4.9650e-05) (hash(x)=7351337)
77
+ 71 train 10.072321 (lr=5.0350e-05) (hash(x)=7386082)
78
+ 72 train 9.945378 (lr=5.1049e-05) (hash(x)=6931318)
79
+ 73 train 9.957379 (lr=5.1748e-05) (hash(x)=6848513)
80
+ 74 train 9.965687 (lr=5.2448e-05) (hash(x)=7714274)
81
+ 75 train 10.014606 (lr=5.3147e-05) (hash(x)=6359743)
82
+ 76 train 9.857609 (lr=5.3846e-05) (hash(x)=6801650)
83
+ 77 train 9.840652 (lr=5.4545e-05) (hash(x)=7536538)
84
+ 78 train 9.943393 (lr=5.5245e-05) (hash(x)=6957410)
85
+ 79 train 9.950122 (lr=5.5944e-05) (hash(x)=6151674)
86
+ 80 train 9.948956 (lr=5.6643e-05) (hash(x)=6033844)
87
+ 81 train 9.844255 (lr=5.7343e-05) (hash(x)=6317099)
88
+ 82 train 9.821302 (lr=5.8042e-05) (hash(x)=5357844)
89
+ 83 train 9.864862 (lr=5.8741e-05) (hash(x)=7168704)
90
+ 84 train 9.929786 (lr=5.9441e-05) (hash(x)=7157938)
91
+ 85 train 9.808735 (lr=6.0140e-05) (hash(x)=6736092)
92
+ 86 train 9.737628 (lr=6.0839e-05) (hash(x)=5383012)
93
+ 87 train 9.775320 (lr=6.1538e-05) (hash(x)=7859060)
94
+ 88 train 9.778187 (lr=6.2238e-05) (hash(x)=5869288)
95
+ 89 train 9.777124 (lr=6.2937e-05) (hash(x)=6958223)
96
+ 90 train 9.655857 (lr=6.3636e-05) (hash(x)=6731344)
97
+ 91 train 9.594296 (lr=6.4336e-05) (hash(x)=6976239)
98
+ 92 train 9.644037 (lr=6.5035e-05) (hash(x)=7769441)
99
+ 93 train 9.646369 (lr=6.5734e-05) (hash(x)=6956845)
100
+ 94 train 9.777517 (lr=6.6434e-05) (hash(x)=6638037)
101
+ 95 train 9.561569 (lr=6.7133e-05) (hash(x)=6391506)
102
+ 96 train 9.756290 (lr=6.7832e-05) (hash(x)=7319928)
103
+ 97 train 9.733913 (lr=6.8531e-05) (hash(x)=6113701)
104
+ 98 train 9.460426 (lr=6.9231e-05) (hash(x)=6246453)
105
+ 99 train 9.509788 (lr=6.9930e-05) (hash(x)=5245999)
106
+ 100 val loss 9.5695
107
+ 100 val perplexity 14321.9639
108
+ 100 train 9.529686 (lr=7.0629e-05) (hash(x)=6227256)
109
+ 101 train 9.528876 (lr=7.1329e-05) (hash(x)=5774189)
110
+ 102 train 9.466109 (lr=7.2028e-05) (hash(x)=5588220)
111
+ 103 train 9.593409 (lr=7.2727e-05) (hash(x)=5985675)
112
+ 104 train 9.437527 (lr=7.3427e-05) (hash(x)=6259214)
113
+ 105 train 9.478571 (lr=7.4126e-05) (hash(x)=6451776)
114
+ 106 train 9.235435 (lr=7.4825e-05) (hash(x)=4940406)
115
+ 107 train 9.349859 (lr=7.5524e-05) (hash(x)=4355733)
116
+ 108 train 9.393755 (lr=7.6224e-05) (hash(x)=6186065)
117
+ 109 train 9.349123 (lr=7.6923e-05) (hash(x)=6669188)
118
+ 110 train 10.091042 (lr=7.7622e-05) (hash(x)=7118997)
119
+ 111 train 9.393293 (lr=7.8322e-05) (hash(x)=6984772)
120
+ 112 train 9.238573 (lr=7.9021e-05) (hash(x)=6472877)
121
+ 113 train 9.324425 (lr=7.9720e-05) (hash(x)=7440580)
122
+ 114 train 9.382460 (lr=8.0420e-05) (hash(x)=6378428)
123
+ 115 train 9.299797 (lr=8.1119e-05) (hash(x)=6244096)
124
+ 116 train 9.278568 (lr=8.1818e-05) (hash(x)=9100578)
125
+ 117 train 9.443444 (lr=8.2517e-05) (hash(x)=8381560)
126
+ 118 train 9.250710 (lr=8.3217e-05) (hash(x)=6066072)
127
+ 119 train 9.108931 (lr=8.3916e-05) (hash(x)=6603717)
128
+ 120 train 9.090149 (lr=8.4615e-05) (hash(x)=5138438)
129
+ 121 train 8.980786 (lr=8.5315e-05) (hash(x)=6437516)
130
+ 122 train 9.121131 (lr=8.6014e-05) (hash(x)=5743807)
131
+ 123 train 8.972855 (lr=8.6713e-05) (hash(x)=5669522)
132
+ 124 train 9.039129 (lr=8.7413e-05) (hash(x)=5963623)
133
+ 125 train 8.843126 (lr=8.8112e-05) (hash(x)=5505889)
134
+ 126 train 9.094586 (lr=8.8811e-05) (hash(x)=7607512)
135
+ 127 train 8.917691 (lr=8.9510e-05) (hash(x)=6190579)
136
+ 128 train 8.983807 (lr=9.0210e-05) (hash(x)=7444834)
137
+ 129 train 8.979677 (lr=9.0909e-05) (hash(x)=5153196)
138
+ 130 train 8.894050 (lr=9.1608e-05) (hash(x)=6495326)
139
+ 131 train 9.003565 (lr=9.2308e-05) (hash(x)=4912983)
140
+ 132 train 9.047916 (lr=9.3007e-05) (hash(x)=7094046)
141
+ 133 train 8.755109 (lr=9.3706e-05) (hash(x)=5109125)
142
+ 134 train 8.652092 (lr=9.4406e-05) (hash(x)=7119136)
143
+ 135 train 8.683964 (lr=9.5105e-05) (hash(x)=7276303)
144
+ 136 train 8.834178 (lr=9.5804e-05) (hash(x)=9147123)
145
+ 137 train 8.849945 (lr=9.6503e-05) (hash(x)=6645716)
146
+ 138 train 8.788457 (lr=9.7203e-05) (hash(x)=6347875)
147
+ 139 train 8.605400 (lr=9.7902e-05) (hash(x)=5479947)
148
+ 140 train 8.752111 (lr=9.8601e-05) (hash(x)=5832595)
149
+ 141 train 8.846031 (lr=9.9301e-05) (hash(x)=7850360)
150
+ 142 train 9.129520 (lr=1.0000e-04) (hash(x)=8499683)
151
+ 143 train 8.694483 (lr=1.0070e-04) (hash(x)=6630378)
152
+ 144 train 8.672168 (lr=1.0140e-04) (hash(x)=5340846)
153
+ 145 train 8.552694 (lr=1.0210e-04) (hash(x)=6295358)
154
+ 146 train 8.630613 (lr=1.0280e-04) (hash(x)=5974729)
155
+ 147 train 8.905967 (lr=1.0350e-04) (hash(x)=8097010)
156
+ 148 train 8.707867 (lr=1.0420e-04) (hash(x)=6681250)
157
+ 149 train 8.394732 (lr=1.0490e-04) (hash(x)=3958188)
158
+ 150 val loss 8.4740
159
+ 150 val perplexity 4788.7915
160
+ 150 train 8.460539 (lr=1.0559e-04) (hash(x)=5975539)
161
+ 151 train 9.807529 (lr=1.0629e-04) (hash(x)=9989744)
162
+ 152 train 9.122705 (lr=1.0699e-04) (hash(x)=6748487)
163
+ 153 train 8.303342 (lr=1.0769e-04) (hash(x)=6343832)
164
+ 154 train 8.324159 (lr=1.0839e-04) (hash(x)=5688949)
165
+ 155 train 8.699677 (lr=1.0909e-04) (hash(x)=6694932)
166
+ 156 train 8.354797 (lr=1.0979e-04) (hash(x)=5872835)
167
+ 157 train 8.368828 (lr=1.1049e-04) (hash(x)=6429137)
168
+ 158 train 8.472591 (lr=1.1119e-04) (hash(x)=6999390)
169
+ 159 train 8.349797 (lr=1.1189e-04) (hash(x)=6317894)
170
+ 160 train 8.124361 (lr=1.1259e-04) (hash(x)=5813657)
171
+ 161 train 8.177979 (lr=1.1329e-04) (hash(x)=5165595)
172
+ 162 train 8.151420 (lr=1.1399e-04) (hash(x)=5682633)
173
+ 163 train 8.337730 (lr=1.1469e-04) (hash(x)=5046327)
174
+ 164 train 8.464416 (lr=1.1538e-04) (hash(x)=6195808)
175
+ 165 train 8.275805 (lr=1.1608e-04) (hash(x)=7060363)
176
+ 166 train 8.220079 (lr=1.1678e-04) (hash(x)=5837126)
177
+ 167 train 8.193572 (lr=1.1748e-04) (hash(x)=7947703)
178
+ 168 train 7.893754 (lr=1.1818e-04) (hash(x)=5277865)
179
+ 169 train 8.093842 (lr=1.1888e-04) (hash(x)=5971240)
180
+ 170 train 8.007186 (lr=1.1958e-04) (hash(x)=6590408)
181
+ 171 train 8.146614 (lr=1.2028e-04) (hash(x)=5916068)
182
+ 172 train 8.083666 (lr=1.2098e-04) (hash(x)=6069727)
183
+ 173 train 8.069501 (lr=1.2168e-04) (hash(x)=7426277)
184
+ 174 train 8.245203 (lr=1.2238e-04) (hash(x)=5356513)
185
+ 175 train 8.044041 (lr=1.2308e-04) (hash(x)=5777498)
186
+ 176 train 8.251587 (lr=1.2378e-04) (hash(x)=5989756)
187
+ 177 train 8.015437 (lr=1.2448e-04) (hash(x)=6403331)
188
+ 178 train 7.996015 (lr=1.2517e-04) (hash(x)=7650667)
189
+ 179 train 8.122531 (lr=1.2587e-04) (hash(x)=5753092)
190
+ 180 train 7.969062 (lr=1.2657e-04) (hash(x)=5751457)
191
+ 181 train 7.827992 (lr=1.2727e-04) (hash(x)=5981486)
192
+ 182 train 7.892427 (lr=1.2797e-04) (hash(x)=5918229)
193
+ 183 train 8.063690 (lr=1.2867e-04) (hash(x)=7723226)
194
+ 184 train 8.113661 (lr=1.2937e-04) (hash(x)=6111721)
195
+ 185 train 7.830946 (lr=1.3007e-04) (hash(x)=6069558)
196
+ 186 train 8.000052 (lr=1.3077e-04) (hash(x)=5626336)
197
+ 187 train 8.198039 (lr=1.3147e-04) (hash(x)=7007016)
198
+ 188 train 7.762515 (lr=1.3217e-04) (hash(x)=5883013)
199
+ 189 train 7.651258 (lr=1.3287e-04) (hash(x)=5027958)
200
+ 190 train 7.965401 (lr=1.3357e-04) (hash(x)=5474948)
201
+ 191 train 7.776146 (lr=1.3427e-04) (hash(x)=6491229)
202
+ 192 train 7.866118 (lr=1.3497e-04) (hash(x)=5518341)
203
+ 193 train 7.886746 (lr=1.3566e-04) (hash(x)=7036653)
204
+ 194 train 7.922240 (lr=1.3636e-04) (hash(x)=5527616)
205
+ 195 train 8.179319 (lr=1.3706e-04) (hash(x)=6390752)
206
+ 196 train 7.904310 (lr=1.3776e-04) (hash(x)=6032557)
207
+ 197 train 8.550974 (lr=1.3846e-04) (hash(x)=7758167)
208
+ 198 train 8.419663 (lr=1.3916e-04) (hash(x)=7968070)
209
+ 199 train 7.961051 (lr=1.3986e-04) (hash(x)=6019389)
210
+ 200 val loss 7.9012
211
+ 200 val perplexity 2700.5374
212
+ 200 train 8.022850 (lr=1.4056e-04) (hash(x)=7276744)
213
+ 201 train 7.989044 (lr=1.4126e-04) (hash(x)=6503191)
214
+ 202 train 7.828589 (lr=1.4196e-04) (hash(x)=6006880)
215
+ 203 train 8.067000 (lr=1.4266e-04) (hash(x)=7662067)
216
+ 204 train 7.795067 (lr=1.4336e-04) (hash(x)=6297345)
217
+ 205 train 8.048116 (lr=1.4406e-04) (hash(x)=7901992)
218
+ 206 train 8.151708 (lr=1.4476e-04) (hash(x)=6579950)
219
+ 207 train 7.696474 (lr=1.4545e-04) (hash(x)=4648609)
220
+ 208 train 7.942523 (lr=1.4615e-04) (hash(x)=6903216)
221
+ 209 train 7.778346 (lr=1.4685e-04) (hash(x)=5897288)
222
+ 210 train 7.837867 (lr=1.4755e-04) (hash(x)=7300160)
223
+ 211 train 7.690275 (lr=1.4825e-04) (hash(x)=4725966)
224
+ 212 train 7.484192 (lr=1.4895e-04) (hash(x)=5060060)
225
+ 213 train 8.026106 (lr=1.4965e-04) (hash(x)=6243442)
226
+ 214 train 7.728159 (lr=1.5035e-04) (hash(x)=5893816)
227
+ 215 train 7.764685 (lr=1.5105e-04) (hash(x)=5558355)
228
+ 216 train 7.880827 (lr=1.5175e-04) (hash(x)=4378747)
229
+ 217 train 7.740253 (lr=1.5245e-04) (hash(x)=5400407)
230
+ 218 train 7.972810 (lr=1.5315e-04) (hash(x)=6900554)
231
+ 219 train 7.896116 (lr=1.5385e-04) (hash(x)=6524933)
232
+ 220 train 7.797544 (lr=1.5455e-04) (hash(x)=5409944)
233
+ 221 train 7.893077 (lr=1.5524e-04) (hash(x)=5889724)
234
+ 222 train 7.834795 (lr=1.5594e-04) (hash(x)=4970496)
235
+ 223 train 7.634544 (lr=1.5664e-04) (hash(x)=6369326)
236
+ 224 train 7.717482 (lr=1.5734e-04) (hash(x)=6563975)
237
+ 225 train 7.848202 (lr=1.5804e-04) (hash(x)=5911906)
238
+ 226 train 7.927455 (lr=1.5874e-04) (hash(x)=6111462)
239
+ 227 train 7.956487 (lr=1.5944e-04) (hash(x)=6022625)
240
+ 228 train 7.772564 (lr=1.6014e-04) (hash(x)=5884663)
241
+ 229 train 7.826407 (lr=1.6084e-04) (hash(x)=6584810)
242
+ 230 train 7.827516 (lr=1.6154e-04) (hash(x)=5289998)
243
+ 231 train 7.542418 (lr=1.6224e-04) (hash(x)=4906853)
244
+ 232 train 7.586934 (lr=1.6294e-04) (hash(x)=5862071)
245
+ 233 train 7.211428 (lr=1.6364e-04) (hash(x)=4564564)
246
+ 234 train 7.587811 (lr=1.6434e-04) (hash(x)=5723971)
247
+ 235 train 7.707727 (lr=1.6503e-04) (hash(x)=5589269)
248
+ 236 train 7.671451 (lr=1.6573e-04) (hash(x)=5683368)
249
+ 237 train 7.922763 (lr=1.6643e-04) (hash(x)=6969180)
250
+ 238 train 7.824206 (lr=1.6713e-04) (hash(x)=5341421)
251
+ 239 train 7.595468 (lr=1.6783e-04) (hash(x)=6437376)
252
+ 240 train 8.101495 (lr=1.6853e-04) (hash(x)=7527007)
253
+ 241 train 7.933755 (lr=1.6923e-04) (hash(x)=7116258)
254
+ 242 train 7.899949 (lr=1.6993e-04) (hash(x)=6463930)
255
+ 243 train 7.936605 (lr=1.7063e-04) (hash(x)=7304892)
256
+ 244 train 8.065814 (lr=1.7133e-04) (hash(x)=5881808)
257
+ 245 train 7.869835 (lr=1.7203e-04) (hash(x)=7366721)
258
+ 246 train 7.764749 (lr=1.7273e-04) (hash(x)=6457521)
259
+ 247 train 8.144732 (lr=1.7343e-04) (hash(x)=7084093)
260
+ 248 train 7.822935 (lr=1.7413e-04) (hash(x)=6015300)
261
+ 249 train 8.075032 (lr=1.7483e-04) (hash(x)=7197652)
262
+ 250 val loss 7.7788
263
+ 250 val perplexity 2389.3330
264
+ 250 train 8.073785 (lr=1.7552e-04) (hash(x)=4830828)
265
+ 251 train 7.828621 (lr=1.7622e-04) (hash(x)=5908178)
266
+ 252 train 7.798131 (lr=1.7692e-04) (hash(x)=6299092)
267
+ 253 train 7.783942 (lr=1.7762e-04) (hash(x)=6349974)
268
+ 254 train 7.822704 (lr=1.7832e-04) (hash(x)=5641494)
269
+ 255 train 7.777048 (lr=1.7902e-04) (hash(x)=7048804)
270
+ 256 train 8.007664 (lr=1.7972e-04) (hash(x)=7035244)
271
+ 257 train 7.696496 (lr=1.8042e-04) (hash(x)=7909039)
272
+ 258 train 7.936101 (lr=1.8112e-04) (hash(x)=5470939)
273
+ 259 train 7.748548 (lr=1.8182e-04) (hash(x)=6085549)
274
+ 260 train 8.059592 (lr=1.8252e-04) (hash(x)=5882649)
275
+ 261 train 7.917109 (lr=1.8322e-04) (hash(x)=7463181)
276
+ 262 train 8.189245 (lr=1.8392e-04) (hash(x)=5215574)
277
+ 263 train 7.670319 (lr=1.8462e-04) (hash(x)=5752594)
278
+ 264 train 7.827533 (lr=1.8531e-04) (hash(x)=6416619)
279
+ 265 train 7.827281 (lr=1.8601e-04) (hash(x)=6114601)
280
+ 266 train 7.731312 (lr=1.8671e-04) (hash(x)=5646171)
281
+ 267 train 7.672514 (lr=1.8741e-04) (hash(x)=7662769)
282
+ 268 train 7.963041 (lr=1.8811e-04) (hash(x)=6394376)
283
+ 269 train 7.552702 (lr=1.8881e-04) (hash(x)=7485666)
284
+ 270 train 7.689405 (lr=1.8951e-04) (hash(x)=6636823)
285
+ 271 train 7.603408 (lr=1.9021e-04) (hash(x)=6393520)
286
+ 272 train 7.760921 (lr=1.9091e-04) (hash(x)=5235659)
287
+ 273 train 7.811428 (lr=1.9161e-04) (hash(x)=5586291)
288
+ 274 train 8.089213 (lr=1.9231e-04) (hash(x)=7034674)
289
+ 275 train 8.170650 (lr=1.9301e-04) (hash(x)=5942867)
290
+ 276 train 7.852236 (lr=1.9371e-04) (hash(x)=7812344)
291
+ 277 train 8.116363 (lr=1.9441e-04) (hash(x)=7574027)
292
+ 278 train 8.076002 (lr=1.9510e-04) (hash(x)=5960216)
293
+ 279 train 7.858212 (lr=1.9580e-04) (hash(x)=6793550)
294
+ 280 train 7.828798 (lr=1.9650e-04) (hash(x)=6879965)
295
+ 281 train 7.541502 (lr=1.9720e-04) (hash(x)=6264789)
296
+ 282 train 7.743322 (lr=1.9790e-04) (hash(x)=5175356)
297
+ 283 train 7.964029 (lr=1.9860e-04) (hash(x)=7105976)
298
+ 284 train 7.956531 (lr=1.9930e-04) (hash(x)=7286386)
299
+ 285 train 8.113026 (lr=2.0000e-04) (hash(x)=7244279)
300
+ 286 train 7.624428 (lr=2.0000e-04) (hash(x)=4877247)
301
+ 287 train 7.599567 (lr=2.0000e-04) (hash(x)=6581348)
302
+ 288 train 7.605886 (lr=2.0000e-04) (hash(x)=6397208)
303
+ 289 train 7.696164 (lr=1.9999e-04) (hash(x)=6543825)
304
+ 290 train 7.472377 (lr=1.9999e-04) (hash(x)=6088993)
305
+ 291 train 7.815174 (lr=1.9998e-04) (hash(x)=5555598)
306
+ 292 train 7.867016 (lr=1.9997e-04) (hash(x)=6924240)
307
+ 293 train 8.014560 (lr=1.9996e-04) (hash(x)=7254232)
308
+ 294 train 7.609447 (lr=1.9994e-04) (hash(x)=7689743)
309
+ 295 train 7.610065 (lr=1.9993e-04) (hash(x)=6235837)
310
+ 296 train 7.831267 (lr=1.9991e-04) (hash(x)=5216883)
311
+ 297 train 7.480352 (lr=1.9989e-04) (hash(x)=4738550)
312
+ 298 train 7.616215 (lr=1.9987e-04) (hash(x)=5126918)
313
+ 299 train 7.817634 (lr=1.9985e-04) (hash(x)=5591770)
314
+ 300 val loss 7.7530
315
+ 300 val perplexity 2328.5071
316
+ 300 train 8.652920 (lr=1.9983e-04) (hash(x)=8001573)
317
+ 301 train 8.383079 (lr=1.9980e-04) (hash(x)=8012491)
318
+ 302 train 8.058331 (lr=1.9978e-04) (hash(x)=3997428)
319
+ 303 train 7.362261 (lr=1.9975e-04) (hash(x)=7083021)
320
+ 304 train 7.986622 (lr=1.9972e-04) (hash(x)=7177371)
321
+ 305 train 7.787862 (lr=1.9969e-04) (hash(x)=6456022)
322
+ 306 train 7.865339 (lr=1.9965e-04) (hash(x)=5238861)
323
+ 307 train 7.842036 (lr=1.9962e-04) (hash(x)=7281348)
324
+ 308 train 8.021835 (lr=1.9958e-04) (hash(x)=6848441)
325
+ 309 train 8.087339 (lr=1.9954e-04) (hash(x)=7824794)
326
+ 310 train 8.200470 (lr=1.9950e-04) (hash(x)=7440208)
327
+ 311 train 7.985400 (lr=1.9946e-04) (hash(x)=6155821)
328
+ 312 train 7.749827 (lr=1.9941e-04) (hash(x)=5350461)
329
+ 313 train 7.956250 (lr=1.9937e-04) (hash(x)=7807564)
330
+ 314 train 7.788625 (lr=1.9932e-04) (hash(x)=5992165)
331
+ 315 train 7.755401 (lr=1.9927e-04) (hash(x)=5736241)
332
+ 316 train 7.743199 (lr=1.9922e-04) (hash(x)=6536253)
333
+ 317 train 7.931822 (lr=1.9916e-04) (hash(x)=5688415)
334
+ 318 train 7.577342 (lr=1.9911e-04) (hash(x)=5839705)
335
+ 319 train 7.771173 (lr=1.9905e-04) (hash(x)=5657123)
336
+ 320 train 7.780609 (lr=1.9899e-04) (hash(x)=6354364)
337
+ 321 train 7.805384 (lr=1.9893e-04) (hash(x)=6102985)
338
+ 322 train 8.022583 (lr=1.9887e-04) (hash(x)=7161967)
339
+ 323 train 7.984571 (lr=1.9881e-04) (hash(x)=6452095)
340
+ 324 train 7.725709 (lr=1.9874e-04) (hash(x)=5547396)
341
+ 325 train 8.043575 (lr=1.9868e-04) (hash(x)=5848921)
342
+ 326 train 7.798221 (lr=1.9861e-04) (hash(x)=4890294)
343
+ 327 train 7.421914 (lr=1.9854e-04) (hash(x)=5312267)
344
+ 328 train 7.481781 (lr=1.9847e-04) (hash(x)=5348929)
345
+ 329 train 7.443196 (lr=1.9839e-04) (hash(x)=5025297)
346
+ 330 train 7.469763 (lr=1.9832e-04) (hash(x)=5670052)
347
+ 331 train 7.329932 (lr=1.9824e-04) (hash(x)=5434493)
348
+ 332 train 8.107336 (lr=1.9816e-04) (hash(x)=9122453)
349
+ 333 train 7.850559 (lr=1.9808e-04) (hash(x)=5197474)
350
+ 334 train 7.713950 (lr=1.9800e-04) (hash(x)=5550786)
351
+ 335 train 7.784082 (lr=1.9792e-04) (hash(x)=6830813)
352
+ 336 train 7.580809 (lr=1.9783e-04) (hash(x)=5258435)
353
+ 337 train 7.616575 (lr=1.9774e-04) (hash(x)=5242548)
354
+ 338 train 7.509399 (lr=1.9765e-04) (hash(x)=3754886)
355
+ 339 train 7.416758 (lr=1.9756e-04) (hash(x)=4752771)
356
+ 340 train 7.609068 (lr=1.9747e-04) (hash(x)=5926875)
357
+ 341 train 8.055240 (lr=1.9738e-04) (hash(x)=6691249)
358
+ 342 train 7.809646 (lr=1.9728e-04) (hash(x)=7514623)
359
+ 343 train 7.882940 (lr=1.9718e-04) (hash(x)=6424933)
360
+ 344 train 7.872913 (lr=1.9709e-04) (hash(x)=6347757)
361
+ 345 train 7.820632 (lr=1.9698e-04) (hash(x)=6960120)
362
+ 346 train 7.834363 (lr=1.9688e-04) (hash(x)=7597197)
363
+ 347 train 7.758900 (lr=1.9678e-04) (hash(x)=5786517)
364
+ 348 train 7.774318 (lr=1.9667e-04) (hash(x)=6424363)
365
+ 349 train 7.871087 (lr=1.9656e-04) (hash(x)=7229689)
366
+ 350 val loss 7.7736
367
+ 350 val perplexity 2377.0247
368
+ 350 train 7.964574 (lr=1.9646e-04) (hash(x)=6772284)
369
+ 351 train 8.060130 (lr=1.9634e-04) (hash(x)=6680023)
370
+ 352 train 7.923871 (lr=1.9623e-04) (hash(x)=6218520)
371
+ 353 train 7.969292 (lr=1.9612e-04) (hash(x)=7688393)
372
+ 354 train 7.834572 (lr=1.9600e-04) (hash(x)=6327977)
373
+ 355 train 7.732324 (lr=1.9588e-04) (hash(x)=6474729)
374
+ 356 train 7.851903 (lr=1.9576e-04) (hash(x)=6657266)
375
+ 357 train 7.874036 (lr=1.9564e-04) (hash(x)=5946650)
376
+ 358 train 7.784548 (lr=1.9552e-04) (hash(x)=6734878)
377
+ 359 train 7.835220 (lr=1.9540e-04) (hash(x)=7523279)
378
+ 360 train 7.688325 (lr=1.9527e-04) (hash(x)=6179478)
379
+ 361 train 7.739112 (lr=1.9514e-04) (hash(x)=5531605)
380
+ 362 train 7.750559 (lr=1.9501e-04) (hash(x)=5844980)
381
+ 363 train 7.431763 (lr=1.9488e-04) (hash(x)=5508768)
382
+ 364 train 7.271553 (lr=1.9475e-04) (hash(x)=6337190)
383
+ 365 train 8.805032 (lr=1.9462e-04) (hash(x)=6415982)
384
+ 366 train 7.899531 (lr=1.9448e-04) (hash(x)=6828959)
385
+ 367 train 7.871853 (lr=1.9434e-04) (hash(x)=6635925)
386
+ 368 train 7.598778 (lr=1.9421e-04) (hash(x)=6358540)
387
+ 369 train 7.972240 (lr=1.9406e-04) (hash(x)=5923706)
388
+ 370 train 7.589488 (lr=1.9392e-04) (hash(x)=4843320)
389
+ 371 train 7.893913 (lr=1.9378e-04) (hash(x)=6663801)
390
+ 372 train 7.688990 (lr=1.9363e-04) (hash(x)=5583287)
391
+ 373 train 7.672047 (lr=1.9349e-04) (hash(x)=6113540)
392
+ 374 train 7.779304 (lr=1.9334e-04) (hash(x)=6389936)
393
+ 375 train 7.763093 (lr=1.9319e-04) (hash(x)=5869441)
394
+ 376 train 7.818826 (lr=1.9304e-04) (hash(x)=5148617)
395
+ 377 train 8.073936 (lr=1.9288e-04) (hash(x)=6913454)
396
+ 378 train 7.798039 (lr=1.9273e-04) (hash(x)=5356233)
397
+ 379 train 7.787842 (lr=1.9257e-04) (hash(x)=6698878)
398
+ 380 train 7.702433 (lr=1.9241e-04) (hash(x)=6280665)
399
+ 381 train 7.908127 (lr=1.9225e-04) (hash(x)=6094899)
400
+ 382 train 7.470886 (lr=1.9209e-04) (hash(x)=6249746)
401
+ 383 train 7.797125 (lr=1.9193e-04) (hash(x)=6349004)
402
+ 384 train 7.723500 (lr=1.9176e-04) (hash(x)=7013468)
403
+ 385 train 7.697244 (lr=1.9160e-04) (hash(x)=6045933)
404
+ 386 train 8.018248 (lr=1.9143e-04) (hash(x)=6571735)
405
+ 387 train 7.623819 (lr=1.9126e-04) (hash(x)=4986137)
406
+ 388 train 7.290435 (lr=1.9109e-04) (hash(x)=5244912)
407
+ 389 train 7.655291 (lr=1.9091e-04) (hash(x)=4798229)
408
+ 390 train 7.692018 (lr=1.9074e-04) (hash(x)=5815783)
409
+ 391 train 7.502819 (lr=1.9056e-04) (hash(x)=6008454)
410
+ 392 train 7.693169 (lr=1.9039e-04) (hash(x)=6407333)
411
+ 393 train 7.590372 (lr=1.9021e-04) (hash(x)=5938362)
412
+ 394 train 7.456230 (lr=1.9003e-04) (hash(x)=6077124)
413
+ 395 train 7.752810 (lr=1.8985e-04) (hash(x)=6550770)
414
+ 396 train 7.590805 (lr=1.8966e-04) (hash(x)=6181528)
415
+ 397 train 7.866416 (lr=1.8948e-04) (hash(x)=7055344)
416
+ 398 train 7.972468 (lr=1.8929e-04) (hash(x)=7689348)
417
+ 399 train 7.879353 (lr=1.8910e-04) (hash(x)=7682741)
418
+ 400 val loss 7.7600
419
+ 400 val perplexity 2344.9790
420
+ 400 train 7.693010 (lr=1.8891e-04) (hash(x)=5189401)
421
+ 401 train 7.728419 (lr=1.8872e-04) (hash(x)=7372095)
422
+ 402 train 7.807273 (lr=1.8853e-04) (hash(x)=6838219)
423
+ 403 train 8.284609 (lr=1.8834e-04) (hash(x)=7892158)
424
+ 404 train 7.937898 (lr=1.8814e-04) (hash(x)=5677077)
425
+ 405 train 7.556235 (lr=1.8794e-04) (hash(x)=5929740)
426
+ 406 train 7.717421 (lr=1.8774e-04) (hash(x)=5285055)
427
+ 407 train 7.802217 (lr=1.8754e-04) (hash(x)=7794028)
428
+ 408 train 7.903992 (lr=1.8734e-04) (hash(x)=6640802)
429
+ 409 train 7.978027 (lr=1.8714e-04) (hash(x)=6839923)
430
+ 410 train 7.449770 (lr=1.8693e-04) (hash(x)=4020997)
431
+ 411 train 7.623665 (lr=1.8673e-04) (hash(x)=7093523)
432
+ 412 train 7.926448 (lr=1.8652e-04) (hash(x)=7370045)
433
+ 413 train 7.645683 (lr=1.8631e-04) (hash(x)=5341312)
434
+ 414 train 7.665128 (lr=1.8610e-04) (hash(x)=5589312)
435
+ 415 train 8.124632 (lr=1.8589e-04) (hash(x)=6166062)
436
+ 416 train 8.043650 (lr=1.8567e-04) (hash(x)=8354149)
437
+ 417 train 7.647033 (lr=1.8546e-04) (hash(x)=5992164)
438
+ 418 train 7.665911 (lr=1.8524e-04) (hash(x)=4748657)
439
+ 419 train 7.701187 (lr=1.8502e-04) (hash(x)=6645781)
440
+ 420 train 7.918882 (lr=1.8480e-04) (hash(x)=6689147)
441
+ 421 train 7.786612 (lr=1.8458e-04) (hash(x)=6207724)
442
+ 422 train 7.739450 (lr=1.8436e-04) (hash(x)=6232863)
443
+ 423 train 7.502040 (lr=1.8414e-04) (hash(x)=4811192)
444
+ 424 train 7.512648 (lr=1.8391e-04) (hash(x)=6211761)
445
+ 425 train 7.701155 (lr=1.8369e-04) (hash(x)=6906335)
446
+ 426 train 7.622811 (lr=1.8346e-04) (hash(x)=4922365)
447
+ 427 train 7.658427 (lr=1.8323e-04) (hash(x)=5970866)
448
+ 428 train 7.590514 (lr=1.8300e-04) (hash(x)=6417833)
449
+ 429 train 7.700044 (lr=1.8276e-04) (hash(x)=5238015)
450
+ 430 train 7.603992 (lr=1.8253e-04) (hash(x)=5375451)
451
+ 431 train 7.719973 (lr=1.8230e-04) (hash(x)=7822680)
452
+ 432 train 7.762851 (lr=1.8206e-04) (hash(x)=6173200)
453
+ 433 train 7.519185 (lr=1.8182e-04) (hash(x)=5478019)
454
+ 434 train 7.732025 (lr=1.8158e-04) (hash(x)=5597859)
455
+ 435 train 7.572690 (lr=1.8134e-04) (hash(x)=6317317)
456
+ 436 train 7.768342 (lr=1.8110e-04) (hash(x)=7120467)
457
+ 437 train 7.882036 (lr=1.8086e-04) (hash(x)=5832344)
458
+ 438 train 7.758165 (lr=1.8061e-04) (hash(x)=6813802)
459
+ 439 train 7.818798 (lr=1.8036e-04) (hash(x)=6431409)
460
+ 440 train 7.844862 (lr=1.8012e-04) (hash(x)=5886861)
461
+ 441 train 7.614089 (lr=1.7987e-04) (hash(x)=5900130)
462
+ 442 train 7.852407 (lr=1.7962e-04) (hash(x)=6061563)
463
+ 443 train 7.794435 (lr=1.7937e-04) (hash(x)=6653337)
464
+ 444 train 7.707383 (lr=1.7911e-04) (hash(x)=6575992)
465
+ 445 train 7.706908 (lr=1.7886e-04) (hash(x)=6696784)
466
+ 446 train 7.847198 (lr=1.7860e-04) (hash(x)=6242161)
467
+ 447 train 7.702934 (lr=1.7835e-04) (hash(x)=5323032)
468
+ 448 train 7.489307 (lr=1.7809e-04) (hash(x)=6439367)
469
+ 449 train 7.732404 (lr=1.7783e-04) (hash(x)=5372100)
470
+ 450 val loss 7.7250
471
+ 450 val perplexity 2264.2266
472
+ 450 train 7.761188 (lr=1.7757e-04) (hash(x)=6344849)
473
+ 451 train 7.412691 (lr=1.7731e-04) (hash(x)=5125339)
474
+ 452 train 7.405895 (lr=1.7704e-04) (hash(x)=4876156)
475
+ 453 train 7.547815 (lr=1.7678e-04) (hash(x)=5664463)
476
+ 454 train 7.445304 (lr=1.7651e-04) (hash(x)=6010042)
477
+ 455 train 7.658559 (lr=1.7624e-04) (hash(x)=7363286)
478
+ 456 train 8.242727 (lr=1.7597e-04) (hash(x)=4873268)
479
+ 457 train 7.959551 (lr=1.7570e-04) (hash(x)=7444468)
480
+ 458 train 7.980326 (lr=1.7543e-04) (hash(x)=6698134)
481
+ 459 train 7.801604 (lr=1.7516e-04) (hash(x)=7670050)
482
+ 460 train 7.816780 (lr=1.7489e-04) (hash(x)=6280866)
483
+ 461 train 7.714090 (lr=1.7461e-04) (hash(x)=8189420)
484
+ 462 train 7.806901 (lr=1.7434e-04) (hash(x)=6878292)
485
+ 463 train 7.519126 (lr=1.7406e-04) (hash(x)=5616075)
486
+ 464 train 7.719515 (lr=1.7378e-04) (hash(x)=5839939)
487
+ 465 train 7.606757 (lr=1.7350e-04) (hash(x)=6360924)
488
+ 466 train 8.046442 (lr=1.7322e-04) (hash(x)=6466475)
489
+ 467 train 7.824868 (lr=1.7294e-04) (hash(x)=6593764)
490
+ 468 train 8.261981 (lr=1.7265e-04) (hash(x)=7389698)
491
+ 469 train 7.910909 (lr=1.7237e-04) (hash(x)=6764377)
492
+ 470 train 7.637399 (lr=1.7208e-04) (hash(x)=7293570)
493
+ 471 train 7.663312 (lr=1.7179e-04) (hash(x)=5672608)
494
+ 472 train 7.532629 (lr=1.7151e-04) (hash(x)=5587975)
495
+ 473 train 7.415164 (lr=1.7122e-04) (hash(x)=6985256)
496
+ 474 train 8.010103 (lr=1.7092e-04) (hash(x)=6549231)
497
+ 475 train 7.711175 (lr=1.7063e-04) (hash(x)=6828653)
498
+ 476 train 7.524570 (lr=1.7034e-04) (hash(x)=5478970)
499
+ 477 train 7.569299 (lr=1.7004e-04) (hash(x)=5199845)
500
+ 478 train 7.650083 (lr=1.6975e-04) (hash(x)=6482073)
501
+ 479 train 7.745840 (lr=1.6945e-04) (hash(x)=6839867)
502
+ 480 train 7.740882 (lr=1.6915e-04) (hash(x)=7073454)
503
+ 481 train 7.989164 (lr=1.6886e-04) (hash(x)=6928712)
504
+ 482 train 7.723703 (lr=1.6856e-04) (hash(x)=5962982)
505
+ 483 train 7.821363 (lr=1.6825e-04) (hash(x)=8426992)
506
+ 484 train 7.738289 (lr=1.6795e-04) (hash(x)=6422394)
507
+ 485 train 7.680492 (lr=1.6765e-04) (hash(x)=5330355)
508
+ 486 train 7.154161 (lr=1.6734e-04) (hash(x)=4111491)
509
+ 487 train 7.181684 (lr=1.6704e-04) (hash(x)=4916928)
510
+ 488 train 7.350986 (lr=1.6673e-04) (hash(x)=6610244)
511
+ 489 train 7.846370 (lr=1.6642e-04) (hash(x)=7287928)
512
+ 490 train 7.739513 (lr=1.6611e-04) (hash(x)=6525083)
513
+ 491 train 7.667002 (lr=1.6580e-04) (hash(x)=6484050)
514
+ 492 train 7.834908 (lr=1.6549e-04) (hash(x)=6573424)
515
+ 493 train 7.700305 (lr=1.6518e-04) (hash(x)=5062850)
516
+ 494 train 7.652481 (lr=1.6487e-04) (hash(x)=6215995)
517
+ 495 train 7.765934 (lr=1.6455e-04) (hash(x)=8353379)
518
+ 496 train 7.702172 (lr=1.6424e-04) (hash(x)=6344823)
519
+ 497 train 7.660632 (lr=1.6392e-04) (hash(x)=6805358)
520
+ 498 train 7.522215 (lr=1.6360e-04) (hash(x)=6018070)
521
+ 499 train 7.801767 (lr=1.6328e-04) (hash(x)=6552510)
522
+ 500 val loss 7.6751
523
+ 500 val perplexity 2153.9329
524
+ 500 train 7.598320 (lr=1.6296e-04) (hash(x)=7218108)
525
+ 501 train 7.801229 (lr=1.6264e-04) (hash(x)=5105545)
526
+ 502 train 7.763143 (lr=1.6232e-04) (hash(x)=5997860)
527
+ 503 train 7.574910 (lr=1.6200e-04) (hash(x)=4838871)
528
+ 504 train 7.606785 (lr=1.6168e-04) (hash(x)=7105888)
529
+ 505 train 7.934749 (lr=1.6135e-04) (hash(x)=5945286)
530
+ 506 train 7.158486 (lr=1.6103e-04) (hash(x)=4381720)
531
+ 507 train 6.841398 (lr=1.6070e-04) (hash(x)=3021697)
532
+ 508 train 7.235814 (lr=1.6037e-04) (hash(x)=6826973)
533
+ 509 train 7.787283 (lr=1.6004e-04) (hash(x)=6443070)
534
+ 510 train 7.477246 (lr=1.5971e-04) (hash(x)=5520637)
535
+ 511 train 7.743924 (lr=1.5938e-04) (hash(x)=6795665)
536
+ 512 train 7.534363 (lr=1.5905e-04) (hash(x)=6063891)
537
+ 513 train 8.028758 (lr=1.5872e-04) (hash(x)=7683068)
538
+ 514 train 7.306225 (lr=1.5838e-04) (hash(x)=5526694)
539
+ 515 train 7.341669 (lr=1.5805e-04) (hash(x)=5486935)
540
+ 516 train 7.697443 (lr=1.5772e-04) (hash(x)=6002252)
541
+ 517 train 7.683260 (lr=1.5738e-04) (hash(x)=6115232)
542
+ 518 train 7.963529 (lr=1.5704e-04) (hash(x)=9039136)
543
+ 519 train 7.608561 (lr=1.5670e-04) (hash(x)=6678038)
544
+ 520 train 7.770864 (lr=1.5636e-04) (hash(x)=5383926)
545
+ 521 train 7.816237 (lr=1.5602e-04) (hash(x)=5747629)
546
+ 522 train 7.683764 (lr=1.5568e-04) (hash(x)=7090040)
547
+ 523 train 7.914528 (lr=1.5534e-04) (hash(x)=6657714)
548
+ 524 train 7.608367 (lr=1.5500e-04) (hash(x)=5414034)
549
+ 525 train 7.646364 (lr=1.5466e-04) (hash(x)=6190642)
550
+ 526 train 7.786247 (lr=1.5431e-04) (hash(x)=7431177)
551
+ 527 train 7.660243 (lr=1.5397e-04) (hash(x)=6112215)
552
+ 528 train 7.798132 (lr=1.5362e-04) (hash(x)=6785874)
553
+ 529 train 7.526227 (lr=1.5327e-04) (hash(x)=7314989)
554
+ 530 train 7.751375 (lr=1.5293e-04) (hash(x)=6222614)
555
+ 531 train 7.832531 (lr=1.5258e-04) (hash(x)=8353143)
556
+ 532 train 7.912910 (lr=1.5223e-04) (hash(x)=6752498)
557
+ 533 train 7.655527 (lr=1.5188e-04) (hash(x)=5912570)
558
+ 534 train 7.808606 (lr=1.5153e-04) (hash(x)=5621785)
559
+ 535 train 7.706855 (lr=1.5118e-04) (hash(x)=5915361)
560
+ 536 train 7.535885 (lr=1.5082e-04) (hash(x)=6853672)
561
+ 537 train 7.512347 (lr=1.5047e-04) (hash(x)=6369494)
562
+ 538 train 7.678410 (lr=1.5012e-04) (hash(x)=6039652)
563
+ 539 train 7.797748 (lr=1.4976e-04) (hash(x)=6254885)
564
+ 540 train 7.650626 (lr=1.4941e-04) (hash(x)=5829860)
565
+ 541 train 7.572127 (lr=1.4905e-04) (hash(x)=5425120)
566
+ 542 train 7.360905 (lr=1.4869e-04) (hash(x)=5209746)
567
+ 543 train 7.917442 (lr=1.4833e-04) (hash(x)=5771588)
568
+ 544 train 7.634370 (lr=1.4798e-04) (hash(x)=8298790)
569
+ 545 train 7.691929 (lr=1.4762e-04) (hash(x)=6763967)
570
+ 546 train 7.675530 (lr=1.4726e-04) (hash(x)=4882397)
571
+ 547 train 7.466078 (lr=1.4690e-04) (hash(x)=5561507)
572
+ 548 train 7.477010 (lr=1.4653e-04) (hash(x)=5762755)
573
+ 549 train 7.580343 (lr=1.4617e-04) (hash(x)=7081151)
574
+ 550 val loss 7.6262
575
+ 550 val perplexity 2051.2283
576
+ 550 train 7.497870 (lr=1.4581e-04) (hash(x)=5047520)
577
+ 551 train 7.627199 (lr=1.4545e-04) (hash(x)=5688829)
578
+ 552 train 7.527453 (lr=1.4508e-04) (hash(x)=7008925)
579
+ 553 train 7.640628 (lr=1.4472e-04) (hash(x)=7821554)
580
+ 554 train 7.648827 (lr=1.4435e-04) (hash(x)=6469035)
581
+ 555 train 7.465584 (lr=1.4398e-04) (hash(x)=5371951)
582
+ 556 train 7.763328 (lr=1.4362e-04) (hash(x)=6436050)
583
+ 557 train 7.707864 (lr=1.4325e-04) (hash(x)=8120149)
584
+ 558 train 7.935392 (lr=1.4288e-04) (hash(x)=6189935)
585
+ 559 train 7.638746 (lr=1.4251e-04) (hash(x)=5443305)
586
+ 560 train 7.731345 (lr=1.4214e-04) (hash(x)=6294312)
587
+ 561 train 7.365246 (lr=1.4177e-04) (hash(x)=7066030)
588
+ 562 train 7.767698 (lr=1.4140e-04) (hash(x)=7744989)
589
+ 563 train 7.913129 (lr=1.4103e-04) (hash(x)=5016757)
590
+ 564 train 7.742455 (lr=1.4066e-04) (hash(x)=8001284)
591
+ 565 train 7.668474 (lr=1.4029e-04) (hash(x)=6296163)
592
+ 566 train 7.776181 (lr=1.3991e-04) (hash(x)=6027688)
593
+ 567 train 7.681470 (lr=1.3954e-04) (hash(x)=6901933)
594
+ 568 train 7.534114 (lr=1.3916e-04) (hash(x)=5124305)
595
+ 569 train 7.664458 (lr=1.3879e-04) (hash(x)=8056633)
596
+ 570 train 7.721654 (lr=1.3841e-04) (hash(x)=6677566)
597
+ 571 train 7.726655 (lr=1.3804e-04) (hash(x)=6019085)
598
+ 572 train 7.505062 (lr=1.3766e-04) (hash(x)=5924495)
599
+ 573 train 7.598801 (lr=1.3728e-04) (hash(x)=7003893)
600
+ 574 train 7.566664 (lr=1.3691e-04) (hash(x)=5563075)
601
+ 575 train 7.392373 (lr=1.3653e-04) (hash(x)=4294425)
602
+ 576 train 7.646019 (lr=1.3615e-04) (hash(x)=5677870)
603
+ 577 train 7.462424 (lr=1.3577e-04) (hash(x)=6568540)
604
+ 578 train 7.718164 (lr=1.3539e-04) (hash(x)=7473394)
605
+ 579 train 7.736392 (lr=1.3501e-04) (hash(x)=6095229)
606
+ 580 train 7.714333 (lr=1.3463e-04) (hash(x)=6946168)
607
+ 581 train 7.529428 (lr=1.3425e-04) (hash(x)=5512150)
608
+ 582 train 7.166545 (lr=1.3387e-04) (hash(x)=4380268)
609
+ 583 train 7.236151 (lr=1.3348e-04) (hash(x)=4363941)
610
+ 584 train 7.392097 (lr=1.3310e-04) (hash(x)=4836009)
611
+ 585 train 7.688985 (lr=1.3272e-04) (hash(x)=6457783)
612
+ 586 train 7.555036 (lr=1.3234e-04) (hash(x)=6378784)
613
+ 587 train 7.450460 (lr=1.3195e-04) (hash(x)=4693798)
614
+ 588 train 7.809880 (lr=1.3157e-04) (hash(x)=9193310)
615
+ 589 train 8.476392 (lr=1.3118e-04) (hash(x)=8759806)
616
+ 590 train 8.821692 (lr=1.3080e-04) (hash(x)=9176140)
617
+ 591 train 7.562479 (lr=1.3041e-04) (hash(x)=5486335)
618
+ 592 train 7.778407 (lr=1.3003e-04) (hash(x)=7868436)
619
+ 593 train 7.674592 (lr=1.2964e-04) (hash(x)=6964993)
620
+ 594 train 7.861164 (lr=1.2925e-04) (hash(x)=6741270)
621
+ 595 train 8.123202 (lr=1.2887e-04) (hash(x)=7907450)
622
+ 596 train 7.988882 (lr=1.2848e-04) (hash(x)=6443452)
623
+ 597 train 7.608569 (lr=1.2809e-04) (hash(x)=5236642)
624
+ 598 train 7.718880 (lr=1.2770e-04) (hash(x)=5604795)
625
+ 599 train 7.670129 (lr=1.2732e-04) (hash(x)=7295165)
626
+ 600 val loss 7.6030
627
+ 600 val perplexity 2004.2572
628
+ 600 train 7.613661 (lr=1.2693e-04) (hash(x)=6034549)
629
+ 601 train 7.646752 (lr=1.2654e-04) (hash(x)=4913602)
630
+ 602 train 7.591200 (lr=1.2615e-04) (hash(x)=6271294)
631
+ 603 train 7.732038 (lr=1.2576e-04) (hash(x)=6814026)
632
+ 604 train 7.621162 (lr=1.2537e-04) (hash(x)=6848040)
633
+ 605 train 7.907537 (lr=1.2498e-04) (hash(x)=7042051)
634
+ 606 train 7.821778 (lr=1.2459e-04) (hash(x)=6481002)
635
+ 607 train 7.451346 (lr=1.2420e-04) (hash(x)=6267424)
636
+ 608 train 7.788858 (lr=1.2381e-04) (hash(x)=7665306)
637
+ 609 train 7.678466 (lr=1.2341e-04) (hash(x)=5614727)
638
+ 610 train 7.689917 (lr=1.2302e-04) (hash(x)=7039197)
639
+ 611 train 7.740019 (lr=1.2263e-04) (hash(x)=8086437)
640
+ 612 train 7.813795 (lr=1.2224e-04) (hash(x)=6993846)
641
+ 613 train 7.820864 (lr=1.2185e-04) (hash(x)=5900143)
642
+ 614 train 7.714909 (lr=1.2145e-04) (hash(x)=4841318)
643
+ 615 train 7.693137 (lr=1.2106e-04) (hash(x)=5270452)
644
+ 616 train 7.609314 (lr=1.2067e-04) (hash(x)=5955026)
645
+ 617 train 7.695532 (lr=1.2027e-04) (hash(x)=8617707)
646
+ 618 train 7.539489 (lr=1.1988e-04) (hash(x)=5159401)
647
+ 619 train 7.532191 (lr=1.1949e-04) (hash(x)=6420820)
648
+ 620 train 7.515136 (lr=1.1909e-04) (hash(x)=6628863)
649
+ 621 train 7.332473 (lr=1.1870e-04) (hash(x)=6215930)
650
+ 622 train 7.665909 (lr=1.1830e-04) (hash(x)=5248645)
651
+ 623 train 7.551149 (lr=1.1791e-04) (hash(x)=6305297)
652
+ 624 train 8.071685 (lr=1.1752e-04) (hash(x)=6989704)
653
+ 625 train 7.618983 (lr=1.1712e-04) (hash(x)=4026682)
654
+ 626 train 7.497633 (lr=1.1673e-04) (hash(x)=6367688)
655
+ 627 train 7.579348 (lr=1.1633e-04) (hash(x)=7889849)
656
+ 628 train 7.751318 (lr=1.1594e-04) (hash(x)=7334719)
657
+ 629 train 7.876184 (lr=1.1554e-04) (hash(x)=6141721)
658
+ 630 train 8.653469 (lr=1.1515e-04) (hash(x)=5056572)
659
+ 631 train 8.088810 (lr=1.1475e-04) (hash(x)=6040077)
660
+ 632 train 8.184292 (lr=1.1435e-04) (hash(x)=6866651)
661
+ 633 train 8.428826 (lr=1.1396e-04) (hash(x)=6155449)
662
+ 634 train 7.986074 (lr=1.1356e-04) (hash(x)=6690174)
663
+ 635 train 7.651904 (lr=1.1317e-04) (hash(x)=5652497)
664
+ 636 train 7.693996 (lr=1.1277e-04) (hash(x)=5931050)
665
+ 637 train 7.784171 (lr=1.1238e-04) (hash(x)=6961314)
666
+ 638 train 7.601748 (lr=1.1198e-04) (hash(x)=5973636)
667
+ 639 train 7.615609 (lr=1.1158e-04) (hash(x)=7130251)
668
+ 640 train 7.390998 (lr=1.1119e-04) (hash(x)=7503390)
669
+ 641 train 7.430662 (lr=1.1079e-04) (hash(x)=5927461)
670
+ 642 train 7.602638 (lr=1.1040e-04) (hash(x)=6196741)
671
+ 643 train 7.501169 (lr=1.1000e-04) (hash(x)=6610177)
672
+ 644 train 7.478892 (lr=1.0960e-04) (hash(x)=6635147)
673
+ 645 train 7.742132 (lr=1.0921e-04) (hash(x)=7277580)
674
+ 646 train 7.385297 (lr=1.0881e-04) (hash(x)=5050080)
675
+ 647 train 7.461150 (lr=1.0842e-04) (hash(x)=6508350)
676
+ 648 train 7.691919 (lr=1.0802e-04) (hash(x)=5276338)
677
+ 649 train 7.827346 (lr=1.0762e-04) (hash(x)=6536034)
678
+ 650 val loss 7.5901
679
+ 650 val perplexity 1978.4186
680
+ 650 train 7.621641 (lr=1.0723e-04) (hash(x)=6944772)
681
+ 651 train 7.631462 (lr=1.0683e-04) (hash(x)=6994983)
682
+ 652 train 7.616026 (lr=1.0644e-04) (hash(x)=7172017)
683
+ 653 train 7.673979 (lr=1.0604e-04) (hash(x)=8700721)
684
+ 654 train 7.661611 (lr=1.0565e-04) (hash(x)=6774360)
685
+ 655 train 7.493419 (lr=1.0525e-04) (hash(x)=5859576)
686
+ 656 train 7.219246 (lr=1.0485e-04) (hash(x)=5899275)
687
+ 657 train 7.160003 (lr=1.0446e-04) (hash(x)=5264962)
688
+ 658 train 6.957269 (lr=1.0406e-04) (hash(x)=5679861)
689
+ 659 train 7.390823 (lr=1.0367e-04) (hash(x)=5487065)
690
+ 660 train 7.173598 (lr=1.0327e-04) (hash(x)=4239476)
691
+ 661 train 7.470143 (lr=1.0288e-04) (hash(x)=5731624)
692
+ 662 train 7.588861 (lr=1.0248e-04) (hash(x)=5883465)
693
+ 663 train 7.387310 (lr=1.0209e-04) (hash(x)=4892065)
694
+ 664 train 7.202816 (lr=1.0170e-04) (hash(x)=5858782)
695
+ 665 train 7.510429 (lr=1.0130e-04) (hash(x)=5489496)
696
+ 666 train 7.182369 (lr=1.0091e-04) (hash(x)=4485195)
697
+ 667 train 7.217450 (lr=1.0051e-04) (hash(x)=4933674)
698
+ 668 train 7.469052 (lr=1.0012e-04) (hash(x)=5746292)
699
+ 669 train 7.459678 (lr=9.9726e-05) (hash(x)=7021003)
700
+ 670 train 7.635485 (lr=9.9333e-05) (hash(x)=5876710)
701
+ 671 train 7.672353 (lr=9.8940e-05) (hash(x)=7317289)
702
+ 672 train 7.790796 (lr=9.8547e-05) (hash(x)=5598226)
703
+ 673 train 7.611783 (lr=9.8154e-05) (hash(x)=7869305)
704
+ 674 train 7.651715 (lr=9.7762e-05) (hash(x)=6611408)
705
+ 675 train 7.652100 (lr=9.7370e-05) (hash(x)=6811522)
706
+ 676 train 7.627545 (lr=9.6978e-05) (hash(x)=6704714)
707
+ 677 train 7.488461 (lr=9.6586e-05) (hash(x)=6601423)
708
+ 678 train 7.721136 (lr=9.6195e-05) (hash(x)=6726071)
709
+ 679 train 7.195115 (lr=9.5804e-05) (hash(x)=5510218)
710
+ 680 train 7.964314 (lr=9.5413e-05) (hash(x)=7950952)
711
+ 681 train 7.630976 (lr=9.5022e-05) (hash(x)=7180298)
712
+ 682 train 7.462156 (lr=9.4632e-05) (hash(x)=6068813)
713
+ 683 train 7.607046 (lr=9.4242e-05) (hash(x)=7304235)
714
+ 684 train 7.630424 (lr=9.3852e-05) (hash(x)=7441806)
715
+ 685 train 7.734046 (lr=9.3463e-05) (hash(x)=8111920)
716
+ 686 train 8.340057 (lr=9.3073e-05) (hash(x)=6222783)
717
+ 687 train 7.610721 (lr=9.2685e-05) (hash(x)=6752265)
718
+ 688 train 7.428543 (lr=9.2296e-05) (hash(x)=6147634)
719
+ 689 train 7.472649 (lr=9.1908e-05) (hash(x)=6788720)
720
+ 690 train 7.627368 (lr=9.1520e-05) (hash(x)=6413518)
721
+ 691 train 7.488778 (lr=9.1133e-05) (hash(x)=5994476)
722
+ 692 train 7.406815 (lr=9.0746e-05) (hash(x)=5462082)
723
+ 693 train 7.420877 (lr=9.0359e-05) (hash(x)=5862533)
724
+ 694 train 7.574021 (lr=8.9973e-05) (hash(x)=7132796)
725
+ 695 train 7.494799 (lr=8.9587e-05) (hash(x)=6530867)
726
+ 696 train 7.530682 (lr=8.9202e-05) (hash(x)=7932207)
727
+ 697 train 7.499899 (lr=8.8817e-05) (hash(x)=5914734)
728
+ 698 train 7.673356 (lr=8.8432e-05) (hash(x)=6361594)
729
+ 699 train 7.623429 (lr=8.8048e-05) (hash(x)=5746260)
730
+ 700 val loss 7.5352
731
+ 700 val perplexity 1872.7638
732
+ 700 train 7.565264 (lr=8.7664e-05) (hash(x)=7065743)
733
+ 701 train 7.482403 (lr=8.7281e-05) (hash(x)=6856097)
734
+ 702 train 7.543030 (lr=8.6898e-05) (hash(x)=7588648)
735
+ 703 train 7.481628 (lr=8.6515e-05) (hash(x)=5790078)
736
+ 704 train 7.503336 (lr=8.6133e-05) (hash(x)=6069422)
737
+ 705 train 7.838685 (lr=8.5751e-05) (hash(x)=5805002)
738
+ 706 train 7.543121 (lr=8.5370e-05) (hash(x)=5344711)
739
+ 707 train 7.379295 (lr=8.4990e-05) (hash(x)=6430135)
740
+ 708 train 7.474722 (lr=8.4610e-05) (hash(x)=6317763)
741
+ 709 train 7.401411 (lr=8.4230e-05) (hash(x)=6156715)
742
+ 710 train 7.773821 (lr=8.3851e-05) (hash(x)=6321494)
743
+ 711 train 7.798543 (lr=8.3472e-05) (hash(x)=7614023)
744
+ 712 train 7.760346 (lr=8.3094e-05) (hash(x)=6740380)
745
+ 713 train 7.610069 (lr=8.2716e-05) (hash(x)=4861744)
746
+ 714 train 7.491691 (lr=8.2339e-05) (hash(x)=6542179)
747
+ 715 train 7.546649 (lr=8.1963e-05) (hash(x)=5244861)
748
+ 716 train 7.666377 (lr=8.1587e-05) (hash(x)=7306636)
749
+ 717 train 7.476132 (lr=8.1211e-05) (hash(x)=7163697)
750
+ 718 train 7.699056 (lr=8.0836e-05) (hash(x)=6421642)
751
+ 719 train 7.472867 (lr=8.0462e-05) (hash(x)=5245146)
752
+ 720 train 7.556446 (lr=8.0088e-05) (hash(x)=6046027)
753
+ 721 train 7.329065 (lr=7.9715e-05) (hash(x)=6153866)
754
+ 722 train 7.507441 (lr=7.9342e-05) (hash(x)=5827481)
755
+ 723 train 7.454361 (lr=7.8970e-05) (hash(x)=6415565)
756
+ 724 train 7.593674 (lr=7.8599e-05) (hash(x)=6409570)
757
+ 725 train 8.035931 (lr=7.8228e-05) (hash(x)=7835853)
758
+ 726 train 7.173873 (lr=7.7858e-05) (hash(x)=4827589)
759
+ 727 train 7.263602 (lr=7.7488e-05) (hash(x)=5786972)
760
+ 728 train 7.712968 (lr=7.7119e-05) (hash(x)=6736612)
761
+ 729 train 7.564220 (lr=7.6751e-05) (hash(x)=6733560)
762
+ 730 train 7.738716 (lr=7.6383e-05) (hash(x)=4877208)
763
+ 731 train 7.229712 (lr=7.6016e-05) (hash(x)=6131703)
764
+ 732 train 7.370987 (lr=7.5650e-05) (hash(x)=6533769)
765
+ 733 train 7.318854 (lr=7.5284e-05) (hash(x)=6001331)
766
+ 734 train 7.894945 (lr=7.4919e-05) (hash(x)=10602643)
767
+ 735 train 7.787372 (lr=7.4555e-05) (hash(x)=6346459)
768
+ 736 train 7.488129 (lr=7.4191e-05) (hash(x)=6728215)
769
+ 737 train 7.806063 (lr=7.3828e-05) (hash(x)=8943770)
770
+ 738 train 7.601428 (lr=7.3466e-05) (hash(x)=7141912)
771
+ 739 train 7.556621 (lr=7.3105e-05) (hash(x)=6504131)
772
+ 740 train 7.415032 (lr=7.2744e-05) (hash(x)=6461667)
773
+ 741 train 7.624809 (lr=7.2384e-05) (hash(x)=5869339)
774
+ 742 train 7.996296 (lr=7.2024e-05) (hash(x)=7948065)
775
+ 743 train 7.273292 (lr=7.1666e-05) (hash(x)=5209234)
776
+ 744 train 7.402139 (lr=7.1308e-05) (hash(x)=6372244)
777
+ 745 train 7.599845 (lr=7.0950e-05) (hash(x)=7678937)
778
+ 746 train 7.509338 (lr=7.0594e-05) (hash(x)=6519438)
779
+ 747 train 7.557699 (lr=7.0238e-05) (hash(x)=6163272)
780
+ 748 train 7.590439 (lr=6.9884e-05) (hash(x)=7025209)
781
+ 749 train 7.440211 (lr=6.9529e-05) (hash(x)=5774172)
782
+ 750 val loss 7.5079
783
+ 750 val perplexity 1822.4437
784
+ 750 train 7.597845 (lr=6.9176e-05) (hash(x)=5327301)
785
+ 751 train 7.398571 (lr=6.8824e-05) (hash(x)=5676768)
786
+ 752 train 7.103422 (lr=6.8472e-05) (hash(x)=4823304)
787
+ 753 train 7.213264 (lr=6.8121e-05) (hash(x)=6443895)
788
+ 754 train 8.005233 (lr=6.7771e-05) (hash(x)=8032890)
789
+ 755 train 8.063845 (lr=6.7422e-05) (hash(x)=6090561)
790
+ 756 train 7.426452 (lr=6.7073e-05) (hash(x)=4514262)
791
+ 757 train 7.851338 (lr=6.6725e-05) (hash(x)=7788628)
792
+ 758 train 7.784301 (lr=6.6379e-05) (hash(x)=7239456)
793
+ 759 train 7.842075 (lr=6.6033e-05) (hash(x)=5863092)
794
+ 760 train 7.511584 (lr=6.5688e-05) (hash(x)=5572628)
795
+ 761 train 7.589452 (lr=6.5343e-05) (hash(x)=7141354)
796
+ 762 train 7.789364 (lr=6.5000e-05) (hash(x)=7272638)
797
+ 763 train 7.647056 (lr=6.4657e-05) (hash(x)=7201312)
798
+ 764 train 7.641791 (lr=6.4316e-05) (hash(x)=6015388)
799
+ 765 train 7.714960 (lr=6.3975e-05) (hash(x)=6414278)
800
+ 766 train 7.494732 (lr=6.3635e-05) (hash(x)=7137466)
801
+ 767 train 8.065648 (lr=6.3296e-05) (hash(x)=7019489)
802
+ 768 train 7.632164 (lr=6.2958e-05) (hash(x)=7233453)
803
+ 769 train 7.369885 (lr=6.2621e-05) (hash(x)=7914626)
804
+ 770 train 7.453147 (lr=6.2285e-05) (hash(x)=5764080)
805
+ 771 train 7.473040 (lr=6.1950e-05) (hash(x)=6225608)
806
+ 772 train 7.567411 (lr=6.1615e-05) (hash(x)=8097255)
807
+ 773 train 7.565201 (lr=6.1282e-05) (hash(x)=5998078)
808
+ 774 train 7.712693 (lr=6.0949e-05) (hash(x)=5416254)
809
+ 775 train 7.399111 (lr=6.0618e-05) (hash(x)=5483019)
810
+ 776 train 7.212022 (lr=6.0287e-05) (hash(x)=4702208)
811
+ 777 train 7.528126 (lr=5.9958e-05) (hash(x)=5911642)
812
+ 778 train 7.556705 (lr=5.9629e-05) (hash(x)=6132487)
813
+ 779 train 7.693803 (lr=5.9301e-05) (hash(x)=5903258)
814
+ 780 train 7.630049 (lr=5.8974e-05) (hash(x)=7915382)
815
+ 781 train 7.620937 (lr=5.8649e-05) (hash(x)=5632006)
816
+ 782 train 7.422455 (lr=5.8324e-05) (hash(x)=6518211)
817
+ 783 train 7.515987 (lr=5.8000e-05) (hash(x)=5968716)
818
+ 784 train 7.628013 (lr=5.7678e-05) (hash(x)=7344525)
819
+ 785 train 7.527868 (lr=5.7356e-05) (hash(x)=6401968)
820
+ 786 train 7.453711 (lr=5.7035e-05) (hash(x)=6276127)
821
+ 787 train 7.565621 (lr=5.6716e-05) (hash(x)=5778017)
822
+ 788 train 7.112717 (lr=5.6397e-05) (hash(x)=5387306)
823
+ 789 train 7.061220 (lr=5.6079e-05) (hash(x)=5772567)
824
+ 790 train 7.309787 (lr=5.5763e-05) (hash(x)=6383748)
825
+ 791 train 7.590770 (lr=5.5447e-05) (hash(x)=7780194)
826
+ 792 train 7.487756 (lr=5.5133e-05) (hash(x)=7119030)
827
+ 793 train 7.604721 (lr=5.4820e-05) (hash(x)=6424771)
828
+ 794 train 7.412771 (lr=5.4507e-05) (hash(x)=6540151)
829
+ 795 train 7.396997 (lr=5.4196e-05) (hash(x)=6140998)
830
+ 796 train 7.475514 (lr=5.3886e-05) (hash(x)=6208271)
831
+ 797 train 7.357804 (lr=5.3577e-05) (hash(x)=7859566)
832
+ 798 train 7.898904 (lr=5.3269e-05) (hash(x)=7064477)
833
+ 799 train 7.147756 (lr=5.2962e-05) (hash(x)=3784321)
834
+ 800 val loss 7.4864
835
+ 800 val perplexity 1783.5763
836
+ 800 train 7.446015 (lr=5.2656e-05) (hash(x)=4472758)
837
+ 801 train 7.408524 (lr=5.2352e-05) (hash(x)=5557891)
838
+ 802 train 7.551142 (lr=5.2048e-05) (hash(x)=7969325)
839
+ 803 train 7.572937 (lr=5.1746e-05) (hash(x)=5860821)
840
+ 804 train 7.563385 (lr=5.1444e-05) (hash(x)=6750848)
841
+ 805 train 7.520956 (lr=5.1144e-05) (hash(x)=5674826)
842
+ 806 train 7.317789 (lr=5.0845e-05) (hash(x)=5529163)
843
+ 807 train 7.714856 (lr=5.0547e-05) (hash(x)=7774109)
844
+ 808 train 7.490806 (lr=5.0251e-05) (hash(x)=6762509)
845
+ 809 train 7.624145 (lr=4.9955e-05) (hash(x)=5466449)
846
+ 810 train 7.466751 (lr=4.9661e-05) (hash(x)=7046935)
847
+ 811 train 8.306866 (lr=4.9367e-05) (hash(x)=7997664)
848
+ 812 train 7.646118 (lr=4.9075e-05) (hash(x)=6882594)
849
+ 813 train 7.805164 (lr=4.8784e-05) (hash(x)=7006517)
850
+ 814 train 7.661189 (lr=4.8495e-05) (hash(x)=6832530)
851
+ 815 train 7.502591 (lr=4.8206e-05) (hash(x)=6576749)
852
+ 816 train 7.601992 (lr=4.7919e-05) (hash(x)=8276629)
853
+ 817 train 7.410618 (lr=4.7633e-05) (hash(x)=6896198)
854
+ 818 train 7.613954 (lr=4.7348e-05) (hash(x)=5829252)
855
+ 819 train 7.760229 (lr=4.7064e-05) (hash(x)=7266655)
856
+ 820 train 7.481735 (lr=4.6782e-05) (hash(x)=6015975)
857
+ 821 train 7.483913 (lr=4.6501e-05) (hash(x)=5696696)
858
+ 822 train 7.504061 (lr=4.6221e-05) (hash(x)=5411666)
859
+ 823 train 7.731577 (lr=4.5942e-05) (hash(x)=7072404)
860
+ 824 train 7.397497 (lr=4.5664e-05) (hash(x)=4910095)
861
+ 825 train 7.505491 (lr=4.5388e-05) (hash(x)=6590657)
862
+ 826 train 7.449922 (lr=4.5113e-05) (hash(x)=7665574)
863
+ 827 train 7.499396 (lr=4.4839e-05) (hash(x)=6626459)
864
+ 828 train 7.530712 (lr=4.4567e-05) (hash(x)=6731971)
865
+ 829 train 7.513950 (lr=4.4295e-05) (hash(x)=5644977)
866
+ 830 train 7.482000 (lr=4.4025e-05) (hash(x)=6238654)
867
+ 831 train 7.373971 (lr=4.3757e-05) (hash(x)=6556025)
868
+ 832 train 7.689452 (lr=4.3489e-05) (hash(x)=7175563)
869
+ 833 train 7.572334 (lr=4.3223e-05) (hash(x)=6407492)
870
+ 834 train 7.386360 (lr=4.2958e-05) (hash(x)=4934335)
871
+ 835 train 7.427689 (lr=4.2695e-05) (hash(x)=5942129)
872
+ 836 train 7.455790 (lr=4.2432e-05) (hash(x)=5995643)
873
+ 837 train 7.502191 (lr=4.2171e-05) (hash(x)=6012163)
874
+ 838 train 7.617628 (lr=4.1912e-05) (hash(x)=8464831)
875
+ 839 train 7.849965 (lr=4.1654e-05) (hash(x)=7325027)
876
+ 840 train 7.721559 (lr=4.1397e-05) (hash(x)=6785865)
877
+ 841 train 7.675906 (lr=4.1141e-05) (hash(x)=4425520)
878
+ 842 train 7.418003 (lr=4.0887e-05) (hash(x)=5388267)
879
+ 843 train 7.656733 (lr=4.0634e-05) (hash(x)=7322467)
880
+ 844 train 7.461135 (lr=4.0382e-05) (hash(x)=6681766)
881
+ 845 train 7.544234 (lr=4.0132e-05) (hash(x)=7482800)
882
+ 846 train 7.466361 (lr=3.9883e-05) (hash(x)=5554493)
883
+ 847 train 7.706922 (lr=3.9635e-05) (hash(x)=6373412)
884
+ 848 train 7.074380 (lr=3.9389e-05) (hash(x)=5117517)
885
+ 849 train 7.335587 (lr=3.9144e-05) (hash(x)=6981426)
886
+ 850 val loss 7.4738
887
+ 850 val perplexity 1761.3721
888
+ 850 train 7.315806 (lr=3.8901e-05) (hash(x)=6886188)
889
+ 851 train 7.533036 (lr=3.8659e-05) (hash(x)=7332255)
890
+ 852 train 7.547958 (lr=3.8418e-05) (hash(x)=6172042)
891
+ 853 train 7.681037 (lr=3.8178e-05) (hash(x)=5930894)
892
+ 854 train 7.743274 (lr=3.7941e-05) (hash(x)=7448958)
893
+ 855 train 7.529554 (lr=3.7704e-05) (hash(x)=5262868)
894
+ 856 train 7.401314 (lr=3.7469e-05) (hash(x)=5558427)
895
+ 857 train 7.289130 (lr=3.7235e-05) (hash(x)=5585769)
896
+ 858 train 7.198334 (lr=3.7003e-05) (hash(x)=5838081)
897
+ 859 train 7.363285 (lr=3.6772e-05) (hash(x)=5688247)
898
+ 860 train 7.469645 (lr=3.6542e-05) (hash(x)=5162020)
899
+ 861 train 7.653494 (lr=3.6314e-05) (hash(x)=7462079)
900
+ 862 train 7.476114 (lr=3.6088e-05) (hash(x)=6516108)
901
+ 863 train 7.279115 (lr=3.5862e-05) (hash(x)=8055563)
902
+ 864 train 7.494496 (lr=3.5639e-05) (hash(x)=6271901)
903
+ 865 train 7.525388 (lr=3.5416e-05) (hash(x)=6221701)
904
+ 866 train 7.514720 (lr=3.5195e-05) (hash(x)=5772861)
905
+ 867 train 7.521962 (lr=3.4976e-05) (hash(x)=5352405)
906
+ 868 train 7.546047 (lr=3.4758e-05) (hash(x)=6111630)
907
+ 869 train 7.554129 (lr=3.4541e-05) (hash(x)=6730666)
908
+ 870 train 7.734034 (lr=3.4326e-05) (hash(x)=8698315)
909
+ 871 train 7.415056 (lr=3.4113e-05) (hash(x)=5932790)
910
+ 872 train 7.788636 (lr=3.3900e-05) (hash(x)=8099892)
911
+ 873 train 7.450464 (lr=3.3690e-05) (hash(x)=6223114)
912
+ 874 train 7.614958 (lr=3.3480e-05) (hash(x)=5798363)
913
+ 875 train 7.408576 (lr=3.3273e-05) (hash(x)=6249312)
914
+ 876 train 7.540551 (lr=3.3066e-05) (hash(x)=6929692)
915
+ 877 train 7.634775 (lr=3.2862e-05) (hash(x)=7242827)
916
+ 878 train 7.392862 (lr=3.2658e-05) (hash(x)=6332123)
917
+ 879 train 7.297074 (lr=3.2457e-05) (hash(x)=5680154)
918
+ 880 train 7.679817 (lr=3.2256e-05) (hash(x)=6352331)
919
+ 881 train 7.508467 (lr=3.2058e-05) (hash(x)=6332332)
920
+ 882 train 7.548125 (lr=3.1860e-05) (hash(x)=6180674)
921
+ 883 train 7.498994 (lr=3.1665e-05) (hash(x)=8156280)
922
+ 884 train 7.546700 (lr=3.1471e-05) (hash(x)=5874247)
923
+ 885 train 7.459682 (lr=3.1278e-05) (hash(x)=6659781)
924
+ 886 train 7.583869 (lr=3.1087e-05) (hash(x)=5780147)
925
+ 887 train 7.409980 (lr=3.0897e-05) (hash(x)=5914217)
926
+ 888 train 7.254773 (lr=3.0709e-05) (hash(x)=5411762)
927
+ 889 train 7.571443 (lr=3.0522e-05) (hash(x)=6714110)
928
+ 890 train 7.235568 (lr=3.0337e-05) (hash(x)=5999685)
929
+ 891 train 7.425292 (lr=3.0154e-05) (hash(x)=7120215)
930
+ 892 train 7.688450 (lr=2.9972e-05) (hash(x)=6546587)
931
+ 893 train 7.449863 (lr=2.9791e-05) (hash(x)=6593413)
932
+ 894 train 7.549087 (lr=2.9613e-05) (hash(x)=6369927)
933
+ 895 train 7.445959 (lr=2.9435e-05) (hash(x)=6424128)
934
+ 896 train 7.419571 (lr=2.9260e-05) (hash(x)=5736158)
935
+ 897 train 7.250756 (lr=2.9085e-05) (hash(x)=5903043)
936
+ 898 train 7.343702 (lr=2.8913e-05) (hash(x)=4419128)
937
+ 899 train 7.178699 (lr=2.8742e-05) (hash(x)=4390027)
938
+ 900 val loss 7.4552
939
+ 900 val perplexity 1728.8169
940
+ 900 train 7.410570 (lr=2.8572e-05) (hash(x)=6728135)
941
+ 901 train 7.522957 (lr=2.8404e-05) (hash(x)=6945760)
942
+ 902 train 7.503778 (lr=2.8238e-05) (hash(x)=6081534)
943
+ 903 train 7.566470 (lr=2.8073e-05) (hash(x)=7804089)
944
+ 904 train 7.625813 (lr=2.7910e-05) (hash(x)=6225832)
945
+ 905 train 7.653649 (lr=2.7749e-05) (hash(x)=6273417)
946
+ 906 train 7.705318 (lr=2.7589e-05) (hash(x)=7775633)
947
+ 907 train 7.673293 (lr=2.7430e-05) (hash(x)=7130267)
948
+ 908 train 7.435651 (lr=2.7274e-05) (hash(x)=6554076)
949
+ 909 train 7.541444 (lr=2.7119e-05) (hash(x)=6140697)
950
+ 910 train 7.367994 (lr=2.6965e-05) (hash(x)=6128181)
951
+ 911 train 7.466553 (lr=2.6813e-05) (hash(x)=6490149)
952
+ 912 train 7.045875 (lr=2.6663e-05) (hash(x)=5422426)
953
+ 913 train 7.449208 (lr=2.6514e-05) (hash(x)=4847733)
954
+ 914 train 7.468187 (lr=2.6367e-05) (hash(x)=6349747)
955
+ 915 train 7.752071 (lr=2.6221e-05) (hash(x)=9276378)
956
+ 916 train 7.362792 (lr=2.6077e-05) (hash(x)=6869524)
957
+ 917 train 7.405890 (lr=2.5935e-05) (hash(x)=7082053)
958
+ 918 train 7.457149 (lr=2.5795e-05) (hash(x)=7799351)
959
+ 919 train 7.636553 (lr=2.5656e-05) (hash(x)=6283540)
960
+ 920 train 7.408257 (lr=2.5518e-05) (hash(x)=6660860)
961
+ 921 train 7.345269 (lr=2.5383e-05) (hash(x)=5200956)
962
+ 922 train 7.435207 (lr=2.5249e-05) (hash(x)=5653241)
963
+ 923 train 7.591687 (lr=2.5116e-05) (hash(x)=5781680)
964
+ 924 train 7.562698 (lr=2.4985e-05) (hash(x)=6146666)
965
+ 925 train 7.392581 (lr=2.4856e-05) (hash(x)=8652907)
966
+ 926 train 7.211637 (lr=2.4729e-05) (hash(x)=5311546)
967
+ 927 train 7.363598 (lr=2.4603e-05) (hash(x)=5728337)
968
+ 928 train 7.330002 (lr=2.4479e-05) (hash(x)=5413240)
969
+ 929 train 7.469073 (lr=2.4356e-05) (hash(x)=5832401)
970
+ 930 train 7.220998 (lr=2.4235e-05) (hash(x)=7420230)
971
+ 931 train 7.529319 (lr=2.4116e-05) (hash(x)=6278202)
972
+ 932 train 7.408755 (lr=2.3998e-05) (hash(x)=6189873)
973
+ 933 train 7.496275 (lr=2.3883e-05) (hash(x)=7403868)
974
+ 934 train 7.386362 (lr=2.3768e-05) (hash(x)=6867459)
975
+ 935 train 7.583867 (lr=2.3656e-05) (hash(x)=7400272)
976
+ 936 train 7.607432 (lr=2.3545e-05) (hash(x)=7936275)
977
+ 937 train 7.318448 (lr=2.3436e-05) (hash(x)=5943413)
978
+ 938 train 7.540411 (lr=2.3328e-05) (hash(x)=5442166)
979
+ 939 train 7.460488 (lr=2.3222e-05) (hash(x)=5795128)
980
+ 940 train 7.633378 (lr=2.3118e-05) (hash(x)=6831680)
981
+ 941 train 7.341377 (lr=2.3016e-05) (hash(x)=5519980)
982
+ 942 train 7.339648 (lr=2.2915e-05) (hash(x)=6765461)
983
+ 943 train 7.318742 (lr=2.2816e-05) (hash(x)=6473432)
984
+ 944 train 7.286235 (lr=2.2718e-05) (hash(x)=6490395)
985
+ 945 train 7.504956 (lr=2.2623e-05) (hash(x)=5484669)
986
+ 946 train 7.488675 (lr=2.2528e-05) (hash(x)=7278970)
987
+ 947 train 7.469147 (lr=2.2436e-05) (hash(x)=5201502)
988
+ 948 train 7.700660 (lr=2.2345e-05) (hash(x)=7277915)
989
+ 949 train 7.403878 (lr=2.2256e-05) (hash(x)=7227057)
990
+ 950 val loss 7.4425
991
+ 950 val perplexity 1706.9459
992
+ 950 train 7.644131 (lr=2.2169e-05) (hash(x)=8178996)
993
+ 951 train 7.791108 (lr=2.2084e-05) (hash(x)=6922646)
994
+ 952 train 7.313836 (lr=2.2000e-05) (hash(x)=5941772)
995
+ 953 train 7.256076 (lr=2.1918e-05) (hash(x)=6056673)
996
+ 954 train 7.675739 (lr=2.1837e-05) (hash(x)=6912178)
997
+ 955 train 7.223477 (lr=2.1758e-05) (hash(x)=5484641)
998
+ 956 train 7.545163 (lr=2.1681e-05) (hash(x)=5121117)
999
+ 957 train 7.572541 (lr=2.1606e-05) (hash(x)=7100865)
1000
+ 958 train 7.269023 (lr=2.1532e-05) (hash(x)=5334280)
1001
+ 959 train 7.351160 (lr=2.1461e-05) (hash(x)=7277141)
1002
+ 960 train 7.377275 (lr=2.1390e-05) (hash(x)=6401960)
1003
+ 961 train 7.406311 (lr=2.1322e-05) (hash(x)=6407603)
1004
+ 962 train 7.536669 (lr=2.1255e-05) (hash(x)=5786089)
1005
+ 963 train 7.392788 (lr=2.1190e-05) (hash(x)=5604614)
1006
+ 964 train 7.421803 (lr=2.1127e-05) (hash(x)=5454287)
1007
+ 965 train 7.556461 (lr=2.1065e-05) (hash(x)=6200612)
1008
+ 966 train 7.516517 (lr=2.1005e-05) (hash(x)=5691759)
1009
+ 967 train 7.473177 (lr=2.0947e-05) (hash(x)=7443124)
1010
+ 968 train 7.490295 (lr=2.0891e-05) (hash(x)=6711366)
1011
+ 969 train 7.426388 (lr=2.0836e-05) (hash(x)=6728764)
1012
+ 970 train 7.727061 (lr=2.0783e-05) (hash(x)=6870240)
1013
+ 971 train 7.426309 (lr=2.0732e-05) (hash(x)=5601968)
1014
+ 972 train 7.180040 (lr=2.0682e-05) (hash(x)=5311504)
1015
+ 973 train 7.445254 (lr=2.0634e-05) (hash(x)=6877761)
1016
+ 974 train 7.535763 (lr=2.0588e-05) (hash(x)=6457033)
1017
+ 975 train 7.482474 (lr=2.0544e-05) (hash(x)=5632735)
1018
+ 976 train 7.692316 (lr=2.0501e-05) (hash(x)=6024880)
1019
+ 977 train 7.368898 (lr=2.0460e-05) (hash(x)=5658845)
1020
+ 978 train 7.265898 (lr=2.0421e-05) (hash(x)=6089461)
1021
+ 979 train 7.314706 (lr=2.0384e-05) (hash(x)=5594810)
1022
+ 980 train 7.111052 (lr=2.0348e-05) (hash(x)=4667026)
1023
+ 981 train 6.978990 (lr=2.0314e-05) (hash(x)=4738523)
1024
+ 982 train 7.308672 (lr=2.0282e-05) (hash(x)=7451876)
1025
+ 983 train 7.553682 (lr=2.0252e-05) (hash(x)=5881015)
1026
+ 984 train 7.567001 (lr=2.0223e-05) (hash(x)=6255031)
1027
+ 985 train 7.395120 (lr=2.0196e-05) (hash(x)=5263325)
1028
+ 986 train 7.373227 (lr=2.0171e-05) (hash(x)=6654585)
1029
+ 987 train 7.415499 (lr=2.0147e-05) (hash(x)=6865361)
1030
+ 988 train 7.525691 (lr=2.0125e-05) (hash(x)=6978986)
1031
+ 989 train 7.624014 (lr=2.0105e-05) (hash(x)=5644619)
1032
+ 990 train 7.438036 (lr=2.0087e-05) (hash(x)=6551023)
1033
+ 991 train 7.402590 (lr=2.0071e-05) (hash(x)=5809411)
1034
+ 992 train 7.472279 (lr=2.0056e-05) (hash(x)=5501837)
1035
+ 993 train 7.491112 (lr=2.0043e-05) (hash(x)=7044374)
1036
+ 994 train 7.449114 (lr=2.0031e-05) (hash(x)=6526130)
1037
+ 995 train 7.546727 (lr=2.0022e-05) (hash(x)=6293025)
1038
+ 996 train 7.572012 (lr=2.0014e-05) (hash(x)=6295768)
1039
+ 997 train 7.363132 (lr=2.0008e-05) (hash(x)=5189023)
1040
+ 998 train 7.186160 (lr=2.0003e-05) (hash(x)=6255467)
1041
+ 999 val loss 7.4355
1042
+ 999 val perplexity 1695.0616
1043
+ 999 train 7.543315 (lr=2.0001e-05) (hash(x)=7232964)
lr2e-4_total_batch_size5120_seq_len128/model_00999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77ce5e02390324cfa5539b7ad11ee4019d87cfd456a49088c2a97c2964f18b16
3
+ size 36163010
lr2e-4_total_batch_size5120_seq_len128/optimizer_00999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c62f785ef9fb826078987ab7693053c030faacd78456f003d781b8fbf73705a2
3
+ size 70764358