andrew-healey commited on
Commit
7b33959
·
verified ·
1 Parent(s): 9c23277

Upload folder using huggingface_hub

Browse files
lr1e-4_total_batch_size40960_seq_len256/args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hellaswag": true, "attention_kind": "selective", "log_dir": "proxy_model_sweep/lr1e-4_total_batch_size40960_seq_len256", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 2, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": null, "group": "proxy_model_sweep", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1337, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": null, "total_batch_size": 40960, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": true, "mup_enable_coord_check_logging": false, "max_lr": 0.0001, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "n_embd": 128}
lr1e-4_total_batch_size40960_seq_len256/dataloader_00999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cec289292cf51a5462e2631af59109e64df3eca2d9e62e1fbd1c53a776a0f874
3
+ size 964
lr1e-4_total_batch_size40960_seq_len256/log2.txt ADDED
@@ -0,0 +1,1043 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ max_steps: 1000
2
+ 0 val loss 11.7488
3
+ 0 val perplexity 126607.6172
4
+ 0 train 11.747648 (lr=3.4965e-07) (hash(x)=9915468)
5
+ 1 train 11.762551 (lr=6.9930e-07) (hash(x)=12684830)
6
+ 2 train 11.753098 (lr=1.0490e-06) (hash(x)=12014715)
7
+ 3 train 11.729498 (lr=1.3986e-06) (hash(x)=12595649)
8
+ 4 train 11.746849 (lr=1.7483e-06) (hash(x)=11101031)
9
+ 5 train 11.737625 (lr=2.0979e-06) (hash(x)=12036401)
10
+ 6 train 11.732759 (lr=2.4476e-06) (hash(x)=11823810)
11
+ 7 train 11.721992 (lr=2.7972e-06) (hash(x)=13549925)
12
+ 8 train 11.712526 (lr=3.1469e-06) (hash(x)=13252262)
13
+ 9 train 11.705689 (lr=3.4965e-06) (hash(x)=13749054)
14
+ 10 train 11.690114 (lr=3.8462e-06) (hash(x)=14461706)
15
+ 11 train 11.677700 (lr=4.1958e-06) (hash(x)=11735995)
16
+ 12 train 11.675545 (lr=4.5455e-06) (hash(x)=12231701)
17
+ 13 train 11.669949 (lr=4.8951e-06) (hash(x)=14117945)
18
+ 14 train 11.615375 (lr=5.2448e-06) (hash(x)=12678303)
19
+ 15 train 11.638197 (lr=5.5944e-06) (hash(x)=12758461)
20
+ 16 train 11.620955 (lr=5.9441e-06) (hash(x)=13701311)
21
+ 17 train 11.582423 (lr=6.2937e-06) (hash(x)=14346433)
22
+ 18 train 11.581202 (lr=6.6434e-06) (hash(x)=18910052)
23
+ 19 train 11.561735 (lr=6.9930e-06) (hash(x)=12508350)
24
+ 20 train 11.538468 (lr=7.3427e-06) (hash(x)=14043988)
25
+ 21 train 11.501307 (lr=7.6923e-06) (hash(x)=11968227)
26
+ 22 train 11.489028 (lr=8.0420e-06) (hash(x)=15670776)
27
+ 23 train 11.472087 (lr=8.3916e-06) (hash(x)=11289341)
28
+ 24 train 11.442546 (lr=8.7413e-06) (hash(x)=11482477)
29
+ 25 train 11.423048 (lr=9.0909e-06) (hash(x)=10345057)
30
+ 26 train 11.383562 (lr=9.4406e-06) (hash(x)=11157124)
31
+ 27 train 11.354762 (lr=9.7902e-06) (hash(x)=10750452)
32
+ 28 train 11.325596 (lr=1.0140e-05) (hash(x)=8921601)
33
+ 29 train 11.280510 (lr=1.0490e-05) (hash(x)=12165500)
34
+ 30 train 11.264828 (lr=1.0839e-05) (hash(x)=13569316)
35
+ 31 train 11.241072 (lr=1.1189e-05) (hash(x)=14043713)
36
+ 32 train 11.202614 (lr=1.1538e-05) (hash(x)=11969650)
37
+ 33 train 11.108937 (lr=1.1888e-05) (hash(x)=13680255)
38
+ 34 train 11.147068 (lr=1.2238e-05) (hash(x)=13871675)
39
+ 35 train 11.069932 (lr=1.2587e-05) (hash(x)=12263431)
40
+ 36 train 11.072086 (lr=1.2937e-05) (hash(x)=12414778)
41
+ 37 train 11.031780 (lr=1.3287e-05) (hash(x)=14443665)
42
+ 38 train 10.983942 (lr=1.3636e-05) (hash(x)=13100449)
43
+ 39 train 10.919054 (lr=1.3986e-05) (hash(x)=13953073)
44
+ 40 train 10.840235 (lr=1.4336e-05) (hash(x)=10956041)
45
+ 41 train 10.878267 (lr=1.4685e-05) (hash(x)=12840879)
46
+ 42 train 10.845440 (lr=1.5035e-05) (hash(x)=14428050)
47
+ 43 train 10.782969 (lr=1.5385e-05) (hash(x)=14104576)
48
+ 44 train 10.746889 (lr=1.5734e-05) (hash(x)=14453314)
49
+ 45 train 10.696529 (lr=1.6084e-05) (hash(x)=12918172)
50
+ 46 train 10.683006 (lr=1.6434e-05) (hash(x)=12368572)
51
+ 47 train 10.663581 (lr=1.6783e-05) (hash(x)=13340443)
52
+ 48 train 10.654287 (lr=1.7133e-05) (hash(x)=10742101)
53
+ 49 train 10.629972 (lr=1.7483e-05) (hash(x)=13025742)
54
+ 50 val loss 10.5884
55
+ 50 val perplexity 39672.1484
56
+ 50 train 10.641424 (lr=1.7832e-05) (hash(x)=14351068)
57
+ 51 train 10.554951 (lr=1.8182e-05) (hash(x)=13155663)
58
+ 52 train 10.543849 (lr=1.8531e-05) (hash(x)=9157946)
59
+ 53 train 10.529142 (lr=1.8881e-05) (hash(x)=13872294)
60
+ 54 train 10.468439 (lr=1.9231e-05) (hash(x)=12773612)
61
+ 55 train 10.484112 (lr=1.9580e-05) (hash(x)=12235713)
62
+ 56 train 10.461968 (lr=1.9930e-05) (hash(x)=13686441)
63
+ 57 train 10.497540 (lr=2.0280e-05) (hash(x)=13214962)
64
+ 58 train 10.410728 (lr=2.0629e-05) (hash(x)=12602681)
65
+ 59 train 10.426524 (lr=2.0979e-05) (hash(x)=13829219)
66
+ 60 train 10.441619 (lr=2.1329e-05) (hash(x)=9335220)
67
+ 61 train 10.396661 (lr=2.1678e-05) (hash(x)=15713183)
68
+ 62 train 10.396031 (lr=2.2028e-05) (hash(x)=9917297)
69
+ 63 train 10.342007 (lr=2.2378e-05) (hash(x)=12688412)
70
+ 64 train 10.316389 (lr=2.2727e-05) (hash(x)=12108991)
71
+ 65 train 10.362356 (lr=2.3077e-05) (hash(x)=12152216)
72
+ 66 train 10.335101 (lr=2.3427e-05) (hash(x)=11421483)
73
+ 67 train 10.310894 (lr=2.3776e-05) (hash(x)=13448155)
74
+ 68 train 10.319539 (lr=2.4126e-05) (hash(x)=11905081)
75
+ 69 train 10.261634 (lr=2.4476e-05) (hash(x)=12290857)
76
+ 70 train 10.385275 (lr=2.4825e-05) (hash(x)=13234933)
77
+ 71 train 10.347075 (lr=2.5175e-05) (hash(x)=9768085)
78
+ 72 train 10.268010 (lr=2.5524e-05) (hash(x)=9528974)
79
+ 73 train 10.323756 (lr=2.5874e-05) (hash(x)=10720403)
80
+ 74 train 10.257861 (lr=2.6224e-05) (hash(x)=12874592)
81
+ 75 train 10.301887 (lr=2.6573e-05) (hash(x)=11025404)
82
+ 76 train 10.317125 (lr=2.6923e-05) (hash(x)=12567779)
83
+ 77 train 10.329890 (lr=2.7273e-05) (hash(x)=11826091)
84
+ 78 train 10.358979 (lr=2.7622e-05) (hash(x)=11259652)
85
+ 79 train 10.314079 (lr=2.7972e-05) (hash(x)=14642293)
86
+ 80 train 10.209698 (lr=2.8322e-05) (hash(x)=12923132)
87
+ 81 train 10.260011 (lr=2.8671e-05) (hash(x)=12169964)
88
+ 82 train 10.206309 (lr=2.9021e-05) (hash(x)=10948640)
89
+ 83 train 10.242997 (lr=2.9371e-05) (hash(x)=14103013)
90
+ 84 train 10.280931 (lr=2.9720e-05) (hash(x)=11135042)
91
+ 85 train 10.229772 (lr=3.0070e-05) (hash(x)=12404200)
92
+ 86 train 10.239367 (lr=3.0420e-05) (hash(x)=11534055)
93
+ 87 train 10.218316 (lr=3.0769e-05) (hash(x)=10914085)
94
+ 88 train 10.230884 (lr=3.1119e-05) (hash(x)=14570713)
95
+ 89 train 10.272120 (lr=3.1469e-05) (hash(x)=11680329)
96
+ 90 train 10.205911 (lr=3.1818e-05) (hash(x)=12437096)
97
+ 91 train 10.184072 (lr=3.2168e-05) (hash(x)=12654843)
98
+ 92 train 10.194653 (lr=3.2517e-05) (hash(x)=10606850)
99
+ 93 train 10.165556 (lr=3.2867e-05) (hash(x)=11281537)
100
+ 94 train 10.215721 (lr=3.3217e-05) (hash(x)=12363856)
101
+ 95 train 10.212786 (lr=3.3566e-05) (hash(x)=14001265)
102
+ 96 train 10.155742 (lr=3.3916e-05) (hash(x)=10444420)
103
+ 97 train 10.187316 (lr=3.4266e-05) (hash(x)=12249732)
104
+ 98 train 10.127490 (lr=3.4615e-05) (hash(x)=13803785)
105
+ 99 train 10.141859 (lr=3.4965e-05) (hash(x)=11689978)
106
+ 100 val loss 10.1378
107
+ 100 val perplexity 25280.8887
108
+ 100 train 10.124390 (lr=3.5315e-05) (hash(x)=13865869)
109
+ 101 train 10.228412 (lr=3.5664e-05) (hash(x)=13030618)
110
+ 102 train 10.101461 (lr=3.6014e-05) (hash(x)=14299833)
111
+ 103 train 10.117199 (lr=3.6364e-05) (hash(x)=12908875)
112
+ 104 train 10.122482 (lr=3.6713e-05) (hash(x)=13859304)
113
+ 105 train 10.114618 (lr=3.7063e-05) (hash(x)=14396964)
114
+ 106 train 10.059211 (lr=3.7413e-05) (hash(x)=11269801)
115
+ 107 train 10.020694 (lr=3.7762e-05) (hash(x)=14486071)
116
+ 108 train 10.065903 (lr=3.8112e-05) (hash(x)=11997361)
117
+ 109 train 10.086704 (lr=3.8462e-05) (hash(x)=11010336)
118
+ 110 train 10.057624 (lr=3.8811e-05) (hash(x)=10864248)
119
+ 111 train 10.023162 (lr=3.9161e-05) (hash(x)=11906151)
120
+ 112 train 10.029696 (lr=3.9510e-05) (hash(x)=15166897)
121
+ 113 train 9.990315 (lr=3.9860e-05) (hash(x)=12338963)
122
+ 114 train 9.955505 (lr=4.0210e-05) (hash(x)=13235312)
123
+ 115 train 10.000879 (lr=4.0559e-05) (hash(x)=11031343)
124
+ 116 train 9.993717 (lr=4.0909e-05) (hash(x)=14597757)
125
+ 117 train 9.992077 (lr=4.1259e-05) (hash(x)=13571036)
126
+ 118 train 9.984200 (lr=4.1608e-05) (hash(x)=13225100)
127
+ 119 train 9.974423 (lr=4.1958e-05) (hash(x)=12281916)
128
+ 120 train 9.919270 (lr=4.2308e-05) (hash(x)=13634084)
129
+ 121 train 9.945660 (lr=4.2657e-05) (hash(x)=11801649)
130
+ 122 train 9.901111 (lr=4.3007e-05) (hash(x)=11746289)
131
+ 123 train 9.913164 (lr=4.3357e-05) (hash(x)=10783313)
132
+ 124 train 9.924176 (lr=4.3706e-05) (hash(x)=14212467)
133
+ 125 train 9.920055 (lr=4.4056e-05) (hash(x)=12994231)
134
+ 126 train 9.933949 (lr=4.4406e-05) (hash(x)=11485137)
135
+ 127 train 9.896671 (lr=4.4755e-05) (hash(x)=10470487)
136
+ 128 train 9.916708 (lr=4.5105e-05) (hash(x)=12205695)
137
+ 129 train 9.829318 (lr=4.5455e-05) (hash(x)=13985657)
138
+ 130 train 9.775917 (lr=4.5804e-05) (hash(x)=12944090)
139
+ 131 train 9.778201 (lr=4.6154e-05) (hash(x)=12831661)
140
+ 132 train 9.819001 (lr=4.6503e-05) (hash(x)=12806095)
141
+ 133 train 9.782230 (lr=4.6853e-05) (hash(x)=14617414)
142
+ 134 train 9.776893 (lr=4.7203e-05) (hash(x)=13221393)
143
+ 135 train 9.789565 (lr=4.7552e-05) (hash(x)=11041023)
144
+ 136 train 9.734136 (lr=4.7902e-05) (hash(x)=11362709)
145
+ 137 train 9.786682 (lr=4.8252e-05) (hash(x)=13186073)
146
+ 138 train 9.804090 (lr=4.8601e-05) (hash(x)=13274177)
147
+ 139 train 9.715269 (lr=4.8951e-05) (hash(x)=13507868)
148
+ 140 train 9.693123 (lr=4.9301e-05) (hash(x)=12512404)
149
+ 141 train 9.705489 (lr=4.9650e-05) (hash(x)=11221721)
150
+ 142 train 9.721018 (lr=5.0000e-05) (hash(x)=15372354)
151
+ 143 train 9.697434 (lr=5.0350e-05) (hash(x)=14689778)
152
+ 144 train 9.694244 (lr=5.0699e-05) (hash(x)=13156509)
153
+ 145 train 9.639981 (lr=5.1049e-05) (hash(x)=15734207)
154
+ 146 train 9.612728 (lr=5.1399e-05) (hash(x)=13758140)
155
+ 147 train 9.609246 (lr=5.1748e-05) (hash(x)=12925955)
156
+ 148 train 9.624068 (lr=5.2098e-05) (hash(x)=12750849)
157
+ 149 train 9.580193 (lr=5.2448e-05) (hash(x)=14327624)
158
+ 150 val loss 9.5604
159
+ 150 val perplexity 14192.1982
160
+ 150 train 9.552365 (lr=5.2797e-05) (hash(x)=11819313)
161
+ 151 train 9.517182 (lr=5.3147e-05) (hash(x)=12559422)
162
+ 152 train 9.563116 (lr=5.3497e-05) (hash(x)=12816359)
163
+ 153 train 9.569820 (lr=5.3846e-05) (hash(x)=10162896)
164
+ 154 train 9.537175 (lr=5.4196e-05) (hash(x)=12041552)
165
+ 155 train 9.508113 (lr=5.4545e-05) (hash(x)=10256290)
166
+ 156 train 9.490771 (lr=5.4895e-05) (hash(x)=14312559)
167
+ 157 train 9.521740 (lr=5.5245e-05) (hash(x)=11443472)
168
+ 158 train 9.448408 (lr=5.5594e-05) (hash(x)=10480150)
169
+ 159 train 9.516107 (lr=5.5944e-05) (hash(x)=13927238)
170
+ 160 train 9.416839 (lr=5.6294e-05) (hash(x)=16448258)
171
+ 161 train 9.425428 (lr=5.6643e-05) (hash(x)=10701487)
172
+ 162 train 9.415069 (lr=5.6993e-05) (hash(x)=13034568)
173
+ 163 train 9.482796 (lr=5.7343e-05) (hash(x)=14575913)
174
+ 164 train 9.351413 (lr=5.7692e-05) (hash(x)=10875163)
175
+ 165 train 9.482010 (lr=5.8042e-05) (hash(x)=13744329)
176
+ 166 train 9.368473 (lr=5.8392e-05) (hash(x)=8860388)
177
+ 167 train 9.324383 (lr=5.8741e-05) (hash(x)=14097210)
178
+ 168 train 9.334857 (lr=5.9091e-05) (hash(x)=11088127)
179
+ 169 train 9.369176 (lr=5.9441e-05) (hash(x)=14363085)
180
+ 170 train 9.345120 (lr=5.9790e-05) (hash(x)=14133983)
181
+ 171 train 9.312074 (lr=6.0140e-05) (hash(x)=9539498)
182
+ 172 train 9.352511 (lr=6.0490e-05) (hash(x)=17793549)
183
+ 173 train 9.311731 (lr=6.0839e-05) (hash(x)=14163458)
184
+ 174 train 9.231804 (lr=6.1189e-05) (hash(x)=14609724)
185
+ 175 train 9.319579 (lr=6.1538e-05) (hash(x)=18154448)
186
+ 176 train 9.413657 (lr=6.1888e-05) (hash(x)=11995482)
187
+ 177 train 9.318861 (lr=6.2238e-05) (hash(x)=13713689)
188
+ 178 train 9.279714 (lr=6.2587e-05) (hash(x)=11555030)
189
+ 179 train 9.273702 (lr=6.2937e-05) (hash(x)=14115772)
190
+ 180 train 9.183664 (lr=6.3287e-05) (hash(x)=12279208)
191
+ 181 train 9.131492 (lr=6.3636e-05) (hash(x)=12021395)
192
+ 182 train 9.188429 (lr=6.3986e-05) (hash(x)=12617648)
193
+ 183 train 9.084084 (lr=6.4336e-05) (hash(x)=12798942)
194
+ 184 train 9.130439 (lr=6.4685e-05) (hash(x)=12366544)
195
+ 185 train 9.111936 (lr=6.5035e-05) (hash(x)=10886073)
196
+ 186 train 9.105676 (lr=6.5385e-05) (hash(x)=12555076)
197
+ 187 train 9.162394 (lr=6.5734e-05) (hash(x)=14777879)
198
+ 188 train 9.132498 (lr=6.6084e-05) (hash(x)=9927010)
199
+ 189 train 9.079164 (lr=6.6434e-05) (hash(x)=13073054)
200
+ 190 train 8.974812 (lr=6.6783e-05) (hash(x)=14967464)
201
+ 191 train 8.999478 (lr=6.7133e-05) (hash(x)=13463198)
202
+ 192 train 9.019534 (lr=6.7483e-05) (hash(x)=12209984)
203
+ 193 train 9.059615 (lr=6.7832e-05) (hash(x)=11983929)
204
+ 194 train 9.043838 (lr=6.8182e-05) (hash(x)=13502211)
205
+ 195 train 8.945989 (lr=6.8531e-05) (hash(x)=12563030)
206
+ 196 train 8.988230 (lr=6.8881e-05) (hash(x)=12774663)
207
+ 197 train 8.981385 (lr=6.9231e-05) (hash(x)=12170905)
208
+ 198 train 9.042742 (lr=6.9580e-05) (hash(x)=13348784)
209
+ 199 train 8.914835 (lr=6.9930e-05) (hash(x)=11525433)
210
+ 200 val loss 8.9068
211
+ 200 val perplexity 7381.7212
212
+ 200 train 9.012314 (lr=7.0280e-05) (hash(x)=9800296)
213
+ 201 train 8.903841 (lr=7.0629e-05) (hash(x)=10168268)
214
+ 202 train 8.878007 (lr=7.0979e-05) (hash(x)=10276843)
215
+ 203 train 9.050154 (lr=7.1329e-05) (hash(x)=12497404)
216
+ 204 train 8.864252 (lr=7.1678e-05) (hash(x)=12932339)
217
+ 205 train 8.936125 (lr=7.2028e-05) (hash(x)=14203584)
218
+ 206 train 8.854735 (lr=7.2378e-05) (hash(x)=13921321)
219
+ 207 train 8.844292 (lr=7.2727e-05) (hash(x)=10946961)
220
+ 208 train 8.827967 (lr=7.3077e-05) (hash(x)=12168723)
221
+ 209 train 8.852965 (lr=7.3427e-05) (hash(x)=10942254)
222
+ 210 train 8.863027 (lr=7.3776e-05) (hash(x)=10371632)
223
+ 211 train 8.917255 (lr=7.4126e-05) (hash(x)=16703203)
224
+ 212 train 8.792215 (lr=7.4476e-05) (hash(x)=17013185)
225
+ 213 train 8.816187 (lr=7.4825e-05) (hash(x)=15000294)
226
+ 214 train 8.727157 (lr=7.5175e-05) (hash(x)=10834161)
227
+ 215 train 8.699159 (lr=7.5524e-05) (hash(x)=14862449)
228
+ 216 train 8.767098 (lr=7.5874e-05) (hash(x)=10386532)
229
+ 217 train 8.593474 (lr=7.6224e-05) (hash(x)=13806874)
230
+ 218 train 8.707203 (lr=7.6573e-05) (hash(x)=12733894)
231
+ 219 train 8.791993 (lr=7.6923e-05) (hash(x)=12259111)
232
+ 220 train 8.652357 (lr=7.7273e-05) (hash(x)=10003332)
233
+ 221 train 8.612259 (lr=7.7622e-05) (hash(x)=11837867)
234
+ 222 train 8.703379 (lr=7.7972e-05) (hash(x)=13840537)
235
+ 223 train 8.619564 (lr=7.8322e-05) (hash(x)=12159808)
236
+ 224 train 8.604895 (lr=7.8671e-05) (hash(x)=13797226)
237
+ 225 train 8.619501 (lr=7.9021e-05) (hash(x)=12372219)
238
+ 226 train 8.644806 (lr=7.9371e-05) (hash(x)=11371435)
239
+ 227 train 8.593147 (lr=7.9720e-05) (hash(x)=10910611)
240
+ 228 train 8.618279 (lr=8.0070e-05) (hash(x)=10985712)
241
+ 229 train 8.556727 (lr=8.0420e-05) (hash(x)=12509276)
242
+ 230 train 8.616318 (lr=8.0769e-05) (hash(x)=12953787)
243
+ 231 train 8.571847 (lr=8.1119e-05) (hash(x)=12720900)
244
+ 232 train 8.466321 (lr=8.1469e-05) (hash(x)=16075757)
245
+ 233 train 8.555449 (lr=8.1818e-05) (hash(x)=14048369)
246
+ 234 train 8.675552 (lr=8.2168e-05) (hash(x)=12602604)
247
+ 235 train 8.601347 (lr=8.2517e-05) (hash(x)=11552959)
248
+ 236 train 8.499423 (lr=8.2867e-05) (hash(x)=9890481)
249
+ 237 train 8.394553 (lr=8.3217e-05) (hash(x)=13423367)
250
+ 238 train 8.416847 (lr=8.3566e-05) (hash(x)=11555634)
251
+ 239 train 8.717155 (lr=8.3916e-05) (hash(x)=14914789)
252
+ 240 train 8.893326 (lr=8.4266e-05) (hash(x)=16763637)
253
+ 241 train 8.874437 (lr=8.4615e-05) (hash(x)=15783017)
254
+ 242 train 8.892593 (lr=8.4965e-05) (hash(x)=16354494)
255
+ 243 train 8.589039 (lr=8.5315e-05) (hash(x)=13512761)
256
+ 244 train 8.440418 (lr=8.5664e-05) (hash(x)=13751537)
257
+ 245 train 8.389757 (lr=8.6014e-05) (hash(x)=13830852)
258
+ 246 train 8.386204 (lr=8.6364e-05) (hash(x)=14841133)
259
+ 247 train 8.449017 (lr=8.6713e-05) (hash(x)=11832803)
260
+ 248 train 8.763041 (lr=8.7063e-05) (hash(x)=27572323)
261
+ 249 train 10.836052 (lr=8.7413e-05) (hash(x)=31819669)
262
+ 250 val loss 8.3191
263
+ 250 val perplexity 4101.4258
264
+ 250 train 8.387470 (lr=8.7762e-05) (hash(x)=11499251)
265
+ 251 train 8.379189 (lr=8.8112e-05) (hash(x)=13332674)
266
+ 252 train 8.290973 (lr=8.8462e-05) (hash(x)=11728990)
267
+ 253 train 8.322297 (lr=8.8811e-05) (hash(x)=15028799)
268
+ 254 train 8.304644 (lr=8.9161e-05) (hash(x)=13966115)
269
+ 255 train 8.297104 (lr=8.9510e-05) (hash(x)=10783848)
270
+ 256 train 8.248372 (lr=8.9860e-05) (hash(x)=11179737)
271
+ 257 train 8.341174 (lr=9.0210e-05) (hash(x)=10874609)
272
+ 258 train 8.331173 (lr=9.0559e-05) (hash(x)=9234657)
273
+ 259 train 8.325091 (lr=9.0909e-05) (hash(x)=12212655)
274
+ 260 train 8.255197 (lr=9.1259e-05) (hash(x)=12238338)
275
+ 261 train 8.263827 (lr=9.1608e-05) (hash(x)=11079662)
276
+ 262 train 8.211183 (lr=9.1958e-05) (hash(x)=13092665)
277
+ 263 train 8.240063 (lr=9.2308e-05) (hash(x)=11087308)
278
+ 264 train 8.257947 (lr=9.2657e-05) (hash(x)=17694381)
279
+ 265 train 8.157444 (lr=9.3007e-05) (hash(x)=13305061)
280
+ 266 train 8.201578 (lr=9.3357e-05) (hash(x)=11290716)
281
+ 267 train 8.187524 (lr=9.3706e-05) (hash(x)=13664986)
282
+ 268 train 8.186601 (lr=9.4056e-05) (hash(x)=12421790)
283
+ 269 train 8.145159 (lr=9.4406e-05) (hash(x)=12060664)
284
+ 270 train 8.251472 (lr=9.4755e-05) (hash(x)=13653671)
285
+ 271 train 8.215740 (lr=9.5105e-05) (hash(x)=15746941)
286
+ 272 train 8.119474 (lr=9.5455e-05) (hash(x)=12335295)
287
+ 273 train 8.206117 (lr=9.5804e-05) (hash(x)=17767151)
288
+ 274 train 8.081421 (lr=9.6154e-05) (hash(x)=13419880)
289
+ 275 train 8.079351 (lr=9.6503e-05) (hash(x)=13277101)
290
+ 276 train 8.023427 (lr=9.6853e-05) (hash(x)=10535318)
291
+ 277 train 8.132802 (lr=9.7203e-05) (hash(x)=10935661)
292
+ 278 train 8.101584 (lr=9.7552e-05) (hash(x)=12907485)
293
+ 279 train 8.103436 (lr=9.7902e-05) (hash(x)=8154583)
294
+ 280 train 8.082265 (lr=9.8252e-05) (hash(x)=13402549)
295
+ 281 train 8.261983 (lr=9.8601e-05) (hash(x)=12543473)
296
+ 282 train 8.067114 (lr=9.8951e-05) (hash(x)=12144679)
297
+ 283 train 8.050070 (lr=9.9301e-05) (hash(x)=12101143)
298
+ 284 train 8.048075 (lr=9.9650e-05) (hash(x)=15908582)
299
+ 285 train 8.067490 (lr=1.0000e-04) (hash(x)=11655051)
300
+ 286 train 7.910710 (lr=1.0000e-04) (hash(x)=9481417)
301
+ 287 train 7.980206 (lr=1.0000e-04) (hash(x)=12561418)
302
+ 288 train 7.893242 (lr=9.9998e-05) (hash(x)=11036935)
303
+ 289 train 7.855979 (lr=9.9996e-05) (hash(x)=13778264)
304
+ 290 train 8.102590 (lr=9.9993e-05) (hash(x)=15215237)
305
+ 291 train 8.413849 (lr=9.9989e-05) (hash(x)=9877852)
306
+ 292 train 7.889930 (lr=9.9984e-05) (hash(x)=10232210)
307
+ 293 train 7.984270 (lr=9.9979e-05) (hash(x)=11968670)
308
+ 294 train 7.916226 (lr=9.9972e-05) (hash(x)=14272928)
309
+ 295 train 7.990905 (lr=9.9965e-05) (hash(x)=9256992)
310
+ 296 train 7.971549 (lr=9.9956e-05) (hash(x)=12593445)
311
+ 297 train 7.881117 (lr=9.9947e-05) (hash(x)=13208924)
312
+ 298 train 7.919998 (lr=9.9937e-05) (hash(x)=11578007)
313
+ 299 train 7.887828 (lr=9.9926e-05) (hash(x)=9762793)
314
+ 300 val loss 7.8798
315
+ 300 val perplexity 2643.4468
316
+ 300 train 8.044203 (lr=9.9915e-05) (hash(x)=12966781)
317
+ 301 train 8.144435 (lr=9.9902e-05) (hash(x)=15724292)
318
+ 302 train 8.154574 (lr=9.9889e-05) (hash(x)=15100923)
319
+ 303 train 8.106503 (lr=9.9874e-05) (hash(x)=13336322)
320
+ 304 train 7.954369 (lr=9.9859e-05) (hash(x)=10985744)
321
+ 305 train 7.909472 (lr=9.9843e-05) (hash(x)=12715509)
322
+ 306 train 7.854469 (lr=9.9826e-05) (hash(x)=11842498)
323
+ 307 train 7.821999 (lr=9.9808e-05) (hash(x)=9614210)
324
+ 308 train 7.822833 (lr=9.9789e-05) (hash(x)=11918721)
325
+ 309 train 7.825284 (lr=9.9770e-05) (hash(x)=12988502)
326
+ 310 train 7.813247 (lr=9.9749e-05) (hash(x)=9377388)
327
+ 311 train 7.803995 (lr=9.9728e-05) (hash(x)=12043981)
328
+ 312 train 7.899476 (lr=9.9706e-05) (hash(x)=12250543)
329
+ 313 train 7.759882 (lr=9.9683e-05) (hash(x)=11337762)
330
+ 314 train 7.880283 (lr=9.9659e-05) (hash(x)=13608297)
331
+ 315 train 7.922638 (lr=9.9634e-05) (hash(x)=10432200)
332
+ 316 train 7.813871 (lr=9.9609e-05) (hash(x)=12645115)
333
+ 317 train 7.774505 (lr=9.9582e-05) (hash(x)=12948458)
334
+ 318 train 7.893652 (lr=9.9555e-05) (hash(x)=12187458)
335
+ 319 train 7.769280 (lr=9.9526e-05) (hash(x)=15117606)
336
+ 320 train 7.773646 (lr=9.9497e-05) (hash(x)=12491246)
337
+ 321 train 7.762526 (lr=9.9467e-05) (hash(x)=9760285)
338
+ 322 train 7.838205 (lr=9.9437e-05) (hash(x)=12041167)
339
+ 323 train 7.760711 (lr=9.9405e-05) (hash(x)=11748314)
340
+ 324 train 7.561112 (lr=9.9372e-05) (hash(x)=11735134)
341
+ 325 train 7.791825 (lr=9.9339e-05) (hash(x)=14573695)
342
+ 326 train 7.704654 (lr=9.9305e-05) (hash(x)=14306208)
343
+ 327 train 7.715310 (lr=9.9270e-05) (hash(x)=13862746)
344
+ 328 train 7.775133 (lr=9.9234e-05) (hash(x)=10815095)
345
+ 329 train 7.725420 (lr=9.9197e-05) (hash(x)=13686163)
346
+ 330 train 7.808816 (lr=9.9159e-05) (hash(x)=14377992)
347
+ 331 train 7.766162 (lr=9.9121e-05) (hash(x)=12421087)
348
+ 332 train 7.726565 (lr=9.9081e-05) (hash(x)=13085792)
349
+ 333 train 7.953267 (lr=9.9041e-05) (hash(x)=7228805)
350
+ 334 train 7.834518 (lr=9.9000e-05) (hash(x)=10015517)
351
+ 335 train 7.663844 (lr=9.8958e-05) (hash(x)=11774322)
352
+ 336 train 7.799214 (lr=9.8915e-05) (hash(x)=12674150)
353
+ 337 train 7.758960 (lr=9.8872e-05) (hash(x)=13116742)
354
+ 338 train 7.789169 (lr=9.8827e-05) (hash(x)=15946962)
355
+ 339 train 7.761064 (lr=9.8782e-05) (hash(x)=13171632)
356
+ 340 train 7.751712 (lr=9.8736e-05) (hash(x)=14334587)
357
+ 341 train 7.706895 (lr=9.8689e-05) (hash(x)=8960004)
358
+ 342 train 7.652092 (lr=9.8641e-05) (hash(x)=11642701)
359
+ 343 train 7.780340 (lr=9.8592e-05) (hash(x)=12969713)
360
+ 344 train 7.631021 (lr=9.8543e-05) (hash(x)=13050602)
361
+ 345 train 7.678365 (lr=9.8492e-05) (hash(x)=13929053)
362
+ 346 train 7.796197 (lr=9.8441e-05) (hash(x)=13096472)
363
+ 347 train 7.672542 (lr=9.8389e-05) (hash(x)=10986878)
364
+ 348 train 7.601457 (lr=9.8336e-05) (hash(x)=13231569)
365
+ 349 train 7.534268 (lr=9.8282e-05) (hash(x)=10360614)
366
+ 350 val loss 7.6831
367
+ 350 val perplexity 2171.3555
368
+ 350 train 7.608648 (lr=9.8228e-05) (hash(x)=14738108)
369
+ 351 train 7.671708 (lr=9.8172e-05) (hash(x)=13341647)
370
+ 352 train 7.739453 (lr=9.8116e-05) (hash(x)=12687712)
371
+ 353 train 7.630794 (lr=9.8059e-05) (hash(x)=12645920)
372
+ 354 train 7.616489 (lr=9.8001e-05) (hash(x)=13050918)
373
+ 355 train 7.656601 (lr=9.7942e-05) (hash(x)=10203871)
374
+ 356 train 7.644189 (lr=9.7882e-05) (hash(x)=12904927)
375
+ 357 train 7.754632 (lr=9.7822e-05) (hash(x)=14048300)
376
+ 358 train 7.654190 (lr=9.7761e-05) (hash(x)=11802275)
377
+ 359 train 7.660752 (lr=9.7699e-05) (hash(x)=13853279)
378
+ 360 train 7.741071 (lr=9.7636e-05) (hash(x)=12126692)
379
+ 361 train 7.605831 (lr=9.7572e-05) (hash(x)=14104463)
380
+ 362 train 7.534350 (lr=9.7507e-05) (hash(x)=13536583)
381
+ 363 train 7.702738 (lr=9.7442e-05) (hash(x)=13197896)
382
+ 364 train 7.672488 (lr=9.7376e-05) (hash(x)=13408658)
383
+ 365 train 7.601171 (lr=9.7309e-05) (hash(x)=11875111)
384
+ 366 train 7.623244 (lr=9.7241e-05) (hash(x)=15057683)
385
+ 367 train 7.693527 (lr=9.7172e-05) (hash(x)=11993762)
386
+ 368 train 7.569124 (lr=9.7103e-05) (hash(x)=10677495)
387
+ 369 train 7.619697 (lr=9.7032e-05) (hash(x)=10105877)
388
+ 370 train 7.580214 (lr=9.6961e-05) (hash(x)=14131566)
389
+ 371 train 7.602437 (lr=9.6889e-05) (hash(x)=9447587)
390
+ 372 train 7.548701 (lr=9.6817e-05) (hash(x)=10069430)
391
+ 373 train 7.531095 (lr=9.6743e-05) (hash(x)=14464065)
392
+ 374 train 7.559900 (lr=9.6669e-05) (hash(x)=11265273)
393
+ 375 train 7.549254 (lr=9.6593e-05) (hash(x)=12235258)
394
+ 376 train 7.584501 (lr=9.6518e-05) (hash(x)=13640309)
395
+ 377 train 7.612022 (lr=9.6441e-05) (hash(x)=12814938)
396
+ 378 train 7.670636 (lr=9.6363e-05) (hash(x)=12343839)
397
+ 379 train 7.557689 (lr=9.6285e-05) (hash(x)=13007077)
398
+ 380 train 7.500021 (lr=9.6206e-05) (hash(x)=11835624)
399
+ 381 train 7.525685 (lr=9.6126e-05) (hash(x)=9419131)
400
+ 382 train 7.417609 (lr=9.6045e-05) (hash(x)=11209923)
401
+ 383 train 7.707348 (lr=9.5963e-05) (hash(x)=11977301)
402
+ 384 train 7.745738 (lr=9.5881e-05) (hash(x)=11326187)
403
+ 385 train 7.511985 (lr=9.5798e-05) (hash(x)=11772591)
404
+ 386 train 7.805121 (lr=9.5714e-05) (hash(x)=15539329)
405
+ 387 train 7.887372 (lr=9.5629e-05) (hash(x)=10882741)
406
+ 388 train 7.533478 (lr=9.5544e-05) (hash(x)=9341637)
407
+ 389 train 8.038224 (lr=9.5457e-05) (hash(x)=13174638)
408
+ 390 train 7.656454 (lr=9.5370e-05) (hash(x)=12490208)
409
+ 391 train 7.657305 (lr=9.5282e-05) (hash(x)=9018988)
410
+ 392 train 7.663507 (lr=9.5194e-05) (hash(x)=15752665)
411
+ 393 train 7.767238 (lr=9.5104e-05) (hash(x)=12406522)
412
+ 394 train 7.592674 (lr=9.5014e-05) (hash(x)=12323425)
413
+ 395 train 7.620439 (lr=9.4923e-05) (hash(x)=11486400)
414
+ 396 train 7.616739 (lr=9.4831e-05) (hash(x)=13731035)
415
+ 397 train 7.680372 (lr=9.4739e-05) (hash(x)=11434626)
416
+ 398 train 7.610010 (lr=9.4646e-05) (hash(x)=12174800)
417
+ 399 train 7.577083 (lr=9.4551e-05) (hash(x)=11810699)
418
+ 400 val loss 7.5846
419
+ 400 val perplexity 1967.6256
420
+ 400 train 7.597749 (lr=9.4457e-05) (hash(x)=10805897)
421
+ 401 train 7.573923 (lr=9.4361e-05) (hash(x)=15232839)
422
+ 402 train 7.706252 (lr=9.4265e-05) (hash(x)=15206031)
423
+ 403 train 7.733481 (lr=9.4168e-05) (hash(x)=12121527)
424
+ 404 train 7.754501 (lr=9.4070e-05) (hash(x)=12401285)
425
+ 405 train 7.556056 (lr=9.3971e-05) (hash(x)=12535938)
426
+ 406 train 7.476427 (lr=9.3872e-05) (hash(x)=11242362)
427
+ 407 train 7.517230 (lr=9.3772e-05) (hash(x)=13461204)
428
+ 408 train 7.570787 (lr=9.3671e-05) (hash(x)=13281798)
429
+ 409 train 7.696587 (lr=9.3569e-05) (hash(x)=13139637)
430
+ 410 train 7.460077 (lr=9.3467e-05) (hash(x)=13854352)
431
+ 411 train 7.471572 (lr=9.3364e-05) (hash(x)=12479097)
432
+ 412 train 7.483540 (lr=9.3260e-05) (hash(x)=12741634)
433
+ 413 train 8.229985 (lr=9.3155e-05) (hash(x)=13309670)
434
+ 414 train 7.548812 (lr=9.3050e-05) (hash(x)=11812959)
435
+ 415 train 7.493743 (lr=9.2944e-05) (hash(x)=13759175)
436
+ 416 train 7.544148 (lr=9.2837e-05) (hash(x)=12390919)
437
+ 417 train 7.795444 (lr=9.2729e-05) (hash(x)=14417261)
438
+ 418 train 7.578106 (lr=9.2621e-05) (hash(x)=13350903)
439
+ 419 train 7.510305 (lr=9.2512e-05) (hash(x)=13854240)
440
+ 420 train 7.731241 (lr=9.2402e-05) (hash(x)=13510491)
441
+ 421 train 7.579117 (lr=9.2292e-05) (hash(x)=12890682)
442
+ 422 train 7.544815 (lr=9.2181e-05) (hash(x)=12185495)
443
+ 423 train 7.680448 (lr=9.2069e-05) (hash(x)=12600677)
444
+ 424 train 7.681854 (lr=9.1956e-05) (hash(x)=13041967)
445
+ 425 train 7.567707 (lr=9.1843e-05) (hash(x)=10171528)
446
+ 426 train 7.563455 (lr=9.1729e-05) (hash(x)=10640767)
447
+ 427 train 7.663056 (lr=9.1614e-05) (hash(x)=13458171)
448
+ 428 train 7.446726 (lr=9.1499e-05) (hash(x)=11457792)
449
+ 429 train 7.531363 (lr=9.1382e-05) (hash(x)=13471569)
450
+ 430 train 7.786706 (lr=9.1266e-05) (hash(x)=15321414)
451
+ 431 train 7.635223 (lr=9.1148e-05) (hash(x)=11813286)
452
+ 432 train 7.466856 (lr=9.1030e-05) (hash(x)=12763755)
453
+ 433 train 7.630887 (lr=9.0911e-05) (hash(x)=10811811)
454
+ 434 train 7.483490 (lr=9.0791e-05) (hash(x)=9411122)
455
+ 435 train 7.456389 (lr=9.0671e-05) (hash(x)=12462956)
456
+ 436 train 7.529957 (lr=9.0550e-05) (hash(x)=12764015)
457
+ 437 train 7.701620 (lr=9.0428e-05) (hash(x)=13306710)
458
+ 438 train 7.434993 (lr=9.0306e-05) (hash(x)=12243473)
459
+ 439 train 7.583931 (lr=9.0182e-05) (hash(x)=10772649)
460
+ 440 train 7.447593 (lr=9.0059e-05) (hash(x)=15248203)
461
+ 441 train 7.389831 (lr=8.9934e-05) (hash(x)=14886975)
462
+ 442 train 7.450053 (lr=8.9809e-05) (hash(x)=10539754)
463
+ 443 train 7.496155 (lr=8.9683e-05) (hash(x)=12534460)
464
+ 444 train 7.381038 (lr=8.9557e-05) (hash(x)=11540156)
465
+ 445 train 7.285108 (lr=8.9430e-05) (hash(x)=11701258)
466
+ 446 train 7.525066 (lr=8.9302e-05) (hash(x)=13548749)
467
+ 447 train 7.604809 (lr=8.9173e-05) (hash(x)=13546394)
468
+ 448 train 7.484581 (lr=8.9044e-05) (hash(x)=13046479)
469
+ 449 train 7.573463 (lr=8.8914e-05) (hash(x)=12655917)
470
+ 450 val loss 7.5528
471
+ 450 val perplexity 1905.9863
472
+ 450 train 7.347743 (lr=8.8784e-05) (hash(x)=10602039)
473
+ 451 train 7.479167 (lr=8.8653e-05) (hash(x)=12289584)
474
+ 452 train 7.550191 (lr=8.8521e-05) (hash(x)=12105301)
475
+ 453 train 7.454079 (lr=8.8388e-05) (hash(x)=11593067)
476
+ 454 train 7.540832 (lr=8.8255e-05) (hash(x)=13061011)
477
+ 455 train 7.564131 (lr=8.8122e-05) (hash(x)=13707506)
478
+ 456 train 7.612805 (lr=8.7987e-05) (hash(x)=11745344)
479
+ 457 train 7.537314 (lr=8.7852e-05) (hash(x)=13382803)
480
+ 458 train 7.526602 (lr=8.7717e-05) (hash(x)=10951259)
481
+ 459 train 7.505188 (lr=8.7580e-05) (hash(x)=15784201)
482
+ 460 train 7.572526 (lr=8.7444e-05) (hash(x)=13413178)
483
+ 461 train 7.417849 (lr=8.7306e-05) (hash(x)=14569416)
484
+ 462 train 7.443406 (lr=8.7168e-05) (hash(x)=10369396)
485
+ 463 train 7.625181 (lr=8.7029e-05) (hash(x)=12612709)
486
+ 464 train 7.568489 (lr=8.6890e-05) (hash(x)=12196831)
487
+ 465 train 7.504036 (lr=8.6750e-05) (hash(x)=13603725)
488
+ 466 train 7.433267 (lr=8.6609e-05) (hash(x)=12426620)
489
+ 467 train 7.850050 (lr=8.6468e-05) (hash(x)=13034137)
490
+ 468 train 7.567615 (lr=8.6326e-05) (hash(x)=10501727)
491
+ 469 train 7.469006 (lr=8.6184e-05) (hash(x)=13569897)
492
+ 470 train 7.570903 (lr=8.6041e-05) (hash(x)=10398118)
493
+ 471 train 7.524466 (lr=8.5897e-05) (hash(x)=11379182)
494
+ 472 train 7.539942 (lr=8.5753e-05) (hash(x)=9972768)
495
+ 473 train 7.529263 (lr=8.5608e-05) (hash(x)=10772043)
496
+ 474 train 7.440810 (lr=8.5462e-05) (hash(x)=11453247)
497
+ 475 train 7.505361 (lr=8.5316e-05) (hash(x)=13194257)
498
+ 476 train 7.447592 (lr=8.5170e-05) (hash(x)=13336861)
499
+ 477 train 7.492377 (lr=8.5022e-05) (hash(x)=11410706)
500
+ 478 train 7.301122 (lr=8.4875e-05) (hash(x)=14280695)
501
+ 479 train 7.342374 (lr=8.4726e-05) (hash(x)=12234355)
502
+ 480 train 7.691111 (lr=8.4577e-05) (hash(x)=11419225)
503
+ 481 train 7.461441 (lr=8.4428e-05) (hash(x)=13227893)
504
+ 482 train 7.688045 (lr=8.4278e-05) (hash(x)=11841994)
505
+ 483 train 7.339045 (lr=8.4127e-05) (hash(x)=11006832)
506
+ 484 train 7.378447 (lr=8.3976e-05) (hash(x)=11812481)
507
+ 485 train 7.487636 (lr=8.3824e-05) (hash(x)=10942327)
508
+ 486 train 7.404693 (lr=8.3672e-05) (hash(x)=13048572)
509
+ 487 train 7.273370 (lr=8.3519e-05) (hash(x)=11307988)
510
+ 488 train 7.393594 (lr=8.3366e-05) (hash(x)=10678624)
511
+ 489 train 7.445647 (lr=8.3212e-05) (hash(x)=11990825)
512
+ 490 train 7.357392 (lr=8.3057e-05) (hash(x)=11321271)
513
+ 491 train 7.523115 (lr=8.2902e-05) (hash(x)=11230858)
514
+ 492 train 7.374776 (lr=8.2746e-05) (hash(x)=11886513)
515
+ 493 train 7.607181 (lr=8.2590e-05) (hash(x)=13052292)
516
+ 494 train 7.199693 (lr=8.2434e-05) (hash(x)=10598277)
517
+ 495 train 7.665239 (lr=8.2276e-05) (hash(x)=13337837)
518
+ 496 train 7.593916 (lr=8.2119e-05) (hash(x)=12914018)
519
+ 497 train 7.690903 (lr=8.1960e-05) (hash(x)=13648351)
520
+ 498 train 7.588863 (lr=8.1801e-05) (hash(x)=11819798)
521
+ 499 train 7.302183 (lr=8.1642e-05) (hash(x)=8133784)
522
+ 500 val loss 7.5125
523
+ 500 val perplexity 1830.7985
524
+ 500 train 7.438015 (lr=8.1482e-05) (hash(x)=13041235)
525
+ 501 train 7.648342 (lr=8.1322e-05) (hash(x)=12571284)
526
+ 502 train 7.372251 (lr=8.1161e-05) (hash(x)=10724747)
527
+ 503 train 7.651697 (lr=8.1000e-05) (hash(x)=11366947)
528
+ 504 train 7.543904 (lr=8.0838e-05) (hash(x)=9729255)
529
+ 505 train 7.674073 (lr=8.0676e-05) (hash(x)=12997836)
530
+ 506 train 7.418233 (lr=8.0513e-05) (hash(x)=11778175)
531
+ 507 train 7.584882 (lr=8.0349e-05) (hash(x)=14611119)
532
+ 508 train 7.580052 (lr=8.0186e-05) (hash(x)=12349115)
533
+ 509 train 7.400591 (lr=8.0021e-05) (hash(x)=10423579)
534
+ 510 train 7.554747 (lr=7.9856e-05) (hash(x)=14187304)
535
+ 511 train 7.822964 (lr=7.9691e-05) (hash(x)=12581602)
536
+ 512 train 7.571360 (lr=7.9525e-05) (hash(x)=13476452)
537
+ 513 train 7.536673 (lr=7.9359e-05) (hash(x)=12215534)
538
+ 514 train 7.419743 (lr=7.9192e-05) (hash(x)=9409868)
539
+ 515 train 7.321919 (lr=7.9025e-05) (hash(x)=10812793)
540
+ 516 train 7.457541 (lr=7.8858e-05) (hash(x)=12379852)
541
+ 517 train 7.532557 (lr=7.8689e-05) (hash(x)=10559198)
542
+ 518 train 7.632159 (lr=7.8521e-05) (hash(x)=12350446)
543
+ 519 train 7.626479 (lr=7.8352e-05) (hash(x)=12102265)
544
+ 520 train 7.486510 (lr=7.8182e-05) (hash(x)=14849069)
545
+ 521 train 7.684856 (lr=7.8012e-05) (hash(x)=10739336)
546
+ 522 train 7.487222 (lr=7.7842e-05) (hash(x)=11768670)
547
+ 523 train 7.464436 (lr=7.7671e-05) (hash(x)=12247648)
548
+ 524 train 7.451417 (lr=7.7500e-05) (hash(x)=14364765)
549
+ 525 train 7.436358 (lr=7.7328e-05) (hash(x)=11816454)
550
+ 526 train 7.440238 (lr=7.7156e-05) (hash(x)=12220238)
551
+ 527 train 7.373689 (lr=7.6984e-05) (hash(x)=9564907)
552
+ 528 train 7.569030 (lr=7.6811e-05) (hash(x)=13654616)
553
+ 529 train 7.550700 (lr=7.6637e-05) (hash(x)=16165649)
554
+ 530 train 7.384135 (lr=7.6463e-05) (hash(x)=9844555)
555
+ 531 train 7.573589 (lr=7.6289e-05) (hash(x)=12152874)
556
+ 532 train 7.478400 (lr=7.6115e-05) (hash(x)=17090108)
557
+ 533 train 7.358916 (lr=7.5940e-05) (hash(x)=7136761)
558
+ 534 train 7.482426 (lr=7.5764e-05) (hash(x)=15315040)
559
+ 535 train 7.456536 (lr=7.5588e-05) (hash(x)=11163149)
560
+ 536 train 7.465367 (lr=7.5412e-05) (hash(x)=12765838)
561
+ 537 train 7.521924 (lr=7.5235e-05) (hash(x)=11285917)
562
+ 538 train 7.345324 (lr=7.5058e-05) (hash(x)=10931519)
563
+ 539 train 7.446640 (lr=7.4881e-05) (hash(x)=12002261)
564
+ 540 train 7.405920 (lr=7.4703e-05) (hash(x)=11302510)
565
+ 541 train 7.447069 (lr=7.4525e-05) (hash(x)=13341656)
566
+ 542 train 7.576398 (lr=7.4346e-05) (hash(x)=14598783)
567
+ 543 train 7.432443 (lr=7.4167e-05) (hash(x)=11617372)
568
+ 544 train 7.524226 (lr=7.3988e-05) (hash(x)=15659388)
569
+ 545 train 7.414237 (lr=7.3808e-05) (hash(x)=12952882)
570
+ 546 train 7.439837 (lr=7.3628e-05) (hash(x)=10511717)
571
+ 547 train 7.357103 (lr=7.3448e-05) (hash(x)=12134313)
572
+ 548 train 7.356125 (lr=7.3267e-05) (hash(x)=11614627)
573
+ 549 train 7.427023 (lr=7.3086e-05) (hash(x)=13628925)
574
+ 550 val loss 7.4963
575
+ 550 val perplexity 1801.4083
576
+ 550 train 7.487904 (lr=7.2904e-05) (hash(x)=11918874)
577
+ 551 train 7.508917 (lr=7.2723e-05) (hash(x)=13040907)
578
+ 552 train 7.312903 (lr=7.2540e-05) (hash(x)=11698109)
579
+ 553 train 7.658591 (lr=7.2358e-05) (hash(x)=13396573)
580
+ 554 train 7.418825 (lr=7.2175e-05) (hash(x)=11733616)
581
+ 555 train 7.397337 (lr=7.1992e-05) (hash(x)=12694349)
582
+ 556 train 7.425029 (lr=7.1808e-05) (hash(x)=12312060)
583
+ 557 train 7.498386 (lr=7.1624e-05) (hash(x)=11344189)
584
+ 558 train 7.612772 (lr=7.1440e-05) (hash(x)=13385426)
585
+ 559 train 7.750234 (lr=7.1256e-05) (hash(x)=13491438)
586
+ 560 train 7.530766 (lr=7.1071e-05) (hash(x)=12488110)
587
+ 561 train 7.311863 (lr=7.0886e-05) (hash(x)=11821644)
588
+ 562 train 7.677312 (lr=7.0701e-05) (hash(x)=12743266)
589
+ 563 train 7.469641 (lr=7.0515e-05) (hash(x)=13057230)
590
+ 564 train 7.439109 (lr=7.0329e-05) (hash(x)=12158411)
591
+ 565 train 7.568459 (lr=7.0143e-05) (hash(x)=13676361)
592
+ 566 train 7.524297 (lr=6.9956e-05) (hash(x)=12596927)
593
+ 567 train 7.454080 (lr=6.9769e-05) (hash(x)=10013331)
594
+ 568 train 7.513072 (lr=6.9582e-05) (hash(x)=8940660)
595
+ 569 train 7.416886 (lr=6.9394e-05) (hash(x)=11426899)
596
+ 570 train 7.522228 (lr=6.9207e-05) (hash(x)=13527285)
597
+ 571 train 7.596833 (lr=6.9019e-05) (hash(x)=13164710)
598
+ 572 train 7.513178 (lr=6.8830e-05) (hash(x)=11663038)
599
+ 573 train 7.350200 (lr=6.8642e-05) (hash(x)=11992117)
600
+ 574 train 7.416256 (lr=6.8453e-05) (hash(x)=11887123)
601
+ 575 train 7.411874 (lr=6.8264e-05) (hash(x)=8821401)
602
+ 576 train 7.537101 (lr=6.8075e-05) (hash(x)=12938718)
603
+ 577 train 7.488029 (lr=6.7885e-05) (hash(x)=12160782)
604
+ 578 train 7.606245 (lr=6.7695e-05) (hash(x)=11053576)
605
+ 579 train 7.437332 (lr=6.7505e-05) (hash(x)=11361433)
606
+ 580 train 7.403644 (lr=6.7315e-05) (hash(x)=11241602)
607
+ 581 train 7.691932 (lr=6.7124e-05) (hash(x)=13467938)
608
+ 582 train 7.384058 (lr=6.6933e-05) (hash(x)=13390909)
609
+ 583 train 7.482238 (lr=6.6742e-05) (hash(x)=11331081)
610
+ 584 train 7.595996 (lr=6.6551e-05) (hash(x)=13257683)
611
+ 585 train 7.404633 (lr=6.6360e-05) (hash(x)=11335829)
612
+ 586 train 7.418481 (lr=6.6168e-05) (hash(x)=12181497)
613
+ 587 train 7.591026 (lr=6.5976e-05) (hash(x)=12036515)
614
+ 588 train 7.328038 (lr=6.5784e-05) (hash(x)=11645653)
615
+ 589 train 7.340950 (lr=6.5592e-05) (hash(x)=9603893)
616
+ 590 train 7.293810 (lr=6.5399e-05) (hash(x)=11916423)
617
+ 591 train 7.394503 (lr=6.5206e-05) (hash(x)=11969785)
618
+ 592 train 7.311136 (lr=6.5013e-05) (hash(x)=11087913)
619
+ 593 train 7.462017 (lr=6.4820e-05) (hash(x)=12961307)
620
+ 594 train 7.383583 (lr=6.4627e-05) (hash(x)=13208036)
621
+ 595 train 7.432051 (lr=6.4433e-05) (hash(x)=11425295)
622
+ 596 train 7.413149 (lr=6.4240e-05) (hash(x)=13460645)
623
+ 597 train 7.940939 (lr=6.4046e-05) (hash(x)=11699159)
624
+ 598 train 7.472442 (lr=6.3852e-05) (hash(x)=14195403)
625
+ 599 train 7.600129 (lr=6.3658e-05) (hash(x)=14729056)
626
+ 600 val loss 7.4779
627
+ 600 val perplexity 1768.4590
628
+ 600 train 7.681319 (lr=6.3463e-05) (hash(x)=13745864)
629
+ 601 train 7.736248 (lr=6.3269e-05) (hash(x)=12321449)
630
+ 602 train 7.428635 (lr=6.3074e-05) (hash(x)=12672696)
631
+ 603 train 7.469262 (lr=6.2879e-05) (hash(x)=11672756)
632
+ 604 train 7.510104 (lr=6.2684e-05) (hash(x)=10336288)
633
+ 605 train 7.598173 (lr=6.2489e-05) (hash(x)=11800594)
634
+ 606 train 7.706091 (lr=6.2294e-05) (hash(x)=13024293)
635
+ 607 train 7.515874 (lr=6.2098e-05) (hash(x)=12575397)
636
+ 608 train 7.583494 (lr=6.1903e-05) (hash(x)=13287310)
637
+ 609 train 7.603239 (lr=6.1707e-05) (hash(x)=11738884)
638
+ 610 train 7.512786 (lr=6.1511e-05) (hash(x)=11748148)
639
+ 611 train 7.536718 (lr=6.1315e-05) (hash(x)=12947093)
640
+ 612 train 7.607229 (lr=6.1119e-05) (hash(x)=10794667)
641
+ 613 train 7.490963 (lr=6.0923e-05) (hash(x)=14377955)
642
+ 614 train 7.570071 (lr=6.0726e-05) (hash(x)=10807688)
643
+ 615 train 7.626755 (lr=6.0530e-05) (hash(x)=11244769)
644
+ 616 train 7.590099 (lr=6.0333e-05) (hash(x)=14038283)
645
+ 617 train 7.497193 (lr=6.0137e-05) (hash(x)=13749200)
646
+ 618 train 7.489282 (lr=5.9940e-05) (hash(x)=12653340)
647
+ 619 train 7.465350 (lr=5.9743e-05) (hash(x)=12603266)
648
+ 620 train 7.511334 (lr=5.9546e-05) (hash(x)=13028854)
649
+ 621 train 7.516984 (lr=5.9349e-05) (hash(x)=13143575)
650
+ 622 train 7.828506 (lr=5.9152e-05) (hash(x)=14763952)
651
+ 623 train 7.534526 (lr=5.8955e-05) (hash(x)=10361816)
652
+ 624 train 7.567252 (lr=5.8758e-05) (hash(x)=14037539)
653
+ 625 train 7.576318 (lr=5.8560e-05) (hash(x)=13103193)
654
+ 626 train 7.682274 (lr=5.8363e-05) (hash(x)=11556600)
655
+ 627 train 7.606869 (lr=5.8165e-05) (hash(x)=13579265)
656
+ 628 train 7.345531 (lr=5.7968e-05) (hash(x)=9852762)
657
+ 629 train 7.441614 (lr=5.7770e-05) (hash(x)=12545114)
658
+ 630 train 7.668101 (lr=5.7573e-05) (hash(x)=9486226)
659
+ 631 train 7.422272 (lr=5.7375e-05) (hash(x)=12209411)
660
+ 632 train 7.423884 (lr=5.7177e-05) (hash(x)=11864068)
661
+ 633 train 7.589712 (lr=5.6979e-05) (hash(x)=12672926)
662
+ 634 train 7.490307 (lr=5.6782e-05) (hash(x)=12433434)
663
+ 635 train 7.396738 (lr=5.6584e-05) (hash(x)=12597133)
664
+ 636 train 7.464248 (lr=5.6386e-05) (hash(x)=12650359)
665
+ 637 train 7.563903 (lr=5.6188e-05) (hash(x)=14114689)
666
+ 638 train 7.700822 (lr=5.5990e-05) (hash(x)=11676050)
667
+ 639 train 7.439381 (lr=5.5792e-05) (hash(x)=12983523)
668
+ 640 train 7.531420 (lr=5.5594e-05) (hash(x)=12379110)
669
+ 641 train 7.491056 (lr=5.5396e-05) (hash(x)=12423458)
670
+ 642 train 7.664492 (lr=5.5198e-05) (hash(x)=10325729)
671
+ 643 train 7.738941 (lr=5.5000e-05) (hash(x)=12621340)
672
+ 644 train 7.454315 (lr=5.4802e-05) (hash(x)=12775620)
673
+ 645 train 7.479337 (lr=5.4604e-05) (hash(x)=14660226)
674
+ 646 train 7.553749 (lr=5.4406e-05) (hash(x)=14893320)
675
+ 647 train 7.755746 (lr=5.4208e-05) (hash(x)=13929193)
676
+ 648 train 7.604027 (lr=5.4010e-05) (hash(x)=14983930)
677
+ 649 train 7.540837 (lr=5.3812e-05) (hash(x)=12874680)
678
+ 650 val loss 7.4510
679
+ 650 val perplexity 1721.5818
680
+ 650 train 7.568626 (lr=5.3614e-05) (hash(x)=14092954)
681
+ 651 train 7.560796 (lr=5.3416e-05) (hash(x)=12140734)
682
+ 652 train 7.597850 (lr=5.3218e-05) (hash(x)=12249928)
683
+ 653 train 7.549144 (lr=5.3021e-05) (hash(x)=14920593)
684
+ 654 train 7.586181 (lr=5.2823e-05) (hash(x)=13026900)
685
+ 655 train 7.529675 (lr=5.2625e-05) (hash(x)=12432281)
686
+ 656 train 7.478646 (lr=5.2427e-05) (hash(x)=11236804)
687
+ 657 train 7.639862 (lr=5.2230e-05) (hash(x)=9254879)
688
+ 658 train 7.520547 (lr=5.2032e-05) (hash(x)=12792209)
689
+ 659 train 7.508458 (lr=5.1835e-05) (hash(x)=12002995)
690
+ 660 train 7.549903 (lr=5.1637e-05) (hash(x)=12750282)
691
+ 661 train 7.602744 (lr=5.1440e-05) (hash(x)=14885943)
692
+ 662 train 7.615678 (lr=5.1242e-05) (hash(x)=15191168)
693
+ 663 train 7.353627 (lr=5.1045e-05) (hash(x)=10854054)
694
+ 664 train 7.426368 (lr=5.0848e-05) (hash(x)=12012154)
695
+ 665 train 7.500179 (lr=5.0651e-05) (hash(x)=11853966)
696
+ 666 train 7.588507 (lr=5.0454e-05) (hash(x)=12464600)
697
+ 667 train 7.552716 (lr=5.0257e-05) (hash(x)=10084405)
698
+ 668 train 7.541563 (lr=5.0060e-05) (hash(x)=12777405)
699
+ 669 train 7.524692 (lr=4.9863e-05) (hash(x)=12641903)
700
+ 670 train 7.648370 (lr=4.9667e-05) (hash(x)=12451662)
701
+ 671 train 7.607507 (lr=4.9470e-05) (hash(x)=14240666)
702
+ 672 train 7.539137 (lr=4.9274e-05) (hash(x)=12695347)
703
+ 673 train 7.612584 (lr=4.9077e-05) (hash(x)=10072428)
704
+ 674 train 7.465147 (lr=4.8881e-05) (hash(x)=10876939)
705
+ 675 train 7.443495 (lr=4.8685e-05) (hash(x)=10745946)
706
+ 676 train 7.555520 (lr=4.8489e-05) (hash(x)=11558164)
707
+ 677 train 7.489335 (lr=4.8293e-05) (hash(x)=13059228)
708
+ 678 train 7.564990 (lr=4.8097e-05) (hash(x)=13832610)
709
+ 679 train 7.606705 (lr=4.7902e-05) (hash(x)=12662035)
710
+ 680 train 7.466330 (lr=4.7706e-05) (hash(x)=13676336)
711
+ 681 train 7.589656 (lr=4.7511e-05) (hash(x)=12576794)
712
+ 682 train 7.438496 (lr=4.7316e-05) (hash(x)=10900074)
713
+ 683 train 7.430740 (lr=4.7121e-05) (hash(x)=14080983)
714
+ 684 train 7.514321 (lr=4.6926e-05) (hash(x)=8462025)
715
+ 685 train 7.484394 (lr=4.6731e-05) (hash(x)=12418730)
716
+ 686 train 7.537107 (lr=4.6537e-05) (hash(x)=12309643)
717
+ 687 train 7.521620 (lr=4.6342e-05) (hash(x)=10845546)
718
+ 688 train 7.469253 (lr=4.6148e-05) (hash(x)=10677186)
719
+ 689 train 7.373917 (lr=4.5954e-05) (hash(x)=11011150)
720
+ 690 train 7.441876 (lr=4.5760e-05) (hash(x)=10740665)
721
+ 691 train 7.315762 (lr=4.5567e-05) (hash(x)=8208509)
722
+ 692 train 7.597192 (lr=4.5373e-05) (hash(x)=12911745)
723
+ 693 train 7.382231 (lr=4.5180e-05) (hash(x)=14670016)
724
+ 694 train 7.486005 (lr=4.4987e-05) (hash(x)=12673793)
725
+ 695 train 7.602462 (lr=4.4794e-05) (hash(x)=10969018)
726
+ 696 train 7.560809 (lr=4.4601e-05) (hash(x)=14046245)
727
+ 697 train 7.547968 (lr=4.4408e-05) (hash(x)=13830783)
728
+ 698 train 7.371757 (lr=4.4216e-05) (hash(x)=9300243)
729
+ 699 train 7.706614 (lr=4.4024e-05) (hash(x)=13431817)
730
+ 700 val loss 7.4264
731
+ 700 val perplexity 1679.7427
732
+ 700 train 7.545178 (lr=4.3832e-05) (hash(x)=13305186)
733
+ 701 train 7.838220 (lr=4.3640e-05) (hash(x)=12615836)
734
+ 702 train 7.611536 (lr=4.3449e-05) (hash(x)=9258497)
735
+ 703 train 7.443942 (lr=4.3258e-05) (hash(x)=10578535)
736
+ 704 train 7.506702 (lr=4.3067e-05) (hash(x)=12205101)
737
+ 705 train 7.451088 (lr=4.2876e-05) (hash(x)=10689204)
738
+ 706 train 7.717144 (lr=4.2685e-05) (hash(x)=14424336)
739
+ 707 train 7.394636 (lr=4.2495e-05) (hash(x)=9930132)
740
+ 708 train 7.458880 (lr=4.2305e-05) (hash(x)=13219226)
741
+ 709 train 7.984379 (lr=4.2115e-05) (hash(x)=10658907)
742
+ 710 train 7.455736 (lr=4.1925e-05) (hash(x)=11464602)
743
+ 711 train 7.541266 (lr=4.1736e-05) (hash(x)=13086683)
744
+ 712 train 7.465263 (lr=4.1547e-05) (hash(x)=9898311)
745
+ 713 train 7.438806 (lr=4.1358e-05) (hash(x)=12827162)
746
+ 714 train 7.637509 (lr=4.1170e-05) (hash(x)=10395991)
747
+ 715 train 7.558787 (lr=4.0981e-05) (hash(x)=10106741)
748
+ 716 train 7.496244 (lr=4.0793e-05) (hash(x)=12366127)
749
+ 717 train 7.428021 (lr=4.0606e-05) (hash(x)=12604368)
750
+ 718 train 7.496137 (lr=4.0418e-05) (hash(x)=12860026)
751
+ 719 train 7.548615 (lr=4.0231e-05) (hash(x)=11989943)
752
+ 720 train 7.597871 (lr=4.0044e-05) (hash(x)=12195929)
753
+ 721 train 7.453842 (lr=3.9857e-05) (hash(x)=11633622)
754
+ 722 train 7.471001 (lr=3.9671e-05) (hash(x)=8909020)
755
+ 723 train 7.606308 (lr=3.9485e-05) (hash(x)=13736247)
756
+ 724 train 7.623188 (lr=3.9299e-05) (hash(x)=13388850)
757
+ 725 train 7.458272 (lr=3.9114e-05) (hash(x)=14420699)
758
+ 726 train 7.515702 (lr=3.8929e-05) (hash(x)=12901599)
759
+ 727 train 7.559571 (lr=3.8744e-05) (hash(x)=13131512)
760
+ 728 train 7.466420 (lr=3.8560e-05) (hash(x)=14095087)
761
+ 729 train 7.638329 (lr=3.8376e-05) (hash(x)=11022067)
762
+ 730 train 7.397621 (lr=3.8192e-05) (hash(x)=11698468)
763
+ 731 train 7.593631 (lr=3.8008e-05) (hash(x)=16086089)
764
+ 732 train 7.661191 (lr=3.7825e-05) (hash(x)=17935722)
765
+ 733 train 7.545993 (lr=3.7642e-05) (hash(x)=13405918)
766
+ 734 train 7.619768 (lr=3.7460e-05) (hash(x)=13479533)
767
+ 735 train 7.485451 (lr=3.7277e-05) (hash(x)=13913676)
768
+ 736 train 7.567332 (lr=3.7096e-05) (hash(x)=14965789)
769
+ 737 train 7.531419 (lr=3.6914e-05) (hash(x)=12903915)
770
+ 738 train 7.614206 (lr=3.6733e-05) (hash(x)=9558687)
771
+ 739 train 7.496002 (lr=3.6552e-05) (hash(x)=13902058)
772
+ 740 train 7.428246 (lr=3.6372e-05) (hash(x)=14056511)
773
+ 741 train 7.486388 (lr=3.6192e-05) (hash(x)=11532081)
774
+ 742 train 7.627848 (lr=3.6012e-05) (hash(x)=10915032)
775
+ 743 train 7.437314 (lr=3.5833e-05) (hash(x)=11934888)
776
+ 744 train 7.406747 (lr=3.5654e-05) (hash(x)=14380849)
777
+ 745 train 7.434485 (lr=3.5475e-05) (hash(x)=10958610)
778
+ 746 train 7.573936 (lr=3.5297e-05) (hash(x)=14082888)
779
+ 747 train 7.556470 (lr=3.5119e-05) (hash(x)=11310809)
780
+ 748 train 7.461943 (lr=3.4942e-05) (hash(x)=9675993)
781
+ 749 train 7.327909 (lr=3.4765e-05) (hash(x)=13548210)
782
+ 750 val loss 7.4085
783
+ 750 val perplexity 1649.9751
784
+ 750 train 7.323922 (lr=3.4588e-05) (hash(x)=12254091)
785
+ 751 train 7.475982 (lr=3.4412e-05) (hash(x)=13891471)
786
+ 752 train 7.486430 (lr=3.4236e-05) (hash(x)=14903352)
787
+ 753 train 7.516104 (lr=3.4060e-05) (hash(x)=11682127)
788
+ 754 train 7.564634 (lr=3.3885e-05) (hash(x)=14061124)
789
+ 755 train 7.359085 (lr=3.3711e-05) (hash(x)=10965362)
790
+ 756 train 7.436463 (lr=3.3537e-05) (hash(x)=13004497)
791
+ 757 train 7.370955 (lr=3.3363e-05) (hash(x)=10651880)
792
+ 758 train 7.411738 (lr=3.3189e-05) (hash(x)=13616109)
793
+ 759 train 7.508320 (lr=3.3016e-05) (hash(x)=13669344)
794
+ 760 train 7.601729 (lr=3.2844e-05) (hash(x)=13135904)
795
+ 761 train 7.523783 (lr=3.2672e-05) (hash(x)=11985052)
796
+ 762 train 7.412024 (lr=3.2500e-05) (hash(x)=13024915)
797
+ 763 train 7.500429 (lr=3.2329e-05) (hash(x)=14389928)
798
+ 764 train 7.421288 (lr=3.2158e-05) (hash(x)=13129054)
799
+ 765 train 7.365158 (lr=3.1988e-05) (hash(x)=11030412)
800
+ 766 train 7.606483 (lr=3.1818e-05) (hash(x)=11791787)
801
+ 767 train 7.521462 (lr=3.1648e-05) (hash(x)=14432102)
802
+ 768 train 7.505769 (lr=3.1479e-05) (hash(x)=10822477)
803
+ 769 train 7.711597 (lr=3.1311e-05) (hash(x)=11064424)
804
+ 770 train 7.514955 (lr=3.1142e-05) (hash(x)=9890383)
805
+ 771 train 7.463249 (lr=3.0975e-05) (hash(x)=12588549)
806
+ 772 train 7.514605 (lr=3.0808e-05) (hash(x)=11270706)
807
+ 773 train 7.573109 (lr=3.0641e-05) (hash(x)=10906084)
808
+ 774 train 7.421782 (lr=3.0475e-05) (hash(x)=12346731)
809
+ 775 train 7.562531 (lr=3.0309e-05) (hash(x)=14731332)
810
+ 776 train 7.678433 (lr=3.0144e-05) (hash(x)=14388379)
811
+ 777 train 7.386539 (lr=2.9979e-05) (hash(x)=13783087)
812
+ 778 train 7.387280 (lr=2.9814e-05) (hash(x)=10702017)
813
+ 779 train 7.340581 (lr=2.9651e-05) (hash(x)=9645726)
814
+ 780 train 7.435367 (lr=2.9487e-05) (hash(x)=13637945)
815
+ 781 train 7.579770 (lr=2.9324e-05) (hash(x)=11606266)
816
+ 782 train 7.415176 (lr=2.9162e-05) (hash(x)=12287211)
817
+ 783 train 7.745209 (lr=2.9000e-05) (hash(x)=12132180)
818
+ 784 train 7.712450 (lr=2.8839e-05) (hash(x)=12051977)
819
+ 785 train 7.353262 (lr=2.8678e-05) (hash(x)=11647865)
820
+ 786 train 7.215220 (lr=2.8518e-05) (hash(x)=11502703)
821
+ 787 train 7.394165 (lr=2.8358e-05) (hash(x)=16037808)
822
+ 788 train 7.664723 (lr=2.8199e-05) (hash(x)=13068798)
823
+ 789 train 7.605221 (lr=2.8040e-05) (hash(x)=13188212)
824
+ 790 train 7.688595 (lr=2.7881e-05) (hash(x)=13226443)
825
+ 791 train 7.617218 (lr=2.7724e-05) (hash(x)=13549840)
826
+ 792 train 7.534480 (lr=2.7566e-05) (hash(x)=15731405)
827
+ 793 train 7.388684 (lr=2.7410e-05) (hash(x)=10579211)
828
+ 794 train 7.592125 (lr=2.7254e-05) (hash(x)=11500722)
829
+ 795 train 7.395550 (lr=2.7098e-05) (hash(x)=13184509)
830
+ 796 train 7.435340 (lr=2.6943e-05) (hash(x)=12193563)
831
+ 797 train 7.271892 (lr=2.6788e-05) (hash(x)=14514636)
832
+ 798 train 7.514935 (lr=2.6634e-05) (hash(x)=11457349)
833
+ 799 train 7.530169 (lr=2.6481e-05) (hash(x)=12182586)
834
+ 800 val loss 7.3987
835
+ 800 val perplexity 1633.7861
836
+ 800 train 7.580489 (lr=2.6328e-05) (hash(x)=12855317)
837
+ 801 train 7.462350 (lr=2.6176e-05) (hash(x)=8492746)
838
+ 802 train 7.413579 (lr=2.6024e-05) (hash(x)=11626864)
839
+ 803 train 7.484621 (lr=2.5873e-05) (hash(x)=12443902)
840
+ 804 train 7.420046 (lr=2.5722e-05) (hash(x)=10467514)
841
+ 805 train 7.326946 (lr=2.5572e-05) (hash(x)=11537620)
842
+ 806 train 7.438431 (lr=2.5423e-05) (hash(x)=10471275)
843
+ 807 train 7.632340 (lr=2.5274e-05) (hash(x)=12244149)
844
+ 808 train 7.484954 (lr=2.5125e-05) (hash(x)=14515998)
845
+ 809 train 7.611898 (lr=2.4978e-05) (hash(x)=14317138)
846
+ 810 train 7.327219 (lr=2.4830e-05) (hash(x)=10820515)
847
+ 811 train 7.474921 (lr=2.4684e-05) (hash(x)=11624808)
848
+ 812 train 7.361389 (lr=2.4538e-05) (hash(x)=13053386)
849
+ 813 train 7.326178 (lr=2.4392e-05) (hash(x)=11776318)
850
+ 814 train 7.432728 (lr=2.4247e-05) (hash(x)=11765380)
851
+ 815 train 7.362702 (lr=2.4103e-05) (hash(x)=12564010)
852
+ 816 train 7.251594 (lr=2.3959e-05) (hash(x)=11756334)
853
+ 817 train 7.401788 (lr=2.3816e-05) (hash(x)=12559496)
854
+ 818 train 7.558629 (lr=2.3674e-05) (hash(x)=13373821)
855
+ 819 train 7.416736 (lr=2.3532e-05) (hash(x)=14061530)
856
+ 820 train 7.413780 (lr=2.3391e-05) (hash(x)=11734902)
857
+ 821 train 7.458072 (lr=2.3250e-05) (hash(x)=10562192)
858
+ 822 train 7.443389 (lr=2.3110e-05) (hash(x)=13531445)
859
+ 823 train 7.546387 (lr=2.2971e-05) (hash(x)=17383177)
860
+ 824 train 7.534544 (lr=2.2832e-05) (hash(x)=10038826)
861
+ 825 train 7.283525 (lr=2.2694e-05) (hash(x)=11420026)
862
+ 826 train 7.411062 (lr=2.2556e-05) (hash(x)=13320274)
863
+ 827 train 7.479941 (lr=2.2420e-05) (hash(x)=15724893)
864
+ 828 train 7.421375 (lr=2.2283e-05) (hash(x)=11537371)
865
+ 829 train 7.512021 (lr=2.2148e-05) (hash(x)=11899847)
866
+ 830 train 7.392877 (lr=2.2013e-05) (hash(x)=11572171)
867
+ 831 train 7.362029 (lr=2.1878e-05) (hash(x)=13291139)
868
+ 832 train 7.454466 (lr=2.1745e-05) (hash(x)=13476426)
869
+ 833 train 7.727875 (lr=2.1612e-05) (hash(x)=12476228)
870
+ 834 train 7.365035 (lr=2.1479e-05) (hash(x)=10645329)
871
+ 835 train 7.461557 (lr=2.1347e-05) (hash(x)=9132980)
872
+ 836 train 7.536265 (lr=2.1216e-05) (hash(x)=11338056)
873
+ 837 train 7.467300 (lr=2.1086e-05) (hash(x)=11174161)
874
+ 838 train 7.378388 (lr=2.0956e-05) (hash(x)=14244522)
875
+ 839 train 7.319438 (lr=2.0827e-05) (hash(x)=11507164)
876
+ 840 train 7.788777 (lr=2.0698e-05) (hash(x)=20968914)
877
+ 841 train 7.393871 (lr=2.0570e-05) (hash(x)=11694810)
878
+ 842 train 7.221704 (lr=2.0443e-05) (hash(x)=13298854)
879
+ 843 train 7.730706 (lr=2.0317e-05) (hash(x)=11884977)
880
+ 844 train 7.507550 (lr=2.0191e-05) (hash(x)=12405359)
881
+ 845 train 7.479666 (lr=2.0066e-05) (hash(x)=12862078)
882
+ 846 train 7.519173 (lr=1.9941e-05) (hash(x)=11634930)
883
+ 847 train 7.316336 (lr=1.9818e-05) (hash(x)=13225195)
884
+ 848 train 7.481653 (lr=1.9694e-05) (hash(x)=14179955)
885
+ 849 train 7.354677 (lr=1.9572e-05) (hash(x)=12005199)
886
+ 850 val loss 7.3879
887
+ 850 val perplexity 1616.3535
888
+ 850 train 7.488586 (lr=1.9450e-05) (hash(x)=13403175)
889
+ 851 train 7.395118 (lr=1.9329e-05) (hash(x)=10734745)
890
+ 852 train 7.531964 (lr=1.9209e-05) (hash(x)=12914137)
891
+ 853 train 7.398307 (lr=1.9089e-05) (hash(x)=12256492)
892
+ 854 train 7.337024 (lr=1.8970e-05) (hash(x)=13071881)
893
+ 855 train 7.438777 (lr=1.8852e-05) (hash(x)=10200730)
894
+ 856 train 7.344612 (lr=1.8734e-05) (hash(x)=13857089)
895
+ 857 train 7.502236 (lr=1.8618e-05) (hash(x)=11137771)
896
+ 858 train 7.437394 (lr=1.8501e-05) (hash(x)=12393535)
897
+ 859 train 7.535953 (lr=1.8386e-05) (hash(x)=11431388)
898
+ 860 train 7.518038 (lr=1.8271e-05) (hash(x)=8864044)
899
+ 861 train 7.536674 (lr=1.8157e-05) (hash(x)=17958071)
900
+ 862 train 7.417372 (lr=1.8044e-05) (hash(x)=10908226)
901
+ 863 train 7.458572 (lr=1.7931e-05) (hash(x)=13867151)
902
+ 864 train 7.649601 (lr=1.7819e-05) (hash(x)=15308571)
903
+ 865 train 7.485988 (lr=1.7708e-05) (hash(x)=11767996)
904
+ 866 train 7.474010 (lr=1.7598e-05) (hash(x)=11493377)
905
+ 867 train 7.429469 (lr=1.7488e-05) (hash(x)=7116954)
906
+ 868 train 7.310431 (lr=1.7379e-05) (hash(x)=11702864)
907
+ 869 train 7.423225 (lr=1.7271e-05) (hash(x)=14051764)
908
+ 870 train 7.518801 (lr=1.7163e-05) (hash(x)=12634339)
909
+ 871 train 7.376614 (lr=1.7056e-05) (hash(x)=14576040)
910
+ 872 train 7.443768 (lr=1.6950e-05) (hash(x)=11113089)
911
+ 873 train 7.387144 (lr=1.6845e-05) (hash(x)=12823709)
912
+ 874 train 7.660709 (lr=1.6740e-05) (hash(x)=9995251)
913
+ 875 train 7.845885 (lr=1.6636e-05) (hash(x)=11451043)
914
+ 876 train 7.314564 (lr=1.6533e-05) (hash(x)=11894772)
915
+ 877 train 7.377823 (lr=1.6431e-05) (hash(x)=13446358)
916
+ 878 train 7.522431 (lr=1.6329e-05) (hash(x)=12318042)
917
+ 879 train 7.501781 (lr=1.6228e-05) (hash(x)=12735546)
918
+ 880 train 7.443748 (lr=1.6128e-05) (hash(x)=9676917)
919
+ 881 train 7.440009 (lr=1.6029e-05) (hash(x)=14742769)
920
+ 882 train 7.470830 (lr=1.5930e-05) (hash(x)=10560897)
921
+ 883 train 7.328776 (lr=1.5832e-05) (hash(x)=13520412)
922
+ 884 train 7.465231 (lr=1.5735e-05) (hash(x)=12425900)
923
+ 885 train 7.483104 (lr=1.5639e-05) (hash(x)=11633472)
924
+ 886 train 7.408842 (lr=1.5543e-05) (hash(x)=10625407)
925
+ 887 train 7.400389 (lr=1.5449e-05) (hash(x)=10330193)
926
+ 888 train 7.161911 (lr=1.5354e-05) (hash(x)=12213510)
927
+ 889 train 7.431158 (lr=1.5261e-05) (hash(x)=13392891)
928
+ 890 train 7.476339 (lr=1.5169e-05) (hash(x)=11215036)
929
+ 891 train 7.274102 (lr=1.5077e-05) (hash(x)=10349282)
930
+ 892 train 7.436772 (lr=1.4986e-05) (hash(x)=13700131)
931
+ 893 train 7.462317 (lr=1.4896e-05) (hash(x)=13147693)
932
+ 894 train 7.354665 (lr=1.4806e-05) (hash(x)=11480067)
933
+ 895 train 7.304891 (lr=1.4718e-05) (hash(x)=10416293)
934
+ 896 train 7.307038 (lr=1.4630e-05) (hash(x)=12020755)
935
+ 897 train 7.335043 (lr=1.4543e-05) (hash(x)=10382090)
936
+ 898 train 7.789202 (lr=1.4456e-05) (hash(x)=16574151)
937
+ 899 train 7.581019 (lr=1.4371e-05) (hash(x)=11403044)
938
+ 900 val loss 7.3793
939
+ 900 val perplexity 1602.4288
940
+ 900 train 7.413050 (lr=1.4286e-05) (hash(x)=12647667)
941
+ 901 train 7.282578 (lr=1.4202e-05) (hash(x)=11055290)
942
+ 902 train 7.288463 (lr=1.4119e-05) (hash(x)=13105089)
943
+ 903 train 7.267941 (lr=1.4037e-05) (hash(x)=10543763)
944
+ 904 train 7.352409 (lr=1.3955e-05) (hash(x)=11998234)
945
+ 905 train 7.401732 (lr=1.3874e-05) (hash(x)=10278798)
946
+ 906 train 7.408617 (lr=1.3794e-05) (hash(x)=14632798)
947
+ 907 train 7.336319 (lr=1.3715e-05) (hash(x)=12739117)
948
+ 908 train 7.201749 (lr=1.3637e-05) (hash(x)=14406428)
949
+ 909 train 7.632793 (lr=1.3559e-05) (hash(x)=15644778)
950
+ 910 train 7.337009 (lr=1.3482e-05) (hash(x)=11144683)
951
+ 911 train 7.328800 (lr=1.3407e-05) (hash(x)=12262130)
952
+ 912 train 7.224071 (lr=1.3331e-05) (hash(x)=12038725)
953
+ 913 train 7.530795 (lr=1.3257e-05) (hash(x)=13080738)
954
+ 914 train 7.606737 (lr=1.3183e-05) (hash(x)=12338355)
955
+ 915 train 7.378728 (lr=1.3111e-05) (hash(x)=13000590)
956
+ 916 train 7.482515 (lr=1.3039e-05) (hash(x)=11862013)
957
+ 917 train 7.495930 (lr=1.2968e-05) (hash(x)=12257037)
958
+ 918 train 7.399986 (lr=1.2897e-05) (hash(x)=10977395)
959
+ 919 train 7.434460 (lr=1.2828e-05) (hash(x)=14493957)
960
+ 920 train 7.457164 (lr=1.2759e-05) (hash(x)=12659659)
961
+ 921 train 7.387201 (lr=1.2691e-05) (hash(x)=14810595)
962
+ 922 train 7.409313 (lr=1.2624e-05) (hash(x)=13100526)
963
+ 923 train 7.562007 (lr=1.2558e-05) (hash(x)=11123341)
964
+ 924 train 7.425322 (lr=1.2493e-05) (hash(x)=13565184)
965
+ 925 train 7.461747 (lr=1.2428e-05) (hash(x)=12312758)
966
+ 926 train 7.523674 (lr=1.2364e-05) (hash(x)=11515370)
967
+ 927 train 7.302715 (lr=1.2301e-05) (hash(x)=13091244)
968
+ 928 train 7.320412 (lr=1.2239e-05) (hash(x)=12887643)
969
+ 929 train 7.458811 (lr=1.2178e-05) (hash(x)=15277954)
970
+ 930 train 7.397930 (lr=1.2118e-05) (hash(x)=11614361)
971
+ 931 train 7.415065 (lr=1.2058e-05) (hash(x)=11439940)
972
+ 932 train 7.289891 (lr=1.1999e-05) (hash(x)=10590452)
973
+ 933 train 7.339827 (lr=1.1941e-05) (hash(x)=10904100)
974
+ 934 train 7.281889 (lr=1.1884e-05) (hash(x)=13412637)
975
+ 935 train 7.281852 (lr=1.1828e-05) (hash(x)=12457412)
976
+ 936 train 7.408570 (lr=1.1772e-05) (hash(x)=13425846)
977
+ 937 train 7.522624 (lr=1.1718e-05) (hash(x)=13293146)
978
+ 938 train 7.439196 (lr=1.1664e-05) (hash(x)=12577151)
979
+ 939 train 7.323372 (lr=1.1611e-05) (hash(x)=12445595)
980
+ 940 train 7.387302 (lr=1.1559e-05) (hash(x)=12525367)
981
+ 941 train 7.417640 (lr=1.1508e-05) (hash(x)=10596022)
982
+ 942 train 7.426594 (lr=1.1457e-05) (hash(x)=11488899)
983
+ 943 train 7.374903 (lr=1.1408e-05) (hash(x)=13400893)
984
+ 944 train 7.544822 (lr=1.1359e-05) (hash(x)=13518118)
985
+ 945 train 7.322078 (lr=1.1311e-05) (hash(x)=13969792)
986
+ 946 train 7.459402 (lr=1.1264e-05) (hash(x)=11569968)
987
+ 947 train 7.151908 (lr=1.1218e-05) (hash(x)=11587799)
988
+ 948 train 7.301124 (lr=1.1173e-05) (hash(x)=10471877)
989
+ 949 train 7.334065 (lr=1.1128e-05) (hash(x)=13316743)
990
+ 950 val loss 7.3762
991
+ 950 val perplexity 1597.5118
992
+ 950 train 7.409931 (lr=1.1085e-05) (hash(x)=10667129)
993
+ 951 train 7.340261 (lr=1.1042e-05) (hash(x)=11093427)
994
+ 952 train 7.190322 (lr=1.1000e-05) (hash(x)=11433879)
995
+ 953 train 7.396222 (lr=1.0959e-05) (hash(x)=14275750)
996
+ 954 train 7.466445 (lr=1.0919e-05) (hash(x)=13079600)
997
+ 955 train 7.329465 (lr=1.0879e-05) (hash(x)=12556789)
998
+ 956 train 7.386924 (lr=1.0841e-05) (hash(x)=13226520)
999
+ 957 train 7.349053 (lr=1.0803e-05) (hash(x)=11533849)
1000
+ 958 train 7.539642 (lr=1.0766e-05) (hash(x)=15632652)
1001
+ 959 train 7.283098 (lr=1.0730e-05) (hash(x)=13690397)
1002
+ 960 train 7.298715 (lr=1.0695e-05) (hash(x)=14961564)
1003
+ 961 train 7.443252 (lr=1.0661e-05) (hash(x)=12793426)
1004
+ 962 train 7.322987 (lr=1.0628e-05) (hash(x)=11519292)
1005
+ 963 train 7.461276 (lr=1.0595e-05) (hash(x)=15699093)
1006
+ 964 train 7.649209 (lr=1.0563e-05) (hash(x)=14151583)
1007
+ 965 train 7.524677 (lr=1.0533e-05) (hash(x)=12375363)
1008
+ 966 train 7.494086 (lr=1.0503e-05) (hash(x)=11486413)
1009
+ 967 train 7.389461 (lr=1.0474e-05) (hash(x)=12757217)
1010
+ 968 train 7.347562 (lr=1.0445e-05) (hash(x)=11080058)
1011
+ 969 train 7.373491 (lr=1.0418e-05) (hash(x)=13865969)
1012
+ 970 train 7.376685 (lr=1.0391e-05) (hash(x)=12301277)
1013
+ 971 train 7.443698 (lr=1.0366e-05) (hash(x)=13365028)
1014
+ 972 train 7.328244 (lr=1.0341e-05) (hash(x)=11511196)
1015
+ 973 train 7.403480 (lr=1.0317e-05) (hash(x)=14686910)
1016
+ 974 train 7.171707 (lr=1.0294e-05) (hash(x)=13214062)
1017
+ 975 train 7.541519 (lr=1.0272e-05) (hash(x)=15655824)
1018
+ 976 train 7.370890 (lr=1.0251e-05) (hash(x)=11323316)
1019
+ 977 train 7.264448 (lr=1.0230e-05) (hash(x)=10909712)
1020
+ 978 train 7.283164 (lr=1.0211e-05) (hash(x)=14617155)
1021
+ 979 train 7.260581 (lr=1.0192e-05) (hash(x)=8413796)
1022
+ 980 train 7.389220 (lr=1.0174e-05) (hash(x)=9112154)
1023
+ 981 train 7.179283 (lr=1.0157e-05) (hash(x)=9600331)
1024
+ 982 train 7.341630 (lr=1.0141e-05) (hash(x)=12843658)
1025
+ 983 train 7.362242 (lr=1.0126e-05) (hash(x)=13336155)
1026
+ 984 train 7.594541 (lr=1.0111e-05) (hash(x)=15924834)
1027
+ 985 train 7.708756 (lr=1.0098e-05) (hash(x)=12161743)
1028
+ 986 train 7.397781 (lr=1.0085e-05) (hash(x)=11615346)
1029
+ 987 train 7.337195 (lr=1.0074e-05) (hash(x)=11098157)
1030
+ 988 train 7.329396 (lr=1.0063e-05) (hash(x)=13405233)
1031
+ 989 train 7.402912 (lr=1.0053e-05) (hash(x)=10512978)
1032
+ 990 train 7.445797 (lr=1.0044e-05) (hash(x)=12449591)
1033
+ 991 train 7.345307 (lr=1.0035e-05) (hash(x)=12353085)
1034
+ 992 train 7.267563 (lr=1.0028e-05) (hash(x)=10439054)
1035
+ 993 train 7.450946 (lr=1.0021e-05) (hash(x)=12846917)
1036
+ 994 train 7.719939 (lr=1.0016e-05) (hash(x)=15342566)
1037
+ 995 train 7.288847 (lr=1.0011e-05) (hash(x)=10222870)
1038
+ 996 train 7.556872 (lr=1.0007e-05) (hash(x)=13109457)
1039
+ 997 train 7.482511 (lr=1.0004e-05) (hash(x)=12815840)
1040
+ 998 train 7.388678 (lr=1.0002e-05) (hash(x)=11914939)
1041
+ 999 val loss 7.3727
1042
+ 999 val perplexity 1591.9333
1043
+ 999 train 7.799417 (lr=1.0000e-05) (hash(x)=19320835)
lr1e-4_total_batch_size40960_seq_len256/model_00999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3da0f59b43f3172e417c614bc19376627b71ab1b81d2d78cab8a584add61ab4e
3
+ size 38587970
lr1e-4_total_batch_size40960_seq_len256/optimizer_00999.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4afd354f13ddb3a1934e7ce337f097b985d5aa0546f40aa84a57872a9d33a0f0
3
+ size 70895430