Upload folder using huggingface_hub
Browse files
attention_kindselective_n_heads4_seed1343/args.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1343", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1343, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr":
|
|
|
|
| 1 |
+
{"hellaswag": true, "attention_kind": "selective", "log_dir": "wider_is_better_9/attention_kindselective_n_heads4_seed1343", "resume_checkpoint": null, "resume_optimizer": false, "add_a_head": false, "add_head_to_start": true, "new_head_init": "normal", "n_heads": 4, "protect_bos_token": true, "prevent_from_masking_myself": true, "max_steps": 8750, "warmup_steps": 500, "group": "wider_is_better_9", "use_wandb": true, "kill_self_after_run": false, "random_seed": 1343, "memory_penalty_epsilon": 0.1, "selection_head_linear_combo": "none", "selection_head_linear_combo_scale": 1.0, "protection_kind": "none", "leaky_relu_alpha": null, "leaky_relu_bias": null, "use_compile": true, "use_mini_model": false, "upload_to_hf": true, "seq_len": 256, "batch_size": 120, "total_batch_size": 61440, "protection_head_scaling_factor": 1.0, "protection_head_bias": 0.0, "n_sliced_masks": null, "n_latent_masks": null, "mask_layernorm": false, "residual_attention_masks": false, "compute_base_shapes": false, "base_shapes_savefile": null, "mup": true, "disable_selection": false, "mup_enable_coord_check_logging": false, "max_lr": 5e-05, "decay_lr": true, "readout_zero_init": false, "query_zero_init": false, "l1_loss": false, "debugpy": false, "key": "5e-5_61440_4_1343", "n_embd": 256}
|
attention_kindselective_n_heads4_seed1343/log2.txt
CHANGED
|
@@ -1,267 +1,282 @@
|
|
| 1 |
max_steps: 8750
|
| 2 |
0 val loss 11.3014
|
| 3 |
0 val perplexity 80932.9531
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
8500 val loss 5.
|
| 258 |
-
8500 val perplexity
|
| 259 |
-
8500 train 5.
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
max_steps: 8750
|
| 2 |
0 val loss 11.3014
|
| 3 |
0 val perplexity 80932.9531
|
| 4 |
+
5300 val loss 5.9350
|
| 5 |
+
5300 val perplexity 378.0438
|
| 6 |
+
5300 train 5.929630 (lr=1.3068e-05) (hash(x)=157909233)
|
| 7 |
+
0 train 11.305982 (lr=1.0000e-07) (hash(x)=153418542)
|
| 8 |
+
100 val loss 9.9370
|
| 9 |
+
100 val perplexity 20681.1777
|
| 10 |
+
100 train 9.942489 (lr=1.0100e-05) (hash(x)=143722001)
|
| 11 |
+
5400 val loss 5.9298
|
| 12 |
+
5400 val perplexity 376.0854
|
| 13 |
+
5400 train 5.810923 (lr=1.2573e-05) (hash(x)=142250860)
|
| 14 |
+
200 val loss 9.1371
|
| 15 |
+
200 val perplexity 9294.0771
|
| 16 |
+
200 train 9.129601 (lr=2.0100e-05) (hash(x)=152989689)
|
| 17 |
+
300 val loss 7.9842
|
| 18 |
+
300 val perplexity 2934.3018
|
| 19 |
+
300 train 7.992355 (lr=3.0100e-05) (hash(x)=150071018)
|
| 20 |
+
5500 val loss 5.9204
|
| 21 |
+
5500 val perplexity 372.5773
|
| 22 |
+
5500 train 5.791650 (lr=1.2085e-05) (hash(x)=145694703)
|
| 23 |
+
400 val loss 7.5357
|
| 24 |
+
400 val perplexity 1873.8107
|
| 25 |
+
400 train 7.571447 (lr=4.0100e-05) (hash(x)=153358238)
|
| 26 |
+
500 val loss 7.4068
|
| 27 |
+
500 val perplexity 1647.1067
|
| 28 |
+
500 train 7.281205 (lr=5.0000e-05) (hash(x)=148983354)
|
| 29 |
+
5600 val loss 5.9127
|
| 30 |
+
5600 val perplexity 369.6888
|
| 31 |
+
5600 train 5.836017 (lr=1.1602e-05) (hash(x)=141788252)
|
| 32 |
+
600 val loss 7.2971
|
| 33 |
+
600 val perplexity 1475.9901
|
| 34 |
+
600 train 7.181204 (lr=4.9984e-05) (hash(x)=150770333)
|
| 35 |
+
5700 val loss 5.9078
|
| 36 |
+
5700 val perplexity 367.8842
|
| 37 |
+
5700 train 5.805344 (lr=1.1127e-05) (hash(x)=150886692)
|
| 38 |
+
700 val loss 7.1623
|
| 39 |
+
700 val perplexity 1289.9027
|
| 40 |
+
700 train 7.078976 (lr=4.9935e-05) (hash(x)=145246201)
|
| 41 |
+
800 val loss 7.0603
|
| 42 |
+
800 val perplexity 1164.7977
|
| 43 |
+
800 train 6.888188 (lr=4.9853e-05) (hash(x)=142635842)
|
| 44 |
+
5800 val loss 5.9017
|
| 45 |
+
5800 val perplexity 365.6725
|
| 46 |
+
5800 train 5.802686 (lr=1.0659e-05) (hash(x)=151814419)
|
| 47 |
+
900 val loss 6.9689
|
| 48 |
+
900 val perplexity 1063.0038
|
| 49 |
+
900 train 6.991961 (lr=4.9739e-05) (hash(x)=148299832)
|
| 50 |
+
1000 val loss 6.8839
|
| 51 |
+
1000 val perplexity 976.4011
|
| 52 |
+
1000 train 6.845263 (lr=4.9593e-05) (hash(x)=152887721)
|
| 53 |
+
5900 val loss 5.8942
|
| 54 |
+
5900 val perplexity 362.9083
|
| 55 |
+
5900 train 5.902763 (lr=1.0200e-05) (hash(x)=146972539)
|
| 56 |
+
1100 val loss 6.8028
|
| 57 |
+
1100 val perplexity 900.3577
|
| 58 |
+
1100 train 6.696290 (lr=4.9415e-05) (hash(x)=153721194)
|
| 59 |
+
6000 val loss 5.8877
|
| 60 |
+
6000 val perplexity 360.5864
|
| 61 |
+
6000 train 5.768605 (lr=9.7500e-06) (hash(x)=150219954)
|
| 62 |
+
1200 val loss 6.7168
|
| 63 |
+
1200 val perplexity 826.1346
|
| 64 |
+
1200 train 6.708447 (lr=4.9205e-05) (hash(x)=153047184)
|
| 65 |
+
1300 val loss 6.6280
|
| 66 |
+
1300 val perplexity 755.9318
|
| 67 |
+
1300 train 6.528294 (lr=4.8964e-05) (hash(x)=152466045)
|
| 68 |
+
6100 val loss 5.8844
|
| 69 |
+
6100 val perplexity 359.3886
|
| 70 |
+
6100 train 5.883430 (lr=9.3098e-06) (hash(x)=161131435)
|
| 71 |
+
1400 val loss 6.5690
|
| 72 |
+
1400 val perplexity 712.6658
|
| 73 |
+
1400 train 6.363219 (lr=4.8691e-05) (hash(x)=151991552)
|
| 74 |
+
1500 val loss 6.5018
|
| 75 |
+
1500 val perplexity 666.3365
|
| 76 |
+
1500 train 6.451866 (lr=4.8388e-05) (hash(x)=147892594)
|
| 77 |
+
6200 val loss 5.8741
|
| 78 |
+
6200 val perplexity 355.6878
|
| 79 |
+
6200 train 5.881380 (lr=8.8800e-06) (hash(x)=150226893)
|
| 80 |
+
1600 val loss 6.4469
|
| 81 |
+
1600 val perplexity 630.7286
|
| 82 |
+
1600 train 6.254613 (lr=4.8055e-05) (hash(x)=135678663)
|
| 83 |
+
6300 val loss 5.8708
|
| 84 |
+
6300 val perplexity 354.5362
|
| 85 |
+
6300 train 5.700086 (lr=8.4613e-06) (hash(x)=144389552)
|
| 86 |
+
1700 val loss 6.4135
|
| 87 |
+
1700 val perplexity 610.0433
|
| 88 |
+
1700 train 6.435209 (lr=4.7691e-05) (hash(x)=143909276)
|
| 89 |
+
1800 val loss 6.3751
|
| 90 |
+
1800 val perplexity 587.0577
|
| 91 |
+
1800 train 6.530271 (lr=4.7299e-05) (hash(x)=155393468)
|
| 92 |
+
6400 val loss 5.8670
|
| 93 |
+
6400 val perplexity 353.2032
|
| 94 |
+
6400 train 5.655887 (lr=8.0542e-06) (hash(x)=147757943)
|
| 95 |
+
1900 val loss 6.3249
|
| 96 |
+
1900 val perplexity 558.3013
|
| 97 |
+
1900 train 6.369246 (lr=4.6878e-05) (hash(x)=150957388)
|
| 98 |
+
2000 val loss 6.2900
|
| 99 |
+
2000 val perplexity 539.1518
|
| 100 |
+
2000 train 6.559343 (lr=4.6428e-05) (hash(x)=144030938)
|
| 101 |
+
6500 val loss 5.8594
|
| 102 |
+
6500 val perplexity 350.5174
|
| 103 |
+
6500 train 5.836080 (lr=7.6594e-06) (hash(x)=144616373)
|
| 104 |
+
2100 val loss 6.2696
|
| 105 |
+
2100 val perplexity 528.2617
|
| 106 |
+
2100 train 6.147165 (lr=4.5951e-05) (hash(x)=146254754)
|
| 107 |
+
6600 val loss 5.8593
|
| 108 |
+
6600 val perplexity 350.4654
|
| 109 |
+
6600 train 5.690321 (lr=7.2774e-06) (hash(x)=145266452)
|
| 110 |
+
2200 val loss 6.2267
|
| 111 |
+
2200 val perplexity 506.0966
|
| 112 |
+
2200 train 6.254446 (lr=4.5448e-05) (hash(x)=150630461)
|
| 113 |
+
2300 val loss 6.2015
|
| 114 |
+
2300 val perplexity 493.4760
|
| 115 |
+
2300 train 6.340286 (lr=4.4918e-05) (hash(x)=165208942)
|
| 116 |
+
6700 val loss 5.8523
|
| 117 |
+
6700 val perplexity 348.0405
|
| 118 |
+
6700 train 5.836007 (lr=6.9087e-06) (hash(x)=149952261)
|
| 119 |
+
2400 val loss 6.1805
|
| 120 |
+
2400 val perplexity 483.2501
|
| 121 |
+
2400 train 6.103783 (lr=4.4363e-05) (hash(x)=153448706)
|
| 122 |
+
2500 val loss 6.1521
|
| 123 |
+
2500 val perplexity 469.6801
|
| 124 |
+
2500 train 6.168651 (lr=4.3784e-05) (hash(x)=148505056)
|
| 125 |
+
6800 val loss 5.8483
|
| 126 |
+
6800 val perplexity 346.6548
|
| 127 |
+
6800 train 5.733516 (lr=6.5540e-06) (hash(x)=143351199)
|
| 128 |
+
2600 val loss 6.1297
|
| 129 |
+
2600 val perplexity 459.3144
|
| 130 |
+
2600 train 6.091936 (lr=4.3181e-05) (hash(x)=142911960)
|
| 131 |
+
2700 val loss 6.1088
|
| 132 |
+
2700 val perplexity 449.7947
|
| 133 |
+
2700 train 6.114074 (lr=4.2555e-05) (hash(x)=149951660)
|
| 134 |
+
6900 val loss 5.8465
|
| 135 |
+
6900 val perplexity 346.0152
|
| 136 |
+
6900 train 5.807109 (lr=6.2137e-06) (hash(x)=156055618)
|
| 137 |
+
2800 val loss 6.0780
|
| 138 |
+
2800 val perplexity 436.1577
|
| 139 |
+
2800 train 6.063082 (lr=4.1908e-05) (hash(x)=152956713)
|
| 140 |
+
7000 val loss 5.8402
|
| 141 |
+
7000 val perplexity 343.8394
|
| 142 |
+
7000 train 5.918152 (lr=5.8883e-06) (hash(x)=164290908)
|
| 143 |
+
2900 val loss 6.0604
|
| 144 |
+
2900 val perplexity 428.5489
|
| 145 |
+
2900 train 5.973416 (lr=4.1240e-05) (hash(x)=147247056)
|
| 146 |
+
3000 val loss 6.0407
|
| 147 |
+
3000 val perplexity 420.2025
|
| 148 |
+
3000 train 5.843521 (lr=4.0551e-05) (hash(x)=146911716)
|
| 149 |
+
7100 val loss 5.8382
|
| 150 |
+
7100 val perplexity 343.1647
|
| 151 |
+
7100 train 5.936929 (lr=5.5783e-06) (hash(x)=150263832)
|
| 152 |
+
3100 val loss 6.0184
|
| 153 |
+
3100 val perplexity 410.9281
|
| 154 |
+
3100 train 6.035954 (lr=3.9844e-05) (hash(x)=153282809)
|
| 155 |
+
3200 val loss 6.0051
|
| 156 |
+
3200 val perplexity 405.4742
|
| 157 |
+
3200 train 5.907290 (lr=3.9119e-05) (hash(x)=152009984)
|
| 158 |
+
7200 val loss 5.8362
|
| 159 |
+
7200 val perplexity 342.4646
|
| 160 |
+
7200 train 5.518635 (lr=5.2841e-06) (hash(x)=139219680)
|
| 161 |
+
3300 val loss 5.9858
|
| 162 |
+
3300 val perplexity 397.7392
|
| 163 |
+
3300 train 5.894918 (lr=3.8377e-05) (hash(x)=150012952)
|
| 164 |
+
7300 val loss 5.8303
|
| 165 |
+
7300 val perplexity 340.4775
|
| 166 |
+
7300 train 5.771295 (lr=5.0062e-06) (hash(x)=150662994)
|
| 167 |
+
3400 val loss 5.9756
|
| 168 |
+
3400 val perplexity 393.7145
|
| 169 |
+
3400 train 5.807843 (lr=3.7619e-05) (hash(x)=146217477)
|
| 170 |
+
3500 val loss 5.9508
|
| 171 |
+
3500 val perplexity 384.0594
|
| 172 |
+
3500 train 6.119646 (lr=3.6847e-05) (hash(x)=180156144)
|
| 173 |
+
7400 val loss 5.8271
|
| 174 |
+
7400 val perplexity 339.3801
|
| 175 |
+
7400 train 5.673862 (lr=4.7449e-06) (hash(x)=139007967)
|
| 176 |
+
3600 val loss 5.9351
|
| 177 |
+
3600 val perplexity 378.0744
|
| 178 |
+
3600 train 5.971004 (lr=3.6061e-05) (hash(x)=154243319)
|
| 179 |
+
3700 val loss 5.9196
|
| 180 |
+
3700 val perplexity 372.2513
|
| 181 |
+
3700 train 5.821632 (lr=3.5263e-05) (hash(x)=154330476)
|
| 182 |
+
7500 val loss 5.8258
|
| 183 |
+
7500 val perplexity 338.9212
|
| 184 |
+
7500 train 5.653856 (lr=4.5007e-06) (hash(x)=138142461)
|
| 185 |
+
3800 val loss 5.9064
|
| 186 |
+
3800 val perplexity 367.3907
|
| 187 |
+
3800 train 5.841512 (lr=3.4453e-05) (hash(x)=145988858)
|
| 188 |
+
3900 val loss 5.8835
|
| 189 |
+
3900 val perplexity 359.0600
|
| 190 |
+
3900 train 5.893821 (lr=3.3633e-05) (hash(x)=149648609)
|
| 191 |
+
7600 val loss 5.8216
|
| 192 |
+
7600 val perplexity 337.5147
|
| 193 |
+
7600 train 5.742036 (lr=4.2739e-06) (hash(x)=150023998)
|
| 194 |
+
4000 val loss 5.8752
|
| 195 |
+
4000 val perplexity 356.0929
|
| 196 |
+
4000 train 5.851351 (lr=3.2805e-05) (hash(x)=154149272)
|
| 197 |
+
7700 val loss 5.8184
|
| 198 |
+
7700 val perplexity 336.4380
|
| 199 |
+
7700 train 5.712600 (lr=4.0648e-06) (hash(x)=143393355)
|
| 200 |
+
4100 val loss 5.8550
|
| 201 |
+
4100 val perplexity 348.9578
|
| 202 |
+
4100 train 5.763957 (lr=3.1968e-05) (hash(x)=132608538)
|
| 203 |
+
4200 val loss 5.8416
|
| 204 |
+
4200 val perplexity 344.3347
|
| 205 |
+
4200 train 5.797907 (lr=3.1126e-05) (hash(x)=140443636)
|
| 206 |
+
7800 val loss 5.8188
|
| 207 |
+
7800 val perplexity 336.5681
|
| 208 |
+
7800 train 5.698625 (lr=3.8738e-06) (hash(x)=144916472)
|
| 209 |
+
4300 val loss 5.8399
|
| 210 |
+
4300 val perplexity 343.7368
|
| 211 |
+
4300 train 5.770792 (lr=3.0277e-05) (hash(x)=138919540)
|
| 212 |
+
7900 val loss 5.8142
|
| 213 |
+
7900 val perplexity 335.0300
|
| 214 |
+
7900 train 5.762723 (lr=3.7010e-06) (hash(x)=150236934)
|
| 215 |
+
4400 val loss 5.8217
|
| 216 |
+
4400 val perplexity 337.5602
|
| 217 |
+
4400 train 5.814669 (lr=2.9425e-05) (hash(x)=153594684)
|
| 218 |
+
8000 val loss 5.8109
|
| 219 |
+
8000 val perplexity 333.9146
|
| 220 |
+
8000 train 5.801303 (lr=3.5468e-06) (hash(x)=146536422)
|
| 221 |
+
4500 val loss 5.8054
|
| 222 |
+
4500 val perplexity 332.0867
|
| 223 |
+
4500 train 5.757834 (lr=2.8571e-05) (hash(x)=144084750)
|
| 224 |
+
4600 val loss 5.7896
|
| 225 |
+
4600 val perplexity 326.8730
|
| 226 |
+
4600 train 5.754485 (lr=2.7714e-05) (hash(x)=147423675)
|
| 227 |
+
8100 val loss 5.8116
|
| 228 |
+
8100 val perplexity 334.1635
|
| 229 |
+
8100 train 5.641131 (lr=3.4114e-06) (hash(x)=151300857)
|
| 230 |
+
4700 val loss 5.7797
|
| 231 |
+
4700 val perplexity 323.6523
|
| 232 |
+
4700 train 5.849201 (lr=2.6857e-05) (hash(x)=162157039)
|
| 233 |
+
8200 val loss 5.8100
|
| 234 |
+
8200 val perplexity 333.6100
|
| 235 |
+
8200 train 5.856618 (lr=3.2950e-06) (hash(x)=160351956)
|
| 236 |
+
4800 val loss 5.7646
|
| 237 |
+
4800 val perplexity 318.8203
|
| 238 |
+
4800 train 5.665515 (lr=2.6002e-05) (hash(x)=134970942)
|
| 239 |
+
4900 val loss 5.7593
|
| 240 |
+
4900 val perplexity 317.1256
|
| 241 |
+
4900 train 5.672352 (lr=2.5148e-05) (hash(x)=160093370)
|
| 242 |
+
8300 val loss 5.8071
|
| 243 |
+
8300 val perplexity 332.6576
|
| 244 |
+
8300 train 5.831361 (lr=3.1977e-06) (hash(x)=150514540)
|
| 245 |
+
5000 val loss 5.7443
|
| 246 |
+
5000 val perplexity 312.3950
|
| 247 |
+
5000 train 5.635426 (lr=2.4298e-05) (hash(x)=154986299)
|
| 248 |
+
8400 val loss 5.8044
|
| 249 |
+
8400 val perplexity 331.7480
|
| 250 |
+
8400 train 5.756098 (lr=3.1197e-06) (hash(x)=155904762)
|
| 251 |
+
5100 val loss 5.7323
|
| 252 |
+
5100 val perplexity 308.6844
|
| 253 |
+
5100 train 5.580538 (lr=2.3452e-05) (hash(x)=145406582)
|
| 254 |
+
5200 val loss 5.7249
|
| 255 |
+
5200 val perplexity 306.3990
|
| 256 |
+
5200 train 5.787915 (lr=2.2613e-05) (hash(x)=148029261)
|
| 257 |
+
8500 val loss 5.8045
|
| 258 |
+
8500 val perplexity 331.8015
|
| 259 |
+
8500 train 5.608176 (lr=3.0611e-06) (hash(x)=146923196)
|
| 260 |
+
5300 val loss 5.7115
|
| 261 |
+
5300 val perplexity 302.3222
|
| 262 |
+
5300 train 5.707219 (lr=2.1780e-05) (hash(x)=157909233)
|
| 263 |
+
5400 val loss 5.7009
|
| 264 |
+
5400 val perplexity 299.1322
|
| 265 |
+
5400 train 5.564538 (lr=2.0956e-05) (hash(x)=142250860)
|
| 266 |
+
8600 val loss 5.8018
|
| 267 |
+
8600 val perplexity 330.8946
|
| 268 |
+
8600 train 5.774884 (lr=3.0220e-06) (hash(x)=138977080)
|
| 269 |
+
5500 val loss 5.6929
|
| 270 |
+
5500 val perplexity 296.7486
|
| 271 |
+
5500 train 5.574304 (lr=2.0141e-05) (hash(x)=145694703)
|
| 272 |
+
5600 val loss 5.6864
|
| 273 |
+
5600 val perplexity 294.8181
|
| 274 |
+
5600 train 5.632887 (lr=1.9337e-05) (hash(x)=141788252)
|
| 275 |
+
8700 val loss 5.7992
|
| 276 |
+
8700 val perplexity 330.0240
|
| 277 |
+
8700 train 5.604139 (lr=3.0024e-06) (hash(x)=146442792)
|
| 278 |
+
8749 val loss 5.8000
|
| 279 |
+
8749 val perplexity 330.3081
|
| 280 |
+
5700 val loss 5.6781
|
| 281 |
+
5700 val perplexity 292.3879
|
| 282 |
+
5700 train 5.579291 (lr=1.8545e-05) (hash(x)=150886692)
|
attention_kindselective_n_heads4_seed1343/model_08749.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 92843394
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5643d53c3193f22149858b63d66e6274d0af1c83ad7e9e75f9ac9bc19d0527e4
|
| 3 |
size 92843394
|
attention_kindselective_n_heads4_seed1343/optimizer_08749.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 179406214
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b094829fc00bde95f33181023bb171bd419b78db7c0a3cac679dc7d571a1504
|
| 3 |
size 179406214
|